Compare commits
5 Commits
25f8c4c686
...
5d94991167
| Author | SHA1 | Date | |
|---|---|---|---|
| 5d94991167 | |||
| 482d23dd4f | |||
| 452de87f35 | |||
| 73cb883151 | |||
| 5c933fc5c9 |
89
scraper.py
89
scraper.py
@@ -1 +1,88 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import argparse
|
||||
import urllib.request
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class WebPage(object):
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
|
||||
def __init__(self, args):
|
||||
self.url = args['url']
|
||||
|
||||
|
||||
def get_source(self, args=None):
|
||||
request = urllib.request.Request(self.url, headers=headers)
|
||||
page = urllib.request.urlopen(request)
|
||||
self.source = page.read()
|
||||
|
||||
|
||||
def find_links(self, args=None, source=None):
|
||||
soup = BeautifulSoup(self.source, 'html.parser')
|
||||
links = soup.find_all('a')
|
||||
hrefs = []
|
||||
|
||||
for link in links:
|
||||
if link['href'].startswith('/'):
|
||||
hrefs.append("".join([self.url, link['href']]))
|
||||
else:
|
||||
hrefs.append(link['href'])
|
||||
|
||||
return hrefs
|
||||
|
||||
|
||||
def run(args=None):
|
||||
source = get_source(args)
|
||||
urls = find_links(args, source)
|
||||
local_urls = parse_urls(args, urls)
|
||||
|
||||
print(local_urls)
|
||||
|
||||
def get_source(args=None):
|
||||
url = args.url
|
||||
useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
|
||||
headers = {'User-Agent': useragent}
|
||||
request = urllib.request.Request(url, headers=headers)
|
||||
page = urllib.request.urlopen(request)
|
||||
source = page.read()
|
||||
|
||||
return source
|
||||
|
||||
|
||||
def find_links(args=None, source=None):
|
||||
soup = BeautifulSoup(source, 'html.parser')
|
||||
links = soup.find_all('a')
|
||||
hrefs = []
|
||||
|
||||
for link in links:
|
||||
if link['href'].startswith('/'):
|
||||
hrefs.append("".join([args.url, link['href']]))
|
||||
else:
|
||||
hrefs.append(link['href'])
|
||||
|
||||
return hrefs
|
||||
|
||||
|
||||
def parse_urls(args=None, urls=None):
|
||||
local_urls = []
|
||||
|
||||
for url in urls:
|
||||
if url.startswith(args.url):
|
||||
local_urls.append(url)
|
||||
|
||||
|
||||
return local_urls
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description='Recursive web scraper')
|
||||
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.url.startswith('http'):
|
||||
raise SystemExit('URL must start with a protocol (http(s)).')
|
||||
|
||||
run(args)
|
||||
|
||||
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
@@ -3,6 +3,35 @@
|
||||
Utilities to provide various misc functions.
|
||||
'''
|
||||
|
||||
class UrlPoolManager(object):
|
||||
'''
|
||||
Object to manage the lifecycle of a pool of URLs.
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
self.url_pool = dict()
|
||||
self.not_crawled = 0
|
||||
self.crawled = 1
|
||||
self.invalid = 2
|
||||
|
||||
def check_duplicate(self, new_url):
|
||||
for url, status in self.url_pool.items():
|
||||
if url == new_url:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def invalidate_url(self, url):
|
||||
self.url_pool[url] = self.invalid
|
||||
|
||||
def add_to_list(self, url):
|
||||
self.url_pool[url] = self.not_crawled
|
||||
# calculate depth
|
||||
# add link, crawled status to url_pool
|
||||
|
||||
def mark_as_crawled(self, url):
|
||||
self.url_pool[url] = self.crawled
|
||||
|
||||
|
||||
def clean_base_url(url):
|
||||
'''
|
||||
|
||||
Reference in New Issue
Block a user