too many changes to make a sensible commit message

This commit is contained in:
2018-09-04 09:21:26 +01:00
parent abc628106d
commit 05e907ecec
2 changed files with 88 additions and 28 deletions

View File

@@ -5,35 +5,70 @@ Need a docstring.
import argparse import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url) from utils.helpers import (UrlPool, WebPage, sanitise_url)
from pprint import pprint
def init_crawler(base_url=None): def init_crawler(base_url=None):
''' '''
needs a docstring needs a docstring
''' '''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool() uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(base_url) initial_page = WebPage(url=base_url, base_url=base_url)
try: try:
initial_urls = initial_page.run() initial_page.run()
except Exception as e: except Exception as e:
print(e) print(e)
initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again
try:
initial_urls.remove(base_url)
except KeyError:
pass
# Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url)
for url in initial_urls: for url in initial_urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(sanitised_url)
return(uncrawled_urls, crawled_urls)
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None):
'''
Needs a docstring
'''
while uncrawled_urls.pool:
# pop url from pool
new_url = uncrawled_urls.remove_from_pool()
# create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url)
try: try:
uncrawled_urls.add_to_pool(url) current_page.run()
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
except Exception as e: except Exception as e:
print(e) print(e)
print(uncrawled_urls.url_pool) for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
def run(args=None): def run(args=None):
''' '''
needs a docstring. needs a docstring.
''' '''
base_url = sanitise_url(args.url) base_url = sanitise_url(args.url, base_url=True)
init_crawler(base_url) uncrawled_urls, crawled_urls = init_crawler(base_url)
process_pool(base_url, uncrawled_urls, crawled_urls)
pprint(crawled_urls.pool)
print('{0} URLs crawled'.format(len(crawled_urls.pool)))
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -14,22 +14,29 @@ class UrlPool(object):
''' '''
def __init__(self): def __init__(self):
self.url_pool = set() self.pool = set()
def check_duplicate(self, new_url): def check_duplicate(self, new_url):
''' '''
Checks if a URL exists in the current pool. Checks if a URL exists in the current pool.
''' '''
if new_url in self.url_pool: if new_url in self.pool:
return True return True
else: else:
return False return False
def remove_from_pool(self, url): def remove_from_pool(self):
self.url_pool.remove(url) '''
Remove a URL from the pool and return it to be crawled.
'''
return(self.pool.pop())
def add_to_pool(self, url): def add_to_pool(self, url):
self.url_pool.add(url) self.pool.add(url)
def list_pool(self):
pool = self.pool
return pool
class WebPage(object): class WebPage(object):
@@ -40,8 +47,9 @@ class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url): def __init__(self, url=None, base_url=None):
self.url = url self.url = url
self.base_url = base_url
def get_source(self): def get_source(self):
@@ -50,7 +58,7 @@ class WebPage(object):
''' '''
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request) page = urllib.request.urlopen(request, timeout=5)
self.source = page.read() self.source = page.read()
@@ -62,7 +70,7 @@ class WebPage(object):
hrefs = set() hrefs = set()
soup = BeautifulSoup(self.source, 'html.parser') soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a') links = soup.find_all('a', href=True)
for link in links: for link in links:
if link['href'].startswith('/'): if link['href'].startswith('/'):
@@ -78,11 +86,20 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL. pool if they start with the base URL.
''' '''
self.urls_to_crawl = set() self.urls_to_crawl = set()
for url in self.discovered_hrefs: for url in self.discovered_hrefs:
if url.startswith(self.url): if url.startswith(self.url):
self.urls_to_crawl.add(url) sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self):
'''
Returns the contents of the
'''
return self.urls_to_crawl
def run(self): def run(self):
@@ -101,24 +118,32 @@ class WebPage(object):
except Exception as e: except Exception as e:
print(e) print(e)
return self.urls_to_crawl
def sanitise_url(url, base_url=False):
def sanitise_url(url):
''' '''
Attempt to standardise the base url to ensure it can be prepended to If `base_url` is True, we attempt to standardise `url` to ensure it can be
relative URLs. If no scheme provided then we default to http as any prepended to relative URLs. If no scheme has been provided then we default
sane https-only site should 301 redirect http > https. to http as any sane https-only site should 301 redirect http > https.
Returns a corrected base URL as a string. If `base_url` is False, we sanitise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc).
Returns a sanitised URL as a string.
''' '''
default_proto = 'http' default_proto = 'http'
delim = '://' delim = '://'
split_url = urlsplit(url) split_url = urlsplit(url)
if split_url.scheme and split_url.scheme.startswith('http'):
base_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
base_url = "".join([default_proto, delim, split_url.path])
return base_url if base_url:
# This will sanitise the initial url for the initial page crawl.
if split_url.scheme and split_url.scheme.startswith('http'):
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
sanitised_url = "".join([default_proto, delim, split_url.path])
else:
# Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return sanitised_url