diff --git a/crawler.py b/crawler.py index abe3d93..782b7bc 100644 --- a/crawler.py +++ b/crawler.py @@ -5,35 +5,70 @@ Need a docstring. import argparse from utils.helpers import (UrlPool, WebPage, sanitise_url) +from pprint import pprint def init_crawler(base_url=None): ''' needs a docstring ''' uncrawled_urls, crawled_urls = UrlPool(), UrlPool() - initial_page = WebPage(base_url) + initial_page = WebPage(url=base_url, base_url=base_url) try: - initial_urls = initial_page.run() + initial_page.run() except Exception as e: print(e) + initial_urls = initial_page.list_urls() + # ensure the base URL isn't crawled again + try: + initial_urls.remove(base_url) + except KeyError: + pass + # Add the base URL to the crawled pool + crawled_urls.add_to_pool(base_url) + for url in initial_urls: + sanitised_url = sanitise_url(url=url) + if sanitised_url not in crawled_urls.pool: + uncrawled_urls.add_to_pool(sanitised_url) + + return(uncrawled_urls, crawled_urls) + + +def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None): + ''' + Needs a docstring + ''' + while uncrawled_urls.pool: + # pop url from pool + new_url = uncrawled_urls.remove_from_pool() + # create a WebPage object for the URL + current_page = WebPage(url=new_url, base_url=base_url) try: - uncrawled_urls.add_to_pool(url) + current_page.run() + _urls = current_page.list_urls() + crawled_urls.add_to_pool(new_url) except Exception as e: print(e) - print(uncrawled_urls.url_pool) + for url in _urls: + sanitised_url = sanitise_url(url=url) + if sanitised_url not in crawled_urls.pool: + uncrawled_urls.add_to_pool(url) def run(args=None): ''' needs a docstring. ''' - base_url = sanitise_url(args.url) + base_url = sanitise_url(args.url, base_url=True) - init_crawler(base_url) + uncrawled_urls, crawled_urls = init_crawler(base_url) + process_pool(base_url, uncrawled_urls, crawled_urls) + + pprint(crawled_urls.pool) + print('{0} URLs crawled'.format(len(crawled_urls.pool))) if __name__ == '__main__': diff --git a/utils/helpers.py b/utils/helpers.py index 75ec2e7..f85b1af 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -14,22 +14,29 @@ class UrlPool(object): ''' def __init__(self): - self.url_pool = set() + self.pool = set() def check_duplicate(self, new_url): ''' Checks if a URL exists in the current pool. ''' - if new_url in self.url_pool: + if new_url in self.pool: return True else: return False - def remove_from_pool(self, url): - self.url_pool.remove(url) + def remove_from_pool(self): + ''' + Remove a URL from the pool and return it to be crawled. + ''' + return(self.pool.pop()) def add_to_pool(self, url): - self.url_pool.add(url) + self.pool.add(url) + + def list_pool(self): + pool = self.pool + return pool class WebPage(object): @@ -40,8 +47,9 @@ class WebPage(object): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} - def __init__(self, url): + def __init__(self, url=None, base_url=None): self.url = url + self.base_url = base_url def get_source(self): @@ -50,7 +58,7 @@ class WebPage(object): ''' request = urllib.request.Request(self.url, headers=self.headers) - page = urllib.request.urlopen(request) + page = urllib.request.urlopen(request, timeout=5) self.source = page.read() @@ -62,7 +70,7 @@ class WebPage(object): hrefs = set() soup = BeautifulSoup(self.source, 'html.parser') - links = soup.find_all('a') + links = soup.find_all('a', href=True) for link in links: if link['href'].startswith('/'): @@ -78,11 +86,20 @@ class WebPage(object): Iterate through the list of discovered URLs and add them to the pool if they start with the base URL. ''' - self.urls_to_crawl = set() + for url in self.discovered_hrefs: if url.startswith(self.url): - self.urls_to_crawl.add(url) + sanitised_url = sanitise_url(url=url) + self.urls_to_crawl.add(sanitised_url) + + + def list_urls(self): + ''' + Returns the contents of the + ''' + + return self.urls_to_crawl def run(self): @@ -101,24 +118,32 @@ class WebPage(object): except Exception as e: print(e) - return self.urls_to_crawl - -def sanitise_url(url): +def sanitise_url(url, base_url=False): ''' - Attempt to standardise the base url to ensure it can be prepended to - relative URLs. If no scheme provided then we default to http as any - sane https-only site should 301 redirect http > https. + If `base_url` is True, we attempt to standardise `url` to ensure it can be + prepended to relative URLs. If no scheme has been provided then we default + to http as any sane https-only site should 301 redirect http > https. - Returns a corrected base URL as a string. + If `base_url` is False, we sanitise URLs to strip queries and fragments (we + don't want to scrape in-page anchors etc). + + Returns a sanitised URL as a string. ''' default_proto = 'http' delim = '://' split_url = urlsplit(url) - if split_url.scheme and split_url.scheme.startswith('http'): - base_url = "".join([split_url.scheme, delim, split_url.netloc]) - elif (split_url.path and not split_url.scheme and not split_url.netloc): - base_url = "".join([default_proto, delim, split_url.path]) - return base_url + if base_url: + # This will sanitise the initial url for the initial page crawl. + if split_url.scheme and split_url.scheme.startswith('http'): + sanitised_url = "".join([split_url.scheme, delim, split_url.netloc]) + elif (split_url.path and not split_url.scheme and not split_url.netloc): + sanitised_url = "".join([default_proto, delim, split_url.path]) + else: + # Sanitise discovered URLs. We already expect them in the format + # protocol://base_url/path + sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) + + return sanitised_url