diff --git a/async_crawler.py b/async_crawler.py index 5b48b3e..dc4d4a4 100644 --- a/async_crawler.py +++ b/async_crawler.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python ''' Need a docstring. @@ -6,10 +7,10 @@ Need a docstring. import argparse import jinja2 import os +import sys import asyncio from datetime import datetime -# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) -from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url +from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url def init_crawler(url=None): @@ -17,10 +18,14 @@ def init_crawler(url=None): docstring ''' # ensure we have a sensible URL to work with - baseurl = sanitise_url(url=url, base_url=True) + baseurl = standardise_url(url=url, base_url=url) # get robots.txt robots = RobotsTxt(base_url=baseurl) + # fail early if robots denies all crawling + if not robots.check(url=baseurl): + sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl)) + return(baseurl, robots) @@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None): print('Sitemap available at {0}/sitemap.html'.format(os.getcwd())) -def main(args=None): +def main(): ''' docstring ''' @@ -54,21 +59,25 @@ def main(args=None): # create a crawler async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) - # async_crawler.run() + # run the crawler - crawler = asyncio.Task(async_crawler.run()) + task = asyncio.Task(async_crawler.run_loop()) loop = asyncio.get_event_loop() - loop.run_until_complete(crawler) + loop.run_until_complete(task) loop.close() - result = crawler.result() - print(len(result)) + results = task.result() + print(results) + print(len(results)) + runtime = int((datetime.now() - starttime).total_seconds()) + print(runtime) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') parser.add_argument("-u", "--url", required=True, help="Base url to crawl") - parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently") + parser.add_argument("-c", "--concurrency", required=False, type=int, + default=50, help="Max number of pages to crawl concurrently") args = parser.parse_args() - main(args) + main() diff --git a/utils/helpers.py b/utils/helpers.py index 05cc85c..f0791a4 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -26,65 +26,113 @@ class AsyncCrawler(object): self.robots = robots self.uncrawled = set() self.crawled = set() - self.session = aiohttp.ClientSession() + # self.headers = {'Accept-Encoding': 'gzip, deflate', + # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + self.client_session = None self.semaphore = asyncio.BoundedSemaphore(concurrency) - # add the base URL to be crawled - self.uncrawled.add(baseurl) - self.headers = {'Accept-Encoding': 'gzip, deflate', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} - - def validate_url(self, url): + async def crawl_url(self, url=None): ''' - Checks if the discovered URL is local to the base URL. + docstring + ''' + urls = set() + async with self.semaphore: + source = await self.get_source(url) + if source: + self.crawled.add(url) + # for new_url in self.find_all_urls(source): + # urls.add(new_url) + urls_to_crawl = self.find_all_urls(source) + # print('discovered {0} new URLs'.format(len(urls_to_crawl))) + for new_url in urls_to_crawl: + urls.add(new_url) + # add the url we just crawled to the crawled pool. + + + return urls + + + def validate_url(self, url=None): + ''' + Ensures we have a valid URL to crawl and that the site's robots.txt + allows it. ''' # ensure the URL is in a sane format - url = sanitise_url(url=url) + url = standardise_url(url=url, base_url=self.baseurl) - if url.startswith(self.baseurl) and robots.check(url=url): + if url and self.robots.check(url=url): + # print('validated url: {0}'.format(url)) return url else: return False - def get_source(self, url): + async def get_source(self, url=None): ''' Obtains the page's source. ''' - pass - - return source + print('semaphore held for {0}'.format(url)) + async with self.client_session.get(url, timeout=5) as resp: + try: + source = await resp.read() + return source + except Exception: + return None - def find_links(self, source): + def find_all_urls(self, source=None): ''' - Find all links in a page's source. + Find all URLs in a page's source. ''' - links = set() + urls = set() html = BeautifulSoup(source, 'lxml') hrefs = html.find_all('a', href=True) + # build a set of URLs which are valid and haven't been crawled yet for href in hrefs: - url = self.validate_url(url=href) - if url: - links.add(url) + url = self.validate_url(url=href['href']) + if url and url not in self.crawled: + urls.add(url) - return links + return urls - def run(self): + async def run_loop(self): ''' function which runs the crawler ''' - pass + print('Crawling: {}'.format(self.baseurl)) + self.client_session = aiohttp.ClientSession(headers=self.headers) + # provide the starting URL to the crawler + self.uncrawled.add(self.baseurl) + + while len(self.uncrawled) > 0: + # print('################################ there are {0} uncrawled urls in the pool'.format( + # len(self.uncrawled))) + url = self.uncrawled.pop() + # print('################ url popped, there are now {0} uncrawled urls in the pool'.format( + # len(self.uncrawled))) + new_urls = await self.crawl_url(url=url) + for url in new_urls: + # print('adding: {0}'.format(url)) + self.uncrawled.add(url) + + await self.client_session.close() + return self.crawled + + + + + + + + + + - for url in self.uncrawled: - validated = validate_url(url=url) - if validated: - source = get_source(url=url) - links = find_links(source=source) @@ -183,8 +231,8 @@ class WebPage(object): ''' for url in self.discovered_hrefs: if url.startswith(self.base_url) and self.robots.check(url): - sanitised_url = sanitise_url(url=url) - self.urls_to_crawl.add(sanitised_url) + standardised_url = sanitise_url(url=url) + self.urls_to_crawl.add(standardised_url) def list_urls(self): @@ -251,31 +299,35 @@ class RobotsTxt(object): return self.robots.can_fetch("*", url) -def sanitise_url(url, base_url=False): +def standardise_url(url=None, base_url=None): ''' - If `base_url` is True, we attempt to standardise `url` to ensure it can be - prepended to relative URLs. If no scheme has been provided then we default + If `base_url` is None then we attempt to standarise the URL to ensure it can + be prepended to relative URLs. If no scheme has been provided then we default to http as any sane https-only site should 301 redirect http > https. - If `base_url` is False, we sanitise URLs to strip queries and fragments (we - don't want to scrape in-page anchors etc). + If `base_url` is set, we standardise URLs to strip queries and fragments (we + don't want to scrape in-page anchors etc). Any relative URLs will be appended + to the base url. - Returns a sanitised URL as a string. + Returns a standardised URL as a string. ''' default_proto = 'http' delim = '://' split_url = urlsplit(url) - if base_url: - # This will sanitise the initial url for the initial page crawl. + if not base_url: + # This will sanitise the initial url provided by the user. if split_url.scheme and split_url.scheme.startswith('http'): - sanitised_url = "".join([split_url.scheme, delim, split_url.netloc]) + return "".join([split_url.scheme, delim, split_url.netloc]) elif (split_url.path and not split_url.scheme and not split_url.netloc): - sanitised_url = "".join([default_proto, delim, split_url.path]) + return "".join([default_proto, delim, split_url.path]) else: # Sanitise discovered URLs. We already expect them in the format # protocol://base_url/path - sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) + if url.startswith('/'): + return urljoin(base_url, split_url.path) + elif url.startswith(base_url): + return "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) - return sanitised_url + return None