From d6964672b6e7fc672dac12a334728999426262d9 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Sat, 15 Sep 2018 21:30:02 +0100 Subject: [PATCH] commit of working async crawler --- async_crawler.py | 28 +++++++-------- utils/helpers.py | 90 ++++++++++++++++++++++++++++++------------------ 2 files changed, 70 insertions(+), 48 deletions(-) diff --git a/async_crawler.py b/async_crawler.py index dc4d4a4..4f3c00b 100644 --- a/async_crawler.py +++ b/async_crawler.py @@ -13,12 +13,12 @@ from datetime import datetime from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url -def init_crawler(url=None): +def sanity_checks(url=None): ''' - docstring + Runs some basic sanity checks before the crawler is initialised. ''' # ensure we have a sensible URL to work with - baseurl = standardise_url(url=url, base_url=url) + baseurl = standardise_url(url=url) # get robots.txt robots = RobotsTxt(base_url=baseurl) @@ -31,16 +31,16 @@ def init_crawler(url=None): def render_sitemap(base_url=None, crawled_urls=None, runtime=None): ''' - Renders the sitemap as an HTML file. + Renders the sitemap to an HTML file. ''' - # urlcount = len(crawled_urls) - # sorted_urls = sorted(crawled_urls) + urlcount = len(crawled_urls) + sorted_urls = sorted(crawled_urls) - tmpl = jinja2.Environment( + template = jinja2.Environment( loader=jinja2.FileSystemLoader('templates') ).get_template('sitemap.html.j2') - rendered_html = tmpl.render( + rendered_html = template.render( base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) with open('sitemap.html', 'w') as outfile: @@ -55,21 +55,19 @@ def main(): ''' starttime = datetime.now() - baseurl, robots = init_crawler(url=args.url) + baseurl, robots = sanity_checks(url=args.url) # create a crawler async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) - # run the crawler - task = asyncio.Task(async_crawler.run_loop()) + task = asyncio.Task(async_crawler.main()) loop = asyncio.get_event_loop() loop.run_until_complete(task) loop.close() results = task.result() - print(results) - print(len(results)) runtime = int((datetime.now() - starttime).total_seconds()) - print(runtime) + + render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime) if __name__ == '__main__': @@ -77,7 +75,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') parser.add_argument("-u", "--url", required=True, help="Base url to crawl") parser.add_argument("-c", "--concurrency", required=False, type=int, - default=50, help="Max number of pages to crawl concurrently") + default=100, help="Max number of pages to crawl concurrently") args = parser.parse_args() main() diff --git a/utils/helpers.py b/utils/helpers.py index 80d5130..ffd2cb4 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -3,10 +3,10 @@ Utilities to provide various misc functions. ''' -import urllib.request -import urllib.error -import gzip - +# import urllib.request +# import urllib.error +# import gzip +# from time import sleep import aiohttp @@ -26,9 +26,8 @@ class AsyncCrawler(object): self.robots = robots self.uncrawled = set() self.crawled = set() - # self.headers = {'Accept-Encoding': 'gzip, deflate', - # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} - self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + self.headers = {'Accept-Encoding': 'gzip, deflate', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} self.client_session = None self.semaphore = asyncio.BoundedSemaphore(concurrency) @@ -37,13 +36,10 @@ class AsyncCrawler(object): ''' docstring ''' - urls = set() + urls = [] source = await self.get_source(url) if source: - # add the URL we've just crawled - self.crawled.add(url) - for new_url in self.find_all_urls(source): - urls.add(new_url) + urls = self.find_all_urls(source) return urls @@ -66,21 +62,29 @@ class AsyncCrawler(object): ''' Obtains the page's source. ''' - print('semaphore held for {0}'.format(url)) async with self.semaphore: - async with self.client_session.get(url, timeout=5) as resp: + async with self.client_session.head(url, timeout=5) as head: try: - source = await resp.read() - return source - except Exception: - return None + data = await head.read() + except Exception as e: + print(e) + if 'text/html' in head.headers['Content-Type']: + async with self.client_session.get(url, timeout=5) as resp: + try: + source = await resp.read() + print('crawled {0}'.format(url)) + return source + except Exception: + return None + else: + print('{0} - {1}'.format(head.headers['Content-Type'], url)) def find_all_urls(self, source=None): ''' Find all URLs in a page's source. ''' - urls = set() + urls = [] html = BeautifulSoup(source, 'lxml') hrefs = html.find_all('a', href=True) @@ -89,27 +93,45 @@ class AsyncCrawler(object): for href in hrefs: url = self.validate_url(url=href['href']) if url and url not in self.crawled: - urls.add(url) + urls.append(url) return urls - async def run_loop(self): - ''' - function which runs the crawler - ''' - # print('Crawling: {}'.format(self.baseurl)) - self.client_session = aiohttp.ClientSession(headers=self.headers) - # provide the starting URL to the crawler - self.uncrawled.add(self.baseurl) + async def run(self, urls=None): + tasks = [] + all_urls = set() + for url in urls: + self.crawled.add(url) + tasks.append(self.crawl_url(url)) - while len(self.uncrawled) > 0: - url = self.uncrawled.pop() - new_urls = await self.crawl_url(url=url) - for url in new_urls: - self.uncrawled.add(url) + for task in asyncio.as_completed(tasks): + urls = None + try: + # completed.append((await task)) + urls = await task + except Exception as e: + print(e) + + if urls: + for url in urls: + all_urls.add(url) + + return all_urls + + + async def main(self): + self.client_session = aiohttp.ClientSession(headers=self.headers) + to_crawl = [] + to_crawl.append(self.baseurl) + + while len(to_crawl) > 0: + discovered_urls = await self.run(urls=to_crawl) + to_crawl.clear() + to_crawl.extend(discovered_urls) await self.client_session.close() + return self.crawled @@ -164,6 +186,7 @@ def standardise_url(url=None, base_url=None): ''' default_proto = 'http' delim = '://' + file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm') split_url = urlsplit(url) @@ -174,6 +197,7 @@ def standardise_url(url=None, base_url=None): elif (split_url.path and not split_url.scheme and not split_url.netloc): return "".join([default_proto, delim, split_url.path]) else: + # if url.endswith(file_extensions): # Sanitise discovered URLs. We already expect them in the format # protocol://base_url/path if url.startswith('/'):