From 91cd988f52ac0b86177ec81edc84a320eec36a7d Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Sun, 16 Sep 2018 15:26:49 +0100 Subject: [PATCH] more comments and progress output --- async_crawler.py | 7 +++++-- utils/helpers.py | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/async_crawler.py b/async_crawler.py index 975a1d7..ff66adf 100644 --- a/async_crawler.py +++ b/async_crawler.py @@ -51,7 +51,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None): def main(): ''' - docstring + Main function, responsible for prepping and running the crawler and + rendering the sitemap. ''' starttime = datetime.now() @@ -60,11 +61,13 @@ def main(): # create a crawler async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) + # create a task to run the crawler, run the loop and then gather the results. task = asyncio.Task(async_crawler.main()) loop = asyncio.get_event_loop() loop.run_until_complete(task) loop.close() - results = task.result() + results = sorted(task.result()) + runtime = int((datetime.now() - starttime).total_seconds()) render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime) diff --git a/utils/helpers.py b/utils/helpers.py index 505c6f4..f18d78a 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -119,6 +119,7 @@ class AsyncCrawler(object): # add the URLs to a set to be returned. if urls: for url in urls: + print('Found: {0}'.format(url)) all_urls.add(url) return all_urls @@ -133,6 +134,7 @@ class AsyncCrawler(object): to_crawl = [] to_crawl.append(self.baseurl) + print('Crawling: {0}'.format(self.baseurl)) while len(to_crawl) > 0: discovered_urls = await self.run(urls=to_crawl) # empty toe crawl list and then add all newly discovered URLs for @@ -169,6 +171,7 @@ class RobotsTxt(object): try: response = urllib.request.urlopen(request, timeout=5) except urllib.error.HTTPError: + # if robots.txt doesn't exist then allow all URLs to be crawled. robots.allow_all = True else: data = response.read()