From 91cd988f52ac0b86177ec81edc84a320eec36a7d Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Sun, 16 Sep 2018 15:26:49 +0100
Subject: [PATCH] more comments and progress output

---
 async_crawler.py | 7 +++++--
 utils/helpers.py | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/async_crawler.py b/async_crawler.py
index 975a1d7..ff66adf 100644
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -51,7 +51,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
 
 def main():
     '''
-    docstring
+    Main function, responsible for prepping and running the crawler and
+    rendering the sitemap.
     '''
     starttime = datetime.now()
 
@@ -60,11 +61,13 @@ def main():
     # create a crawler
     async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
 
+    # create a task to run the crawler, run the loop and then gather the results.
     task = asyncio.Task(async_crawler.main())
     loop = asyncio.get_event_loop()
     loop.run_until_complete(task)
     loop.close()
-    results = task.result()
+    results = sorted(task.result())
+
     runtime = int((datetime.now() - starttime).total_seconds())
 
     render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
diff --git a/utils/helpers.py b/utils/helpers.py
index 505c6f4..f18d78a 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -119,6 +119,7 @@ class AsyncCrawler(object):
             # add the URLs to a set to be returned.
             if urls:
                 for url in urls:
+                    print('Found: {0}'.format(url))
                     all_urls.add(url)
 
         return all_urls
@@ -133,6 +134,7 @@ class AsyncCrawler(object):
         to_crawl = []
         to_crawl.append(self.baseurl)
 
+        print('Crawling: {0}'.format(self.baseurl))
         while len(to_crawl) > 0:
             discovered_urls = await self.run(urls=to_crawl)
             # empty toe crawl list and then add all newly discovered URLs for
@@ -169,6 +171,7 @@ class RobotsTxt(object):
         try:
             response = urllib.request.urlopen(request, timeout=5)
         except urllib.error.HTTPError:
+            # if robots.txt doesn't exist then allow all URLs to be crawled.
             robots.allow_all = True
         else:
             data = response.read()