From 7bc9fe0679bb373f2dddd677f7dfebc9860b0c66 Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Sun, 16 Sep 2018 08:56:44 +0100
Subject: [PATCH] improved documentation and remove unneeded set

---
 utils/helpers.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/utils/helpers.py b/utils/helpers.py
index f23ce6e..e17a927 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -18,7 +18,6 @@ class AsyncCrawler(object):
     def __init__(self, baseurl=None, robots=None, concurrency=None):
         self.baseurl = baseurl
         self.robots = robots
-        self.uncrawled = set()
         self.crawled = set()
         self.headers = {'Accept-Encoding': 'gzip, deflate',
                         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -28,7 +27,7 @@ class AsyncCrawler(object):
 
     async def crawl_url(self, url=None):
         '''
-        docstring
+        Crawls the given URL and finds all new URLs in the initial page.
         '''
         urls = []
         source = await self.get_source(url)
@@ -54,7 +53,8 @@ class AsyncCrawler(object):
 
     async def get_source(self, url=None):
         '''
-        Obtains the URL's source, provided it is HTML.
+        Obtains the URL's source, provided it is HTML. Usage of semaphores
+        ensures only a certain number of coroutines can run at once.
         '''
         async with self.semaphore:
             async with self.client_session.head(url, timeout=5) as head:
@@ -66,7 +66,6 @@ class AsyncCrawler(object):
                 async with self.client_session.get(url, timeout=5) as resp:
                     try:
                         source = await resp.read()
-                        print('crawled {0}'.format(url))
                         return source
                     except Exception:
                         return None