From 7bc9fe0679bb373f2dddd677f7dfebc9860b0c66 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Sun, 16 Sep 2018 08:56:44 +0100 Subject: [PATCH] improved documentation and remove unneeded set --- utils/helpers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index f23ce6e..e17a927 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -18,7 +18,6 @@ class AsyncCrawler(object): def __init__(self, baseurl=None, robots=None, concurrency=None): self.baseurl = baseurl self.robots = robots - self.uncrawled = set() self.crawled = set() self.headers = {'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} @@ -28,7 +27,7 @@ class AsyncCrawler(object): async def crawl_url(self, url=None): ''' - docstring + Crawls the given URL and finds all new URLs in the initial page. ''' urls = [] source = await self.get_source(url) @@ -54,7 +53,8 @@ class AsyncCrawler(object): async def get_source(self, url=None): ''' - Obtains the URL's source, provided it is HTML. + Obtains the URL's source, provided it is HTML. Usage of semaphores + ensures only a certain number of coroutines can run at once. ''' async with self.semaphore: async with self.client_session.head(url, timeout=5) as head: @@ -66,7 +66,6 @@ class AsyncCrawler(object): async with self.client_session.get(url, timeout=5) as resp: try: source = await resp.read() - print('crawled {0}'.format(url)) return source except Exception: return None