diff --git a/utils/helpers.py b/utils/helpers.py index f23ce6e..e17a927 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -18,7 +18,6 @@ class AsyncCrawler(object): def __init__(self, baseurl=None, robots=None, concurrency=None): self.baseurl = baseurl self.robots = robots - self.uncrawled = set() self.crawled = set() self.headers = {'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} @@ -28,7 +27,7 @@ class AsyncCrawler(object): async def crawl_url(self, url=None): ''' - docstring + Crawls the given URL and finds all new URLs in the initial page. ''' urls = [] source = await self.get_source(url) @@ -54,7 +53,8 @@ class AsyncCrawler(object): async def get_source(self, url=None): ''' - Obtains the URL's source, provided it is HTML. + Obtains the URL's source, provided it is HTML. Usage of semaphores + ensures only a certain number of coroutines can run at once. ''' async with self.semaphore: async with self.client_session.head(url, timeout=5) as head: @@ -66,7 +66,6 @@ class AsyncCrawler(object): async with self.client_session.get(url, timeout=5) as resp: try: source = await resp.read() - print('crawled {0}'.format(url)) return source except Exception: return None