From 3808f72f731d398a449f1d363c0a43e557ed2254 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Fri, 14 Sep 2018 16:06:17 +0100 Subject: [PATCH] correct semaphore usage --- utils/helpers.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index be1bda3..80d5130 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -38,16 +38,12 @@ class AsyncCrawler(object): docstring ''' urls = set() - async with self.semaphore: - source = await self.get_source(url) - if source: - # add the URL we've just crawled - self.crawled.add(url) - for new_url in self.find_all_urls(source): - urls.add(new_url) - # urls_to_crawl = self.find_all_urls(source) - # for new_url in urls_to_crawl: - # urls.add(new_url) + source = await self.get_source(url) + if source: + # add the URL we've just crawled + self.crawled.add(url) + for new_url in self.find_all_urls(source): + urls.add(new_url) return urls @@ -71,12 +67,13 @@ class AsyncCrawler(object): Obtains the page's source. ''' print('semaphore held for {0}'.format(url)) - async with self.client_session.get(url, timeout=5) as resp: - try: - source = await resp.read() - return source - except Exception: - return None + async with self.semaphore: + async with self.client_session.get(url, timeout=5) as resp: + try: + source = await resp.read() + return source + except Exception: + return None def find_all_urls(self, source=None):