correct semaphore usage

This commit is contained in:
2018-09-14 16:06:17 +01:00
parent 7ebe4855b8
commit 3808f72f73

View File

@@ -38,16 +38,12 @@ class AsyncCrawler(object):
docstring docstring
''' '''
urls = set() urls = set()
async with self.semaphore: source = await self.get_source(url)
source = await self.get_source(url) if source:
if source: # add the URL we've just crawled
# add the URL we've just crawled self.crawled.add(url)
self.crawled.add(url) for new_url in self.find_all_urls(source):
for new_url in self.find_all_urls(source): urls.add(new_url)
urls.add(new_url)
# urls_to_crawl = self.find_all_urls(source)
# for new_url in urls_to_crawl:
# urls.add(new_url)
return urls return urls
@@ -71,12 +67,13 @@ class AsyncCrawler(object):
Obtains the page's source. Obtains the page's source.
''' '''
print('semaphore held for {0}'.format(url)) print('semaphore held for {0}'.format(url))
async with self.client_session.get(url, timeout=5) as resp: async with self.semaphore:
try: async with self.client_session.get(url, timeout=5) as resp:
source = await resp.read() try:
return source source = await resp.read()
except Exception: return source
return None except Exception:
return None
def find_all_urls(self, source=None): def find_all_urls(self, source=None):