correct semaphore usage
This commit is contained in:
@@ -38,16 +38,12 @@ class AsyncCrawler(object):
|
|||||||
docstring
|
docstring
|
||||||
'''
|
'''
|
||||||
urls = set()
|
urls = set()
|
||||||
async with self.semaphore:
|
source = await self.get_source(url)
|
||||||
source = await self.get_source(url)
|
if source:
|
||||||
if source:
|
# add the URL we've just crawled
|
||||||
# add the URL we've just crawled
|
self.crawled.add(url)
|
||||||
self.crawled.add(url)
|
for new_url in self.find_all_urls(source):
|
||||||
for new_url in self.find_all_urls(source):
|
urls.add(new_url)
|
||||||
urls.add(new_url)
|
|
||||||
# urls_to_crawl = self.find_all_urls(source)
|
|
||||||
# for new_url in urls_to_crawl:
|
|
||||||
# urls.add(new_url)
|
|
||||||
|
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
@@ -71,12 +67,13 @@ class AsyncCrawler(object):
|
|||||||
Obtains the page's source.
|
Obtains the page's source.
|
||||||
'''
|
'''
|
||||||
print('semaphore held for {0}'.format(url))
|
print('semaphore held for {0}'.format(url))
|
||||||
async with self.client_session.get(url, timeout=5) as resp:
|
async with self.semaphore:
|
||||||
try:
|
async with self.client_session.get(url, timeout=5) as resp:
|
||||||
source = await resp.read()
|
try:
|
||||||
return source
|
source = await resp.read()
|
||||||
except Exception:
|
return source
|
||||||
return None
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def find_all_urls(self, source=None):
|
def find_all_urls(self, source=None):
|
||||||
|
|||||||
Reference in New Issue
Block a user