improved documentation and remove unneeded set
This commit is contained in:
@@ -18,7 +18,6 @@ class AsyncCrawler(object):
|
||||
def __init__(self, baseurl=None, robots=None, concurrency=None):
|
||||
self.baseurl = baseurl
|
||||
self.robots = robots
|
||||
self.uncrawled = set()
|
||||
self.crawled = set()
|
||||
self.headers = {'Accept-Encoding': 'gzip, deflate',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
@@ -28,7 +27,7 @@ class AsyncCrawler(object):
|
||||
|
||||
async def crawl_url(self, url=None):
|
||||
'''
|
||||
docstring
|
||||
Crawls the given URL and finds all new URLs in the initial page.
|
||||
'''
|
||||
urls = []
|
||||
source = await self.get_source(url)
|
||||
@@ -54,7 +53,8 @@ class AsyncCrawler(object):
|
||||
|
||||
async def get_source(self, url=None):
|
||||
'''
|
||||
Obtains the URL's source, provided it is HTML.
|
||||
Obtains the URL's source, provided it is HTML. Usage of semaphores
|
||||
ensures only a certain number of coroutines can run at once.
|
||||
'''
|
||||
async with self.semaphore:
|
||||
async with self.client_session.head(url, timeout=5) as head:
|
||||
@@ -66,7 +66,6 @@ class AsyncCrawler(object):
|
||||
async with self.client_session.get(url, timeout=5) as resp:
|
||||
try:
|
||||
source = await resp.read()
|
||||
print('crawled {0}'.format(url))
|
||||
return source
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user