improve documentation
This commit is contained in:
@@ -54,7 +54,7 @@ class AsyncCrawler(object):
|
||||
|
||||
async def get_source(self, url=None):
|
||||
'''
|
||||
Obtains the page's source.
|
||||
Obtains the URL's source, provided it is HTML.
|
||||
'''
|
||||
async with self.semaphore:
|
||||
async with self.client_session.head(url, timeout=5) as head:
|
||||
@@ -70,13 +70,12 @@ class AsyncCrawler(object):
|
||||
return source
|
||||
except Exception:
|
||||
return None
|
||||
else:
|
||||
print('{0} - {1}'.format(head.headers['Content-Type'], url))
|
||||
|
||||
|
||||
def find_all_urls(self, source=None):
|
||||
'''
|
||||
Find all URLs in a page's source.
|
||||
Find all URLs in a page's source. Returns a list of URLs which have
|
||||
been validated as local to the starting URL.
|
||||
'''
|
||||
urls = []
|
||||
|
||||
@@ -93,12 +92,21 @@ class AsyncCrawler(object):
|
||||
|
||||
|
||||
async def run(self, urls=None):
|
||||
'''
|
||||
Crawls a batch of URLs of any size (resource usage is bounded by n
|
||||
semaphores (where n = concurrency). Returns a set of URLs to be added
|
||||
to the list of URLs which need to be crawled (find_all_urls only returns
|
||||
unseen URLs).
|
||||
'''
|
||||
tasks = []
|
||||
all_urls = set()
|
||||
for url in urls:
|
||||
# mark the URL as seen.
|
||||
self.crawled.add(url)
|
||||
# create an coroutine to crawl the URL.
|
||||
tasks.append(self.crawl_url(url))
|
||||
|
||||
# wait for all tasks to complete.
|
||||
for task in asyncio.as_completed(tasks):
|
||||
urls = None
|
||||
try:
|
||||
@@ -107,6 +115,7 @@ class AsyncCrawler(object):
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
# add the URLs to a set to be returned.
|
||||
if urls:
|
||||
for url in urls:
|
||||
all_urls.add(url)
|
||||
@@ -115,15 +124,22 @@ class AsyncCrawler(object):
|
||||
|
||||
|
||||
async def main(self):
|
||||
'''
|
||||
Runs a crawl with batches of URLs. Once complete returns a list of all
|
||||
crawled URLs.
|
||||
'''
|
||||
self.client_session = aiohttp.ClientSession(headers=self.headers)
|
||||
to_crawl = []
|
||||
to_crawl.append(self.baseurl)
|
||||
|
||||
while len(to_crawl) > 0:
|
||||
discovered_urls = await self.run(urls=to_crawl)
|
||||
# empty toe crawl list and then add all newly discovered URLs for
|
||||
# the next iteration.
|
||||
to_crawl.clear()
|
||||
to_crawl.extend(discovered_urls)
|
||||
|
||||
# close the ssions once all URLs have been crawled.
|
||||
await self.client_session.close()
|
||||
|
||||
return self.crawled
|
||||
|
||||
Reference in New Issue
Block a user