improve documentation

This commit is contained in:
2018-09-15 21:48:50 +01:00
parent 0244435fea
commit 6548f55416

View File

@@ -54,7 +54,7 @@ class AsyncCrawler(object):
async def get_source(self, url=None):
'''
Obtains the page's source.
Obtains the URL's source, provided it is HTML.
'''
async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head:
@@ -70,13 +70,12 @@ class AsyncCrawler(object):
return source
except Exception:
return None
else:
print('{0} - {1}'.format(head.headers['Content-Type'], url))
def find_all_urls(self, source=None):
'''
Find all URLs in a page's source.
Find all URLs in a page's source. Returns a list of URLs which have
been validated as local to the starting URL.
'''
urls = []
@@ -93,12 +92,21 @@ class AsyncCrawler(object):
async def run(self, urls=None):
'''
Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only returns
unseen URLs).
'''
tasks = []
all_urls = set()
for url in urls:
# mark the URL as seen.
self.crawled.add(url)
# create an coroutine to crawl the URL.
tasks.append(self.crawl_url(url))
# wait for all tasks to complete.
for task in asyncio.as_completed(tasks):
urls = None
try:
@@ -107,6 +115,7 @@ class AsyncCrawler(object):
except Exception as e:
print(e)
# add the URLs to a set to be returned.
if urls:
for url in urls:
all_urls.add(url)
@@ -115,15 +124,22 @@ class AsyncCrawler(object):
async def main(self):
'''
Runs a crawl with batches of URLs. Once complete returns a list of all
crawled URLs.
'''
self.client_session = aiohttp.ClientSession(headers=self.headers)
to_crawl = []
to_crawl.append(self.baseurl)
while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl)
# empty toe crawl list and then add all newly discovered URLs for
# the next iteration.
to_crawl.clear()
to_crawl.extend(discovered_urls)
# close the ssions once all URLs have been crawled.
await self.client_session.close()
return self.crawled