improve documentation

This commit is contained in:
2018-09-15 21:48:50 +01:00
parent 0244435fea
commit 6548f55416

View File

@@ -54,7 +54,7 @@ class AsyncCrawler(object):
async def get_source(self, url=None): async def get_source(self, url=None):
''' '''
Obtains the page's source. Obtains the URL's source, provided it is HTML.
''' '''
async with self.semaphore: async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head: async with self.client_session.head(url, timeout=5) as head:
@@ -70,13 +70,12 @@ class AsyncCrawler(object):
return source return source
except Exception: except Exception:
return None return None
else:
print('{0} - {1}'.format(head.headers['Content-Type'], url))
def find_all_urls(self, source=None): def find_all_urls(self, source=None):
''' '''
Find all URLs in a page's source. Find all URLs in a page's source. Returns a list of URLs which have
been validated as local to the starting URL.
''' '''
urls = [] urls = []
@@ -93,12 +92,21 @@ class AsyncCrawler(object):
async def run(self, urls=None): async def run(self, urls=None):
'''
Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only returns
unseen URLs).
'''
tasks = [] tasks = []
all_urls = set() all_urls = set()
for url in urls: for url in urls:
# mark the URL as seen.
self.crawled.add(url) self.crawled.add(url)
# create an coroutine to crawl the URL.
tasks.append(self.crawl_url(url)) tasks.append(self.crawl_url(url))
# wait for all tasks to complete.
for task in asyncio.as_completed(tasks): for task in asyncio.as_completed(tasks):
urls = None urls = None
try: try:
@@ -107,6 +115,7 @@ class AsyncCrawler(object):
except Exception as e: except Exception as e:
print(e) print(e)
# add the URLs to a set to be returned.
if urls: if urls:
for url in urls: for url in urls:
all_urls.add(url) all_urls.add(url)
@@ -115,15 +124,22 @@ class AsyncCrawler(object):
async def main(self): async def main(self):
'''
Runs a crawl with batches of URLs. Once complete returns a list of all
crawled URLs.
'''
self.client_session = aiohttp.ClientSession(headers=self.headers) self.client_session = aiohttp.ClientSession(headers=self.headers)
to_crawl = [] to_crawl = []
to_crawl.append(self.baseurl) to_crawl.append(self.baseurl)
while len(to_crawl) > 0: while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl) discovered_urls = await self.run(urls=to_crawl)
# empty toe crawl list and then add all newly discovered URLs for
# the next iteration.
to_crawl.clear() to_crawl.clear()
to_crawl.extend(discovered_urls) to_crawl.extend(discovered_urls)
# close the ssions once all URLs have been crawled.
await self.client_session.close() await self.client_session.close()
return self.crawled return self.crawled