improve documentation
This commit is contained in:
@@ -54,7 +54,7 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
async def get_source(self, url=None):
|
async def get_source(self, url=None):
|
||||||
'''
|
'''
|
||||||
Obtains the page's source.
|
Obtains the URL's source, provided it is HTML.
|
||||||
'''
|
'''
|
||||||
async with self.semaphore:
|
async with self.semaphore:
|
||||||
async with self.client_session.head(url, timeout=5) as head:
|
async with self.client_session.head(url, timeout=5) as head:
|
||||||
@@ -70,13 +70,12 @@ class AsyncCrawler(object):
|
|||||||
return source
|
return source
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
print('{0} - {1}'.format(head.headers['Content-Type'], url))
|
|
||||||
|
|
||||||
|
|
||||||
def find_all_urls(self, source=None):
|
def find_all_urls(self, source=None):
|
||||||
'''
|
'''
|
||||||
Find all URLs in a page's source.
|
Find all URLs in a page's source. Returns a list of URLs which have
|
||||||
|
been validated as local to the starting URL.
|
||||||
'''
|
'''
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
@@ -93,12 +92,21 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
|
|
||||||
async def run(self, urls=None):
|
async def run(self, urls=None):
|
||||||
|
'''
|
||||||
|
Crawls a batch of URLs of any size (resource usage is bounded by n
|
||||||
|
semaphores (where n = concurrency). Returns a set of URLs to be added
|
||||||
|
to the list of URLs which need to be crawled (find_all_urls only returns
|
||||||
|
unseen URLs).
|
||||||
|
'''
|
||||||
tasks = []
|
tasks = []
|
||||||
all_urls = set()
|
all_urls = set()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
|
# mark the URL as seen.
|
||||||
self.crawled.add(url)
|
self.crawled.add(url)
|
||||||
|
# create an coroutine to crawl the URL.
|
||||||
tasks.append(self.crawl_url(url))
|
tasks.append(self.crawl_url(url))
|
||||||
|
|
||||||
|
# wait for all tasks to complete.
|
||||||
for task in asyncio.as_completed(tasks):
|
for task in asyncio.as_completed(tasks):
|
||||||
urls = None
|
urls = None
|
||||||
try:
|
try:
|
||||||
@@ -107,6 +115,7 @@ class AsyncCrawler(object):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
# add the URLs to a set to be returned.
|
||||||
if urls:
|
if urls:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
all_urls.add(url)
|
all_urls.add(url)
|
||||||
@@ -115,15 +124,22 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
|
|
||||||
async def main(self):
|
async def main(self):
|
||||||
|
'''
|
||||||
|
Runs a crawl with batches of URLs. Once complete returns a list of all
|
||||||
|
crawled URLs.
|
||||||
|
'''
|
||||||
self.client_session = aiohttp.ClientSession(headers=self.headers)
|
self.client_session = aiohttp.ClientSession(headers=self.headers)
|
||||||
to_crawl = []
|
to_crawl = []
|
||||||
to_crawl.append(self.baseurl)
|
to_crawl.append(self.baseurl)
|
||||||
|
|
||||||
while len(to_crawl) > 0:
|
while len(to_crawl) > 0:
|
||||||
discovered_urls = await self.run(urls=to_crawl)
|
discovered_urls = await self.run(urls=to_crawl)
|
||||||
|
# empty toe crawl list and then add all newly discovered URLs for
|
||||||
|
# the next iteration.
|
||||||
to_crawl.clear()
|
to_crawl.clear()
|
||||||
to_crawl.extend(discovered_urls)
|
to_crawl.extend(discovered_urls)
|
||||||
|
|
||||||
|
# close the ssions once all URLs have been crawled.
|
||||||
await self.client_session.close()
|
await self.client_session.close()
|
||||||
|
|
||||||
return self.crawled
|
return self.crawled
|
||||||
|
|||||||
Reference in New Issue
Block a user