diff --git a/utils/helpers.py b/utils/helpers.py index a63690c..f23ce6e 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -54,7 +54,7 @@ class AsyncCrawler(object): async def get_source(self, url=None): ''' - Obtains the page's source. + Obtains the URL's source, provided it is HTML. ''' async with self.semaphore: async with self.client_session.head(url, timeout=5) as head: @@ -70,13 +70,12 @@ class AsyncCrawler(object): return source except Exception: return None - else: - print('{0} - {1}'.format(head.headers['Content-Type'], url)) def find_all_urls(self, source=None): ''' - Find all URLs in a page's source. + Find all URLs in a page's source. Returns a list of URLs which have + been validated as local to the starting URL. ''' urls = [] @@ -93,12 +92,21 @@ class AsyncCrawler(object): async def run(self, urls=None): + ''' + Crawls a batch of URLs of any size (resource usage is bounded by n + semaphores (where n = concurrency). Returns a set of URLs to be added + to the list of URLs which need to be crawled (find_all_urls only returns + unseen URLs). + ''' tasks = [] all_urls = set() for url in urls: + # mark the URL as seen. self.crawled.add(url) + # create an coroutine to crawl the URL. tasks.append(self.crawl_url(url)) + # wait for all tasks to complete. for task in asyncio.as_completed(tasks): urls = None try: @@ -107,6 +115,7 @@ class AsyncCrawler(object): except Exception as e: print(e) + # add the URLs to a set to be returned. if urls: for url in urls: all_urls.add(url) @@ -115,15 +124,22 @@ class AsyncCrawler(object): async def main(self): + ''' + Runs a crawl with batches of URLs. Once complete returns a list of all + crawled URLs. + ''' self.client_session = aiohttp.ClientSession(headers=self.headers) to_crawl = [] to_crawl.append(self.baseurl) while len(to_crawl) > 0: discovered_urls = await self.run(urls=to_crawl) + # empty toe crawl list and then add all newly discovered URLs for + # the next iteration. to_crawl.clear() to_crawl.extend(discovered_urls) + # close the ssions once all URLs have been crawled. await self.client_session.close() return self.crawled