diff --git a/async_crawler.py b/async_crawler.py index f8285ad..2dae874 100644 --- a/async_crawler.py +++ b/async_crawler.py @@ -24,6 +24,15 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url def sanity_checks(url=None): ''' Runs some basic sanity checks before the crawler is initialised. + + Accepts: + url: the root URL to be crawled. + + Returns: + baseurl: a validated and cleaned version of the initial URL. + (type=string) + robots: an object which allows us to query whether a site may be crawled. + (type=RobotsTxt) ''' # ensure we have a sensible URL to work with baseurl = standardise_url(url=url) @@ -41,6 +50,11 @@ def sanity_checks(url=None): def render_sitemap(base_url=None, crawled_urls=None, runtime=None): ''' Renders the sitemap to an HTML file. + + Accepts: + base_url: + crawled_urls: + runtime: ''' urlcount = len(crawled_urls) sorted_urls = sorted(crawled_urls) diff --git a/utils/helpers.py b/utils/helpers.py index 5739b7f..58770be 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -58,7 +58,7 @@ class AsyncCrawler(object): async with self.semaphore: async with self.client_session.head(url, timeout=5) as head: try: - data = await head.read() + _ = await head.read() except Exception as e: print(e) if 'text/html' in head.headers['Content-Type']: @@ -130,7 +130,7 @@ class AsyncCrawler(object): print('Crawling: {0}'.format(self.baseurl)) while len(to_crawl) > 0: discovered_urls = await self.run(urls=to_crawl) - # empty toe crawl list and then add all newly discovered URLs for + # empty to_crawl list and then add all newly discovered URLs for # the next iteration. to_crawl.clear() to_crawl.extend(discovered_urls)