small improvements to docs and variables

2018-09-17 21:44:04 +01:00
parent eb2395d461
commit 73c21e5bd3
2 changed files with 16 additions and 2 deletions
@@ -24,6 +24,15 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    Accepts:
      url: the root URL to be crawled.
    Returns:
      baseurl: a validated and cleaned version of the initial URL.
              (type=string)
      robots: an object which allows us to query whether a site may be crawled.
              (type=RobotsTxt)
    '''
    # ensure we have a sensible URL to work with
    baseurl = standardise_url(url=url)
@@ -41,6 +50,11 @@ def sanity_checks(url=None):
 def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    Accepts:
        base_url:
        crawled_urls:
        runtime:
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
@@ -58,7 +58,7 @@ class AsyncCrawler(object):
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
-                    data = await head.read()
+                    _ = await head.read()
                except Exception as e:
                    print(e)
            if 'text/html' in head.headers['Content-Type']:
@@ -130,7 +130,7 @@ class AsyncCrawler(object):
        print('Crawling: {0}'.format(self.baseurl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
-            # empty toe crawl list and then add all newly discovered URLs for
+            # empty to_crawl list and then add all newly discovered URLs for
            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)