added more tests

small improvements to docs and variables
2018-09-17 21:44:20 +01:00 · 2018-09-17 21:44:04 +01:00
3 changed files with 50 additions and 16 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -24,6 +24,15 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    Accepts:
      url: the root URL to be crawled.
    Returns:
      baseurl: a validated and cleaned version of the initial URL.
              (type=string)
      robots: an object which allows us to query whether a site may be crawled.
              (type=RobotsTxt)
    '''
    # ensure we have a sensible URL to work with
    baseurl = standardise_url(url=url)
@@ -41,6 +50,11 @@ def sanity_checks(url=None):
 def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    Accepts:
        base_url:
        crawled_urls:
        runtime:
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
--- a/test_helpers.py
+++ b/test_helpers.py
@@ -1,34 +1,54 @@
 #!/usr/bin/env python
 import unittest
-from utils.helpers import (sanitise_url)
+from utils.helpers import (RobotsTxt, standardise_url)
 class TestRobots(unittest.TestCase):
    base_url = 'http://eu.httpbin.org'
    test_paths = (('/', True), ('/deny', False))
    robots = RobotsTxt(base_url=base_url)
    def test_robots_txt_deny(self):
        '''
        Asserts result is True or False.
        '''
        for path, allowed in self.test_paths:
            result = self.robots.check(url=path)
            self.assertIs(result, allowed)
 class TestUrls(unittest.TestCase):
    base_url = 'http://eu.httpbin.org'
    base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
-                     ('www.simonweald.com', 'http://www.simonweald.com'),
+                     ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
-                     ('http://www.github.com/', 'http://www.github.com'),
+                     ('https://eu.httpbin.org', 'https://eu.httpbin.org'))
                     ('https://www.github.com', 'https://www.github.com'))
-    urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
+    urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
-                     ('https://github.com/?foo=bar', 'https://github.com/'),
+                     ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
-                     ('https://github.com/#anchor', 'https://github.com/'))
+                     ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
-
+    def test_standardise_base_url(self):
    def test_sanitise_base_url(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Tests whether a base URL can be standardised to the format
        proto://[sub].domain.tld.
        '''
        for url, target in self.base_url_list:
-            result = sanitise_url(url, base_url=True)
+            result = standardise_url(url)
            self.assertEqual(result, target)
-    def test_sanitise_url(self):
+    def test_standardise_url(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Ensure that fragments/anchors etc are stripped.
        '''
        for url, target in self.urls_to_clean:
-            result = sanitise_url(url)
+            result = standardise_url(url, base_url=self.base_url)
            self.assertEqual(result, target)
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -58,7 +58,7 @@ class AsyncCrawler(object):
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
-                    data = await head.read()
+                    _ = await head.read()
                except Exception as e:
                    print(e)
            if 'text/html' in head.headers['Content-Type']:
@@ -130,7 +130,7 @@ class AsyncCrawler(object):
        print('Crawling: {0}'.format(self.baseurl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
-            # empty toe crawl list and then add all newly discovered URLs for
+            # empty to_crawl list and then add all newly discovered URLs for
            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)
Author	SHA1	Message	Date
Simon Weald	51f988e1bc	added more tests	2018-09-17 21:44:20 +01:00
Simon Weald	73c21e5bd3	small improvements to docs and variables	2018-09-17 21:44:04 +01:00