add test files

update docs
correct tests with new arg names
2018-09-19 08:39:05 +01:00 · 2018-09-19 08:38:49 +01:00 · 2018-09-19 08:37:55 +01:00
3 changed files with 20 additions and 29 deletions
@@ -13,7 +13,7 @@ pip install -r requirements.txt
 Run:

 ```bash
-python crawler.py -u https://urltocrawl.com [-c 100]
+python async_crawler.py -u https://urltocrawl.com [-c 100]
 ```

 Flags:
@@ -0,0 +1,10 @@
+<html>
+<body>
+<p>
+<ul>
+  <li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
+  <li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
+  <li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
+</ul>
+</body>
+</html>
@@ -2,37 +2,18 @@

 import unittest
 from unittest import mock
-from utils.helpers import AsyncCrawler, RobotsTxt, standardise_url
-
-
-class TestAsyncCrawler(unittest.TestCase):
-
-    base_url = 'http://eu.httpbin.org'
-    concurrency = 10
-    testcrawler = AsyncCrawler(baseurl=base_url, concurrency=concurrency)
-    expected_urls = ['http://eu.httpbin.org/b/', 'http://eu.httpbin.org/c/']
-    crawled = set()
-    crawled.add('https://eu.httpbin.org/a/')
-
-    @mock.patch('utils.helpers.AsyncCrawler.validate_url', response=True)
-    def test_find_all_urls(self, validate_url):
-
-        with open('test/files/find_all_urls.html', 'r') as f:
-            source = f.read()
-
-        urls = self.testcrawler.find_all_urls(source=source)
-        self.assertEqual(urls, self.expected_urls)
+from utils.helpers import RobotsTxt, standardise_url


 class TestRobots(unittest.TestCase):

-    base_url = 'http://eu.httpbin.org'
+    rooturl = 'http://eu.httpbin.org'
    no_robots = 'https://www.simonweald.com'

    test_paths = (('/', True), ('/deny', False))

-    robots = RobotsTxt(base_url=base_url)
-    norobots = RobotsTxt(base_url=no_robots)
+    robots = RobotsTxt(rooturl=rooturl)
+    norobots = RobotsTxt(rooturl=no_robots)

    def test_robots_txt_deny(self):
        '''
@@ -52,9 +33,9 @@ class TestRobots(unittest.TestCase):

 class TestUrls(unittest.TestCase):

-    base_url = 'http://eu.httpbin.org'
+    rooturl = 'http://eu.httpbin.org'

-    base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
+    rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
                     ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
                     ('https://eu.httpbin.org', 'https://eu.httpbin.org'))

@@ -64,12 +45,12 @@ class TestUrls(unittest.TestCase):
                     ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))

-    def test_standardise_base_url(self):
+    def test_standardise_rooturl(self):
        '''
        Tests whether a base URL can be standardised to the format
        proto://[sub].domain.tld.
        '''
-        for url, target in self.base_url_list:
+        for url, target in self.rooturl_list:
            result = standardise_url(url)
            self.assertEqual(result, target)

@@ -78,7 +59,7 @@ class TestUrls(unittest.TestCase):
        Ensure that fragments/anchors etc are stripped.
        '''
        for url, target in self.urls_to_clean:
-            result = standardise_url(url, base_url=self.base_url)
+            result = standardise_url(url, rooturl=self.rooturl)
            self.assertEqual(result, target)
Author	SHA1	Message	Date
simon	5f7d66912f	add test files	2018-09-19 08:39:05 +01:00
simon	d4cd93e3d4	update docs	2018-09-19 08:38:49 +01:00
simon	f5f6afd1a4	correct tests with new arg names	2018-09-19 08:37:55 +01:00