Compare commits

...

2 Commits

Author SHA1 Message Date
51f988e1bc added more tests 2018-09-17 21:44:20 +01:00
73c21e5bd3 small improvements to docs and variables 2018-09-17 21:44:04 +01:00
3 changed files with 50 additions and 16 deletions

View File

@@ -24,6 +24,15 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
def sanity_checks(url=None): def sanity_checks(url=None):
''' '''
Runs some basic sanity checks before the crawler is initialised. Runs some basic sanity checks before the crawler is initialised.
Accepts:
url: the root URL to be crawled.
Returns:
baseurl: a validated and cleaned version of the initial URL.
(type=string)
robots: an object which allows us to query whether a site may be crawled.
(type=RobotsTxt)
''' '''
# ensure we have a sensible URL to work with # ensure we have a sensible URL to work with
baseurl = standardise_url(url=url) baseurl = standardise_url(url=url)
@@ -41,6 +50,11 @@ def sanity_checks(url=None):
def render_sitemap(base_url=None, crawled_urls=None, runtime=None): def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
''' '''
Renders the sitemap to an HTML file. Renders the sitemap to an HTML file.
Accepts:
base_url:
crawled_urls:
runtime:
''' '''
urlcount = len(crawled_urls) urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls) sorted_urls = sorted(crawled_urls)

View File

@@ -1,34 +1,54 @@
#!/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
from utils.helpers import (sanitise_url) from utils.helpers import (RobotsTxt, standardise_url)
class TestRobots(unittest.TestCase):
base_url = 'http://eu.httpbin.org'
test_paths = (('/', True), ('/deny', False))
robots = RobotsTxt(base_url=base_url)
def test_robots_txt_deny(self):
'''
Asserts result is True or False.
'''
for path, allowed in self.test_paths:
result = self.robots.check(url=path)
self.assertIs(result, allowed)
class TestUrls(unittest.TestCase): class TestUrls(unittest.TestCase):
base_url = 'http://eu.httpbin.org'
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('www.simonweald.com', 'http://www.simonweald.com'), ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
('http://www.github.com/', 'http://www.github.com'), ('https://eu.httpbin.org', 'https://eu.httpbin.org'))
('https://www.github.com', 'https://www.github.com'))
urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'), urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
('https://github.com/?foo=bar', 'https://github.com/'), ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
('https://github.com/#anchor', 'https://github.com/')) ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
def test_standardise_base_url(self):
def test_sanitise_base_url(self):
''' '''
Tests whether a URL's protocol can be discovered if not provided. Tests whether a base URL can be standardised to the format
proto://[sub].domain.tld.
''' '''
for url, target in self.base_url_list: for url, target in self.base_url_list:
result = sanitise_url(url, base_url=True) result = standardise_url(url)
self.assertEqual(result, target) self.assertEqual(result, target)
def test_sanitise_url(self): def test_standardise_url(self):
''' '''
Tests whether a URL's protocol can be discovered if not provided. Ensure that fragments/anchors etc are stripped.
''' '''
for url, target in self.urls_to_clean: for url, target in self.urls_to_clean:
result = sanitise_url(url) result = standardise_url(url, base_url=self.base_url)
self.assertEqual(result, target) self.assertEqual(result, target)

View File

@@ -58,7 +58,7 @@ class AsyncCrawler(object):
async with self.semaphore: async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head: async with self.client_session.head(url, timeout=5) as head:
try: try:
data = await head.read() _ = await head.read()
except Exception as e: except Exception as e:
print(e) print(e)
if 'text/html' in head.headers['Content-Type']: if 'text/html' in head.headers['Content-Type']:
@@ -130,7 +130,7 @@ class AsyncCrawler(object):
print('Crawling: {0}'.format(self.baseurl)) print('Crawling: {0}'.format(self.baseurl))
while len(to_crawl) > 0: while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl) discovered_urls = await self.run(urls=to_crawl)
# empty toe crawl list and then add all newly discovered URLs for # empty to_crawl list and then add all newly discovered URLs for
# the next iteration. # the next iteration.
to_crawl.clear() to_crawl.clear()
to_crawl.extend(discovered_urls) to_crawl.extend(discovered_urls)