Files
web-scraper/test_helpers.py
2018-09-17 21:44:20 +01:00

57 lines
1.8 KiB
Python

#!/usr/bin/env python
import unittest
from utils.helpers import (RobotsTxt, standardise_url)
class TestRobots(unittest.TestCase):
base_url = 'http://eu.httpbin.org'
test_paths = (('/', True), ('/deny', False))
robots = RobotsTxt(base_url=base_url)
def test_robots_txt_deny(self):
'''
Asserts result is True or False.
'''
for path, allowed in self.test_paths:
result = self.robots.check(url=path)
self.assertIs(result, allowed)
class TestUrls(unittest.TestCase):
base_url = 'http://eu.httpbin.org'
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
('https://eu.httpbin.org', 'https://eu.httpbin.org'))
urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
def test_standardise_base_url(self):
'''
Tests whether a base URL can be standardised to the format
proto://[sub].domain.tld.
'''
for url, target in self.base_url_list:
result = standardise_url(url)
self.assertEqual(result, target)
def test_standardise_url(self):
'''
Ensure that fragments/anchors etc are stripped.
'''
for url, target in self.urls_to_clean:
result = standardise_url(url, base_url=self.base_url)
self.assertEqual(result, target)
if __name__ == '__main__':
unittest.main()