#!/usr/bin/env python import unittest from unittest import mock from utils.helpers import RobotsTxt, standardise_url class TestRobots(unittest.TestCase): rooturl = 'http://eu.httpbin.org' no_robots = 'https://www.simonweald.com' test_paths = (('/', True), ('/deny', False)) robots = RobotsTxt(rooturl=rooturl) norobots = RobotsTxt(rooturl=no_robots) def test_robots_txt_deny(self): ''' Asserts result is True or False. ''' for path, allowed in self.test_paths: result = self.robots.check(url=path) self.assertIs(result, allowed) def test_no_robots_txt(self): ''' Ensure we can crawl if robots.txt isn't present. ''' result = self.norobots.check(url='/') self.assertTrue(result) class TestUrls(unittest.TestCase): rooturl = 'http://eu.httpbin.org' rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), ('http://eu.httpbin.org/', 'http://eu.httpbin.org'), ('https://eu.httpbin.org', 'https://eu.httpbin.org')) urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'), ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'), ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'), ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'), ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html')) def test_standardise_rooturl(self): ''' Tests whether a base URL can be standardised to the format proto://[sub].domain.tld. ''' for url, target in self.rooturl_list: result = standardise_url(url) self.assertEqual(result, target) def test_standardise_url(self): ''' Ensure that fragments/anchors etc are stripped. ''' for url, target in self.urls_to_clean: result = standardise_url(url, rooturl=self.rooturl) self.assertEqual(result, target) if __name__ == '__main__': unittest.main()