#!/usr/bin/env python import unittest from unittest import mock from utils.helpers import AsyncCrawler, RobotsTxt, standardise_url class TestAsyncCrawler(unittest.TestCase): base_url = 'http://eu.httpbin.org' concurrency = 10 testcrawler = AsyncCrawler(baseurl=base_url, concurrency=concurrency) expected_urls = ['http://eu.httpbin.org/b/', 'http://eu.httpbin.org/c/'] crawled = set() crawled.add('https://eu.httpbin.org/a/') @mock.patch('utils.helpers.AsyncCrawler.validate_url', response=True) def test_find_all_urls(self, validate_url): with open('test/files/find_all_urls.html', 'r') as f: source = f.read() urls = self.testcrawler.find_all_urls(source=source) self.assertEqual(urls, self.expected_urls) class TestRobots(unittest.TestCase): base_url = 'http://eu.httpbin.org' no_robots = 'https://www.simonweald.com' test_paths = (('/', True), ('/deny', False)) robots = RobotsTxt(base_url=base_url) norobots = RobotsTxt(base_url=no_robots) def test_robots_txt_deny(self): ''' Asserts result is True or False. ''' for path, allowed in self.test_paths: result = self.robots.check(url=path) self.assertIs(result, allowed) def test_no_robots_txt(self): ''' Ensure we can crawl if robots.txt isn't present. ''' result = self.norobots.check(url='/') self.assertTrue(result) class TestUrls(unittest.TestCase): base_url = 'http://eu.httpbin.org' base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), ('http://eu.httpbin.org/', 'http://eu.httpbin.org'), ('https://eu.httpbin.org', 'https://eu.httpbin.org')) urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'), ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'), ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'), ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'), ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html')) def test_standardise_base_url(self): ''' Tests whether a base URL can be standardised to the format proto://[sub].domain.tld. ''' for url, target in self.base_url_list: result = standardise_url(url) self.assertEqual(result, target) def test_standardise_url(self): ''' Ensure that fragments/anchors etc are stripped. ''' for url, target in self.urls_to_clean: result = standardise_url(url, base_url=self.base_url) self.assertEqual(result, target) if __name__ == '__main__': unittest.main()