87 lines
2.8 KiB
Python
87 lines
2.8 KiB
Python
#!/usr/bin/env python
|
|
|
|
import unittest
|
|
from unittest import mock
|
|
from utils.helpers import AsyncCrawler, RobotsTxt, standardise_url
|
|
|
|
|
|
class TestAsyncCrawler(unittest.TestCase):
|
|
|
|
base_url = 'http://eu.httpbin.org'
|
|
concurrency = 10
|
|
testcrawler = AsyncCrawler(baseurl=base_url, concurrency=concurrency)
|
|
expected_urls = ['http://eu.httpbin.org/b/', 'http://eu.httpbin.org/c/']
|
|
crawled = set()
|
|
crawled.add('https://eu.httpbin.org/a/')
|
|
|
|
@mock.patch('utils.helpers.AsyncCrawler.validate_url', response=True)
|
|
def test_find_all_urls(self, validate_url):
|
|
|
|
with open('test/files/find_all_urls.html', 'r') as f:
|
|
source = f.read()
|
|
|
|
urls = self.testcrawler.find_all_urls(source=source)
|
|
self.assertEqual(urls, self.expected_urls)
|
|
|
|
|
|
class TestRobots(unittest.TestCase):
|
|
|
|
base_url = 'http://eu.httpbin.org'
|
|
no_robots = 'https://www.simonweald.com'
|
|
|
|
test_paths = (('/', True), ('/deny', False))
|
|
|
|
robots = RobotsTxt(base_url=base_url)
|
|
norobots = RobotsTxt(base_url=no_robots)
|
|
|
|
def test_robots_txt_deny(self):
|
|
'''
|
|
Asserts result is True or False.
|
|
'''
|
|
for path, allowed in self.test_paths:
|
|
result = self.robots.check(url=path)
|
|
self.assertIs(result, allowed)
|
|
|
|
def test_no_robots_txt(self):
|
|
'''
|
|
Ensure we can crawl if robots.txt isn't present.
|
|
'''
|
|
result = self.norobots.check(url='/')
|
|
self.assertTrue(result)
|
|
|
|
|
|
class TestUrls(unittest.TestCase):
|
|
|
|
base_url = 'http://eu.httpbin.org'
|
|
|
|
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
|
|
('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
|
|
('https://eu.httpbin.org', 'https://eu.httpbin.org'))
|
|
|
|
urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
|
|
('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
|
|
('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
|
|
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
|
|
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
|
|
|
|
def test_standardise_base_url(self):
|
|
'''
|
|
Tests whether a base URL can be standardised to the format
|
|
proto://[sub].domain.tld.
|
|
'''
|
|
for url, target in self.base_url_list:
|
|
result = standardise_url(url)
|
|
self.assertEqual(result, target)
|
|
|
|
def test_standardise_url(self):
|
|
'''
|
|
Ensure that fragments/anchors etc are stripped.
|
|
'''
|
|
for url, target in self.urls_to_clean:
|
|
result = standardise_url(url, base_url=self.base_url)
|
|
self.assertEqual(result, target)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|