web-scraper/test_helpers.py

#!/usr/bin/env python

import unittest
from unittest import mock
from utils.helpers import RobotsTxt, standardise_url


class TestRobots(unittest.TestCase):

    rooturl = 'http://eu.httpbin.org'
    no_robots = 'https://www.simonweald.com'

    test_paths = (('/', True), ('/deny', False))

    robots = RobotsTxt(rooturl=rooturl)
    norobots = RobotsTxt(rooturl=no_robots)

    def test_robots_txt_deny(self):
        '''
        Asserts result is True or False.
        '''
        for path, allowed in self.test_paths:
            result = self.robots.check(url=path)
            self.assertIs(result, allowed)

    def test_no_robots_txt(self):
        '''
        Ensure we can crawl if robots.txt isn't present.
        '''
        result = self.norobots.check(url='/')
        self.assertTrue(result)


class TestUrls(unittest.TestCase):

    rooturl = 'http://eu.httpbin.org'

    rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
                     ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
                     ('https://eu.httpbin.org', 'https://eu.httpbin.org'))

    urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
                     ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
                     ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))

    def test_standardise_rooturl(self):
        '''
        Tests whether a base URL can be standardised to the format
        proto://[sub].domain.tld.
        '''
        for url, target in self.rooturl_list:
            result = standardise_url(url)
            self.assertEqual(result, target)

    def test_standardise_url(self):
        '''
        Ensure that fragments/anchors etc are stripped.
        '''
        for url, target in self.urls_to_clean:
            result = standardise_url(url, rooturl=self.rooturl)
            self.assertEqual(result, target)


if __name__ == '__main__':
    unittest.main()