From 25f8c4c68601d05cc2b9e21554a48d5783033a61 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Tue, 28 Aug 2018 17:22:52 +0100 Subject: [PATCH] remove testing url with requests and assume that the user is correct --- test_helpers.py | 29 ++++++++++++++++------------- utils/helpers.py | 46 ++++++++++++++++++++-------------------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/test_helpers.py b/test_helpers.py index 670170d..1aacdee 100644 --- a/test_helpers.py +++ b/test_helpers.py @@ -1,34 +1,37 @@ #!/usr/bin/env python import unittest -from utils.helpers import (url_validation, standardise_base_url) +from utils.helpers import (clean_base_url) class TestUrls(unittest.TestCase): base_url = "github.com" + base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), - ('www.simonweald.com', 'https://www.simonweald.com'), - ('http://www.github.com', 'http://www.github.com')) + ('www.simonweald.com', 'http://www.simonweald.com'), + ('http://www.github.com/', 'http://www.github.com'), + ('https://www.github.com', 'https://www.github.com')) + valid_urls = ["https://www.github.com", "http://www.github.com", "github.com", "/some/url/", "index.html"] - def test_url_standardisation(self): + def test_clean_base_url(self): ''' Tests whether a URL's protocol can be discovered if not provided. ''' for url, target in self.base_url_list: - result = standardise_base_url(url) + result = clean_base_url(url) self.assertEqual(result, target) - def test_url_validation(self): - ''' - Passes when given a valid URL. A valid URL is qualified - by being local to the domain to be crawled. - ''' - for url in self.valid_urls: - result = url_validation(self.base_url, url) - self.assertTrue(result) + # def test_url_validation(self): + # ''' + # Passes when given a valid URL. A valid URL is qualified + # by being local to the domain to be crawled. + # ''' + # for url in self.valid_urls: + # result = url_validation(self.base_url, url) + # self.assertTrue(result) if __name__ == '__main__': diff --git a/utils/helpers.py b/utils/helpers.py index bd38305..593494d 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -1,44 +1,38 @@ #!/usr/bin/env python - -import re -import requests +''' +Utilities to provide various misc functions. +''' -def standardise_base_url(url): +def clean_base_url(url): ''' Standardise the URL to be scraped to ensure it is added to relative URLs in a consistent manner. ''' - match_protocol = r'http(s?)\:\/\/' + protocol = 'http://' - if re.match(match_protocol, url): + if url.startswith('http'): base_url = url else: - http_url = 'http://{0}'.format(url) - https_url = 'https://{0}'.format(url) - # attempt to discover which protocol is being used. - try: - result = requests.get(http_url) - if result.url.startswith('http'): - base_url = http_url - if result.url.startswith('https'): - base_url = https_url - except requests.exceptions.RequestException as e: - base_url = https_url + # otherwise assume HTTP as any sane site should upgrade + # to HTTPS via a 301 redirect. + base_url = "".join([protocol, url]) + # strip the trailing slash to allow us to append + # relative URLs. if base_url.endswith('/'): base_url = base_url[:-1] return base_url -def get_url_validation(base_url=None, url=None): - ''' - Checks if a URL is valid. Can be absolute or relative. - ''' +# def get_url_validation(base_url=None, url=None): +# ''' +# Checks if a URL is valid. Can be absolute or relative. +# ''' - if url.startswith('/'): - full_url = '{0}{1}'.format(base_url, url) - if url.startswith(ffbase_url): - full_url = url - elif url.startswith('/'): +# if url.startswith('/'): +# full_url = '{0}{1}'.format(base_url, url) +# if url.startswith(ffbase_url): +# full_url = url +# elif url.startswith('/'):