From 79b10798a35c666de38e12cb6e25864df3299ab7 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Mon, 27 Aug 2018 19:37:41 +0100 Subject: [PATCH] initial commit of utils --- utils/helpers.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 utils/helpers.py diff --git a/utils/helpers.py b/utils/helpers.py new file mode 100644 index 0000000..bd38305 --- /dev/null +++ b/utils/helpers.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import re +import requests + + +def standardise_base_url(url): + ''' + Standardise the URL to be scraped to ensure it + is added to relative URLs in a consistent manner. + ''' + match_protocol = r'http(s?)\:\/\/' + + if re.match(match_protocol, url): + base_url = url + else: + http_url = 'http://{0}'.format(url) + https_url = 'https://{0}'.format(url) + # attempt to discover which protocol is being used. + try: + result = requests.get(http_url) + if result.url.startswith('http'): + base_url = http_url + if result.url.startswith('https'): + base_url = https_url + except requests.exceptions.RequestException as e: + base_url = https_url + + if base_url.endswith('/'): + base_url = base_url[:-1] + + return base_url + + +def get_url_validation(base_url=None, url=None): + ''' + Checks if a URL is valid. Can be absolute or relative. + ''' + + if url.startswith('/'): + full_url = '{0}{1}'.format(base_url, url) + if url.startswith(ffbase_url): + full_url = url + elif url.startswith('/'):