diff --git a/utils/helpers.py b/utils/helpers.py new file mode 100644 index 0000000..bd38305 --- /dev/null +++ b/utils/helpers.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import re +import requests + + +def standardise_base_url(url): + ''' + Standardise the URL to be scraped to ensure it + is added to relative URLs in a consistent manner. + ''' + match_protocol = r'http(s?)\:\/\/' + + if re.match(match_protocol, url): + base_url = url + else: + http_url = 'http://{0}'.format(url) + https_url = 'https://{0}'.format(url) + # attempt to discover which protocol is being used. + try: + result = requests.get(http_url) + if result.url.startswith('http'): + base_url = http_url + if result.url.startswith('https'): + base_url = https_url + except requests.exceptions.RequestException as e: + base_url = https_url + + if base_url.endswith('/'): + base_url = base_url[:-1] + + return base_url + + +def get_url_validation(base_url=None, url=None): + ''' + Checks if a URL is valid. Can be absolute or relative. + ''' + + if url.startswith('/'): + full_url = '{0}{1}'.format(base_url, url) + if url.startswith(ffbase_url): + full_url = url + elif url.startswith('/'):