#!/usr/bin/env python import re import requests def standardise_base_url(url): ''' Standardise the URL to be scraped to ensure it is added to relative URLs in a consistent manner. ''' match_protocol = r'http(s?)\:\/\/' if re.match(match_protocol, url): base_url = url else: http_url = 'http://{0}'.format(url) https_url = 'https://{0}'.format(url) # attempt to discover which protocol is being used. try: result = requests.get(http_url) if result.url.startswith('http'): base_url = http_url if result.url.startswith('https'): base_url = https_url except requests.exceptions.RequestException as e: base_url = https_url if base_url.endswith('/'): base_url = base_url[:-1] return base_url def get_url_validation(base_url=None, url=None): ''' Checks if a URL is valid. Can be absolute or relative. ''' if url.startswith('/'): full_url = '{0}{1}'.format(base_url, url) if url.startswith(ffbase_url): full_url = url elif url.startswith('/'):