#!/usr/bin/env python ''' Utilities to provide various misc functions. ''' def clean_base_url(url): ''' Standardise the URL to be scraped to ensure it is added to relative URLs in a consistent manner. ''' protocol = 'http://' if url.startswith('http'): base_url = url else: # otherwise assume HTTP as any sane site should upgrade # to HTTPS via a 301 redirect. base_url = "".join([protocol, url]) # strip the trailing slash to allow us to append # relative URLs. if base_url.endswith('/'): base_url = base_url[:-1] return base_url # def get_url_validation(base_url=None, url=None): # ''' # Checks if a URL is valid. Can be absolute or relative. # ''' # if url.startswith('/'): # full_url = '{0}{1}'.format(base_url, url) # if url.startswith(ffbase_url): # full_url = url # elif url.startswith('/'):