web-scraper/utils/helpers.py

#!/usr/bin/env python
'''
Utilities to provide various misc functions.
'''


def clean_base_url(url):
    '''
    Standardise the URL to be scraped to ensure it
    is added to relative URLs in a consistent manner.
    '''
    protocol = 'http://'

    if url.startswith('http'):
        base_url = url
    else:
        # otherwise assume HTTP as any sane site should upgrade
        # to HTTPS via a 301 redirect.
        base_url = "".join([protocol, url])

    # strip the trailing slash to allow us to append
    # relative URLs.
    if base_url.endswith('/'):
        base_url = base_url[:-1]

    return base_url


# def get_url_validation(base_url=None, url=None):
#     '''
#     Checks if a URL is valid. Can be absolute or relative.
#     '''

#     if url.startswith('/'):
#         full_url = '{0}{1}'.format(base_url, url)
#     if url.startswith(ffbase_url):
#         full_url = url
#     elif url.startswith('/'):