web-scraper/utils/helpers.py

#!/usr/bin/env python
'''
Utilities to provide various misc functions.
'''

class UrlPoolManager(object):
    '''
    Object to manage the lifecycle of a pool of URLs.
    '''

    def __init__(self):
        self.url_pool = set()

    def check_duplicate(self, new_url):
        '''
        Checks if a URL exists in the current pool.
        '''
        if new_url in self.url_pool:
            return True
        else:
            return False

    def invalidate_url(self, url):
        self.url_pool.remove(url)

    def add_to_list(self, url):
        self.url_pool.add(url)


def clean_base_url(url):
    '''
    Standardise the URL to be scraped to ensure it
    is added to relative URLs in a consistent manner.
    '''
    protocol = 'http://'

    if url.startswith('http'):
        base_url = url
    else:
        # otherwise assume HTTP as any sane site should upgrade
        # to HTTPS via a 301 redirect.
        base_url = "".join([protocol, url])

    # strip the trailing slash to allow us to append
    # relative URLs.
    if base_url.endswith('/'):
        base_url = base_url[:-1]

    return base_url


# def get_url_validation(base_url=None, url=None):
#     '''
#     Checks if a URL is valid. Can be absolute or relative.
#     '''

#     if url.startswith('/'):
#         full_url = '{0}{1}'.format(base_url, url)
#     if url.startswith(ffbase_url):
#         full_url = url
#     elif url.startswith('/'):