web-scraper/utils/helpers.py

#!/usr/bin/env python
'''
Utilities to provide various misc functions.
'''

from urllib.parse import urljoin

class UrlPoolManager(object):
    '''
    Object to manage the lifecycle of a pool of URLs.
    '''

    def __init__(self):
        self.url_pool = set()

    def check_duplicate(self, new_url):
        '''
        Checks if a URL exists in the current pool.
        '''
        if new_url in self.url_pool:
            return True
        else:
            return False

    def invalidate_url(self, url):
        self.url_pool.remove(url)

    def add_to_list(self, url):
        self.url_pool.add(url)


def clean_base_url(url):
    '''
    Standardise the URL to be scraped to ensure it
    is added to relative URLs in a consistent manner.
    '''
    protocol = 'http://'

    if url.startswith('http'):
        base_url = url
    else:
        # otherwise assume HTTP as any sane site should upgrade
        # to HTTPS via a 301 redirect.
        base_url = "".join([protocol, url])

    return base_url


def get_url_validation(base_url=None, url=None):
    '''
    Ensure any URLs discovered are absolute. If relative,
    they will be appended to the base URL.
    '''

    if url.startswith('/'):
        return urljoin(base_url, url)
    if url.startswith(base_url):
        return url