web-scraper/utils/helpers.py

#!/usr/bin/env python
'''
Utilities to provide various misc functions.
'''

class UrlPoolManager(object):
    '''
    Object to manage the lifecycle of a pool of URLs.
    '''

    def __init__(self):
        self.url_pool = dict()
        self.not_crawled = 0
        self.crawled = 1
        self.invalid = 2

    def check_duplicate(self, new_url):
        for url, status in self.url_pool.items():
            if url == new_url:
                return True
            else:
                return False

    def invalidate_url(self, url):
        self.url_pool[url] = self.invalid

    def add_to_list(self, url):
        self.url_pool[url] = self.not_crawled
        # calculate depth
        # add link, crawled status to url_pool

    def mark_as_crawled(self, url):
        self.url_pool[url] = self.crawled


def clean_base_url(url):
    '''
    Standardise the URL to be scraped to ensure it
    is added to relative URLs in a consistent manner.
    '''
    protocol = 'http://'

    if url.startswith('http'):
        base_url = url
    else:
        # otherwise assume HTTP as any sane site should upgrade
        # to HTTPS via a 301 redirect.
        base_url = "".join([protocol, url])

    # strip the trailing slash to allow us to append
    # relative URLs.
    if base_url.endswith('/'):
        base_url = base_url[:-1]

    return base_url


# def get_url_validation(base_url=None, url=None):
#     '''
#     Checks if a URL is valid. Can be absolute or relative.
#     '''

#     if url.startswith('/'):
#         full_url = '{0}{1}'.format(base_url, url)
#     if url.startswith(ffbase_url):
#         full_url = url
#     elif url.startswith('/'):