#!/usr/bin/env python ''' Utilities to provide various misc functions. ''' class UrlPoolManager(object): ''' Object to manage the lifecycle of a pool of URLs. ''' def __init__(self): self.url_pool = set() def check_duplicate(self, new_url): ''' Checks if a URL exists in the current pool. ''' if new_url in self.url_pool: return True else: return False def invalidate_url(self, url): self.url_pool.remove(url) def add_to_list(self, url): self.url_pool.add(url) def clean_base_url(url): ''' Standardise the URL to be scraped to ensure it is added to relative URLs in a consistent manner. ''' protocol = 'http://' if url.startswith('http'): base_url = url else: # otherwise assume HTTP as any sane site should upgrade # to HTTPS via a 301 redirect. base_url = "".join([protocol, url]) # strip the trailing slash to allow us to append # relative URLs. if base_url.endswith('/'): base_url = base_url[:-1] return base_url # def get_url_validation(base_url=None, url=None): # ''' # Checks if a URL is valid. Can be absolute or relative. # ''' # if url.startswith('/'): # full_url = '{0}{1}'.format(base_url, url) # if url.startswith(ffbase_url): # full_url = url # elif url.startswith('/'):