#!/usr/bin/env python ''' Utilities to provide various misc functions. ''' from urllib.parse import urljoin class UrlPoolManager(object): ''' Object to manage the lifecycle of a pool of URLs. ''' def __init__(self): self.url_pool = set() def check_duplicate(self, new_url): ''' Checks if a URL exists in the current pool. ''' if new_url in self.url_pool: return True else: return False def invalidate_url(self, url): self.url_pool.remove(url) def add_to_list(self, url): self.url_pool.add(url) def clean_base_url(url): ''' Standardise the URL to be scraped to ensure it is added to relative URLs in a consistent manner. ''' protocol = 'http://' if url.startswith('http'): base_url = url else: # otherwise assume HTTP as any sane site should upgrade # to HTTPS via a 301 redirect. base_url = "".join([protocol, url]) return base_url def get_url_validation(base_url=None, url=None): ''' Ensure any URLs discovered are absolute. If relative, they will be appended to the base URL. ''' if url.startswith('/'): return urljoin(base_url, url) if url.startswith(base_url): return url