#!/usr/bin/env python ''' Utilities to provide various misc functions. ''' import urllib.request from bs4 import BeautifulSoup from urllib.parse import (urljoin, urlsplit) class UrlPool(object): ''' Object to manage a pool of URLs. ''' def __init__(self): self.url_pool = set() def check_duplicate(self, new_url): ''' Checks if a URL exists in the current pool. ''' if new_url in self.url_pool: return True else: return False def remove_from_pool(self, url): self.url_pool.remove(url) def add_to_pool(self, url): self.url_pool.add(url) class WebPage(object): ''' Object to manage common operations required to return the data from each individual page. ''' headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} def __init__(self, url): self.url = url def get_source(self): ''' Retrieve a page's source. ''' request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request) self.source = page.read() def find_links(self): ''' Find all URLs on a page and ensure they are absolute. If they are relative then they will be appended to the base URL. ''' hrefs = set() soup = BeautifulSoup(self.source, 'html.parser') links = soup.find_all('a') for link in links: if link['href'].startswith('/'): hrefs.add(urljoin(self.url, link['href'])) else: hrefs.add(link['href']) self.discovered_hrefs = hrefs def parse_urls(self): ''' Iterate through the list of discovered URLs and add them to the pool if they start with the base URL. ''' self.urls_to_crawl = set() for url in self.discovered_hrefs: if url.startswith(self.url): self.urls_to_crawl.add(url) def run(self): try: self.get_source() except Exception as e: print(e) try: self.find_links() except Exception as e: print(e) try: self.parse_urls() except Exception as e: print(e) return self.urls_to_crawl def sanitise_url(url): ''' Attempt to standardise the base url to ensure it can be prepended to relative URLs. If no scheme provided then we default to http as any sane https-only site should 301 redirect http > https. Returns a corrected base URL as a string. ''' default_proto = 'http' delim = '://' split_url = urlsplit(url) if split_url.scheme and split_url.scheme.startswith('http'): base_url = "".join([split_url.scheme, delim, split_url.netloc]) elif (split_url.path and not split_url.scheme and not split_url.netloc): base_url = "".join([default_proto, delim, split_url.path]) return base_url