#!/usr/bin/env python ''' Utilities to provide various misc functions. ''' import urllib.request from bs4 import BeautifulSoup from urllib.parse import (urljoin, urlsplit) class UrlPool(object): ''' Object to manage a pool of URLs. ''' def __init__(self): self.pool = set() def check_duplicate(self, new_url): ''' Checks if a URL exists in the current pool. ''' if new_url in self.pool: return True else: return False def remove_from_pool(self): ''' Remove a URL from the pool and return it to be crawled. ''' return(self.pool.pop()) def add_to_pool(self, url): self.pool.add(url) def list_pool(self): pool = self.pool return pool class WebPage(object): ''' Object to manage common operations required to return the data from each individual page. ''' headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} def __init__(self, url=None, base_url=None): self.url = url self.base_url = base_url def get_source(self): ''' Retrieve a page's source. ''' request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request, timeout=5) self.source = page.read() def find_links(self): ''' Find all URLs on a page and ensure they are absolute. If they are relative then they will be appended to the base URL. ''' hrefs = set() soup = BeautifulSoup(self.source, 'html.parser') links = soup.find_all('a', href=True) for link in links: if link['href'].startswith('/'): hrefs.add(urljoin(self.url, link['href'])) else: hrefs.add(link['href']) self.discovered_hrefs = hrefs def parse_urls(self): ''' Iterate through the list of discovered URLs and add them to the pool if they start with the base URL. ''' self.urls_to_crawl = set() for url in self.discovered_hrefs: if url.startswith(self.url): sanitised_url = sanitise_url(url=url) self.urls_to_crawl.add(sanitised_url) def list_urls(self): ''' Returns the contents of the ''' return self.urls_to_crawl def run(self): try: self.get_source() except Exception as e: print(e) try: self.find_links() except Exception as e: print(e) try: self.parse_urls() except Exception as e: print(e) def sanitise_url(url, base_url=False): ''' If `base_url` is True, we attempt to standardise `url` to ensure it can be prepended to relative URLs. If no scheme has been provided then we default to http as any sane https-only site should 301 redirect http > https. If `base_url` is False, we sanitise URLs to strip queries and fragments (we don't want to scrape in-page anchors etc). Returns a sanitised URL as a string. ''' default_proto = 'http' delim = '://' split_url = urlsplit(url) if base_url: # This will sanitise the initial url for the initial page crawl. if split_url.scheme and split_url.scheme.startswith('http'): sanitised_url = "".join([split_url.scheme, delim, split_url.netloc]) elif (split_url.path and not split_url.scheme and not split_url.netloc): sanitised_url = "".join([default_proto, delim, split_url.path]) else: # Sanitise discovered URLs. We already expect them in the format # protocol://base_url/path sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) return sanitised_url