diff --git a/utils/helpers.py b/utils/helpers.py index b07cf21..9bd6392 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -3,11 +3,14 @@ Utilities to provide various misc functions. ''' -from urllib.parse import urljoin +import urllib.request +from bs4 import BeautifulSoup +from urllib.parse import (urljoin, urlsplit) -class UrlPoolManager(object): + +class UrlPool(object): ''' - Object to manage the lifecycle of a pool of URLs. + Object to manage a pool of URLs. ''' def __init__(self): @@ -29,27 +32,65 @@ class UrlPoolManager(object): self.url_pool.add(url) -def clean_base_url(url): - ''' - Standardise the URL to be scraped to ensure it - is added to relative URLs in a consistent manner. - ''' - protocol = 'http://' +class WebPage(object): - if url.startswith('http'): - base_url = url - else: - # otherwise assume HTTP as any sane site should upgrade - # to HTTPS via a 301 redirect. - base_url = "".join([protocol, url]) + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + + def __init__(self, url): + self.url = url + + def get_source(self): + request = urllib.request.Request(self.url, headers=self.headers) + page = urllib.request.urlopen(request) + self.source = page.read() + + def find_links(self): + soup = BeautifulSoup(self.source, 'html.parser') + links = soup.find_all('a') + hrefs = [] + + for link in links: + if link['href'].startswith('/'): + hrefs.append("".join([self.url, link['href']])) + else: + hrefs.append(link['href']) + + self.hrefs = hrefs + + def parse_urls(self): + local_urls = [] + for url in self.hrefs: + if url.startswith(self.url): + local_urls.append(url) + + return local_urls + + +def sanitise_url(url): + ''' + Attempt to standardise the base url to ensure it can be prepended to + relative URLs. If no scheme provided then we default to http as any + sane https-only site should 301 redirect http > https. + + Returns a corrected base URL as a string. + ''' + default_proto = 'http' + delim = '://' + + split_url = urlsplit(url) + if split_url.scheme and split_url.scheme.startswith('http'): + base_url = "".join([split_url.scheme, delim, split_url.netloc]) + elif (split_url.path and not split_url.scheme and not split_url.netloc): + base_url = "".join([default_proto, delim, split_url.path]) return base_url -def get_url_validation(base_url=None, url=None): +def qualify_url(base_url=None, url=None): ''' Ensure any URLs discovered are absolute. If relative, - they will be appended to the base URL. + they will be appended to the base URL. Returns an + absolute URL as a string. ''' if url.startswith('/'):