rework url sanitiser to use urllib modules, move WebPage object to helpers

2018-08-31 18:26:25 +01:00
parent 453331d69d
commit 915def3a5d
1 changed files with 58 additions and 17 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,11 +3,14 @@
 Utilities to provide various misc functions.
 '''
-from urllib.parse import urljoin
+import urllib.request
 from bs4 import BeautifulSoup
 from urllib.parse import (urljoin, urlsplit)
-class UrlPoolManager(object):
+
 class UrlPool(object):
    '''
-    Object to manage the lifecycle of a pool of URLs.
+    Object to manage a pool of URLs.
    '''
    def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
        self.url_pool.add(url)
-def clean_base_url(url):
+class WebPage(object):
    '''
    Standardise the URL to be scraped to ensure it
    is added to relative URLs in a consistent manner.
    '''
    protocol = 'http://'
-    if url.startswith('http'):
+    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-        base_url = url
+
    def __init__(self, url):
        self.url = url
    def get_source(self):
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request)
        self.source = page.read()
    def find_links(self):
        soup = BeautifulSoup(self.source, 'html.parser')
        links = soup.find_all('a')
        hrefs = []
        for link in links:
            if link['href'].startswith('/'):
                hrefs.append("".join([self.url, link['href']]))
            else:
-        # otherwise assume HTTP as any sane site should upgrade
+                hrefs.append(link['href'])
-        # to HTTPS via a 301 redirect.
+
-        base_url = "".join([protocol, url])
+        self.hrefs = hrefs
    def parse_urls(self):
        local_urls = []
        for url in self.hrefs:
            if url.startswith(self.url):
                local_urls.append(url)
        return local_urls
 def sanitise_url(url):
    '''
    Attempt to standardise the base url to ensure it can be prepended to
    relative URLs. If no scheme provided then we default to http as any
    sane https-only site should 301 redirect http > https.
    Returns a corrected base URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
    split_url = urlsplit(url)
    if split_url.scheme and split_url.scheme.startswith('http'):
        base_url = "".join([split_url.scheme, delim, split_url.netloc])
    elif (split_url.path and not split_url.scheme and not split_url.netloc):
        base_url = "".join([default_proto, delim, split_url.path])
    return base_url
-def get_url_validation(base_url=None, url=None):
+def qualify_url(base_url=None, url=None):
    '''
    Ensure any URLs discovered are absolute. If relative,
-    they will be appended to the base URL.
+    they will be appended to the base URL. Returns an
    absolute URL as a string.
    '''
    if url.startswith('/'):