rework url sanitiser to use urllib modules, move WebPage object to helpers

2018-08-31 18:26:25 +01:00
parent 453331d69d
commit 915def3a5d
1 changed files with 58 additions and 17 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,11 +3,14 @@
 Utilities to provide various misc functions.
 '''

-from urllib.parse import urljoin
+import urllib.request
+from bs4 import BeautifulSoup
+from urllib.parse import (urljoin, urlsplit)

-class UrlPoolManager(object):
+
+class UrlPool(object):
    '''
-    Object to manage the lifecycle of a pool of URLs.
+    Object to manage a pool of URLs.
    '''

    def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
        self.url_pool.add(url)


-def clean_base_url(url):
-    '''
-    Standardise the URL to be scraped to ensure it
-    is added to relative URLs in a consistent manner.
-    '''
-    protocol = 'http://'
+class WebPage(object):

-    if url.startswith('http'):
-        base_url = url
+    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+
+    def __init__(self, url):
+        self.url = url
+
+    def get_source(self):
+        request = urllib.request.Request(self.url, headers=self.headers)
+        page = urllib.request.urlopen(request)
+        self.source = page.read()
+
+    def find_links(self):
+        soup = BeautifulSoup(self.source, 'html.parser')
+        links = soup.find_all('a')
+        hrefs = []
+
+        for link in links:
+            if link['href'].startswith('/'):
+                hrefs.append("".join([self.url, link['href']]))
            else:
-        # otherwise assume HTTP as any sane site should upgrade
-        # to HTTPS via a 301 redirect.
-        base_url = "".join([protocol, url])
+                hrefs.append(link['href'])
+
+        self.hrefs = hrefs
+
+    def parse_urls(self):
+        local_urls = []
+        for url in self.hrefs:
+            if url.startswith(self.url):
+                local_urls.append(url)
+
+        return local_urls
+
+
+def sanitise_url(url):
+    '''
+    Attempt to standardise the base url to ensure it can be prepended to
+    relative URLs. If no scheme provided then we default to http as any
+    sane https-only site should 301 redirect http > https.
+
+    Returns a corrected base URL as a string.
+    '''
+    default_proto = 'http'
+    delim = '://'
+
+    split_url = urlsplit(url)
+    if split_url.scheme and split_url.scheme.startswith('http'):
+        base_url = "".join([split_url.scheme, delim, split_url.netloc])
+    elif (split_url.path and not split_url.scheme and not split_url.netloc):
+        base_url = "".join([default_proto, delim, split_url.path])

    return base_url


-def get_url_validation(base_url=None, url=None):
+def qualify_url(base_url=None, url=None):
    '''
    Ensure any URLs discovered are absolute. If relative,
-    they will be appended to the base URL.
+    they will be appended to the base URL. Returns an
+    absolute URL as a string.
    '''

    if url.startswith('/'):