diff --git a/utils/helpers.py b/utils/helpers.py
index b07cf21..9bd6392 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,11 +3,14 @@
 Utilities to provide various misc functions.
 '''
 
-from urllib.parse import urljoin
+import urllib.request
+from bs4 import BeautifulSoup
+from urllib.parse import (urljoin, urlsplit)
 
-class UrlPoolManager(object):
+
+class UrlPool(object):
     '''
-    Object to manage the lifecycle of a pool of URLs.
+    Object to manage a pool of URLs.
     '''
 
     def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
         self.url_pool.add(url)
 
 
-def clean_base_url(url):
-    '''
-    Standardise the URL to be scraped to ensure it
-    is added to relative URLs in a consistent manner.
-    '''
-    protocol = 'http://'
+class WebPage(object):
 
-    if url.startswith('http'):
-        base_url = url
-    else:
-        # otherwise assume HTTP as any sane site should upgrade
-        # to HTTPS via a 301 redirect.
-        base_url = "".join([protocol, url])
+    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+
+    def __init__(self, url):
+        self.url = url
+
+    def get_source(self):
+        request = urllib.request.Request(self.url, headers=self.headers)
+        page = urllib.request.urlopen(request)
+        self.source = page.read()
+
+    def find_links(self):
+        soup = BeautifulSoup(self.source, 'html.parser')
+        links = soup.find_all('a')
+        hrefs = []
+
+        for link in links:
+            if link['href'].startswith('/'):
+                hrefs.append("".join([self.url, link['href']]))
+            else:
+                hrefs.append(link['href'])
+
+        self.hrefs = hrefs
+
+    def parse_urls(self):
+        local_urls = []
+        for url in self.hrefs:
+            if url.startswith(self.url):
+                local_urls.append(url)
+
+        return local_urls
+
+
+def sanitise_url(url):
+    '''
+    Attempt to standardise the base url to ensure it can be prepended to
+    relative URLs. If no scheme provided then we default to http as any
+    sane https-only site should 301 redirect http > https.
+
+    Returns a corrected base URL as a string.
+    '''
+    default_proto = 'http'
+    delim = '://'
+
+    split_url = urlsplit(url)
+    if split_url.scheme and split_url.scheme.startswith('http'):
+        base_url = "".join([split_url.scheme, delim, split_url.netloc])
+    elif (split_url.path and not split_url.scheme and not split_url.netloc):
+        base_url = "".join([default_proto, delim, split_url.path])
 
     return base_url
 
 
-def get_url_validation(base_url=None, url=None):
+def qualify_url(base_url=None, url=None):
     '''
     Ensure any URLs discovered are absolute. If relative,
-    they will be appended to the base URL.
+    they will be appended to the base URL. Returns an
+    absolute URL as a string.
     '''
 
     if url.startswith('/'):