From 79b10798a35c666de38e12cb6e25864df3299ab7 Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Mon, 27 Aug 2018 19:37:41 +0100
Subject: [PATCH] initial commit of utils

---
 utils/helpers.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 utils/helpers.py

diff --git a/utils/helpers.py b/utils/helpers.py
new file mode 100644
index 0000000..bd38305
--- /dev/null
+++ b/utils/helpers.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+import re
+import requests
+
+
+def standardise_base_url(url):
+    '''
+    Standardise the URL to be scraped to ensure it
+    is added to relative URLs in a consistent manner.
+    '''
+    match_protocol = r'http(s?)\:\/\/'
+
+    if re.match(match_protocol, url):
+        base_url = url
+    else:
+        http_url = 'http://{0}'.format(url)
+        https_url = 'https://{0}'.format(url)
+        # attempt to discover which protocol is being used.
+        try:
+            result = requests.get(http_url)
+            if result.url.startswith('http'):
+                base_url = http_url
+            if result.url.startswith('https'):
+                base_url = https_url
+        except requests.exceptions.RequestException as e:
+            base_url = https_url
+
+    if base_url.endswith('/'):
+        base_url = base_url[:-1]
+
+    return base_url
+
+
+def get_url_validation(base_url=None, url=None):
+    '''
+    Checks if a URL is valid. Can be absolute or relative.
+    '''
+
+    if url.startswith('/'):
+        full_url = '{0}{1}'.format(base_url, url)
+    if url.startswith(ffbase_url):
+        full_url = url
+    elif url.startswith('/'):