From 453331d69df9a7d410c80888989e52f1664463b3 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Wed, 29 Aug 2018 22:27:26 +0100 Subject: [PATCH] simplified url qualifier --- utils/helpers.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index db184ea..b07cf21 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -3,6 +3,8 @@ Utilities to provide various misc functions. ''' +from urllib.parse import urljoin + class UrlPoolManager(object): ''' Object to manage the lifecycle of a pool of URLs. @@ -41,21 +43,16 @@ def clean_base_url(url): # to HTTPS via a 301 redirect. base_url = "".join([protocol, url]) - # strip the trailing slash to allow us to append - # relative URLs. - if base_url.endswith('/'): - base_url = base_url[:-1] - return base_url -# def get_url_validation(base_url=None, url=None): -# ''' -# Checks if a URL is valid. Can be absolute or relative. -# ''' +def get_url_validation(base_url=None, url=None): + ''' + Ensure any URLs discovered are absolute. If relative, + they will be appended to the base URL. + ''' -# if url.startswith('/'): -# full_url = '{0}{1}'.format(base_url, url) -# if url.startswith(ffbase_url): -# full_url = url -# elif url.startswith('/'): + if url.startswith('/'): + return urljoin(base_url, url) + if url.startswith(base_url): + return url