rework url sanitiser to use urllib modules, move WebPage object to helpers

This commit is contained in:
2018-08-31 18:26:25 +01:00
parent 453331d69d
commit 915def3a5d

View File

@@ -3,11 +3,14 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
from urllib.parse import urljoin import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import (urljoin, urlsplit)
class UrlPoolManager(object):
class UrlPool(object):
''' '''
Object to manage the lifecycle of a pool of URLs. Object to manage a pool of URLs.
''' '''
def __init__(self): def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
self.url_pool.add(url) self.url_pool.add(url)
def clean_base_url(url): class WebPage(object):
'''
Standardise the URL to be scraped to ensure it
is added to relative URLs in a consistent manner.
'''
protocol = 'http://'
if url.startswith('http'): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
base_url = url
def __init__(self, url):
self.url = url
def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else: else:
# otherwise assume HTTP as any sane site should upgrade hrefs.append(link['href'])
# to HTTPS via a 301 redirect.
base_url = "".join([protocol, url]) self.hrefs = hrefs
def parse_urls(self):
local_urls = []
for url in self.hrefs:
if url.startswith(self.url):
local_urls.append(url)
return local_urls
def sanitise_url(url):
'''
Attempt to standardise the base url to ensure it can be prepended to
relative URLs. If no scheme provided then we default to http as any
sane https-only site should 301 redirect http > https.
Returns a corrected base URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if split_url.scheme and split_url.scheme.startswith('http'):
base_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
base_url = "".join([default_proto, delim, split_url.path])
return base_url return base_url
def get_url_validation(base_url=None, url=None): def qualify_url(base_url=None, url=None):
''' '''
Ensure any URLs discovered are absolute. If relative, Ensure any URLs discovered are absolute. If relative,
they will be appended to the base URL. they will be appended to the base URL. Returns an
absolute URL as a string.
''' '''
if url.startswith('/'): if url.startswith('/'):