rework url sanitiser to use urllib modules, move WebPage object to helpers

This commit is contained in:
2018-08-31 18:26:25 +01:00
parent 453331d69d
commit 915def3a5d

View File

@@ -3,11 +3,14 @@
Utilities to provide various misc functions.
'''
from urllib.parse import urljoin
import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import (urljoin, urlsplit)
class UrlPoolManager(object):
class UrlPool(object):
'''
Object to manage the lifecycle of a pool of URLs.
Object to manage a pool of URLs.
'''
def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
self.url_pool.add(url)
def clean_base_url(url):
'''
Standardise the URL to be scraped to ensure it
is added to relative URLs in a consistent manner.
'''
protocol = 'http://'
class WebPage(object):
if url.startswith('http'):
base_url = url
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url):
self.url = url
def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
# otherwise assume HTTP as any sane site should upgrade
# to HTTPS via a 301 redirect.
base_url = "".join([protocol, url])
hrefs.append(link['href'])
self.hrefs = hrefs
def parse_urls(self):
local_urls = []
for url in self.hrefs:
if url.startswith(self.url):
local_urls.append(url)
return local_urls
def sanitise_url(url):
'''
Attempt to standardise the base url to ensure it can be prepended to
relative URLs. If no scheme provided then we default to http as any
sane https-only site should 301 redirect http > https.
Returns a corrected base URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if split_url.scheme and split_url.scheme.startswith('http'):
base_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
base_url = "".join([default_proto, delim, split_url.path])
return base_url
def get_url_validation(base_url=None, url=None):
def qualify_url(base_url=None, url=None):
'''
Ensure any URLs discovered are absolute. If relative,
they will be appended to the base URL.
they will be appended to the base URL. Returns an
absolute URL as a string.
'''
if url.startswith('/'):