Compare commits

...

2 Commits

2 changed files with 86 additions and 17 deletions

28
crawler.py Normal file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python
'''
Need a docstring.
'''
import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url, qualify_url)
def init_crawler(base_url=None):
'''
needs a docstring
'''
def run(args=None):
'''
needs a docstring.
'''
base_url = sanitise_url(args.url)
print(base_url)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
args = parser.parse_args()
run(args)

View File

@@ -3,11 +3,14 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
from urllib.parse import urljoin import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import (urljoin, urlsplit)
class UrlPoolManager(object):
class UrlPool(object):
''' '''
Object to manage the lifecycle of a pool of URLs. Object to manage a pool of URLs.
''' '''
def __init__(self): def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
self.url_pool.add(url) self.url_pool.add(url)
def clean_base_url(url): class WebPage(object):
'''
Standardise the URL to be scraped to ensure it
is added to relative URLs in a consistent manner.
'''
protocol = 'http://'
if url.startswith('http'): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
base_url = url
else: def __init__(self, url):
# otherwise assume HTTP as any sane site should upgrade self.url = url
# to HTTPS via a 301 redirect.
base_url = "".join([protocol, url]) def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
self.hrefs = hrefs
def parse_urls(self):
local_urls = []
for url in self.hrefs:
if url.startswith(self.url):
local_urls.append(url)
return local_urls
def sanitise_url(url):
'''
Attempt to standardise the base url to ensure it can be prepended to
relative URLs. If no scheme provided then we default to http as any
sane https-only site should 301 redirect http > https.
Returns a corrected base URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if split_url.scheme and split_url.scheme.startswith('http'):
base_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
base_url = "".join([default_proto, delim, split_url.path])
return base_url return base_url
def get_url_validation(base_url=None, url=None): def qualify_url(base_url=None, url=None):
''' '''
Ensure any URLs discovered are absolute. If relative, Ensure any URLs discovered are absolute. If relative,
they will be appended to the base URL. they will be appended to the base URL. Returns an
absolute URL as a string.
''' '''
if url.startswith('/'): if url.startswith('/'):