initial commit of crawler skeleton

rework url sanitiser to use urllib modules, move WebPage object to helpers
2018-08-31 18:26:49 +01:00 · 2018-08-31 18:26:25 +01:00
2 changed files with 86 additions and 17 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -0,0 +1,28 @@
 #!/usr/bin/env python
 '''
 Need a docstring.
 '''
 import argparse
 from utils.helpers import (UrlPool, WebPage, sanitise_url, qualify_url)
 def init_crawler(base_url=None):
    '''
    needs a docstring
    '''
 def run(args=None):
    '''
    needs a docstring.
    '''
    base_url = sanitise_url(args.url)
    print(base_url)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
    args = parser.parse_args()
    run(args)
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,11 +3,14 @@
 Utilities to provide various misc functions.
 '''
-from urllib.parse import urljoin
+import urllib.request
 from bs4 import BeautifulSoup
 from urllib.parse import (urljoin, urlsplit)
-class UrlPoolManager(object):
+
 class UrlPool(object):
    '''
-    Object to manage the lifecycle of a pool of URLs.
+    Object to manage a pool of URLs.
    '''
    def __init__(self):
@@ -29,27 +32,65 @@ class UrlPoolManager(object):
        self.url_pool.add(url)
-def clean_base_url(url):
+class WebPage(object):
    '''
    Standardise the URL to be scraped to ensure it
    is added to relative URLs in a consistent manner.
    '''
    protocol = 'http://'
-    if url.startswith('http'):
+    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-        base_url = url
+
-    else:
+    def __init__(self, url):
-        # otherwise assume HTTP as any sane site should upgrade
+        self.url = url
-        # to HTTPS via a 301 redirect.
+
-        base_url = "".join([protocol, url])
+    def get_source(self):
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request)
        self.source = page.read()
    def find_links(self):
        soup = BeautifulSoup(self.source, 'html.parser')
        links = soup.find_all('a')
        hrefs = []
        for link in links:
            if link['href'].startswith('/'):
                hrefs.append("".join([self.url, link['href']]))
            else:
                hrefs.append(link['href'])
        self.hrefs = hrefs
    def parse_urls(self):
        local_urls = []
        for url in self.hrefs:
            if url.startswith(self.url):
                local_urls.append(url)
        return local_urls
 def sanitise_url(url):
    '''
    Attempt to standardise the base url to ensure it can be prepended to
    relative URLs. If no scheme provided then we default to http as any
    sane https-only site should 301 redirect http > https.
    Returns a corrected base URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
    split_url = urlsplit(url)
    if split_url.scheme and split_url.scheme.startswith('http'):
        base_url = "".join([split_url.scheme, delim, split_url.netloc])
    elif (split_url.path and not split_url.scheme and not split_url.netloc):
        base_url = "".join([default_proto, delim, split_url.path])
    return base_url
-def get_url_validation(base_url=None, url=None):
+def qualify_url(base_url=None, url=None):
    '''
    Ensure any URLs discovered are absolute. If relative,
-    they will be appended to the base URL.
+    they will be appended to the base URL. Returns an
    absolute URL as a string.
    '''
    if url.startswith('/'):
Author	SHA1	Message	Date
Simon Weald	5e0d9fd568	initial commit of crawler skeleton	2018-08-31 18:26:49 +01:00
Simon Weald	915def3a5d	rework url sanitiser to use urllib modules, move WebPage object to helpers	2018-08-31 18:26:25 +01:00