web-scraper/utils/helpers.py

#!/usr/bin/env python
'''
Utilities to provide various misc functions.
'''

import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import (urljoin, urlsplit)


class UrlPool(object):
    '''
    Object to manage a pool of URLs.
    '''

    def __init__(self):
        self.url_pool = set()

    def check_duplicate(self, new_url):
        '''
        Checks if a URL exists in the current pool.
        '''
        if new_url in self.url_pool:
            return True
        else:
            return False

    def remove_from_pool(self, url):
        self.url_pool.remove(url)

    def add_to_pool(self, url):
        self.url_pool.add(url)


class WebPage(object):

    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}

    def __init__(self, url):
        self.url = url


    def get_source(self):
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request)
        self.source = page.read()


    def find_links(self):
        soup = BeautifulSoup(self.source, 'html.parser')
        links = soup.find_all('a')
        hrefs = set()

        for link in links:
            if link['href'].startswith('/'):
                hrefs.add(urljoin(self.url, link['href']))
            else:
                hrefs.add(link['href'])

        self.discovered_hrefs = hrefs


    def parse_urls(self):
        self.urls_to_crawl = set()
        for url in self.discovered_hrefs:
            if url.startswith(self.url):
                self.urls_to_crawl.add(url)


    def run(self):
        try:
            self.get_source()
        except Exception as e:
            print(e)

        try:
            self.find_links()
        except Exception as e:
            print(e)

        try:
            self.parse_urls()
        except Exception as e:
            print(e)

        return self.urls_to_crawl


def sanitise_url(url):
    '''
    Attempt to standardise the base url to ensure it can be prepended to
    relative URLs. If no scheme provided then we default to http as any
    sane https-only site should 301 redirect http > https.

    Returns a corrected base URL as a string.
    '''
    default_proto = 'http'
    delim = '://'

    split_url = urlsplit(url)
    if split_url.scheme and split_url.scheme.startswith('http'):
        base_url = "".join([split_url.scheme, delim, split_url.netloc])
    elif (split_url.path and not split_url.scheme and not split_url.netloc):
        base_url = "".join([default_proto, delim, split_url.path])

    return base_url


def qualify_url(base_url=None, url=None):
    '''
    Ensure any URLs discovered are absolute. If relative,
    they will be appended to the base URL. Returns an
    absolute URL as a string.
    '''

    if url.startswith('/'):
        return urljoin(base_url, url)
    if url.startswith(base_url):
        return url