manually retrieve robots.txt to ensure we can set the user-agent

2018-09-07 12:40:12 +01:00
parent ab0ab0a010
commit fdd84a8786
1 changed files with 19 additions and 8 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -6,6 +6,7 @@ Utilities to provide various misc functions.
 from bs4 import BeautifulSoup
 import urllib.request
 import urllib.robotparser
 import urllib.error
 from urllib.parse import (urljoin, urlsplit)
@@ -60,8 +61,9 @@ class WebPage(object):
        '''
        request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request, timeout=5)
+        page = urllib.request.urlopen(request, timeout=5) # handle
        headers = page.info()
        print(headers['content-type'])
        if "text/html" in headers['content-type']:
            self.source = page.read()
@@ -73,7 +75,7 @@ class WebPage(object):
        '''
        hrefs = set()
-        soup = BeautifulSoup(self.source, 'html.parser')
+        soup = BeautifulSoup(self.source, 'html.parser') # handle no source
        links = soup.find_all('a', href=True)
        for link in links:
@@ -92,7 +94,7 @@ class WebPage(object):
        '''
        self.urls_to_crawl = set()
-        for url in self.discovered_hrefs:
+        for url in self.discovered_hrefs: #handle no hrefs found
            if url.startswith(self.url):
                if self.robots.check(url):
                    sanitised_url = sanitise_url(url=url)
@@ -131,13 +133,22 @@ class RobotsTxt(object):
    def __init__(self, base_url=None):
        self.base_url = base_url
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        robots_url = urljoin(self.base_url, 'robots.txt')
        request = urllib.request.Request(robots_url, headers=self.headers)
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError as err:
            print(err)
        else:
            data = response.read()
            decoded_data = data.decode("utf-8").splitlines()
        robots = urllib.robotparser.RobotFileParser()
-        robots.set_url(urljoin(self.base_url, 'robots.txt'))
+        robots.set_url(robots_url)
-        try:
+        robots.parse(decoded_data)
            robots.read()
        except Exception as e:
            print(e)
        self.robots = robots