From fdd84a8786cc32af90b62b63e10e87d5ec707140 Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Fri, 7 Sep 2018 12:40:12 +0100
Subject: [PATCH] manually retrieve robots.txt to ensure we can set the
 user-agent

---
 utils/helpers.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/utils/helpers.py b/utils/helpers.py
index 94fe187..dc522fa 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -6,6 +6,7 @@ Utilities to provide various misc functions.
 from bs4 import BeautifulSoup
 import urllib.request
 import urllib.robotparser
+import urllib.error
 from urllib.parse import (urljoin, urlsplit)
 
 
@@ -60,8 +61,9 @@ class WebPage(object):
         '''
 
         request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request, timeout=5)
+        page = urllib.request.urlopen(request, timeout=5) # handle
         headers = page.info()
+        print(headers['content-type'])
         if "text/html" in headers['content-type']:
             self.source = page.read()
 
@@ -73,7 +75,7 @@ class WebPage(object):
         '''
         hrefs = set()
 
-        soup = BeautifulSoup(self.source, 'html.parser')
+        soup = BeautifulSoup(self.source, 'html.parser') # handle no source
         links = soup.find_all('a', href=True)
 
         for link in links:
@@ -92,7 +94,7 @@ class WebPage(object):
         '''
         self.urls_to_crawl = set()
 
-        for url in self.discovered_hrefs:
+        for url in self.discovered_hrefs: #handle no hrefs found
             if url.startswith(self.url):
                 if self.robots.check(url):
                     sanitised_url = sanitise_url(url=url)
@@ -131,13 +133,22 @@ class RobotsTxt(object):
 
     def __init__(self, base_url=None):
         self.base_url = base_url
+        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+
+        robots_url = urljoin(self.base_url, 'robots.txt')
+        request = urllib.request.Request(robots_url, headers=self.headers)
+
+        try:
+            response = urllib.request.urlopen(request, timeout=5)
+        except urllib.error.HTTPError as err:
+            print(err)
+        else:
+            data = response.read()
+            decoded_data = data.decode("utf-8").splitlines()
 
         robots = urllib.robotparser.RobotFileParser()
-        robots.set_url(urljoin(self.base_url, 'robots.txt'))
-        try:
-            robots.read()
-        except Exception as e:
-            print(e)
+        robots.set_url(robots_url)
+        robots.parse(decoded_data)
 
         self.robots = robots