adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent

2018-09-09 09:57:16 +01:00
parent fdd84a8786
commit 738ab8e441
1 changed files with 6 additions and 6 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -138,16 +138,16 @@ class RobotsTxt(object):
        robots_url = urljoin(self.base_url, 'robots.txt')
        request = urllib.request.Request(robots_url, headers=self.headers)
        robots = urllib.robotparser.RobotFileParser()
        robots.set_url(robots_url)
        try:
            response = urllib.request.urlopen(request, timeout=5)
-        except urllib.error.HTTPError as err:
+        except urllib.error.HTTPError:
-            print(err)
+            robots.allow_all = True
        else:
            data = response.read()
            decoded_data = data.decode("utf-8").splitlines()
        robots = urllib.robotparser.RobotFileParser()
        robots.set_url(robots_url)
            robots.parse(decoded_data)
        self.robots = robots