adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent

This commit is contained in:
2018-09-09 09:57:16 +01:00
parent fdd84a8786
commit 738ab8e441

View File

@@ -138,16 +138,16 @@ class RobotsTxt(object):
robots_url = urljoin(self.base_url, 'robots.txt') robots_url = urljoin(self.base_url, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers) request = urllib.request.Request(robots_url, headers=self.headers)
robots = urllib.robotparser.RobotFileParser()
robots.set_url(robots_url)
try: try:
response = urllib.request.urlopen(request, timeout=5) response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError as err: except urllib.error.HTTPError:
print(err) robots.allow_all = True
else: else:
data = response.read() data = response.read()
decoded_data = data.decode("utf-8").splitlines() decoded_data = data.decode("utf-8").splitlines()
robots = urllib.robotparser.RobotFileParser()
robots.set_url(robots_url)
robots.parse(decoded_data) robots.parse(decoded_data)
self.robots = robots self.robots = robots