adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent
This commit is contained in:
@@ -138,16 +138,16 @@ class RobotsTxt(object):
|
||||
robots_url = urljoin(self.base_url, 'robots.txt')
|
||||
request = urllib.request.Request(robots_url, headers=self.headers)
|
||||
|
||||
robots = urllib.robotparser.RobotFileParser()
|
||||
robots.set_url(robots_url)
|
||||
|
||||
try:
|
||||
response = urllib.request.urlopen(request, timeout=5)
|
||||
except urllib.error.HTTPError as err:
|
||||
print(err)
|
||||
except urllib.error.HTTPError:
|
||||
robots.allow_all = True
|
||||
else:
|
||||
data = response.read()
|
||||
decoded_data = data.decode("utf-8").splitlines()
|
||||
|
||||
robots = urllib.robotparser.RobotFileParser()
|
||||
robots.set_url(robots_url)
|
||||
robots.parse(decoded_data)
|
||||
|
||||
self.robots = robots
|
||||
|
||||
Reference in New Issue
Block a user