From fdd84a8786cc32af90b62b63e10e87d5ec707140 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Fri, 7 Sep 2018 12:40:12 +0100 Subject: [PATCH] manually retrieve robots.txt to ensure we can set the user-agent --- utils/helpers.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index 94fe187..dc522fa 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -6,6 +6,7 @@ Utilities to provide various misc functions. from bs4 import BeautifulSoup import urllib.request import urllib.robotparser +import urllib.error from urllib.parse import (urljoin, urlsplit) @@ -60,8 +61,9 @@ class WebPage(object): ''' request = urllib.request.Request(self.url, headers=self.headers) - page = urllib.request.urlopen(request, timeout=5) + page = urllib.request.urlopen(request, timeout=5) # handle headers = page.info() + print(headers['content-type']) if "text/html" in headers['content-type']: self.source = page.read() @@ -73,7 +75,7 @@ class WebPage(object): ''' hrefs = set() - soup = BeautifulSoup(self.source, 'html.parser') + soup = BeautifulSoup(self.source, 'html.parser') # handle no source links = soup.find_all('a', href=True) for link in links: @@ -92,7 +94,7 @@ class WebPage(object): ''' self.urls_to_crawl = set() - for url in self.discovered_hrefs: + for url in self.discovered_hrefs: #handle no hrefs found if url.startswith(self.url): if self.robots.check(url): sanitised_url = sanitise_url(url=url) @@ -131,13 +133,22 @@ class RobotsTxt(object): def __init__(self, base_url=None): self.base_url = base_url + self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + + robots_url = urljoin(self.base_url, 'robots.txt') + request = urllib.request.Request(robots_url, headers=self.headers) + + try: + response = urllib.request.urlopen(request, timeout=5) + except urllib.error.HTTPError as err: + print(err) + else: + data = response.read() + decoded_data = data.decode("utf-8").splitlines() robots = urllib.robotparser.RobotFileParser() - robots.set_url(urljoin(self.base_url, 'robots.txt')) - try: - robots.read() - except Exception as e: - print(e) + robots.set_url(robots_url) + robots.parse(decoded_data) self.robots = robots