manually retrieve robots.txt to ensure we can set the user-agent

This commit is contained in:
2018-09-07 12:40:12 +01:00
parent ab0ab0a010
commit fdd84a8786

View File

@@ -6,6 +6,7 @@ Utilities to provide various misc functions.
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.request import urllib.request
import urllib.robotparser import urllib.robotparser
import urllib.error
from urllib.parse import (urljoin, urlsplit) from urllib.parse import (urljoin, urlsplit)
@@ -60,8 +61,9 @@ class WebPage(object):
''' '''
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5) page = urllib.request.urlopen(request, timeout=5) # handle
headers = page.info() headers = page.info()
print(headers['content-type'])
if "text/html" in headers['content-type']: if "text/html" in headers['content-type']:
self.source = page.read() self.source = page.read()
@@ -73,7 +75,7 @@ class WebPage(object):
''' '''
hrefs = set() hrefs = set()
soup = BeautifulSoup(self.source, 'html.parser') soup = BeautifulSoup(self.source, 'html.parser') # handle no source
links = soup.find_all('a', href=True) links = soup.find_all('a', href=True)
for link in links: for link in links:
@@ -92,7 +94,7 @@ class WebPage(object):
''' '''
self.urls_to_crawl = set() self.urls_to_crawl = set()
for url in self.discovered_hrefs: for url in self.discovered_hrefs: #handle no hrefs found
if url.startswith(self.url): if url.startswith(self.url):
if self.robots.check(url): if self.robots.check(url):
sanitised_url = sanitise_url(url=url) sanitised_url = sanitise_url(url=url)
@@ -131,13 +133,22 @@ class RobotsTxt(object):
def __init__(self, base_url=None): def __init__(self, base_url=None):
self.base_url = base_url self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
robots_url = urljoin(self.base_url, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers)
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError as err:
print(err)
else:
data = response.read()
decoded_data = data.decode("utf-8").splitlines()
robots = urllib.robotparser.RobotFileParser() robots = urllib.robotparser.RobotFileParser()
robots.set_url(urljoin(self.base_url, 'robots.txt')) robots.set_url(robots_url)
try: robots.parse(decoded_data)
robots.read()
except Exception as e:
print(e)
self.robots = robots self.robots = robots