manually retrieve robots.txt to ensure we can set the user-agent

2018-09-07 12:40:12 +01:00
parent ab0ab0a010
commit fdd84a8786
1 changed files with 19 additions and 8 deletions
@@ -6,6 +6,7 @@ Utilities to provide various misc functions.
 from bs4 import BeautifulSoup
 import urllib.request
 import urllib.robotparser
+import urllib.error
 from urllib.parse import (urljoin, urlsplit)


@@ -60,8 +61,9 @@ class WebPage(object):
        '''

        request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request, timeout=5)
+        page = urllib.request.urlopen(request, timeout=5) # handle
        headers = page.info()
+        print(headers['content-type'])
        if "text/html" in headers['content-type']:
            self.source = page.read()

@@ -73,7 +75,7 @@ class WebPage(object):
        '''
        hrefs = set()

-        soup = BeautifulSoup(self.source, 'html.parser')
+        soup = BeautifulSoup(self.source, 'html.parser') # handle no source
        links = soup.find_all('a', href=True)

        for link in links:
@@ -92,7 +94,7 @@ class WebPage(object):
        '''
        self.urls_to_crawl = set()

-        for url in self.discovered_hrefs:
+        for url in self.discovered_hrefs: #handle no hrefs found
            if url.startswith(self.url):
                if self.robots.check(url):
                    sanitised_url = sanitise_url(url=url)
@@ -131,13 +133,22 @@ class RobotsTxt(object):

    def __init__(self, base_url=None):
        self.base_url = base_url
+        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+
+        robots_url = urljoin(self.base_url, 'robots.txt')
+        request = urllib.request.Request(robots_url, headers=self.headers)
+
+        try:
+            response = urllib.request.urlopen(request, timeout=5)
+        except urllib.error.HTTPError as err:
+            print(err)
+        else:
+            data = response.read()
+            decoded_data = data.decode("utf-8").splitlines()

        robots = urllib.robotparser.RobotFileParser()
-        robots.set_url(urljoin(self.base_url, 'robots.txt'))
-        try:
-            robots.read()
-        except Exception as e:
-            print(e)
+        robots.set_url(robots_url)
+        robots.parse(decoded_data)

        self.robots = robots