diff --git a/utils/helpers.py b/utils/helpers.py index e17a927..505c6f4 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -6,7 +6,9 @@ Utilities to provide various misc functions. import aiohttp import asyncio from bs4 import BeautifulSoup +import urllib.error from urllib.parse import urljoin, urlsplit +import urllib.request import urllib.robotparser @@ -151,7 +153,9 @@ class RobotsTxt(object): def __init__(self, base_url=None): ''' - Manually retrieve robots.txt to allow us to set the user-agent. + Manually retrieve robots.txt to allow us to set the user-agent (works + around sites which disallow access to robots.txt without a sane + user-agent). ''' self.base_url = base_url self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}