more documentation and add back some required imports

This commit is contained in:
2018-09-16 09:00:43 +01:00
parent 7bc9fe0679
commit 336517e84a

View File

@@ -6,7 +6,9 @@ Utilities to provide various misc functions.
import aiohttp import aiohttp
import asyncio import asyncio
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.error
from urllib.parse import urljoin, urlsplit from urllib.parse import urljoin, urlsplit
import urllib.request
import urllib.robotparser import urllib.robotparser
@@ -151,7 +153,9 @@ class RobotsTxt(object):
def __init__(self, base_url=None): def __init__(self, base_url=None):
''' '''
Manually retrieve robots.txt to allow us to set the user-agent. Manually retrieve robots.txt to allow us to set the user-agent (works
around sites which disallow access to robots.txt without a sane
user-agent).
''' '''
self.base_url = base_url self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}