more documentation and add back some required imports
This commit is contained in:
@@ -6,7 +6,9 @@ Utilities to provide various misc functions.
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import asyncio
|
import asyncio
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib.error
|
||||||
from urllib.parse import urljoin, urlsplit
|
from urllib.parse import urljoin, urlsplit
|
||||||
|
import urllib.request
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
|
|
||||||
|
|
||||||
@@ -151,7 +153,9 @@ class RobotsTxt(object):
|
|||||||
|
|
||||||
def __init__(self, base_url=None):
|
def __init__(self, base_url=None):
|
||||||
'''
|
'''
|
||||||
Manually retrieve robots.txt to allow us to set the user-agent.
|
Manually retrieve robots.txt to allow us to set the user-agent (works
|
||||||
|
around sites which disallow access to robots.txt without a sane
|
||||||
|
user-agent).
|
||||||
'''
|
'''
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
|
|||||||
Reference in New Issue
Block a user