added comments and docstrings
This commit is contained in:
@@ -48,6 +48,7 @@ class WebPage(object):
|
||||
the data from each individual page.
|
||||
'''
|
||||
|
||||
# set a sane user-agent and request compression if available.
|
||||
headers = {'Accept-Encoding': 'gzip, deflate',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
|
||||
@@ -67,6 +68,7 @@ class WebPage(object):
|
||||
request = urllib.request.Request(self.url, headers=self.headers)
|
||||
page = urllib.request.urlopen(request, timeout=5)
|
||||
|
||||
# handle the content encoding in case it needs decompressing.
|
||||
if 'text/html' in page.info().get('Content-Type'):
|
||||
if page.info().get('Content-Encoding'):
|
||||
if page.info().get('Content-Encoding') == 'gzip':
|
||||
@@ -101,8 +103,7 @@ class WebPage(object):
|
||||
Iterate through the list of discovered URLs and add them to the
|
||||
pool if they start with the base URL.
|
||||
'''
|
||||
|
||||
for url in self.discovered_hrefs: #handle no hrefs found
|
||||
for url in self.discovered_hrefs:
|
||||
if url.startswith(self.url):
|
||||
if self.robots.check(url):
|
||||
sanitised_url = sanitise_url(url=url)
|
||||
@@ -111,16 +112,21 @@ class WebPage(object):
|
||||
|
||||
def list_urls(self):
|
||||
'''
|
||||
Returns the contents of the
|
||||
Returns all valid discovered URLs.
|
||||
'''
|
||||
|
||||
return self.urls_to_crawl
|
||||
|
||||
|
||||
def run(self):
|
||||
'''
|
||||
Attempt to get the page's source and if successful, iterate through it
|
||||
to find any links we can crawl.
|
||||
'''
|
||||
try:
|
||||
self.get_source()
|
||||
except Exception:
|
||||
# skip if we didn't retrieve the source.
|
||||
pass
|
||||
|
||||
if self.source:
|
||||
@@ -157,7 +163,7 @@ class RobotsTxt(object):
|
||||
|
||||
def check(self, url):
|
||||
'''
|
||||
needs a docstring
|
||||
Test if robots allows us to crawl that URL.
|
||||
'''
|
||||
return self.robots.can_fetch("*", url)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user