added comments and docstrings

This commit is contained in:
2018-09-09 22:49:55 +01:00
parent f16f82fdfb
commit 9e125dfae0

View File

@@ -48,6 +48,7 @@ class WebPage(object):
the data from each individual page.
'''
# set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -67,6 +68,7 @@ class WebPage(object):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5)
# handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
@@ -101,8 +103,7 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL.
'''
for url in self.discovered_hrefs: #handle no hrefs found
for url in self.discovered_hrefs:
if url.startswith(self.url):
if self.robots.check(url):
sanitised_url = sanitise_url(url=url)
@@ -111,16 +112,21 @@ class WebPage(object):
def list_urls(self):
'''
Returns the contents of the
Returns all valid discovered URLs.
'''
return self.urls_to_crawl
def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try:
self.get_source()
except Exception:
# skip if we didn't retrieve the source.
pass
if self.source:
@@ -157,7 +163,7 @@ class RobotsTxt(object):
def check(self, url):
'''
needs a docstring
Test if robots allows us to crawl that URL.
'''
return self.robots.can_fetch("*", url)