added comments and docstrings

This commit is contained in:
2018-09-09 22:49:55 +01:00
parent f16f82fdfb
commit 9e125dfae0

View File

@@ -48,6 +48,7 @@ class WebPage(object):
the data from each individual page. the data from each individual page.
''' '''
# set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate', headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -67,6 +68,7 @@ class WebPage(object):
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5) page = urllib.request.urlopen(request, timeout=5)
# handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'): if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'): if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip': if page.info().get('Content-Encoding') == 'gzip':
@@ -101,8 +103,7 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL. pool if they start with the base URL.
''' '''
for url in self.discovered_hrefs:
for url in self.discovered_hrefs: #handle no hrefs found
if url.startswith(self.url): if url.startswith(self.url):
if self.robots.check(url): if self.robots.check(url):
sanitised_url = sanitise_url(url=url) sanitised_url = sanitise_url(url=url)
@@ -111,16 +112,21 @@ class WebPage(object):
def list_urls(self): def list_urls(self):
''' '''
Returns the contents of the Returns all valid discovered URLs.
''' '''
return self.urls_to_crawl return self.urls_to_crawl
def run(self): def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try: try:
self.get_source() self.get_source()
except Exception: except Exception:
# skip if we didn't retrieve the source.
pass pass
if self.source: if self.source:
@@ -157,7 +163,7 @@ class RobotsTxt(object):
def check(self, url): def check(self, url):
''' '''
needs a docstring Test if robots allows us to crawl that URL.
''' '''
return self.robots.can_fetch("*", url) return self.robots.can_fetch("*", url)