added comments and docstrings
This commit is contained in:
@@ -48,6 +48,7 @@ class WebPage(object):
|
|||||||
the data from each individual page.
|
the data from each individual page.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# set a sane user-agent and request compression if available.
|
||||||
headers = {'Accept-Encoding': 'gzip, deflate',
|
headers = {'Accept-Encoding': 'gzip, deflate',
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
|
|
||||||
@@ -67,6 +68,7 @@ class WebPage(object):
|
|||||||
request = urllib.request.Request(self.url, headers=self.headers)
|
request = urllib.request.Request(self.url, headers=self.headers)
|
||||||
page = urllib.request.urlopen(request, timeout=5)
|
page = urllib.request.urlopen(request, timeout=5)
|
||||||
|
|
||||||
|
# handle the content encoding in case it needs decompressing.
|
||||||
if 'text/html' in page.info().get('Content-Type'):
|
if 'text/html' in page.info().get('Content-Type'):
|
||||||
if page.info().get('Content-Encoding'):
|
if page.info().get('Content-Encoding'):
|
||||||
if page.info().get('Content-Encoding') == 'gzip':
|
if page.info().get('Content-Encoding') == 'gzip':
|
||||||
@@ -101,8 +103,7 @@ class WebPage(object):
|
|||||||
Iterate through the list of discovered URLs and add them to the
|
Iterate through the list of discovered URLs and add them to the
|
||||||
pool if they start with the base URL.
|
pool if they start with the base URL.
|
||||||
'''
|
'''
|
||||||
|
for url in self.discovered_hrefs:
|
||||||
for url in self.discovered_hrefs: #handle no hrefs found
|
|
||||||
if url.startswith(self.url):
|
if url.startswith(self.url):
|
||||||
if self.robots.check(url):
|
if self.robots.check(url):
|
||||||
sanitised_url = sanitise_url(url=url)
|
sanitised_url = sanitise_url(url=url)
|
||||||
@@ -111,16 +112,21 @@ class WebPage(object):
|
|||||||
|
|
||||||
def list_urls(self):
|
def list_urls(self):
|
||||||
'''
|
'''
|
||||||
Returns the contents of the
|
Returns all valid discovered URLs.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
return self.urls_to_crawl
|
return self.urls_to_crawl
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
'''
|
||||||
|
Attempt to get the page's source and if successful, iterate through it
|
||||||
|
to find any links we can crawl.
|
||||||
|
'''
|
||||||
try:
|
try:
|
||||||
self.get_source()
|
self.get_source()
|
||||||
except Exception:
|
except Exception:
|
||||||
|
# skip if we didn't retrieve the source.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if self.source:
|
if self.source:
|
||||||
@@ -157,7 +163,7 @@ class RobotsTxt(object):
|
|||||||
|
|
||||||
def check(self, url):
|
def check(self, url):
|
||||||
'''
|
'''
|
||||||
needs a docstring
|
Test if robots allows us to crawl that URL.
|
||||||
'''
|
'''
|
||||||
return self.robots.can_fetch("*", url)
|
return self.robots.can_fetch("*", url)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user