added comments and docstrings

2018-09-09 22:49:55 +01:00
parent f16f82fdfb
commit 9e125dfae0
1 changed files with 10 additions and 4 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -48,6 +48,7 @@ class WebPage(object):
    the data from each individual page.
    '''

+    # set a sane user-agent and request compression if available.
    headers = {'Accept-Encoding': 'gzip, deflate',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}

@@ -67,6 +68,7 @@ class WebPage(object):
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request, timeout=5)

+        # handle the content encoding in case it needs decompressing.
        if 'text/html' in page.info().get('Content-Type'):
            if page.info().get('Content-Encoding'):
                if page.info().get('Content-Encoding') == 'gzip':
@@ -101,8 +103,7 @@ class WebPage(object):
        Iterate through the list of discovered URLs and add them to the
        pool if they start with the base URL.
        '''
-
-        for url in self.discovered_hrefs: #handle no hrefs found
+        for url in self.discovered_hrefs:
            if url.startswith(self.url):
                if self.robots.check(url):
                    sanitised_url = sanitise_url(url=url)
@@ -111,16 +112,21 @@ class WebPage(object):

    def list_urls(self):
        '''
-        Returns the contents of the
+        Returns all valid discovered URLs.
        '''

        return self.urls_to_crawl


    def run(self):
+        '''
+        Attempt to get the page's source and if successful, iterate through it
+        to find any links we can crawl.
+        '''
        try:
            self.get_source()
        except Exception:
+            # skip if we didn't retrieve the source.
            pass

        if self.source:
@@ -157,7 +163,7 @@ class RobotsTxt(object):

    def check(self, url):
        '''
-        needs a docstring
+        Test if robots allows us to crawl that URL.
        '''
        return self.robots.can_fetch("*", url)