diff --git a/utils/helpers.py b/utils/helpers.py index 0945238..6a43cba 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -48,6 +48,7 @@ class WebPage(object): the data from each individual page. ''' + # set a sane user-agent and request compression if available. headers = {'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} @@ -67,6 +68,7 @@ class WebPage(object): request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request, timeout=5) + # handle the content encoding in case it needs decompressing. if 'text/html' in page.info().get('Content-Type'): if page.info().get('Content-Encoding'): if page.info().get('Content-Encoding') == 'gzip': @@ -101,8 +103,7 @@ class WebPage(object): Iterate through the list of discovered URLs and add them to the pool if they start with the base URL. ''' - - for url in self.discovered_hrefs: #handle no hrefs found + for url in self.discovered_hrefs: if url.startswith(self.url): if self.robots.check(url): sanitised_url = sanitise_url(url=url) @@ -111,16 +112,21 @@ class WebPage(object): def list_urls(self): ''' - Returns the contents of the + Returns all valid discovered URLs. ''' return self.urls_to_crawl def run(self): + ''' + Attempt to get the page's source and if successful, iterate through it + to find any links we can crawl. + ''' try: self.get_source() except Exception: + # skip if we didn't retrieve the source. pass if self.source: @@ -157,7 +163,7 @@ class RobotsTxt(object): def check(self, url): ''' - needs a docstring + Test if robots allows us to crawl that URL. ''' return self.robots.can_fetch("*", url)