improve handling of gzip/deflated data detection

2018-09-09 11:21:46 +01:00
parent 1b005570ee
commit 9e754a5584
1 changed files with 10 additions and 7 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -65,13 +65,16 @@ class WebPage(object):
        '''

        request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request, timeout=5) # handle
-        headers = page.info()
-        print(headers['content-type'])
-        if "gzip" in headers['content-type']:
-            self.source = gzip.decompress(page.read())
-        elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
-            self.source = page.read()
+        page = urllib.request.urlopen(request, timeout=5)
+
+        if 'text/html' in page.info().get('Content-Type'):
+            if page.info().get('Content-Encoding'):
+                if page.info().get('Content-Encoding') == 'gzip':
+                    self.source = gzip.decompress(page.read())
+                elif page.info().get('Content-Encoding') == 'deflate':
+                    self.source = page.read()
+            else:
+                self.source = page.read()


    def find_links(self):