implement gzip compression requests and handling

2018-09-09 10:53:09 +01:00
parent 17fa9f93f9
commit 1b005570ee
1 changed files with 6 additions and 2 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 import urllib.request
 import urllib.robotparser
 import urllib.error
 import gzip
 from urllib.parse import (urljoin, urlsplit)
@@ -47,7 +48,8 @@ class WebPage(object):
    the data from each individual page.
    '''
-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+    headers = {'Accept-Encoding': 'gzip, deflate',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    def __init__(self, url=None, base_url=None, robots=None):
        self.url = url
@@ -66,7 +68,9 @@ class WebPage(object):
        page = urllib.request.urlopen(request, timeout=5) # handle
        headers = page.info()
        print(headers['content-type'])
-        if "text/html" in headers['content-type']:
+        if "gzip" in headers['content-type']:
            self.source = gzip.decompress(page.read())
        elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
            self.source = page.read()