From 1b005570ee19679377be6a7a5dffd34a1c56f337 Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Sun, 9 Sep 2018 10:53:09 +0100
Subject: [PATCH] implement gzip compression requests and handling

---
 utils/helpers.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/utils/helpers.py b/utils/helpers.py
index 10f0d5f..a21639a 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 import urllib.request
 import urllib.robotparser
 import urllib.error
+import gzip
 from urllib.parse import (urljoin, urlsplit)
 
 
@@ -47,7 +48,8 @@ class WebPage(object):
     the data from each individual page.
     '''
 
-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+    headers = {'Accept-Encoding': 'gzip, deflate',
+               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
 
     def __init__(self, url=None, base_url=None, robots=None):
         self.url = url
@@ -66,7 +68,9 @@ class WebPage(object):
         page = urllib.request.urlopen(request, timeout=5) # handle
         headers = page.info()
         print(headers['content-type'])
-        if "text/html" in headers['content-type']:
+        if "gzip" in headers['content-type']:
+            self.source = gzip.decompress(page.read())
+        elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
             self.source = page.read()