From 1b005570ee19679377be6a7a5dffd34a1c56f337 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Sun, 9 Sep 2018 10:53:09 +0100 Subject: [PATCH] implement gzip compression requests and handling --- utils/helpers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index 10f0d5f..a21639a 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup import urllib.request import urllib.robotparser import urllib.error +import gzip from urllib.parse import (urljoin, urlsplit) @@ -47,7 +48,8 @@ class WebPage(object): the data from each individual page. ''' - headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + headers = {'Accept-Encoding': 'gzip, deflate', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} def __init__(self, url=None, base_url=None, robots=None): self.url = url @@ -66,7 +68,9 @@ class WebPage(object): page = urllib.request.urlopen(request, timeout=5) # handle headers = page.info() print(headers['content-type']) - if "text/html" in headers['content-type']: + if "gzip" in headers['content-type']: + self.source = gzip.decompress(page.read()) + elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']: self.source = page.read()