implement gzip compression requests and handling

tick off gzip encoding
2018-09-09 10:53:09 +01:00 · 2018-09-09 10:52:37 +01:00
2 changed files with 9 additions and 4 deletions
--- a/notes.md
+++ b/notes.md
@@ -12,6 +12,7 @@
  * ~~remove base url from initial urls with and without trailing slash~~
  * ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
  * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
-  * investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)
+  * ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
  * implement some kind of progress display
-  * async
+  * async
+  * better exception handling
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 import urllib.request
 import urllib.robotparser
 import urllib.error
+import gzip
 from urllib.parse import (urljoin, urlsplit)


@@ -47,7 +48,8 @@ class WebPage(object):
    the data from each individual page.
    '''

-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+    headers = {'Accept-Encoding': 'gzip, deflate',
+               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}

    def __init__(self, url=None, base_url=None, robots=None):
        self.url = url
@@ -66,7 +68,9 @@ class WebPage(object):
        page = urllib.request.urlopen(request, timeout=5) # handle
        headers = page.info()
        print(headers['content-type'])
-        if "text/html" in headers['content-type']:
+        if "gzip" in headers['content-type']:
+            self.source = gzip.decompress(page.read())
+        elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
            self.source = page.read()
Author	SHA1	Message	Date
Simon Weald	1b005570ee	implement gzip compression requests and handling	2018-09-09 10:53:09 +01:00
Simon Weald	17fa9f93f9	tick off gzip encoding	2018-09-09 10:52:37 +01:00