implement gzip compression requests and handling

This commit is contained in:
2018-09-09 10:53:09 +01:00
parent 17fa9f93f9
commit 1b005570ee

View File

@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
import urllib.request import urllib.request
import urllib.robotparser import urllib.robotparser
import urllib.error import urllib.error
import gzip
from urllib.parse import (urljoin, urlsplit) from urllib.parse import (urljoin, urlsplit)
@@ -47,7 +48,8 @@ class WebPage(object):
the data from each individual page. the data from each individual page.
''' '''
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None, robots=None): def __init__(self, url=None, base_url=None, robots=None):
self.url = url self.url = url
@@ -66,7 +68,9 @@ class WebPage(object):
page = urllib.request.urlopen(request, timeout=5) # handle page = urllib.request.urlopen(request, timeout=5) # handle
headers = page.info() headers = page.info()
print(headers['content-type']) print(headers['content-type'])
if "text/html" in headers['content-type']: if "gzip" in headers['content-type']:
self.source = gzip.decompress(page.read())
elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
self.source = page.read() self.source = page.read()