implement gzip compression requests and handling
This commit is contained in:
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
|||||||
import urllib.request
|
import urllib.request
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
import urllib.error
|
import urllib.error
|
||||||
|
import gzip
|
||||||
from urllib.parse import (urljoin, urlsplit)
|
from urllib.parse import (urljoin, urlsplit)
|
||||||
|
|
||||||
|
|
||||||
@@ -47,7 +48,8 @@ class WebPage(object):
|
|||||||
the data from each individual page.
|
the data from each individual page.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
headers = {'Accept-Encoding': 'gzip, deflate',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
|
|
||||||
def __init__(self, url=None, base_url=None, robots=None):
|
def __init__(self, url=None, base_url=None, robots=None):
|
||||||
self.url = url
|
self.url = url
|
||||||
@@ -66,7 +68,9 @@ class WebPage(object):
|
|||||||
page = urllib.request.urlopen(request, timeout=5) # handle
|
page = urllib.request.urlopen(request, timeout=5) # handle
|
||||||
headers = page.info()
|
headers = page.info()
|
||||||
print(headers['content-type'])
|
print(headers['content-type'])
|
||||||
if "text/html" in headers['content-type']:
|
if "gzip" in headers['content-type']:
|
||||||
|
self.source = gzip.decompress(page.read())
|
||||||
|
elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
|
||||||
self.source = page.read()
|
self.source = page.read()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user