Compare commits
2 Commits
1e51e10db2
...
1b005570ee
| Author | SHA1 | Date | |
|---|---|---|---|
| 1b005570ee | |||
| 17fa9f93f9 |
3
notes.md
3
notes.md
@@ -12,6 +12,7 @@
|
||||
* ~~remove base url from initial urls with and without trailing slash~~
|
||||
* ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
|
||||
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
|
||||
* investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)
|
||||
* ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
|
||||
* implement some kind of progress display
|
||||
* async
|
||||
* better exception handling
|
||||
|
||||
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
||||
import urllib.request
|
||||
import urllib.robotparser
|
||||
import urllib.error
|
||||
import gzip
|
||||
from urllib.parse import (urljoin, urlsplit)
|
||||
|
||||
|
||||
@@ -47,7 +48,8 @@ class WebPage(object):
|
||||
the data from each individual page.
|
||||
'''
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
headers = {'Accept-Encoding': 'gzip, deflate',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
|
||||
def __init__(self, url=None, base_url=None, robots=None):
|
||||
self.url = url
|
||||
@@ -66,7 +68,9 @@ class WebPage(object):
|
||||
page = urllib.request.urlopen(request, timeout=5) # handle
|
||||
headers = page.info()
|
||||
print(headers['content-type'])
|
||||
if "text/html" in headers['content-type']:
|
||||
if "gzip" in headers['content-type']:
|
||||
self.source = gzip.decompress(page.read())
|
||||
elif "text/html" in headers['content-type'] or "deflate" in headers['content-type']:
|
||||
self.source = page.read()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user