diff --git a/notes.md b/notes.md index 6e4f3f2..36d72d5 100644 --- a/notes.md +++ b/notes.md @@ -5,7 +5,7 @@ * ~~use `pop()` on the set instead of `.remove()`~~ * ~~return false once the set is empty~~ * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~ - * ignore any links which aren't to pages + * ~~ignore any links which aren't to pages~~ * better url checking to get bare domain * ~~remove base url from initial urls with and without trailing slash~~ * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls diff --git a/utils/helpers.py b/utils/helpers.py index ac52193..2c65a0f 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -61,7 +61,9 @@ class WebPage(object): request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request, timeout=5) - self.source = page.read() + headers = page.info() + if headers['content-type'] == "text/html": + self.source = page.read() def find_links(self):