From e57a86c60aeffce82fbd04abcae6ac3369fd7746 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Thu, 6 Sep 2018 16:30:11 +0100 Subject: [PATCH] only attempt to read html --- notes.md | 2 +- utils/helpers.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/notes.md b/notes.md index 6e4f3f2..36d72d5 100644 --- a/notes.md +++ b/notes.md @@ -5,7 +5,7 @@ * ~~use `pop()` on the set instead of `.remove()`~~ * ~~return false once the set is empty~~ * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~ - * ignore any links which aren't to pages + * ~~ignore any links which aren't to pages~~ * better url checking to get bare domain * ~~remove base url from initial urls with and without trailing slash~~ * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls diff --git a/utils/helpers.py b/utils/helpers.py index ac52193..2c65a0f 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -61,7 +61,9 @@ class WebPage(object): request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request, timeout=5) - self.source = page.read() + headers = page.info() + if headers['content-type'] == "text/html": + self.source = page.read() def find_links(self):