From e57a86c60aeffce82fbd04abcae6ac3369fd7746 Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Thu, 6 Sep 2018 16:30:11 +0100
Subject: [PATCH] only attempt to read html

---
 notes.md         | 2 +-
 utils/helpers.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/notes.md b/notes.md
index 6e4f3f2..36d72d5 100644
--- a/notes.md
+++ b/notes.md
@@ -5,7 +5,7 @@
   * ~~use `pop()` on the set instead of `.remove()`~~
     * ~~return false once the set is empty~~
   * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
-  * ignore any links which aren't to pages
+  * ~~ignore any links which aren't to pages~~
   * better url checking to get bare domain
   * ~~remove base url from initial urls with and without trailing slash~~
   * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls
diff --git a/utils/helpers.py b/utils/helpers.py
index ac52193..2c65a0f 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -61,7 +61,9 @@ class WebPage(object):
 
         request = urllib.request.Request(self.url, headers=self.headers)
         page = urllib.request.urlopen(request, timeout=5)
-        self.source = page.read()
+        headers = page.info()
+        if headers['content-type'] == "text/html":
+            self.source = page.read()
 
 
     def find_links(self):