only attempt to read html

2018-09-06 16:30:11 +01:00
parent a3ec9451e3
commit e57a86c60a
2 changed files with 4 additions and 2 deletions
@@ -5,7 +5,7 @@
  * ~~use `pop()` on the set instead of `.remove()`~~
    * ~~return false once the set is empty~~
  * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
-  * ignore any links which aren't to pages
+  * ~~ignore any links which aren't to pages~~
  * better url checking to get bare domain
  * ~~remove base url from initial urls with and without trailing slash~~
  * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls
@@ -61,7 +61,9 @@ class WebPage(object):

        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request, timeout=5)
-        self.source = page.read()
+        headers = page.info()
+        if headers['content-type'] == "text/html":
+            self.source = page.read()


    def find_links(self):