update notes

various minor improvements to exception handling
2018-09-09 10:16:22 +01:00 · 2018-09-09 10:16:03 +01:00
2 changed files with 7 additions and 13 deletions
@@ -8,16 +8,9 @@
  * ~~ignore any links which aren't to pages~~
  * better url checking to get bare domain
  * remove trailing slash from any discovered url
-  * investigate lxml parser
+  * ~~investigate lxml parser~~
  * ~~remove base url from initial urls with and without trailing slash~~
  * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls
  * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
  * investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)
-
-
-```
-text/html; charset=utf-8
-application/xhtml+xml
-'WebPage' object has no attribute 'source'
-'WebPage' object has no attribute 'discovered_hrefs'
-```
+  * implement some kind of progress display
@@ -53,6 +53,8 @@ class WebPage(object):
        self.url = url
        self.base_url = base_url
        self.robots = robots
+        self.source = None
+        self.urls_to_crawl = set()


    def get_source(self):
@@ -75,7 +77,7 @@ class WebPage(object):
        '''
        hrefs = set()

-        soup = BeautifulSoup(self.source, 'lxml') # handle no source
+        soup = BeautifulSoup(self.source, 'lxml')
        links = soup.find_all('a', href=True)

        for link in links:
@@ -92,7 +94,6 @@ class WebPage(object):
        Iterate through the list of discovered URLs and add them to the
        pool if they start with the base URL.
        '''
-        self.urls_to_crawl = set()

        for url in self.discovered_hrefs: #handle no hrefs found
            if url.startswith(self.url):
@@ -112,8 +113,8 @@ class WebPage(object):
    def run(self):
        try:
            self.get_source()
-        except Exception as e:
-            print(e)
+        except Exception:
+            pass

        if self.source:
            self.find_links()
Author	SHA1	Message	Date
simon	69f5788745	update notes	2018-09-09 10:16:22 +01:00
simon	b5d644a223	various minor improvements to exception handling	2018-09-09 10:16:03 +01:00