Compare commits

...

2 Commits

Author SHA1 Message Date
69f5788745 update notes 2018-09-09 10:16:22 +01:00
b5d644a223 various minor improvements to exception handling 2018-09-09 10:16:03 +01:00
2 changed files with 7 additions and 13 deletions

View File

@@ -8,16 +8,9 @@
* ~~ignore any links which aren't to pages~~ * ~~ignore any links which aren't to pages~~
* better url checking to get bare domain * better url checking to get bare domain
* remove trailing slash from any discovered url * remove trailing slash from any discovered url
* investigate lxml parser * ~~investigate lxml parser~~
* ~~remove base url from initial urls with and without trailing slash~~ * ~~remove base url from initial urls with and without trailing slash~~
* investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~ * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
* investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request) * investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)
* implement some kind of progress display
```
text/html; charset=utf-8
application/xhtml+xml
'WebPage' object has no attribute 'source'
'WebPage' object has no attribute 'discovered_hrefs'
```

View File

@@ -53,6 +53,8 @@ class WebPage(object):
self.url = url self.url = url
self.base_url = base_url self.base_url = base_url
self.robots = robots self.robots = robots
self.source = None
self.urls_to_crawl = set()
def get_source(self): def get_source(self):
@@ -75,7 +77,7 @@ class WebPage(object):
''' '''
hrefs = set() hrefs = set()
soup = BeautifulSoup(self.source, 'lxml') # handle no source soup = BeautifulSoup(self.source, 'lxml')
links = soup.find_all('a', href=True) links = soup.find_all('a', href=True)
for link in links: for link in links:
@@ -92,7 +94,6 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL. pool if they start with the base URL.
''' '''
self.urls_to_crawl = set()
for url in self.discovered_hrefs: #handle no hrefs found for url in self.discovered_hrefs: #handle no hrefs found
if url.startswith(self.url): if url.startswith(self.url):
@@ -112,8 +113,8 @@ class WebPage(object):
def run(self): def run(self):
try: try:
self.get_source() self.get_source()
except Exception as e: except Exception:
print(e) pass
if self.source: if self.source:
self.find_links() self.find_links()