various minor improvements to exception handling

This commit is contained in:
2018-09-09 10:16:03 +01:00
parent 6508156aa4
commit b5d644a223

View File

@@ -53,6 +53,8 @@ class WebPage(object):
self.url = url self.url = url
self.base_url = base_url self.base_url = base_url
self.robots = robots self.robots = robots
self.source = None
self.urls_to_crawl = set()
def get_source(self): def get_source(self):
@@ -75,7 +77,7 @@ class WebPage(object):
''' '''
hrefs = set() hrefs = set()
soup = BeautifulSoup(self.source, 'lxml') # handle no source soup = BeautifulSoup(self.source, 'lxml')
links = soup.find_all('a', href=True) links = soup.find_all('a', href=True)
for link in links: for link in links:
@@ -92,7 +94,6 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL. pool if they start with the base URL.
''' '''
self.urls_to_crawl = set()
for url in self.discovered_hrefs: #handle no hrefs found for url in self.discovered_hrefs: #handle no hrefs found
if url.startswith(self.url): if url.startswith(self.url):
@@ -112,8 +113,8 @@ class WebPage(object):
def run(self): def run(self):
try: try:
self.get_source() self.get_source()
except Exception as e: except Exception:
print(e) pass
if self.source: if self.source:
self.find_links() self.find_links()