use lxml as the parser and only find links on a page if we've got the source

This commit is contained in:
2018-09-09 10:06:25 +01:00
parent 738ab8e441
commit 6508156aa4
2 changed files with 3 additions and 8 deletions

View File

@@ -4,4 +4,5 @@ certifi==2018.8.13
chardet==3.0.4 chardet==3.0.4
idna==2.7 idna==2.7
Jinja2==2.10 Jinja2==2.10
lxml==4.2.4
MarkupSafe==1.0 MarkupSafe==1.0

View File

@@ -75,7 +75,7 @@ class WebPage(object):
''' '''
hrefs = set() hrefs = set()
soup = BeautifulSoup(self.source, 'html.parser') # handle no source soup = BeautifulSoup(self.source, 'lxml') # handle no source
links = soup.find_all('a', href=True) links = soup.find_all('a', href=True)
for link in links: for link in links:
@@ -115,15 +115,9 @@ class WebPage(object):
except Exception as e: except Exception as e:
print(e) print(e)
try: if self.source:
self.find_links() self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls() self.parse_urls()
except Exception as e:
print(e)
class RobotsTxt(object): class RobotsTxt(object):