use lxml as the parser and only find links on a page if we've got the source

This commit is contained in:
2018-09-09 10:06:25 +01:00
parent 738ab8e441
commit 6508156aa4
2 changed files with 3 additions and 8 deletions

View File

@@ -4,4 +4,5 @@ certifi==2018.8.13
chardet==3.0.4
idna==2.7
Jinja2==2.10
lxml==4.2.4
MarkupSafe==1.0

View File

@@ -75,7 +75,7 @@ class WebPage(object):
'''
hrefs = set()
soup = BeautifulSoup(self.source, 'html.parser') # handle no source
soup = BeautifulSoup(self.source, 'lxml') # handle no source
links = soup.find_all('a', href=True)
for link in links:
@@ -115,15 +115,9 @@ class WebPage(object):
except Exception as e:
print(e)
try:
if self.source:
self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls()
except Exception as e:
print(e)
class RobotsTxt(object):