use lxml as the parser and only find links on a page if we've got the source
This commit is contained in:
@@ -4,4 +4,5 @@ certifi==2018.8.13
|
||||
chardet==3.0.4
|
||||
idna==2.7
|
||||
Jinja2==2.10
|
||||
lxml==4.2.4
|
||||
MarkupSafe==1.0
|
||||
|
||||
@@ -75,7 +75,7 @@ class WebPage(object):
|
||||
'''
|
||||
hrefs = set()
|
||||
|
||||
soup = BeautifulSoup(self.source, 'html.parser') # handle no source
|
||||
soup = BeautifulSoup(self.source, 'lxml') # handle no source
|
||||
links = soup.find_all('a', href=True)
|
||||
|
||||
for link in links:
|
||||
@@ -115,15 +115,9 @@ class WebPage(object):
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
try:
|
||||
if self.source:
|
||||
self.find_links()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
try:
|
||||
self.parse_urls()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
class RobotsTxt(object):
|
||||
|
||||
Reference in New Issue
Block a user