use lxml as the parser and only find links on a page if we've got the source
This commit is contained in:
@@ -75,7 +75,7 @@ class WebPage(object):
|
||||
'''
|
||||
hrefs = set()
|
||||
|
||||
soup = BeautifulSoup(self.source, 'html.parser') # handle no source
|
||||
soup = BeautifulSoup(self.source, 'lxml') # handle no source
|
||||
links = soup.find_all('a', href=True)
|
||||
|
||||
for link in links:
|
||||
@@ -115,15 +115,9 @@ class WebPage(object):
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
try:
|
||||
if self.source:
|
||||
self.find_links()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
try:
|
||||
self.parse_urls()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
class RobotsTxt(object):
|
||||
|
||||
Reference in New Issue
Block a user