use lxml as the parser and only find links on a page if we've got the source
This commit is contained in:
@@ -4,4 +4,5 @@ certifi==2018.8.13
|
|||||||
chardet==3.0.4
|
chardet==3.0.4
|
||||||
idna==2.7
|
idna==2.7
|
||||||
Jinja2==2.10
|
Jinja2==2.10
|
||||||
|
lxml==4.2.4
|
||||||
MarkupSafe==1.0
|
MarkupSafe==1.0
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ class WebPage(object):
|
|||||||
'''
|
'''
|
||||||
hrefs = set()
|
hrefs = set()
|
||||||
|
|
||||||
soup = BeautifulSoup(self.source, 'html.parser') # handle no source
|
soup = BeautifulSoup(self.source, 'lxml') # handle no source
|
||||||
links = soup.find_all('a', href=True)
|
links = soup.find_all('a', href=True)
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
@@ -115,15 +115,9 @@ class WebPage(object):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
try:
|
if self.source:
|
||||||
self.find_links()
|
self.find_links()
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.parse_urls()
|
self.parse_urls()
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
class RobotsTxt(object):
|
class RobotsTxt(object):
|
||||||
|
|||||||
Reference in New Issue
Block a user