diff --git a/requirements.txt b/requirements.txt index acc8f43..bb93a64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ certifi==2018.8.13 chardet==3.0.4 idna==2.7 Jinja2==2.10 +lxml==4.2.4 MarkupSafe==1.0 diff --git a/utils/helpers.py b/utils/helpers.py index 8133cb5..0a2275b 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -75,7 +75,7 @@ class WebPage(object): ''' hrefs = set() - soup = BeautifulSoup(self.source, 'html.parser') # handle no source + soup = BeautifulSoup(self.source, 'lxml') # handle no source links = soup.find_all('a', href=True) for link in links: @@ -115,15 +115,9 @@ class WebPage(object): except Exception as e: print(e) - try: + if self.source: self.find_links() - except Exception as e: - print(e) - - try: self.parse_urls() - except Exception as e: - print(e) class RobotsTxt(object):