use more explicit names, use urljoin to combine urls

This commit is contained in:
2018-08-31 19:12:58 +01:00
parent 0517e5bc56
commit 759f965e95

View File

@@ -53,16 +53,16 @@ class WebPage(object):
for link in links: for link in links:
if link['href'].startswith('/'): if link['href'].startswith('/'):
hrefs.add("".join([self.url, link['href']])) hrefs.add(urljoin(self.url, link['href']))
else: else:
hrefs.add(link['href']) hrefs.add(link['href'])
self.hrefs = hrefs self.discovered_hrefs = hrefs
def parse_urls(self): def parse_urls(self):
self.urls_to_crawl = set() self.urls_to_crawl = set()
for url in self.hrefs: for url in self.discovered_hrefs:
if url.startswith(self.url): if url.startswith(self.url):
self.urls_to_crawl.add(url) self.urls_to_crawl.add(url)