use more explicit names, use urljoin to combine urls
This commit is contained in:
@@ -53,16 +53,16 @@ class WebPage(object):
|
||||
|
||||
for link in links:
|
||||
if link['href'].startswith('/'):
|
||||
hrefs.add("".join([self.url, link['href']]))
|
||||
hrefs.add(urljoin(self.url, link['href']))
|
||||
else:
|
||||
hrefs.add(link['href'])
|
||||
|
||||
self.hrefs = hrefs
|
||||
self.discovered_hrefs = hrefs
|
||||
|
||||
|
||||
def parse_urls(self):
|
||||
self.urls_to_crawl = set()
|
||||
for url in self.hrefs:
|
||||
for url in self.discovered_hrefs:
|
||||
if url.startswith(self.url):
|
||||
self.urls_to_crawl.add(url)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user