use more explicit names, use urljoin to combine urls
This commit is contained in:
@@ -53,16 +53,16 @@ class WebPage(object):
|
|||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if link['href'].startswith('/'):
|
if link['href'].startswith('/'):
|
||||||
hrefs.add("".join([self.url, link['href']]))
|
hrefs.add(urljoin(self.url, link['href']))
|
||||||
else:
|
else:
|
||||||
hrefs.add(link['href'])
|
hrefs.add(link['href'])
|
||||||
|
|
||||||
self.hrefs = hrefs
|
self.discovered_hrefs = hrefs
|
||||||
|
|
||||||
|
|
||||||
def parse_urls(self):
|
def parse_urls(self):
|
||||||
self.urls_to_crawl = set()
|
self.urls_to_crawl = set()
|
||||||
for url in self.hrefs:
|
for url in self.discovered_hrefs:
|
||||||
if url.startswith(self.url):
|
if url.startswith(self.url):
|
||||||
self.urls_to_crawl.add(url)
|
self.urls_to_crawl.add(url)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user