From 1b18aa83eb08ff9543c2eeb3746608d6d6a45cb5 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Fri, 31 Aug 2018 19:01:35 +0100 Subject: [PATCH] corrected some small errors and added runner function --- utils/helpers.py | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index 9bd6392..2b9a762 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -25,10 +25,10 @@ class UrlPool(object): else: return False - def invalidate_url(self, url): + def remove_from_pool(self, url): self.url_pool.remove(url) - def add_to_list(self, url): + def add_to_pool(self, url): self.url_pool.add(url) @@ -39,31 +39,51 @@ class WebPage(object): def __init__(self, url): self.url = url + def get_source(self): request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request) self.source = page.read() + def find_links(self): soup = BeautifulSoup(self.source, 'html.parser') links = soup.find_all('a') - hrefs = [] + hrefs = set() for link in links: if link['href'].startswith('/'): - hrefs.append("".join([self.url, link['href']])) + hrefs.add("".join([self.url, link['href']])) else: - hrefs.append(link['href']) + hrefs.add(link['href']) self.hrefs = hrefs + def parse_urls(self): - local_urls = [] + self.urls_to_crawl = set() for url in self.hrefs: if url.startswith(self.url): - local_urls.append(url) + self.urls_to_crawl.add(url) - return local_urls + + def run(self): + try: + self.get_source() + except Exception as e: + print(e) + + try: + self.find_links() + except Exception as e: + print(e) + + try: + self.parse_urls() + except Exception as e: + print(e) + + return self.urls_to_crawl def sanitise_url(url):