From 2b812da26a8aaa1f03c1f6a304298b84a8cdb4b7 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Wed, 29 Aug 2018 21:49:15 +0100 Subject: [PATCH] simplify UrlPoolManager to use a set instead of a dict --- utils/helpers.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index 1161848..db184ea 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -9,28 +9,22 @@ class UrlPoolManager(object): ''' def __init__(self): - self.url_pool = dict() - self.not_crawled = 0 - self.crawled = 1 - self.invalid = 2 + self.url_pool = set() def check_duplicate(self, new_url): - for url, status in self.url_pool.items(): - if url == new_url: - return True - else: - return False + ''' + Checks if a URL exists in the current pool. + ''' + if new_url in self.url_pool: + return True + else: + return False def invalidate_url(self, url): - self.url_pool[url] = self.invalid + self.url_pool.remove(url) def add_to_list(self, url): - self.url_pool[url] = self.not_crawled - # calculate depth - # add link, crawled status to url_pool - - def mark_as_crawled(self, url): - self.url_pool[url] = self.crawled + self.url_pool.add(url) def clean_base_url(url):