diff --git a/utils/helpers.py b/utils/helpers.py index 1161848..db184ea 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -9,28 +9,22 @@ class UrlPoolManager(object): ''' def __init__(self): - self.url_pool = dict() - self.not_crawled = 0 - self.crawled = 1 - self.invalid = 2 + self.url_pool = set() def check_duplicate(self, new_url): - for url, status in self.url_pool.items(): - if url == new_url: - return True - else: - return False + ''' + Checks if a URL exists in the current pool. + ''' + if new_url in self.url_pool: + return True + else: + return False def invalidate_url(self, url): - self.url_pool[url] = self.invalid + self.url_pool.remove(url) def add_to_list(self, url): - self.url_pool[url] = self.not_crawled - # calculate depth - # add link, crawled status to url_pool - - def mark_as_crawled(self, url): - self.url_pool[url] = self.crawled + self.url_pool.add(url) def clean_base_url(url):