diff --git a/utils/helpers.py b/utils/helpers.py index 593494d..ce446ca 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -3,6 +3,35 @@ Utilities to provide various misc functions. ''' +class ListManager(object): + ''' + Object to manage the lifecycle of a pool of URLs. + ''' + + def __init__(self): + self.url_pool = dict() + self.not_crawled = 0 + self.crawled = 1 + self.invalid = 2 + + def check_duplicate(self, new_url): + for url, status in self.url_pool.items(): + if url == new_url: + return True + else: + return False + + def invalidate_url(self, url): + self.url_pool[url] = self.invalid + + def add_to_list(self, url): + self.url_pool[url] = self.not_crawled + # calculate depth + # add link, crawled status to url_pool + + def mark_as_crawled(self, url): + self.url_pool[url] = self.crawled + def clean_base_url(url): '''