From 73cb883151f35fdf83ec4d9b5e5b017e89135456 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Tue, 28 Aug 2018 22:28:16 +0100 Subject: [PATCH] add a list manager object --- utils/helpers.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/utils/helpers.py b/utils/helpers.py index 593494d..ce446ca 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -3,6 +3,35 @@ Utilities to provide various misc functions. ''' +class ListManager(object): + ''' + Object to manage the lifecycle of a pool of URLs. + ''' + + def __init__(self): + self.url_pool = dict() + self.not_crawled = 0 + self.crawled = 1 + self.invalid = 2 + + def check_duplicate(self, new_url): + for url, status in self.url_pool.items(): + if url == new_url: + return True + else: + return False + + def invalidate_url(self, url): + self.url_pool[url] = self.invalid + + def add_to_list(self, url): + self.url_pool[url] = self.not_crawled + # calculate depth + # add link, crawled status to url_pool + + def mark_as_crawled(self, url): + self.url_pool[url] = self.crawled + def clean_base_url(url): '''