crawler now initialises and populates crawled pool with urls it finds

corrected some small errors and added runner function
2018-08-31 19:02:21 +01:00 · 2018-08-31 19:01:35 +01:00
2 changed files with 46 additions and 9 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -10,6 +10,21 @@ def init_crawler(base_url=None):
    '''
    needs a docstring
    '''
    uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
    initial_page = WebPage(base_url)
    try:
        initial_urls = initial_page.run()
    except Exception as e:
        print(e)
    for url in initial_urls:
        try:
            uncrawled_urls.add_to_pool(url)
        except Exception as e:
            print(e)
    print(uncrawled_urls.url_pool)
 def run(args=None):
@@ -17,7 +32,9 @@ def run(args=None):
    needs a docstring.
    '''
    base_url = sanitise_url(args.url)
-    print(base_url)
+
    init_crawler(base_url)
 if __name__ == '__main__':
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -25,10 +25,10 @@ class UrlPool(object):
        else:
            return False
-    def invalidate_url(self, url):
+    def remove_from_pool(self, url):
        self.url_pool.remove(url)
-    def add_to_list(self, url):
+    def add_to_pool(self, url):
        self.url_pool.add(url)
@@ -39,31 +39,51 @@ class WebPage(object):
    def __init__(self, url):
        self.url = url
    def get_source(self):
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request)
        self.source = page.read()
    def find_links(self):
        soup = BeautifulSoup(self.source, 'html.parser')
        links = soup.find_all('a')
-        hrefs = []
+        hrefs = set()
        for link in links:
            if link['href'].startswith('/'):
-                hrefs.append("".join([self.url, link['href']]))
+                hrefs.add("".join([self.url, link['href']]))
            else:
-                hrefs.append(link['href'])
+                hrefs.add(link['href'])
        self.hrefs = hrefs
    def parse_urls(self):
-        local_urls = []
+        self.urls_to_crawl = set()
        for url in self.hrefs:
            if url.startswith(self.url):
-                local_urls.append(url)
+                self.urls_to_crawl.add(url)
-        return local_urls
+
    def run(self):
        try:
            self.get_source()
        except Exception as e:
            print(e)
        try:
            self.find_links()
        except Exception as e:
            print(e)
        try:
            self.parse_urls()
        except Exception as e:
            print(e)
        return self.urls_to_crawl
 def sanitise_url(url):
Author	SHA1	Message	Date
Simon Weald	0517e5bc56	crawler now initialises and populates crawled pool with urls it finds	2018-08-31 19:02:21 +01:00
Simon Weald	1b18aa83eb	corrected some small errors and added runner function	2018-08-31 19:01:35 +01:00