crawler now initialises and populates crawled pool with urls it finds

corrected some small errors and added runner function
2018-08-31 19:02:21 +01:00 · 2018-08-31 19:01:35 +01:00
2 changed files with 46 additions and 9 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -10,6 +10,21 @@ def init_crawler(base_url=None):
    '''
    needs a docstring
    '''
+    uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
+    initial_page = WebPage(base_url)
+
+    try:
+        initial_urls = initial_page.run()
+    except Exception as e:
+        print(e)
+
+    for url in initial_urls:
+        try:
+            uncrawled_urls.add_to_pool(url)
+        except Exception as e:
+            print(e)
+
+    print(uncrawled_urls.url_pool)


 def run(args=None):
@@ -17,7 +32,9 @@ def run(args=None):
    needs a docstring.
    '''
    base_url = sanitise_url(args.url)
-    print(base_url)
+
+    init_crawler(base_url)
+

 if __name__ == '__main__':

--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -25,10 +25,10 @@ class UrlPool(object):
        else:
            return False

-    def invalidate_url(self, url):
+    def remove_from_pool(self, url):
        self.url_pool.remove(url)

-    def add_to_list(self, url):
+    def add_to_pool(self, url):
        self.url_pool.add(url)


@@ -39,31 +39,51 @@ class WebPage(object):
    def __init__(self, url):
        self.url = url

+
    def get_source(self):
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request)
        self.source = page.read()

+
    def find_links(self):
        soup = BeautifulSoup(self.source, 'html.parser')
        links = soup.find_all('a')
-        hrefs = []
+        hrefs = set()

        for link in links:
            if link['href'].startswith('/'):
-                hrefs.append("".join([self.url, link['href']]))
+                hrefs.add("".join([self.url, link['href']]))
            else:
-                hrefs.append(link['href'])
+                hrefs.add(link['href'])

        self.hrefs = hrefs

+
    def parse_urls(self):
-        local_urls = []
+        self.urls_to_crawl = set()
        for url in self.hrefs:
            if url.startswith(self.url):
-                local_urls.append(url)
+                self.urls_to_crawl.add(url)

-        return local_urls
+
+    def run(self):
+        try:
+            self.get_source()
+        except Exception as e:
+            print(e)
+
+        try:
+            self.find_links()
+        except Exception as e:
+            print(e)
+
+        try:
+            self.parse_urls()
+        except Exception as e:
+            print(e)
+
+        return self.urls_to_crawl


 def sanitise_url(url):
Author	SHA1	Message	Date
Simon Weald	0517e5bc56	crawler now initialises and populates crawled pool with urls it finds	2018-08-31 19:02:21 +01:00
Simon Weald	1b18aa83eb	corrected some small errors and added runner function	2018-08-31 19:01:35 +01:00