crawler now initialises and populates crawled pool with urls it finds

2018-08-31 19:02:21 +01:00
parent 1b18aa83eb
commit 0517e5bc56
1 changed files with 18 additions and 1 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -10,6 +10,21 @@ def init_crawler(base_url=None):
    '''
    needs a docstring
    '''
+    uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
+    initial_page = WebPage(base_url)
+
+    try:
+        initial_urls = initial_page.run()
+    except Exception as e:
+        print(e)
+
+    for url in initial_urls:
+        try:
+            uncrawled_urls.add_to_pool(url)
+        except Exception as e:
+            print(e)
+
+    print(uncrawled_urls.url_pool)


 def run(args=None):
@@ -17,7 +32,9 @@ def run(args=None):
    needs a docstring.
    '''
    base_url = sanitise_url(args.url)
-    print(base_url)
+
+    init_crawler(base_url)
+

 if __name__ == '__main__':