From 1b9b207a2852a690b791c8aa8df8ccfda2e23f18 Mon Sep 17 00:00:00 2001
From: Simon Weald <simon@simonweald.com>
Date: Tue, 4 Sep 2018 13:57:52 +0100
Subject: [PATCH] attempt to remove base url with trailing slash (if
 discovered)

---
 crawler.py | 9 +++++++++
 notes.md   | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/crawler.py b/crawler.py
index 782b7bc..d2011b6 100644
--- a/crawler.py
+++ b/crawler.py
@@ -7,6 +7,7 @@ import argparse
 from utils.helpers import (UrlPool, WebPage, sanitise_url)
 from pprint import pprint
 
+
 def init_crawler(base_url=None):
     '''
     needs a docstring
@@ -20,11 +21,19 @@ def init_crawler(base_url=None):
         print(e)
 
     initial_urls = initial_page.list_urls()
+
     # ensure the base URL isn't crawled again
     try:
         initial_urls.remove(base_url)
     except KeyError:
         pass
+    # also ensure base URL wasn't discovered with a trailing slash on the
+    # initial page scrape
+    try:
+        initial_urls.remove("".join([base_url, '/']))
+    except KeyError:
+        pass
+
     # Add the base URL to the crawled pool
     crawled_urls.add_to_pool(base_url)
 
diff --git a/notes.md b/notes.md
index 737221e..1575f85 100644
--- a/notes.md
+++ b/notes.md
@@ -7,4 +7,4 @@
   * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
   * ignore any links which aren't to pages
   * better url checking to get bare domain
-  * remove base url from initial urls with and without trailing slash
\ No newline at end of file
+  * ~~remove base url from initial urls with and without trailing slash~~