attempt to remove base url with trailing slash (if discovered)

2018-09-04 13:57:52 +01:00
parent 6abe7d68e0
commit 1b9b207a28
2 changed files with 10 additions and 1 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -7,6 +7,7 @@ import argparse
 from utils.helpers import (UrlPool, WebPage, sanitise_url)
 from pprint import pprint

+
 def init_crawler(base_url=None):
    '''
    needs a docstring
@@ -20,11 +21,19 @@ def init_crawler(base_url=None):
        print(e)

    initial_urls = initial_page.list_urls()
+
    # ensure the base URL isn't crawled again
    try:
        initial_urls.remove(base_url)
    except KeyError:
        pass
+    # also ensure base URL wasn't discovered with a trailing slash on the
+    # initial page scrape
+    try:
+        initial_urls.remove("".join([base_url, '/']))
+    except KeyError:
+        pass
+
    # Add the base URL to the crawled pool
    crawled_urls.add_to_pool(base_url)

--- a/notes.md
+++ b/notes.md
@@ -7,4 +7,4 @@
  * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
  * ignore any links which aren't to pages
  * better url checking to get bare domain
-  * remove base url from initial urls with and without trailing slash
+  * ~~remove base url from initial urls with and without trailing slash~~