From 1b9b207a2852a690b791c8aa8df8ccfda2e23f18 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Tue, 4 Sep 2018 13:57:52 +0100 Subject: [PATCH] attempt to remove base url with trailing slash (if discovered) --- crawler.py | 9 +++++++++ notes.md | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index 782b7bc..d2011b6 100644 --- a/crawler.py +++ b/crawler.py @@ -7,6 +7,7 @@ import argparse from utils.helpers import (UrlPool, WebPage, sanitise_url) from pprint import pprint + def init_crawler(base_url=None): ''' needs a docstring @@ -20,11 +21,19 @@ def init_crawler(base_url=None): print(e) initial_urls = initial_page.list_urls() + # ensure the base URL isn't crawled again try: initial_urls.remove(base_url) except KeyError: pass + # also ensure base URL wasn't discovered with a trailing slash on the + # initial page scrape + try: + initial_urls.remove("".join([base_url, '/'])) + except KeyError: + pass + # Add the base URL to the crawled pool crawled_urls.add_to_pool(base_url) diff --git a/notes.md b/notes.md index 737221e..1575f85 100644 --- a/notes.md +++ b/notes.md @@ -7,4 +7,4 @@ * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~ * ignore any links which aren't to pages * better url checking to get bare domain - * remove base url from initial urls with and without trailing slash \ No newline at end of file + * ~~remove base url from initial urls with and without trailing slash~~