attempt to remove base url with trailing slash (if discovered)

This commit is contained in:
2018-09-04 13:57:52 +01:00
parent 6abe7d68e0
commit 1b9b207a28
2 changed files with 10 additions and 1 deletions

View File

@@ -7,6 +7,7 @@ import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url)
from pprint import pprint
def init_crawler(base_url=None):
'''
needs a docstring
@@ -20,11 +21,19 @@ def init_crawler(base_url=None):
print(e)
initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again
try:
initial_urls.remove(base_url)
except KeyError:
pass
# also ensure base URL wasn't discovered with a trailing slash on the
# initial page scrape
try:
initial_urls.remove("".join([base_url, '/']))
except KeyError:
pass
# Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url)

View File

@@ -7,4 +7,4 @@
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ignore any links which aren't to pages
* better url checking to get bare domain
* remove base url from initial urls with and without trailing slash
* ~~remove base url from initial urls with and without trailing slash~~