attempt to remove base url with trailing slash (if discovered)

This commit is contained in:
2018-09-04 13:57:52 +01:00
parent 6abe7d68e0
commit 1b9b207a28
2 changed files with 10 additions and 1 deletions

View File

@@ -7,6 +7,7 @@ import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url) from utils.helpers import (UrlPool, WebPage, sanitise_url)
from pprint import pprint from pprint import pprint
def init_crawler(base_url=None): def init_crawler(base_url=None):
''' '''
needs a docstring needs a docstring
@@ -20,11 +21,19 @@ def init_crawler(base_url=None):
print(e) print(e)
initial_urls = initial_page.list_urls() initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again # ensure the base URL isn't crawled again
try: try:
initial_urls.remove(base_url) initial_urls.remove(base_url)
except KeyError: except KeyError:
pass pass
# also ensure base URL wasn't discovered with a trailing slash on the
# initial page scrape
try:
initial_urls.remove("".join([base_url, '/']))
except KeyError:
pass
# Add the base URL to the crawled pool # Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url) crawled_urls.add_to_pool(base_url)

View File

@@ -7,4 +7,4 @@
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~ * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ignore any links which aren't to pages * ignore any links which aren't to pages
* better url checking to get bare domain * better url checking to get bare domain
* remove base url from initial urls with and without trailing slash * ~~remove base url from initial urls with and without trailing slash~~