attempt to remove base url with trailing slash (if discovered)
This commit is contained in:
@@ -7,6 +7,7 @@ import argparse
|
||||
from utils.helpers import (UrlPool, WebPage, sanitise_url)
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
def init_crawler(base_url=None):
|
||||
'''
|
||||
needs a docstring
|
||||
@@ -20,11 +21,19 @@ def init_crawler(base_url=None):
|
||||
print(e)
|
||||
|
||||
initial_urls = initial_page.list_urls()
|
||||
|
||||
# ensure the base URL isn't crawled again
|
||||
try:
|
||||
initial_urls.remove(base_url)
|
||||
except KeyError:
|
||||
pass
|
||||
# also ensure base URL wasn't discovered with a trailing slash on the
|
||||
# initial page scrape
|
||||
try:
|
||||
initial_urls.remove("".join([base_url, '/']))
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# Add the base URL to the crawled pool
|
||||
crawled_urls.add_to_pool(base_url)
|
||||
|
||||
|
||||
2
notes.md
2
notes.md
@@ -7,4 +7,4 @@
|
||||
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
|
||||
* ignore any links which aren't to pages
|
||||
* better url checking to get bare domain
|
||||
* remove base url from initial urls with and without trailing slash
|
||||
* ~~remove base url from initial urls with and without trailing slash~~
|
||||
|
||||
Reference in New Issue
Block a user