attempt to remove base url with trailing slash (if discovered)
This commit is contained in:
@@ -7,6 +7,7 @@ import argparse
|
|||||||
from utils.helpers import (UrlPool, WebPage, sanitise_url)
|
from utils.helpers import (UrlPool, WebPage, sanitise_url)
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
|
|
||||||
def init_crawler(base_url=None):
|
def init_crawler(base_url=None):
|
||||||
'''
|
'''
|
||||||
needs a docstring
|
needs a docstring
|
||||||
@@ -20,11 +21,19 @@ def init_crawler(base_url=None):
|
|||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
initial_urls = initial_page.list_urls()
|
initial_urls = initial_page.list_urls()
|
||||||
|
|
||||||
# ensure the base URL isn't crawled again
|
# ensure the base URL isn't crawled again
|
||||||
try:
|
try:
|
||||||
initial_urls.remove(base_url)
|
initial_urls.remove(base_url)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
# also ensure base URL wasn't discovered with a trailing slash on the
|
||||||
|
# initial page scrape
|
||||||
|
try:
|
||||||
|
initial_urls.remove("".join([base_url, '/']))
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Add the base URL to the crawled pool
|
# Add the base URL to the crawled pool
|
||||||
crawled_urls.add_to_pool(base_url)
|
crawled_urls.add_to_pool(base_url)
|
||||||
|
|
||||||
|
|||||||
2
notes.md
2
notes.md
@@ -7,4 +7,4 @@
|
|||||||
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
|
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
|
||||||
* ignore any links which aren't to pages
|
* ignore any links which aren't to pages
|
||||||
* better url checking to get bare domain
|
* better url checking to get bare domain
|
||||||
* remove base url from initial urls with and without trailing slash
|
* ~~remove base url from initial urls with and without trailing slash~~
|
||||||
|
|||||||
Reference in New Issue
Block a user