small improvements to docs and variables
This commit is contained in:
@@ -24,6 +24,15 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
|
||||
def sanity_checks(url=None):
|
||||
'''
|
||||
Runs some basic sanity checks before the crawler is initialised.
|
||||
|
||||
Accepts:
|
||||
url: the root URL to be crawled.
|
||||
|
||||
Returns:
|
||||
baseurl: a validated and cleaned version of the initial URL.
|
||||
(type=string)
|
||||
robots: an object which allows us to query whether a site may be crawled.
|
||||
(type=RobotsTxt)
|
||||
'''
|
||||
# ensure we have a sensible URL to work with
|
||||
baseurl = standardise_url(url=url)
|
||||
@@ -41,6 +50,11 @@ def sanity_checks(url=None):
|
||||
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
||||
'''
|
||||
Renders the sitemap to an HTML file.
|
||||
|
||||
Accepts:
|
||||
base_url:
|
||||
crawled_urls:
|
||||
runtime:
|
||||
'''
|
||||
urlcount = len(crawled_urls)
|
||||
sorted_urls = sorted(crawled_urls)
|
||||
|
||||
Reference in New Issue
Block a user