report runtime of script in generated sitemap

This commit is contained in:
2018-09-06 17:20:59 +01:00
parent 816a727d79
commit d1c1e17f4f
2 changed files with 9 additions and 5 deletions

View File

@@ -5,8 +5,8 @@ Need a docstring.
import argparse
import jinja2
from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from pprint import pprint
def init_crawler(base_url=None, robots=None):
@@ -68,7 +68,7 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
uncrawled_urls.add_to_pool(url)
def render_sitemap(base_url=None, crawled_urls=None):
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap as an HTML file.
'''
@@ -79,7 +79,7 @@ def render_sitemap(base_url=None, crawled_urls=None):
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls)
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
@@ -91,13 +91,17 @@ def run(args=None):
'''
needs a docstring.
'''
starttime = datetime.now()
base_url = sanitise_url(args.url, base_url=True)
robots = RobotsTxt(base_url=base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool)
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
# pprint(crawled_urls.pool)
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))