diff --git a/crawler.py b/crawler.py index 3401c26..3c6bf61 100644 --- a/crawler.py +++ b/crawler.py @@ -5,8 +5,8 @@ Need a docstring. import argparse import jinja2 +from datetime import datetime from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) -from pprint import pprint def init_crawler(base_url=None, robots=None): @@ -68,7 +68,7 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N uncrawled_urls.add_to_pool(url) -def render_sitemap(base_url=None, crawled_urls=None): +def render_sitemap(base_url=None, crawled_urls=None, runtime=None): ''' Renders the sitemap as an HTML file. ''' @@ -79,7 +79,7 @@ def render_sitemap(base_url=None, crawled_urls=None): loader=jinja2.FileSystemLoader('templates') ).get_template('sitemap.html.j2') - rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls) + rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) with open('sitemap.html', 'w') as outfile: outfile.write(rendered_html) @@ -91,13 +91,17 @@ def run(args=None): ''' needs a docstring. ''' + starttime = datetime.now() + base_url = sanitise_url(args.url, base_url=True) robots = RobotsTxt(base_url=base_url) uncrawled_urls, crawled_urls = init_crawler(base_url, robots) process_pool(base_url, uncrawled_urls, crawled_urls, robots) - render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool) + runtime = int((datetime.now() - starttime).total_seconds()) + + render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime) # pprint(crawled_urls.pool) # print('{0} URLs crawled'.format(len(crawled_urls.pool))) diff --git a/templates/sitemap.html.j2 b/templates/sitemap.html.j2 index 606a846..c9822ae 100644 --- a/templates/sitemap.html.j2 +++ b/templates/sitemap.html.j2 @@ -4,7 +4,7 @@
-Crawled {{ urlcount }} URLs on {{ base_url }} +Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.