report runtime of script in generated sitemap

This commit is contained in:
2018-09-06 17:20:59 +01:00
parent 816a727d79
commit d1c1e17f4f
2 changed files with 9 additions and 5 deletions

View File

@@ -5,8 +5,8 @@ Need a docstring.
import argparse import argparse
import jinja2 import jinja2
from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from pprint import pprint
def init_crawler(base_url=None, robots=None): def init_crawler(base_url=None, robots=None):
@@ -68,7 +68,7 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
uncrawled_urls.add_to_pool(url) uncrawled_urls.add_to_pool(url)
def render_sitemap(base_url=None, crawled_urls=None): def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
''' '''
Renders the sitemap as an HTML file. Renders the sitemap as an HTML file.
''' '''
@@ -79,7 +79,7 @@ def render_sitemap(base_url=None, crawled_urls=None):
loader=jinja2.FileSystemLoader('templates') loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2') ).get_template('sitemap.html.j2')
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls) rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile: with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html) outfile.write(rendered_html)
@@ -91,13 +91,17 @@ def run(args=None):
''' '''
needs a docstring. needs a docstring.
''' '''
starttime = datetime.now()
base_url = sanitise_url(args.url, base_url=True) base_url = sanitise_url(args.url, base_url=True)
robots = RobotsTxt(base_url=base_url) robots = RobotsTxt(base_url=base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url, robots) uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls, robots) process_pool(base_url, uncrawled_urls, crawled_urls, robots)
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool) runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
# pprint(crawled_urls.pool) # pprint(crawled_urls.pool)
# print('{0} URLs crawled'.format(len(crawled_urls.pool))) # print('{0} URLs crawled'.format(len(crawled_urls.pool)))

View File

@@ -4,7 +4,7 @@
</head> </head>
<body> <body>
<p> <p>
Crawled {{ urlcount }} URLs on {{ base_url }} Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
<ul> <ul>
{% for url in urls %} {% for url in urls %}
<li><a href="{{ url }}">{{ url }}</a></li> <li><a href="{{ url }}">{{ url }}</a></li>