report runtime of script in generated sitemap
This commit is contained in:
12
crawler.py
12
crawler.py
@@ -5,8 +5,8 @@ Need a docstring.
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import jinja2
|
import jinja2
|
||||||
|
from datetime import datetime
|
||||||
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
|
|
||||||
def init_crawler(base_url=None, robots=None):
|
def init_crawler(base_url=None, robots=None):
|
||||||
@@ -68,7 +68,7 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
|
|||||||
uncrawled_urls.add_to_pool(url)
|
uncrawled_urls.add_to_pool(url)
|
||||||
|
|
||||||
|
|
||||||
def render_sitemap(base_url=None, crawled_urls=None):
|
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
||||||
'''
|
'''
|
||||||
Renders the sitemap as an HTML file.
|
Renders the sitemap as an HTML file.
|
||||||
'''
|
'''
|
||||||
@@ -79,7 +79,7 @@ def render_sitemap(base_url=None, crawled_urls=None):
|
|||||||
loader=jinja2.FileSystemLoader('templates')
|
loader=jinja2.FileSystemLoader('templates')
|
||||||
).get_template('sitemap.html.j2')
|
).get_template('sitemap.html.j2')
|
||||||
|
|
||||||
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls)
|
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
|
||||||
|
|
||||||
with open('sitemap.html', 'w') as outfile:
|
with open('sitemap.html', 'w') as outfile:
|
||||||
outfile.write(rendered_html)
|
outfile.write(rendered_html)
|
||||||
@@ -91,13 +91,17 @@ def run(args=None):
|
|||||||
'''
|
'''
|
||||||
needs a docstring.
|
needs a docstring.
|
||||||
'''
|
'''
|
||||||
|
starttime = datetime.now()
|
||||||
|
|
||||||
base_url = sanitise_url(args.url, base_url=True)
|
base_url = sanitise_url(args.url, base_url=True)
|
||||||
robots = RobotsTxt(base_url=base_url)
|
robots = RobotsTxt(base_url=base_url)
|
||||||
|
|
||||||
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
|
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
|
||||||
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
|
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
|
||||||
|
|
||||||
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool)
|
runtime = int((datetime.now() - starttime).total_seconds())
|
||||||
|
|
||||||
|
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
|
||||||
|
|
||||||
# pprint(crawled_urls.pool)
|
# pprint(crawled_urls.pool)
|
||||||
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
|
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<p>
|
<p>
|
||||||
Crawled {{ urlcount }} URLs on {{ base_url }}
|
Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
|
||||||
<ul>
|
<ul>
|
||||||
{% for url in urls %}
|
{% for url in urls %}
|
||||||
<li><a href="{{ url }}">{{ url }}</a></li>
|
<li><a href="{{ url }}">{{ url }}</a></li>
|
||||||
|
|||||||
Reference in New Issue
Block a user