diff --git a/crawler.py b/crawler.py index b791ba0..3401c26 100644 --- a/crawler.py +++ b/crawler.py @@ -4,6 +4,7 @@ Need a docstring. ''' import argparse +import jinja2 from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) from pprint import pprint @@ -67,6 +68,25 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N uncrawled_urls.add_to_pool(url) +def render_sitemap(base_url=None, crawled_urls=None): + ''' + Renders the sitemap as an HTML file. + ''' + urlcount = len(crawled_urls) + sorted_urls = sorted(crawled_urls) + + tmpl = jinja2.Environment( + loader=jinja2.FileSystemLoader('templates') + ).get_template('sitemap.html.j2') + + rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls) + + with open('sitemap.html', 'w') as outfile: + outfile.write(rendered_html) + + print('Sitemap available at sitemap.html') + + def run(args=None): ''' needs a docstring. @@ -77,8 +97,10 @@ def run(args=None): uncrawled_urls, crawled_urls = init_crawler(base_url, robots) process_pool(base_url, uncrawled_urls, crawled_urls, robots) - pprint(crawled_urls.pool) - print('{0} URLs crawled'.format(len(crawled_urls.pool))) + render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool) + + # pprint(crawled_urls.pool) + # print('{0} URLs crawled'.format(len(crawled_urls.pool))) if __name__ == '__main__': diff --git a/templates/sitemap.html.j2 b/templates/sitemap.html.j2 new file mode 100644 index 0000000..606a846 --- /dev/null +++ b/templates/sitemap.html.j2 @@ -0,0 +1,14 @@ + + + Sitemap for {{ base_url }} + + +

+Crawled {{ urlcount }} URLs on {{ base_url }} +

+ + \ No newline at end of file