render results as HTML
This commit is contained in:
26
crawler.py
26
crawler.py
@@ -4,6 +4,7 @@ Need a docstring.
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import jinja2
|
||||
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
||||
from pprint import pprint
|
||||
|
||||
@@ -67,6 +68,25 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
|
||||
uncrawled_urls.add_to_pool(url)
|
||||
|
||||
|
||||
def render_sitemap(base_url=None, crawled_urls=None):
|
||||
'''
|
||||
Renders the sitemap as an HTML file.
|
||||
'''
|
||||
urlcount = len(crawled_urls)
|
||||
sorted_urls = sorted(crawled_urls)
|
||||
|
||||
tmpl = jinja2.Environment(
|
||||
loader=jinja2.FileSystemLoader('templates')
|
||||
).get_template('sitemap.html.j2')
|
||||
|
||||
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls)
|
||||
|
||||
with open('sitemap.html', 'w') as outfile:
|
||||
outfile.write(rendered_html)
|
||||
|
||||
print('Sitemap available at sitemap.html')
|
||||
|
||||
|
||||
def run(args=None):
|
||||
'''
|
||||
needs a docstring.
|
||||
@@ -77,8 +97,10 @@ def run(args=None):
|
||||
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
|
||||
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
|
||||
|
||||
pprint(crawled_urls.pool)
|
||||
print('{0} URLs crawled'.format(len(crawled_urls.pool)))
|
||||
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool)
|
||||
|
||||
# pprint(crawled_urls.pool)
|
||||
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user