Compare commits

..

2 Commits

Author SHA1 Message Date
84ab27a75e render results as HTML 2018-09-06 17:08:26 +01:00
6d9103c154 improved content-type detection 2018-09-06 17:08:12 +01:00
3 changed files with 39 additions and 3 deletions

View File

@@ -4,6 +4,7 @@ Need a docstring.
'''
import argparse
import jinja2
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from pprint import pprint
@@ -67,6 +68,25 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
uncrawled_urls.add_to_pool(url)
def render_sitemap(base_url=None, crawled_urls=None):
'''
Renders the sitemap as an HTML file.
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
tmpl = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at sitemap.html')
def run(args=None):
'''
needs a docstring.
@@ -77,8 +97,10 @@ def run(args=None):
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
pprint(crawled_urls.pool)
print('{0} URLs crawled'.format(len(crawled_urls.pool)))
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool)
# pprint(crawled_urls.pool)
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
if __name__ == '__main__':

14
templates/sitemap.html.j2 Normal file
View File

@@ -0,0 +1,14 @@
<html>
<head>
<title>Sitemap for {{ base_url }}</title>
</head>
<body>
<p>
Crawled {{ urlcount }} URLs on {{ base_url }}
<ul>
{% for url in urls %}
<li><a href="{{ url }}">{{ url }}</a></li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -62,7 +62,7 @@ class WebPage(object):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5)
headers = page.info()
if headers['content-type'] == "text/html":
if "text/html" in headers['content-type']:
self.source = page.read()