Compare commits
2 Commits
e57a86c60a
...
84ab27a75e
| Author | SHA1 | Date | |
|---|---|---|---|
| 84ab27a75e | |||
| 6d9103c154 |
26
crawler.py
26
crawler.py
@@ -4,6 +4,7 @@ Need a docstring.
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import jinja2
|
||||||
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
@@ -67,6 +68,25 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
|
|||||||
uncrawled_urls.add_to_pool(url)
|
uncrawled_urls.add_to_pool(url)
|
||||||
|
|
||||||
|
|
||||||
|
def render_sitemap(base_url=None, crawled_urls=None):
|
||||||
|
'''
|
||||||
|
Renders the sitemap as an HTML file.
|
||||||
|
'''
|
||||||
|
urlcount = len(crawled_urls)
|
||||||
|
sorted_urls = sorted(crawled_urls)
|
||||||
|
|
||||||
|
tmpl = jinja2.Environment(
|
||||||
|
loader=jinja2.FileSystemLoader('templates')
|
||||||
|
).get_template('sitemap.html.j2')
|
||||||
|
|
||||||
|
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls)
|
||||||
|
|
||||||
|
with open('sitemap.html', 'w') as outfile:
|
||||||
|
outfile.write(rendered_html)
|
||||||
|
|
||||||
|
print('Sitemap available at sitemap.html')
|
||||||
|
|
||||||
|
|
||||||
def run(args=None):
|
def run(args=None):
|
||||||
'''
|
'''
|
||||||
needs a docstring.
|
needs a docstring.
|
||||||
@@ -77,8 +97,10 @@ def run(args=None):
|
|||||||
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
|
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
|
||||||
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
|
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
|
||||||
|
|
||||||
pprint(crawled_urls.pool)
|
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool)
|
||||||
print('{0} URLs crawled'.format(len(crawled_urls.pool)))
|
|
||||||
|
# pprint(crawled_urls.pool)
|
||||||
|
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
14
templates/sitemap.html.j2
Normal file
14
templates/sitemap.html.j2
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Sitemap for {{ base_url }}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Crawled {{ urlcount }} URLs on {{ base_url }}
|
||||||
|
<ul>
|
||||||
|
{% for url in urls %}
|
||||||
|
<li><a href="{{ url }}">{{ url }}</a></li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -62,7 +62,7 @@ class WebPage(object):
|
|||||||
request = urllib.request.Request(self.url, headers=self.headers)
|
request = urllib.request.Request(self.url, headers=self.headers)
|
||||||
page = urllib.request.urlopen(request, timeout=5)
|
page = urllib.request.urlopen(request, timeout=5)
|
||||||
headers = page.info()
|
headers = page.info()
|
||||||
if headers['content-type'] == "text/html":
|
if "text/html" in headers['content-type']:
|
||||||
self.source = page.read()
|
self.source = page.read()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user