Files
web-scraper/async_crawler.py

109 lines
3.1 KiB
Python

#!/usr/bin/env python
'''
Asynchronous web crawler written in Python 3.5+.
This script will respect the site's `robots.txt`, if one exists. If not, all
URLs discovered will be crawled.
The crawler takes a total of two arguments (concurrency is optional):
url: the root URL to begin the crawl from.
concurrency: the maximum number of pages which may be crawled concurrently.
'''
import argparse
import asyncio
from datetime import datetime
import jinja2
import os
import sys
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
def sanity_checks(url=None):
'''
Runs some basic sanity checks before the crawler is initialised.
Accepts:
url: the root URL to be crawled.
Returns:
rooturl: a string containing avalidated and cleaned version of the
initial URL.
robots: an object which allows us to query whether a site may be crawled.
'''
# ensure we have a sensible URL to work with
rooturl = standardise_url(url=url)
# get robots.txt
robots = RobotsTxt(rooturl=rooturl)
# fail early if robots denies all crawling
if not robots.check(url=rooturl):
sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
rooturl))
return(rooturl, robots)
def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap to an HTML file.
Accepts:
rooturl: string containing the root URL
crawled_urls: set containing discovered URLs
runtime: int representing run time of AsyncCrawler
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
template = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def main():
'''
Main function, responsible for prepping and running the crawler and
rendering the sitemap.
'''
starttime = datetime.now()
rooturl, robots = sanity_checks(url=args.url)
# create a crawler
async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
concurrency=args.concurrency)
# create a task to run the crawler, run the loop and then gather the
# results.
task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
loop.close()
results = sorted(task.result())
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Initial url")
parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max pages to crawl concurrently")
args = parser.parse_args()
main()