#!/usr/bin/env python ''' Asynchronous web crawler written in Python 3.5+. This script will respect the site's `robots.txt`, if one exists. If not, all URLs discovered will be crawled. The crawler takes a total of two arguments (concurrency is optional): url: the base URL to begin the crawl from. concurrency: the maximum number of pages which may be crawled concurrently. ''' import argparse import asyncio from datetime import datetime import jinja2 import os import sys from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url def sanity_checks(url=None): ''' Runs some basic sanity checks before the crawler is initialised. ''' # ensure we have a sensible URL to work with baseurl = standardise_url(url=url) # get robots.txt robots = RobotsTxt(base_url=baseurl) # fail early if robots denies all crawling if not robots.check(url=baseurl): sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl)) return(baseurl, robots) def render_sitemap(base_url=None, crawled_urls=None, runtime=None): ''' Renders the sitemap to an HTML file. ''' urlcount = len(crawled_urls) sorted_urls = sorted(crawled_urls) template = jinja2.Environment( loader=jinja2.FileSystemLoader('templates') ).get_template('sitemap.html.j2') rendered_html = template.render( base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) with open('sitemap.html', 'w') as outfile: outfile.write(rendered_html) print('Sitemap available at {0}/sitemap.html'.format(os.getcwd())) def main(): ''' Main function, responsible for prepping and running the crawler and rendering the sitemap. ''' starttime = datetime.now() baseurl, robots = sanity_checks(url=args.url) # create a crawler async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) # create a task to run the crawler, run the loop and then gather the results. task = asyncio.Task(async_crawler.main()) loop = asyncio.get_event_loop() loop.run_until_complete(task) loop.close() results = sorted(task.result()) runtime = int((datetime.now() - starttime).total_seconds()) render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') parser.add_argument("-u", "--url", required=True, help="Initial url to crawl") parser.add_argument("-c", "--concurrency", required=False, type=int, default=100, help="Max number of pages to crawl concurrently") args = parser.parse_args() main()