#!/usr/bin/env python ''' Asynchronous web crawler written in Python 3.5+. This script will respect the site's `robots.txt`, if one exists. If not, all URLs discovered will be crawled. The crawler takes a total of two arguments (concurrency is optional): url: the root URL to begin the crawl from. concurrency: the maximum number of pages which may be crawled concurrently. ''' import argparse import asyncio from datetime import datetime import jinja2 import os import sys from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url def sanity_checks(url=None): ''' Runs some basic sanity checks before the crawler is initialised. Accepts: url: the root URL to be crawled. Returns: rooturl: a string containing avalidated and cleaned version of the initial URL. robots: an object which allows us to query whether a site may be crawled. ''' # ensure we have a sensible URL to work with rooturl = standardise_url(url=url) # get robots.txt robots = RobotsTxt(rooturl=rooturl) # fail early if robots denies all crawling if not robots.check(url=rooturl): sys.exit("{0} cannot be crawled (denied by robots.txt)".format( rooturl)) return(rooturl, robots) def render_sitemap(rooturl=None, crawled_urls=None, runtime=None): ''' Renders the sitemap to an HTML file. Accepts: rooturl: string containing the root URL crawled_urls: set containing discovered URLs runtime: int representing run time of AsyncCrawler ''' urlcount = len(crawled_urls) sorted_urls = sorted(crawled_urls) template = jinja2.Environment( loader=jinja2.FileSystemLoader('templates') ).get_template('sitemap.html.j2') rendered_html = template.render(rooturl=rooturl, urlcount=urlcount, urls=sorted_urls, runtime=runtime) with open('sitemap.html', 'w') as outfile: outfile.write(rendered_html) print('Sitemap available at {0}/sitemap.html'.format(os.getcwd())) def main(): ''' Main function, responsible for prepping and running the crawler and rendering the sitemap. ''' starttime = datetime.now() rooturl, robots = sanity_checks(url=args.url) # create a crawler async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots, concurrency=args.concurrency) # create a task to run the crawler, run the loop and then gather the # results. task = asyncio.Task(async_crawler.main()) loop = asyncio.get_event_loop() loop.run_until_complete(task) loop.close() results = sorted(task.result()) runtime = int((datetime.now() - starttime).total_seconds()) render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') parser.add_argument("-u", "--url", required=True, help="Initial url") parser.add_argument("-c", "--concurrency", required=False, type=int, default=100, help="Max pages to crawl concurrently") args = parser.parse_args() main()