web-scraper/async_crawler.py

#!/usr/bin/env python

'''
Asynchronous web crawler written in Python 3.5+.

This script will respect the site's `robots.txt`, if one exists. If not, all
URLs discovered will be crawled.

The crawler takes a total of two arguments (concurrency is optional):

    url: the root URL to begin the crawl from.
    concurrency: the maximum number of pages which may be crawled concurrently.
'''

import argparse
import asyncio
from datetime import datetime
import jinja2
import os
import sys
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url


def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.

    Accepts:
      url: the root URL to be crawled.

    Returns:
      rooturl: a string containing avalidated and cleaned version of the
               initial URL.
      robots: an object which allows us to query whether a site may be crawled.
    '''
    # ensure we have a sensible URL to work with
    rooturl = standardise_url(url=url)
    # get robots.txt
    robots = RobotsTxt(rooturl=rooturl)

    # fail early if robots denies all crawling
    if not robots.check(url=rooturl):
        sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
                 rooturl))

    return(rooturl, robots)


def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.

    Accepts:
        rooturl: string containing the root URL
        crawled_urls: set containing discovered URLs
        runtime: int representing run time of AsyncCrawler
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)

    template = jinja2.Environment(
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')

    rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
                                    urls=sorted_urls, runtime=runtime)

    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)

    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))


def main():
    '''
    Main function, responsible for prepping and running the crawler and
    rendering the sitemap.
    '''
    starttime = datetime.now()

    rooturl, robots = sanity_checks(url=args.url)

    # create a crawler
    async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
                                 concurrency=args.concurrency)

    # create a task to run the crawler, run the loop and then gather the
    # results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
    results = sorted(task.result())

    runtime = int((datetime.now() - starttime).total_seconds())

    render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Initial url")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
                        default=100, help="Max pages to crawl concurrently")
    args = parser.parse_args()

    main()