#!/usr/bin/env python
'''
Asynchronous web crawler written in Python 3.5+.

This script will respect the site's `robots.txt`, if one exists. If not, all
URLs discovered will be crawled.

The crawler takes a total of two arguments (concurrency is optional):

    url: the base URL to begin the crawl from.
    concurrency: the maximum number of pages which may be crawled concurrently.
'''

import argparse
import asyncio
from datetime import datetime
import jinja2
import os
import sys
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url


def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    '''
    # ensure we have a sensible URL to work with
    baseurl = standardise_url(url=url)
    # get robots.txt
    robots = RobotsTxt(base_url=baseurl)

    # fail early if robots denies all crawling
    if not robots.check(url=baseurl):
        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))

    return(baseurl, robots)


def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)

    template = jinja2.Environment(
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')

    rendered_html = template.render(
        base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)

    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)

    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))


def main():
    '''
    Main function, responsible for prepping and running the crawler and
    rendering the sitemap.
    '''
    starttime = datetime.now()

    baseurl, robots = sanity_checks(url=args.url)

    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)

    # create a task to run the crawler, run the loop and then gather the results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
    results = sorted(task.result())

    runtime = int((datetime.now() - starttime).total_seconds())

    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
                        default=100, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()

    main()