#!/usr/bin/env python ''' Need a docstring. ''' import argparse import jinja2 from datetime import datetime from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) def init_crawler(base_url=None, robots=None): ''' Initialises the crawler by running the initial URL. ''' uncrawled_urls, crawled_urls = UrlPool(), UrlPool() initial_page = WebPage(url=base_url, base_url=base_url, robots=robots) try: initial_page.run() except Exception as e: print(e) initial_urls = initial_page.list_urls() # ensure the base URL isn't crawled again try: initial_urls.remove(base_url) except KeyError: pass # also ensure base URL wasn't discovered with a trailing slash on the # initial page scrape try: initial_urls.remove("".join([base_url, '/'])) except KeyError: pass # Add the base URL to the crawled pool crawled_urls.add_to_pool(base_url) for url in initial_urls: sanitised_url = sanitise_url(url=url) if sanitised_url not in crawled_urls.pool: uncrawled_urls.add_to_pool(sanitised_url) return(uncrawled_urls, crawled_urls) def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None): ''' Iterates over the pool of URLs and adds any discovered URLs. ''' while uncrawled_urls.pool: # pop url from pool new_url = uncrawled_urls.remove_from_pool() # create a WebPage object for the URL current_page = WebPage(url=new_url, base_url=base_url, robots=robots) try: current_page.run() _urls = current_page.list_urls() crawled_urls.add_to_pool(new_url) except Exception as e: print(e) for url in _urls: sanitised_url = sanitise_url(url=url) if sanitised_url not in crawled_urls.pool: uncrawled_urls.add_to_pool(url) print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool), len(uncrawled_urls.pool))) def render_sitemap(base_url=None, crawled_urls=None, runtime=None): ''' Renders the sitemap as an HTML file. ''' urlcount = len(crawled_urls) sorted_urls = sorted(crawled_urls) tmpl = jinja2.Environment( loader=jinja2.FileSystemLoader('templates') ).get_template('sitemap.html.j2') rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) with open('sitemap.html', 'w') as outfile: outfile.write(rendered_html) print('Sitemap available at sitemap.html') def run(args=None): ''' needs a docstring. ''' starttime = datetime.now() base_url = sanitise_url(args.url, base_url=True) robots = RobotsTxt(base_url=base_url) uncrawled_urls, crawled_urls = init_crawler(base_url, robots) process_pool(base_url, uncrawled_urls, crawled_urls, robots) runtime = int((datetime.now() - starttime).total_seconds()) render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime) # pprint(crawled_urls.pool) # print('{0} URLs crawled'.format(len(crawled_urls.pool))) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') parser.add_argument("-u", "--url", required=True, help="Base url to crawl") args = parser.parse_args() run(args)