#!/usr/bin/env python ''' Need a docstring. ''' import argparse from utils.helpers import (UrlPool, WebPage, sanitise_url) from pprint import pprint def init_crawler(base_url=None): ''' needs a docstring ''' uncrawled_urls, crawled_urls = UrlPool(), UrlPool() initial_page = WebPage(url=base_url, base_url=base_url) try: initial_page.run() except Exception as e: print(e) initial_urls = initial_page.list_urls() # ensure the base URL isn't crawled again try: initial_urls.remove(base_url) except KeyError: pass # Add the base URL to the crawled pool crawled_urls.add_to_pool(base_url) for url in initial_urls: sanitised_url = sanitise_url(url=url) if sanitised_url not in crawled_urls.pool: uncrawled_urls.add_to_pool(sanitised_url) return(uncrawled_urls, crawled_urls) def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None): ''' Needs a docstring ''' while uncrawled_urls.pool: # pop url from pool new_url = uncrawled_urls.remove_from_pool() # create a WebPage object for the URL current_page = WebPage(url=new_url, base_url=base_url) try: current_page.run() _urls = current_page.list_urls() crawled_urls.add_to_pool(new_url) except Exception as e: print(e) for url in _urls: sanitised_url = sanitise_url(url=url) if sanitised_url not in crawled_urls.pool: uncrawled_urls.add_to_pool(url) def run(args=None): ''' needs a docstring. ''' base_url = sanitise_url(args.url, base_url=True) uncrawled_urls, crawled_urls = init_crawler(base_url) process_pool(base_url, uncrawled_urls, crawled_urls) pprint(crawled_urls.pool) print('{0} URLs crawled'.format(len(crawled_urls.pool))) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') parser.add_argument("-u", "--url", required=True, help="Base url to crawl") args = parser.parse_args() run(args)