From 5e0d9fd56880f52ccb2f46cb2ac0ffe2c6b7ca88 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Fri, 31 Aug 2018 18:26:49 +0100 Subject: [PATCH] initial commit of crawler skeleton --- crawler.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..8d3839c --- /dev/null +++ b/crawler.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +''' +Need a docstring. +''' + +import argparse +from utils.helpers import (UrlPool, WebPage, sanitise_url, qualify_url) + +def init_crawler(base_url=None): + ''' + needs a docstring + ''' + + +def run(args=None): + ''' + needs a docstring. + ''' + base_url = sanitise_url(args.url) + print(base_url) + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Recursive web crawler') + parser.add_argument("-u", "--url", required=True, help="Base url to crawl") + args = parser.parse_args() + + run(args)