46 lines
917 B
Python
46 lines
917 B
Python
#!/usr/bin/env python
|
|
'''
|
|
Need a docstring.
|
|
'''
|
|
|
|
import argparse
|
|
from utils.helpers import (UrlPool, WebPage, sanitise_url)
|
|
|
|
def init_crawler(base_url=None):
|
|
'''
|
|
needs a docstring
|
|
'''
|
|
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
|
|
initial_page = WebPage(base_url)
|
|
|
|
try:
|
|
initial_urls = initial_page.run()
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
for url in initial_urls:
|
|
try:
|
|
uncrawled_urls.add_to_pool(url)
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
print(uncrawled_urls.url_pool)
|
|
|
|
|
|
def run(args=None):
|
|
'''
|
|
needs a docstring.
|
|
'''
|
|
base_url = sanitise_url(args.url)
|
|
|
|
init_crawler(base_url)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
|
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
|
|
args = parser.parse_args()
|
|
|
|
run(args)
|