From 36e1f7693f5f1a4ec565086f20f1e78c536481cd Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Wed, 12 Sep 2018 22:54:12 +0100 Subject: [PATCH] initial foray into asynchronous crawling --- async_crawler.py | 74 +++++++++++++++++++++++++++++++++++++++++ utils/helpers.py | 85 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 async_crawler.py diff --git a/async_crawler.py b/async_crawler.py new file mode 100644 index 0000000..5b48b3e --- /dev/null +++ b/async_crawler.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +''' +Need a docstring. +''' + +import argparse +import jinja2 +import os +import asyncio +from datetime import datetime +# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) +from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url + + +def init_crawler(url=None): + ''' + docstring + ''' + # ensure we have a sensible URL to work with + baseurl = sanitise_url(url=url, base_url=True) + # get robots.txt + robots = RobotsTxt(base_url=baseurl) + + return(baseurl, robots) + + +def render_sitemap(base_url=None, crawled_urls=None, runtime=None): + ''' + Renders the sitemap as an HTML file. + ''' + # urlcount = len(crawled_urls) + # sorted_urls = sorted(crawled_urls) + + tmpl = jinja2.Environment( + loader=jinja2.FileSystemLoader('templates') + ).get_template('sitemap.html.j2') + + rendered_html = tmpl.render( + base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) + + with open('sitemap.html', 'w') as outfile: + outfile.write(rendered_html) + + print('Sitemap available at {0}/sitemap.html'.format(os.getcwd())) + + +def main(args=None): + ''' + docstring + ''' + starttime = datetime.now() + + baseurl, robots = init_crawler(url=args.url) + + # create a crawler + async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) + # async_crawler.run() + + crawler = asyncio.Task(async_crawler.run()) + loop = asyncio.get_event_loop() + loop.run_until_complete(crawler) + loop.close() + result = crawler.result() + print(len(result)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Recursive web crawler') + parser.add_argument("-u", "--url", required=True, help="Base url to crawl") + parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently") + args = parser.parse_args() + + main(args) diff --git a/utils/helpers.py b/utils/helpers.py index 8da18f0..05cc85c 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -3,13 +3,90 @@ Utilities to provide various misc functions. ''' -from bs4 import BeautifulSoup -import aiohttp import urllib.request -import urllib.robotparser import urllib.error import gzip -from urllib.parse import (urljoin, urlsplit) + + + +import aiohttp +import asyncio +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlsplit +import urllib.robotparser + + +class AsyncCrawler(object): + ''' + docstring + ''' + + def __init__(self, baseurl=None, robots=None, concurrency=None): + self.baseurl = baseurl + self.robots = robots + self.uncrawled = set() + self.crawled = set() + self.session = aiohttp.ClientSession() + self.semaphore = asyncio.BoundedSemaphore(concurrency) + # add the base URL to be crawled + self.uncrawled.add(baseurl) + self.headers = {'Accept-Encoding': 'gzip, deflate', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + + + def validate_url(self, url): + ''' + Checks if the discovered URL is local to the base URL. + ''' + # ensure the URL is in a sane format + url = sanitise_url(url=url) + + if url.startswith(self.baseurl) and robots.check(url=url): + return url + else: + return False + + + def get_source(self, url): + ''' + Obtains the page's source. + ''' + pass + + return source + + + def find_links(self, source): + ''' + Find all links in a page's source. + ''' + links = set() + + html = BeautifulSoup(source, 'lxml') + hrefs = html.find_all('a', href=True) + + for href in hrefs: + url = self.validate_url(url=href) + if url: + links.add(url) + + return links + + + def run(self): + ''' + function which runs the crawler + ''' + pass + + for url in self.uncrawled: + validated = validate_url(url=url) + + if validated: + source = get_source(url=url) + links = find_links(source=source) + + class UrlPool(object):