initial foray into asynchronous crawling

This commit is contained in:
2018-09-12 22:54:12 +01:00
parent 8698c21fda
commit 36e1f7693f
2 changed files with 155 additions and 4 deletions

74
async_crawler.py Normal file
View File

@@ -0,0 +1,74 @@
#!/usr/bin/env python
'''
Need a docstring.
'''
import argparse
import jinja2
import os
import asyncio
from datetime import datetime
# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
def init_crawler(url=None):
'''
docstring
'''
# ensure we have a sensible URL to work with
baseurl = sanitise_url(url=url, base_url=True)
# get robots.txt
robots = RobotsTxt(base_url=baseurl)
return(baseurl, robots)
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap as an HTML file.
'''
# urlcount = len(crawled_urls)
# sorted_urls = sorted(crawled_urls)
tmpl = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(
base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def main(args=None):
'''
docstring
'''
starttime = datetime.now()
baseurl, robots = init_crawler(url=args.url)
# create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
# async_crawler.run()
crawler = asyncio.Task(async_crawler.run())
loop = asyncio.get_event_loop()
loop.run_until_complete(crawler)
loop.close()
result = crawler.result()
print(len(result))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
args = parser.parse_args()
main(args)

View File

@@ -3,13 +3,90 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
from bs4 import BeautifulSoup
import aiohttp
import urllib.request import urllib.request
import urllib.robotparser
import urllib.error import urllib.error
import gzip import gzip
from urllib.parse import (urljoin, urlsplit)
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import urllib.robotparser
class AsyncCrawler(object):
'''
docstring
'''
def __init__(self, baseurl=None, robots=None, concurrency=None):
self.baseurl = baseurl
self.robots = robots
self.uncrawled = set()
self.crawled = set()
self.session = aiohttp.ClientSession()
self.semaphore = asyncio.BoundedSemaphore(concurrency)
# add the base URL to be crawled
self.uncrawled.add(baseurl)
self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def validate_url(self, url):
'''
Checks if the discovered URL is local to the base URL.
'''
# ensure the URL is in a sane format
url = sanitise_url(url=url)
if url.startswith(self.baseurl) and robots.check(url=url):
return url
else:
return False
def get_source(self, url):
'''
Obtains the page's source.
'''
pass
return source
def find_links(self, source):
'''
Find all links in a page's source.
'''
links = set()
html = BeautifulSoup(source, 'lxml')
hrefs = html.find_all('a', href=True)
for href in hrefs:
url = self.validate_url(url=href)
if url:
links.add(url)
return links
def run(self):
'''
function which runs the crawler
'''
pass
for url in self.uncrawled:
validated = validate_url(url=url)
if validated:
source = get_source(url=url)
links = find_links(source=source)
class UrlPool(object): class UrlPool(object):