async crawler in a mostly-working state

This commit is contained in:
2018-09-14 16:01:12 +01:00
parent 36e1f7693f
commit db986b0eba
2 changed files with 114 additions and 53 deletions

View File

@@ -1,3 +1,4 @@
#!/usr/bin/env python
'''
Need a docstring.
@@ -6,10 +7,10 @@ Need a docstring.
import argparse
import jinja2
import os
import sys
import asyncio
from datetime import datetime
# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
def init_crawler(url=None):
@@ -17,10 +18,14 @@ def init_crawler(url=None):
docstring
'''
# ensure we have a sensible URL to work with
baseurl = sanitise_url(url=url, base_url=True)
baseurl = standardise_url(url=url, base_url=url)
# get robots.txt
robots = RobotsTxt(base_url=baseurl)
# fail early if robots denies all crawling
if not robots.check(url=baseurl):
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
return(baseurl, robots)
@@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def main(args=None):
def main():
'''
docstring
'''
@@ -54,21 +59,25 @@ def main(args=None):
# create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
# async_crawler.run()
# run the crawler
crawler = asyncio.Task(async_crawler.run())
task = asyncio.Task(async_crawler.run_loop())
loop = asyncio.get_event_loop()
loop.run_until_complete(crawler)
loop.run_until_complete(task)
loop.close()
result = crawler.result()
print(len(result))
results = task.result()
print(results)
print(len(results))
runtime = int((datetime.now() - starttime).total_seconds())
print(runtime)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
parser.add_argument("-c", "--concurrency", required=False, type=int,
default=50, help="Max number of pages to crawl concurrently")
args = parser.parse_args()
main(args)
main()

View File

@@ -26,65 +26,113 @@ class AsyncCrawler(object):
self.robots = robots
self.uncrawled = set()
self.crawled = set()
self.session = aiohttp.ClientSession()
# self.headers = {'Accept-Encoding': 'gzip, deflate',
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency)
# add the base URL to be crawled
self.uncrawled.add(baseurl)
self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def validate_url(self, url):
async def crawl_url(self, url=None):
'''
Checks if the discovered URL is local to the base URL.
docstring
'''
urls = set()
async with self.semaphore:
source = await self.get_source(url)
if source:
self.crawled.add(url)
# for new_url in self.find_all_urls(source):
# urls.add(new_url)
urls_to_crawl = self.find_all_urls(source)
# print('discovered {0} new URLs'.format(len(urls_to_crawl)))
for new_url in urls_to_crawl:
urls.add(new_url)
# add the url we just crawled to the crawled pool.
return urls
def validate_url(self, url=None):
'''
Ensures we have a valid URL to crawl and that the site's robots.txt
allows it.
'''
# ensure the URL is in a sane format
url = sanitise_url(url=url)
url = standardise_url(url=url, base_url=self.baseurl)
if url.startswith(self.baseurl) and robots.check(url=url):
if url and self.robots.check(url=url):
# print('validated url: {0}'.format(url))
return url
else:
return False
def get_source(self, url):
async def get_source(self, url=None):
'''
Obtains the page's source.
'''
pass
return source
print('semaphore held for {0}'.format(url))
async with self.client_session.get(url, timeout=5) as resp:
try:
source = await resp.read()
return source
except Exception:
return None
def find_links(self, source):
def find_all_urls(self, source=None):
'''
Find all links in a page's source.
Find all URLs in a page's source.
'''
links = set()
urls = set()
html = BeautifulSoup(source, 'lxml')
hrefs = html.find_all('a', href=True)
# build a set of URLs which are valid and haven't been crawled yet
for href in hrefs:
url = self.validate_url(url=href)
if url:
links.add(url)
url = self.validate_url(url=href['href'])
if url and url not in self.crawled:
urls.add(url)
return links
return urls
def run(self):
async def run_loop(self):
'''
function which runs the crawler
'''
pass
print('Crawling: {}'.format(self.baseurl))
self.client_session = aiohttp.ClientSession(headers=self.headers)
# provide the starting URL to the crawler
self.uncrawled.add(self.baseurl)
while len(self.uncrawled) > 0:
# print('################################ there are {0} uncrawled urls in the pool'.format(
# len(self.uncrawled)))
url = self.uncrawled.pop()
# print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
# len(self.uncrawled)))
new_urls = await self.crawl_url(url=url)
for url in new_urls:
# print('adding: {0}'.format(url))
self.uncrawled.add(url)
await self.client_session.close()
return self.crawled
for url in self.uncrawled:
validated = validate_url(url=url)
if validated:
source = get_source(url=url)
links = find_links(source=source)
@@ -183,8 +231,8 @@ class WebPage(object):
'''
for url in self.discovered_hrefs:
if url.startswith(self.base_url) and self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
standardised_url = sanitise_url(url=url)
self.urls_to_crawl.add(standardised_url)
def list_urls(self):
@@ -251,31 +299,35 @@ class RobotsTxt(object):
return self.robots.can_fetch("*", url)
def sanitise_url(url, base_url=False):
def standardise_url(url=None, base_url=None):
'''
If `base_url` is True, we attempt to standardise `url` to ensure it can be
prepended to relative URLs. If no scheme has been provided then we default
If `base_url` is None then we attempt to standarise the URL to ensure it can
be prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https.
If `base_url` is False, we sanitise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc).
If `base_url` is set, we standardise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc). Any relative URLs will be appended
to the base url.
Returns a sanitised URL as a string.
Returns a standardised URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if base_url:
# This will sanitise the initial url for the initial page crawl.
if not base_url:
# This will sanitise the initial url provided by the user.
if split_url.scheme and split_url.scheme.startswith('http'):
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
return "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
sanitised_url = "".join([default_proto, delim, split_url.path])
return "".join([default_proto, delim, split_url.path])
else:
# Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
if url.startswith('/'):
return urljoin(base_url, split_url.path)
elif url.startswith(base_url):
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return sanitised_url
return None