async crawler in a mostly-working state
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
Need a docstring.
|
||||
@@ -6,10 +7,10 @@ Need a docstring.
|
||||
import argparse
|
||||
import jinja2
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
||||
from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
|
||||
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
|
||||
|
||||
|
||||
def init_crawler(url=None):
|
||||
@@ -17,10 +18,14 @@ def init_crawler(url=None):
|
||||
docstring
|
||||
'''
|
||||
# ensure we have a sensible URL to work with
|
||||
baseurl = sanitise_url(url=url, base_url=True)
|
||||
baseurl = standardise_url(url=url, base_url=url)
|
||||
# get robots.txt
|
||||
robots = RobotsTxt(base_url=baseurl)
|
||||
|
||||
# fail early if robots denies all crawling
|
||||
if not robots.check(url=baseurl):
|
||||
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
|
||||
|
||||
return(baseurl, robots)
|
||||
|
||||
|
||||
@@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
||||
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
|
||||
|
||||
|
||||
def main(args=None):
|
||||
def main():
|
||||
'''
|
||||
docstring
|
||||
'''
|
||||
@@ -54,21 +59,25 @@ def main(args=None):
|
||||
|
||||
# create a crawler
|
||||
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
|
||||
# async_crawler.run()
|
||||
# run the crawler
|
||||
|
||||
crawler = asyncio.Task(async_crawler.run())
|
||||
task = asyncio.Task(async_crawler.run_loop())
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(crawler)
|
||||
loop.run_until_complete(task)
|
||||
loop.close()
|
||||
result = crawler.result()
|
||||
print(len(result))
|
||||
results = task.result()
|
||||
print(results)
|
||||
print(len(results))
|
||||
runtime = int((datetime.now() - starttime).total_seconds())
|
||||
print(runtime)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
||||
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
|
||||
parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
|
||||
parser.add_argument("-c", "--concurrency", required=False, type=int,
|
||||
default=50, help="Max number of pages to crawl concurrently")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
main()
|
||||
|
||||
134
utils/helpers.py
134
utils/helpers.py
@@ -26,65 +26,113 @@ class AsyncCrawler(object):
|
||||
self.robots = robots
|
||||
self.uncrawled = set()
|
||||
self.crawled = set()
|
||||
self.session = aiohttp.ClientSession()
|
||||
# self.headers = {'Accept-Encoding': 'gzip, deflate',
|
||||
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
self.client_session = None
|
||||
self.semaphore = asyncio.BoundedSemaphore(concurrency)
|
||||
# add the base URL to be crawled
|
||||
self.uncrawled.add(baseurl)
|
||||
self.headers = {'Accept-Encoding': 'gzip, deflate',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
|
||||
|
||||
def validate_url(self, url):
|
||||
async def crawl_url(self, url=None):
|
||||
'''
|
||||
Checks if the discovered URL is local to the base URL.
|
||||
docstring
|
||||
'''
|
||||
urls = set()
|
||||
async with self.semaphore:
|
||||
source = await self.get_source(url)
|
||||
if source:
|
||||
self.crawled.add(url)
|
||||
# for new_url in self.find_all_urls(source):
|
||||
# urls.add(new_url)
|
||||
urls_to_crawl = self.find_all_urls(source)
|
||||
# print('discovered {0} new URLs'.format(len(urls_to_crawl)))
|
||||
for new_url in urls_to_crawl:
|
||||
urls.add(new_url)
|
||||
# add the url we just crawled to the crawled pool.
|
||||
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def validate_url(self, url=None):
|
||||
'''
|
||||
Ensures we have a valid URL to crawl and that the site's robots.txt
|
||||
allows it.
|
||||
'''
|
||||
# ensure the URL is in a sane format
|
||||
url = sanitise_url(url=url)
|
||||
url = standardise_url(url=url, base_url=self.baseurl)
|
||||
|
||||
if url.startswith(self.baseurl) and robots.check(url=url):
|
||||
if url and self.robots.check(url=url):
|
||||
# print('validated url: {0}'.format(url))
|
||||
return url
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def get_source(self, url):
|
||||
async def get_source(self, url=None):
|
||||
'''
|
||||
Obtains the page's source.
|
||||
'''
|
||||
pass
|
||||
|
||||
print('semaphore held for {0}'.format(url))
|
||||
async with self.client_session.get(url, timeout=5) as resp:
|
||||
try:
|
||||
source = await resp.read()
|
||||
return source
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_links(self, source):
|
||||
def find_all_urls(self, source=None):
|
||||
'''
|
||||
Find all links in a page's source.
|
||||
Find all URLs in a page's source.
|
||||
'''
|
||||
links = set()
|
||||
urls = set()
|
||||
|
||||
html = BeautifulSoup(source, 'lxml')
|
||||
hrefs = html.find_all('a', href=True)
|
||||
|
||||
# build a set of URLs which are valid and haven't been crawled yet
|
||||
for href in hrefs:
|
||||
url = self.validate_url(url=href)
|
||||
if url:
|
||||
links.add(url)
|
||||
url = self.validate_url(url=href['href'])
|
||||
if url and url not in self.crawled:
|
||||
urls.add(url)
|
||||
|
||||
return links
|
||||
return urls
|
||||
|
||||
|
||||
def run(self):
|
||||
async def run_loop(self):
|
||||
'''
|
||||
function which runs the crawler
|
||||
'''
|
||||
pass
|
||||
print('Crawling: {}'.format(self.baseurl))
|
||||
self.client_session = aiohttp.ClientSession(headers=self.headers)
|
||||
# provide the starting URL to the crawler
|
||||
self.uncrawled.add(self.baseurl)
|
||||
|
||||
while len(self.uncrawled) > 0:
|
||||
# print('################################ there are {0} uncrawled urls in the pool'.format(
|
||||
# len(self.uncrawled)))
|
||||
url = self.uncrawled.pop()
|
||||
# print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
|
||||
# len(self.uncrawled)))
|
||||
new_urls = await self.crawl_url(url=url)
|
||||
for url in new_urls:
|
||||
# print('adding: {0}'.format(url))
|
||||
self.uncrawled.add(url)
|
||||
|
||||
await self.client_session.close()
|
||||
return self.crawled
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for url in self.uncrawled:
|
||||
validated = validate_url(url=url)
|
||||
|
||||
if validated:
|
||||
source = get_source(url=url)
|
||||
links = find_links(source=source)
|
||||
|
||||
|
||||
|
||||
@@ -183,8 +231,8 @@ class WebPage(object):
|
||||
'''
|
||||
for url in self.discovered_hrefs:
|
||||
if url.startswith(self.base_url) and self.robots.check(url):
|
||||
sanitised_url = sanitise_url(url=url)
|
||||
self.urls_to_crawl.add(sanitised_url)
|
||||
standardised_url = sanitise_url(url=url)
|
||||
self.urls_to_crawl.add(standardised_url)
|
||||
|
||||
|
||||
def list_urls(self):
|
||||
@@ -251,31 +299,35 @@ class RobotsTxt(object):
|
||||
return self.robots.can_fetch("*", url)
|
||||
|
||||
|
||||
def sanitise_url(url, base_url=False):
|
||||
def standardise_url(url=None, base_url=None):
|
||||
'''
|
||||
If `base_url` is True, we attempt to standardise `url` to ensure it can be
|
||||
prepended to relative URLs. If no scheme has been provided then we default
|
||||
If `base_url` is None then we attempt to standarise the URL to ensure it can
|
||||
be prepended to relative URLs. If no scheme has been provided then we default
|
||||
to http as any sane https-only site should 301 redirect http > https.
|
||||
|
||||
If `base_url` is False, we sanitise URLs to strip queries and fragments (we
|
||||
don't want to scrape in-page anchors etc).
|
||||
If `base_url` is set, we standardise URLs to strip queries and fragments (we
|
||||
don't want to scrape in-page anchors etc). Any relative URLs will be appended
|
||||
to the base url.
|
||||
|
||||
Returns a sanitised URL as a string.
|
||||
Returns a standardised URL as a string.
|
||||
'''
|
||||
default_proto = 'http'
|
||||
delim = '://'
|
||||
|
||||
split_url = urlsplit(url)
|
||||
|
||||
if base_url:
|
||||
# This will sanitise the initial url for the initial page crawl.
|
||||
if not base_url:
|
||||
# This will sanitise the initial url provided by the user.
|
||||
if split_url.scheme and split_url.scheme.startswith('http'):
|
||||
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
|
||||
return "".join([split_url.scheme, delim, split_url.netloc])
|
||||
elif (split_url.path and not split_url.scheme and not split_url.netloc):
|
||||
sanitised_url = "".join([default_proto, delim, split_url.path])
|
||||
return "".join([default_proto, delim, split_url.path])
|
||||
else:
|
||||
# Sanitise discovered URLs. We already expect them in the format
|
||||
# protocol://base_url/path
|
||||
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
|
||||
if url.startswith('/'):
|
||||
return urljoin(base_url, split_url.path)
|
||||
elif url.startswith(base_url):
|
||||
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
|
||||
|
||||
return sanitised_url
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user