async crawler in a mostly-working state
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
'''
|
||||||
Need a docstring.
|
Need a docstring.
|
||||||
@@ -6,10 +7,10 @@ Need a docstring.
|
|||||||
import argparse
|
import argparse
|
||||||
import jinja2
|
import jinja2
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
|
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
|
||||||
from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
|
|
||||||
|
|
||||||
|
|
||||||
def init_crawler(url=None):
|
def init_crawler(url=None):
|
||||||
@@ -17,10 +18,14 @@ def init_crawler(url=None):
|
|||||||
docstring
|
docstring
|
||||||
'''
|
'''
|
||||||
# ensure we have a sensible URL to work with
|
# ensure we have a sensible URL to work with
|
||||||
baseurl = sanitise_url(url=url, base_url=True)
|
baseurl = standardise_url(url=url, base_url=url)
|
||||||
# get robots.txt
|
# get robots.txt
|
||||||
robots = RobotsTxt(base_url=baseurl)
|
robots = RobotsTxt(base_url=baseurl)
|
||||||
|
|
||||||
|
# fail early if robots denies all crawling
|
||||||
|
if not robots.check(url=baseurl):
|
||||||
|
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
|
||||||
|
|
||||||
return(baseurl, robots)
|
return(baseurl, robots)
|
||||||
|
|
||||||
|
|
||||||
@@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
|||||||
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
|
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
|
||||||
|
|
||||||
|
|
||||||
def main(args=None):
|
def main():
|
||||||
'''
|
'''
|
||||||
docstring
|
docstring
|
||||||
'''
|
'''
|
||||||
@@ -54,21 +59,25 @@ def main(args=None):
|
|||||||
|
|
||||||
# create a crawler
|
# create a crawler
|
||||||
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
|
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
|
||||||
# async_crawler.run()
|
# run the crawler
|
||||||
|
|
||||||
crawler = asyncio.Task(async_crawler.run())
|
task = asyncio.Task(async_crawler.run_loop())
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
loop.run_until_complete(crawler)
|
loop.run_until_complete(task)
|
||||||
loop.close()
|
loop.close()
|
||||||
result = crawler.result()
|
results = task.result()
|
||||||
print(len(result))
|
print(results)
|
||||||
|
print(len(results))
|
||||||
|
runtime = int((datetime.now() - starttime).total_seconds())
|
||||||
|
print(runtime)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
||||||
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
|
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
|
||||||
parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
|
parser.add_argument("-c", "--concurrency", required=False, type=int,
|
||||||
|
default=50, help="Max number of pages to crawl concurrently")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main()
|
||||||
|
|||||||
134
utils/helpers.py
134
utils/helpers.py
@@ -26,65 +26,113 @@ class AsyncCrawler(object):
|
|||||||
self.robots = robots
|
self.robots = robots
|
||||||
self.uncrawled = set()
|
self.uncrawled = set()
|
||||||
self.crawled = set()
|
self.crawled = set()
|
||||||
self.session = aiohttp.ClientSession()
|
# self.headers = {'Accept-Encoding': 'gzip, deflate',
|
||||||
|
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
|
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
|
self.client_session = None
|
||||||
self.semaphore = asyncio.BoundedSemaphore(concurrency)
|
self.semaphore = asyncio.BoundedSemaphore(concurrency)
|
||||||
# add the base URL to be crawled
|
|
||||||
self.uncrawled.add(baseurl)
|
|
||||||
self.headers = {'Accept-Encoding': 'gzip, deflate',
|
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
|
||||||
|
|
||||||
|
async def crawl_url(self, url=None):
|
||||||
def validate_url(self, url):
|
|
||||||
'''
|
'''
|
||||||
Checks if the discovered URL is local to the base URL.
|
docstring
|
||||||
|
'''
|
||||||
|
urls = set()
|
||||||
|
async with self.semaphore:
|
||||||
|
source = await self.get_source(url)
|
||||||
|
if source:
|
||||||
|
self.crawled.add(url)
|
||||||
|
# for new_url in self.find_all_urls(source):
|
||||||
|
# urls.add(new_url)
|
||||||
|
urls_to_crawl = self.find_all_urls(source)
|
||||||
|
# print('discovered {0} new URLs'.format(len(urls_to_crawl)))
|
||||||
|
for new_url in urls_to_crawl:
|
||||||
|
urls.add(new_url)
|
||||||
|
# add the url we just crawled to the crawled pool.
|
||||||
|
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def validate_url(self, url=None):
|
||||||
|
'''
|
||||||
|
Ensures we have a valid URL to crawl and that the site's robots.txt
|
||||||
|
allows it.
|
||||||
'''
|
'''
|
||||||
# ensure the URL is in a sane format
|
# ensure the URL is in a sane format
|
||||||
url = sanitise_url(url=url)
|
url = standardise_url(url=url, base_url=self.baseurl)
|
||||||
|
|
||||||
if url.startswith(self.baseurl) and robots.check(url=url):
|
if url and self.robots.check(url=url):
|
||||||
|
# print('validated url: {0}'.format(url))
|
||||||
return url
|
return url
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_source(self, url):
|
async def get_source(self, url=None):
|
||||||
'''
|
'''
|
||||||
Obtains the page's source.
|
Obtains the page's source.
|
||||||
'''
|
'''
|
||||||
pass
|
print('semaphore held for {0}'.format(url))
|
||||||
|
async with self.client_session.get(url, timeout=5) as resp:
|
||||||
|
try:
|
||||||
|
source = await resp.read()
|
||||||
return source
|
return source
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def find_links(self, source):
|
def find_all_urls(self, source=None):
|
||||||
'''
|
'''
|
||||||
Find all links in a page's source.
|
Find all URLs in a page's source.
|
||||||
'''
|
'''
|
||||||
links = set()
|
urls = set()
|
||||||
|
|
||||||
html = BeautifulSoup(source, 'lxml')
|
html = BeautifulSoup(source, 'lxml')
|
||||||
hrefs = html.find_all('a', href=True)
|
hrefs = html.find_all('a', href=True)
|
||||||
|
|
||||||
|
# build a set of URLs which are valid and haven't been crawled yet
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
url = self.validate_url(url=href)
|
url = self.validate_url(url=href['href'])
|
||||||
if url:
|
if url and url not in self.crawled:
|
||||||
links.add(url)
|
urls.add(url)
|
||||||
|
|
||||||
return links
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
async def run_loop(self):
|
||||||
'''
|
'''
|
||||||
function which runs the crawler
|
function which runs the crawler
|
||||||
'''
|
'''
|
||||||
pass
|
print('Crawling: {}'.format(self.baseurl))
|
||||||
|
self.client_session = aiohttp.ClientSession(headers=self.headers)
|
||||||
|
# provide the starting URL to the crawler
|
||||||
|
self.uncrawled.add(self.baseurl)
|
||||||
|
|
||||||
|
while len(self.uncrawled) > 0:
|
||||||
|
# print('################################ there are {0} uncrawled urls in the pool'.format(
|
||||||
|
# len(self.uncrawled)))
|
||||||
|
url = self.uncrawled.pop()
|
||||||
|
# print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
|
||||||
|
# len(self.uncrawled)))
|
||||||
|
new_urls = await self.crawl_url(url=url)
|
||||||
|
for url in new_urls:
|
||||||
|
# print('adding: {0}'.format(url))
|
||||||
|
self.uncrawled.add(url)
|
||||||
|
|
||||||
|
await self.client_session.close()
|
||||||
|
return self.crawled
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for url in self.uncrawled:
|
|
||||||
validated = validate_url(url=url)
|
|
||||||
|
|
||||||
if validated:
|
|
||||||
source = get_source(url=url)
|
|
||||||
links = find_links(source=source)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -183,8 +231,8 @@ class WebPage(object):
|
|||||||
'''
|
'''
|
||||||
for url in self.discovered_hrefs:
|
for url in self.discovered_hrefs:
|
||||||
if url.startswith(self.base_url) and self.robots.check(url):
|
if url.startswith(self.base_url) and self.robots.check(url):
|
||||||
sanitised_url = sanitise_url(url=url)
|
standardised_url = sanitise_url(url=url)
|
||||||
self.urls_to_crawl.add(sanitised_url)
|
self.urls_to_crawl.add(standardised_url)
|
||||||
|
|
||||||
|
|
||||||
def list_urls(self):
|
def list_urls(self):
|
||||||
@@ -251,31 +299,35 @@ class RobotsTxt(object):
|
|||||||
return self.robots.can_fetch("*", url)
|
return self.robots.can_fetch("*", url)
|
||||||
|
|
||||||
|
|
||||||
def sanitise_url(url, base_url=False):
|
def standardise_url(url=None, base_url=None):
|
||||||
'''
|
'''
|
||||||
If `base_url` is True, we attempt to standardise `url` to ensure it can be
|
If `base_url` is None then we attempt to standarise the URL to ensure it can
|
||||||
prepended to relative URLs. If no scheme has been provided then we default
|
be prepended to relative URLs. If no scheme has been provided then we default
|
||||||
to http as any sane https-only site should 301 redirect http > https.
|
to http as any sane https-only site should 301 redirect http > https.
|
||||||
|
|
||||||
If `base_url` is False, we sanitise URLs to strip queries and fragments (we
|
If `base_url` is set, we standardise URLs to strip queries and fragments (we
|
||||||
don't want to scrape in-page anchors etc).
|
don't want to scrape in-page anchors etc). Any relative URLs will be appended
|
||||||
|
to the base url.
|
||||||
|
|
||||||
Returns a sanitised URL as a string.
|
Returns a standardised URL as a string.
|
||||||
'''
|
'''
|
||||||
default_proto = 'http'
|
default_proto = 'http'
|
||||||
delim = '://'
|
delim = '://'
|
||||||
|
|
||||||
split_url = urlsplit(url)
|
split_url = urlsplit(url)
|
||||||
|
|
||||||
if base_url:
|
if not base_url:
|
||||||
# This will sanitise the initial url for the initial page crawl.
|
# This will sanitise the initial url provided by the user.
|
||||||
if split_url.scheme and split_url.scheme.startswith('http'):
|
if split_url.scheme and split_url.scheme.startswith('http'):
|
||||||
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
|
return "".join([split_url.scheme, delim, split_url.netloc])
|
||||||
elif (split_url.path and not split_url.scheme and not split_url.netloc):
|
elif (split_url.path and not split_url.scheme and not split_url.netloc):
|
||||||
sanitised_url = "".join([default_proto, delim, split_url.path])
|
return "".join([default_proto, delim, split_url.path])
|
||||||
else:
|
else:
|
||||||
# Sanitise discovered URLs. We already expect them in the format
|
# Sanitise discovered URLs. We already expect them in the format
|
||||||
# protocol://base_url/path
|
# protocol://base_url/path
|
||||||
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
|
if url.startswith('/'):
|
||||||
|
return urljoin(base_url, split_url.path)
|
||||||
|
elif url.startswith(base_url):
|
||||||
|
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
|
||||||
|
|
||||||
return sanitised_url
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user