Compare commits

...

2 Commits

3 changed files with 171 additions and 14 deletions

74
async_crawler.py Normal file
View File

@@ -0,0 +1,74 @@
#!/usr/bin/env python
'''
Need a docstring.
'''
import argparse
import jinja2
import os
import asyncio
from datetime import datetime
# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
def init_crawler(url=None):
'''
docstring
'''
# ensure we have a sensible URL to work with
baseurl = sanitise_url(url=url, base_url=True)
# get robots.txt
robots = RobotsTxt(base_url=baseurl)
return(baseurl, robots)
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap as an HTML file.
'''
# urlcount = len(crawled_urls)
# sorted_urls = sorted(crawled_urls)
tmpl = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(
base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def main(args=None):
'''
docstring
'''
starttime = datetime.now()
baseurl, robots = init_crawler(url=args.url)
# create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
# async_crawler.run()
crawler = asyncio.Task(async_crawler.run())
loop = asyncio.get_event_loop()
loop.run_until_complete(crawler)
loop.close()
result = crawler.result()
print(len(result))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
args = parser.parse_args()
main(args)

View File

@@ -6,6 +6,7 @@ Need a docstring.
import argparse import argparse
import jinja2 import jinja2
import os import os
import asyncio
from datetime import datetime from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
@@ -57,16 +58,18 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
# create a WebPage object for the URL # create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots) current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try: try:
current_page.run() succeeded = current_page.run()
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
except Exception as e: except Exception as e:
print(e) print(e)
for url in _urls: if succeeded:
sanitised_url = sanitise_url(url=url) _urls = current_page.list_urls()
if sanitised_url not in crawled_urls.pool: crawled_urls.add_to_pool(new_url)
uncrawled_urls.add_to_pool(url)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool), print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
len(uncrawled_urls.pool))) len(uncrawled_urls.pool)))

View File

@@ -3,12 +3,90 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
from bs4 import BeautifulSoup
import urllib.request import urllib.request
import urllib.robotparser
import urllib.error import urllib.error
import gzip import gzip
from urllib.parse import (urljoin, urlsplit)
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import urllib.robotparser
class AsyncCrawler(object):
'''
docstring
'''
def __init__(self, baseurl=None, robots=None, concurrency=None):
self.baseurl = baseurl
self.robots = robots
self.uncrawled = set()
self.crawled = set()
self.session = aiohttp.ClientSession()
self.semaphore = asyncio.BoundedSemaphore(concurrency)
# add the base URL to be crawled
self.uncrawled.add(baseurl)
self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def validate_url(self, url):
'''
Checks if the discovered URL is local to the base URL.
'''
# ensure the URL is in a sane format
url = sanitise_url(url=url)
if url.startswith(self.baseurl) and robots.check(url=url):
return url
else:
return False
def get_source(self, url):
'''
Obtains the page's source.
'''
pass
return source
def find_links(self, source):
'''
Find all links in a page's source.
'''
links = set()
html = BeautifulSoup(source, 'lxml')
hrefs = html.find_all('a', href=True)
for href in hrefs:
url = self.validate_url(url=href)
if url:
links.add(url)
return links
def run(self):
'''
function which runs the crawler
'''
pass
for url in self.uncrawled:
validated = validate_url(url=url)
if validated:
source = get_source(url=url)
links = find_links(source=source)
class UrlPool(object): class UrlPool(object):
@@ -104,10 +182,9 @@ class WebPage(object):
pool if they start with the base URL. pool if they start with the base URL.
''' '''
for url in self.discovered_hrefs: for url in self.discovered_hrefs:
if url.startswith(self.url): if url.startswith(self.base_url) and self.robots.check(url):
if self.robots.check(url): sanitised_url = sanitise_url(url=url)
sanitised_url = sanitise_url(url=url) self.urls_to_crawl.add(sanitised_url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self): def list_urls(self):
@@ -132,6 +209,9 @@ class WebPage(object):
if self.source: if self.source:
self.find_links() self.find_links()
self.parse_urls() self.parse_urls()
return True
else:
return False
class RobotsTxt(object): class RobotsTxt(object):