Compare commits

..

5 Commits

Author SHA1 Message Date
d6964672b6 commit of working async crawler 2018-09-15 21:30:02 +01:00
3808f72f73 correct semaphore usage 2018-09-14 16:06:17 +01:00
7ebe4855b8 remove unecessary classes2 2018-09-14 16:02:20 +01:00
db986b0eba async crawler in a mostly-working state 2018-09-14 16:01:12 +01:00
36e1f7693f initial foray into asynchronous crawling 2018-09-12 22:54:12 +01:00
2 changed files with 209 additions and 124 deletions

81
async_crawler.py Normal file
View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python
'''
Need a docstring.
'''
import argparse
import jinja2
import os
import sys
import asyncio
from datetime import datetime
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
def sanity_checks(url=None):
'''
Runs some basic sanity checks before the crawler is initialised.
'''
# ensure we have a sensible URL to work with
baseurl = standardise_url(url=url)
# get robots.txt
robots = RobotsTxt(base_url=baseurl)
# fail early if robots denies all crawling
if not robots.check(url=baseurl):
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
return(baseurl, robots)
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap to an HTML file.
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
template = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = template.render(
base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def main():
'''
docstring
'''
starttime = datetime.now()
baseurl, robots = sanity_checks(url=args.url)
# create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
loop.close()
results = task.result()
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max number of pages to crawl concurrently")
args = parser.parse_args()
main()

View File

@@ -3,140 +3,138 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
from bs4 import BeautifulSoup # import urllib.request
# import urllib.error
# import gzip
# from time import sleep
import aiohttp import aiohttp
import urllib.request import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import urllib.robotparser import urllib.robotparser
import urllib.error
import gzip
from urllib.parse import (urljoin, urlsplit)
class UrlPool(object): class AsyncCrawler(object):
''' '''
Object to manage a pool of URLs. docstring
''' '''
def __init__(self): def __init__(self, baseurl=None, robots=None, concurrency=None):
self.pool = set() self.baseurl = baseurl
def check_duplicate(self, new_url):
'''
Checks if a URL exists in the current pool.
'''
if new_url in self.pool:
return True
else:
return False
def remove_from_pool(self):
'''
Remove a URL from the pool and return it to be crawled.
'''
return(self.pool.pop())
def add_to_pool(self, url):
self.pool.add(url)
def list_pool(self):
pool = self.pool
return pool
class WebPage(object):
'''
Object to manage common operations required to return
the data from each individual page.
'''
# set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None, robots=None):
self.url = url
self.base_url = base_url
self.robots = robots self.robots = robots
self.source = None self.uncrawled = set()
self.urls_to_crawl = set() self.crawled = set()
self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency)
def get_source(self): async def crawl_url(self, url=None):
''' '''
Retrieve a page's source. docstring
''' '''
urls = []
source = await self.get_source(url)
if source:
urls = self.find_all_urls(source)
request = urllib.request.Request(self.url, headers=self.headers) return urls
page = urllib.request.urlopen(request, timeout=5)
# handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
self.source = gzip.decompress(page.read())
elif page.info().get('Content-Encoding') == 'deflate':
self.source = page.read()
else:
self.source = page.read()
def find_links(self): def validate_url(self, url=None):
''' '''
Find all URLs on a page and ensure they are absolute. If they are Ensures we have a valid URL to crawl and that the site's robots.txt
relative then they will be appended to the base URL. allows it.
''' '''
hrefs = set() # ensure the URL is in a sane format
url = standardise_url(url=url, base_url=self.baseurl)
soup = BeautifulSoup(self.source, 'lxml') if url and self.robots.check(url=url):
links = soup.find_all('a', href=True) return url
for link in links:
if link['href'].startswith('/'):
hrefs.add(urljoin(self.url, link['href']))
else:
hrefs.add(link['href'])
self.discovered_hrefs = hrefs
def parse_urls(self):
'''
Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL.
'''
for url in self.discovered_hrefs:
if url.startswith(self.base_url) and self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self):
'''
Returns all valid discovered URLs.
'''
return self.urls_to_crawl
def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try:
self.get_source()
except Exception:
# skip if we didn't retrieve the source.
pass
if self.source:
self.find_links()
self.parse_urls()
return True
else: else:
return False return False
async def get_source(self, url=None):
'''
Obtains the page's source.
'''
async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head:
try:
data = await head.read()
except Exception as e:
print(e)
if 'text/html' in head.headers['Content-Type']:
async with self.client_session.get(url, timeout=5) as resp:
try:
source = await resp.read()
print('crawled {0}'.format(url))
return source
except Exception:
return None
else:
print('{0} - {1}'.format(head.headers['Content-Type'], url))
def find_all_urls(self, source=None):
'''
Find all URLs in a page's source.
'''
urls = []
html = BeautifulSoup(source, 'lxml')
hrefs = html.find_all('a', href=True)
# build a set of URLs which are valid and haven't been crawled yet
for href in hrefs:
url = self.validate_url(url=href['href'])
if url and url not in self.crawled:
urls.append(url)
return urls
async def run(self, urls=None):
tasks = []
all_urls = set()
for url in urls:
self.crawled.add(url)
tasks.append(self.crawl_url(url))
for task in asyncio.as_completed(tasks):
urls = None
try:
# completed.append((await task))
urls = await task
except Exception as e:
print(e)
if urls:
for url in urls:
all_urls.add(url)
return all_urls
async def main(self):
self.client_session = aiohttp.ClientSession(headers=self.headers)
to_crawl = []
to_crawl.append(self.baseurl)
while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl)
to_crawl.clear()
to_crawl.extend(discovered_urls)
await self.client_session.close()
return self.crawled
class RobotsTxt(object): class RobotsTxt(object):
''' '''
needs a docstring needs a docstring
@@ -174,31 +172,37 @@ class RobotsTxt(object):
return self.robots.can_fetch("*", url) return self.robots.can_fetch("*", url)
def sanitise_url(url, base_url=False): def standardise_url(url=None, base_url=None):
''' '''
If `base_url` is True, we attempt to standardise `url` to ensure it can be If `base_url` is None then we attempt to standarise the URL to ensure it can
prepended to relative URLs. If no scheme has been provided then we default be prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https. to http as any sane https-only site should 301 redirect http > https.
If `base_url` is False, we sanitise URLs to strip queries and fragments (we If `base_url` is set, we standardise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc). don't want to scrape in-page anchors etc). Any relative URLs will be appended
to the base url.
Returns a sanitised URL as a string. Returns a standardised URL as a string.
''' '''
default_proto = 'http' default_proto = 'http'
delim = '://' delim = '://'
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
split_url = urlsplit(url) split_url = urlsplit(url)
if base_url: if not base_url:
# This will sanitise the initial url for the initial page crawl. # This will sanitise the initial url provided by the user.
if split_url.scheme and split_url.scheme.startswith('http'): if split_url.scheme and split_url.scheme.startswith('http'):
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc]) return "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc): elif (split_url.path and not split_url.scheme and not split_url.netloc):
sanitised_url = "".join([default_proto, delim, split_url.path]) return "".join([default_proto, delim, split_url.path])
else: else:
# if url.endswith(file_extensions):
# Sanitise discovered URLs. We already expect them in the format # Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path # protocol://base_url/path
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) if url.startswith('/'):
return urljoin(base_url, split_url.path)
elif url.startswith(base_url):
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return sanitised_url return None