Compare commits

...

21 Commits

Author SHA1 Message Date
5f7d66912f add test files 2018-09-19 08:39:05 +01:00
d4cd93e3d4 update docs 2018-09-19 08:38:49 +01:00
f5f6afd1a4 correct tests with new arg names 2018-09-19 08:37:55 +01:00
679b1b7b53 rename all instances of base_url to rooturl, add more documentation 2018-09-18 18:24:15 +01:00
32d7f1e54b add talking points 2018-09-18 18:23:12 +01:00
f6265f18a7 initial test for AsyncCrawler 2018-09-18 18:22:55 +01:00
9a4e9ddfc7 add test for missing robots.txt 2018-09-18 10:53:13 +01:00
51f988e1bc added more tests 2018-09-17 21:44:20 +01:00
73c21e5bd3 small improvements to docs and variables 2018-09-17 21:44:04 +01:00
eb2395d461 minor change to README 2018-09-17 08:11:26 +01:00
c53f62b55d add most changes suggested by pycodestyle 2018-09-16 16:10:38 +01:00
75d3756bbc fix errors discovered by pycyodestyle 2018-09-16 16:04:07 +01:00
5262c23281 add flags to README 2018-09-16 15:58:17 +01:00
524f6a45cd improve documentation 2018-09-16 15:53:47 +01:00
a926090bed update requirements 2018-09-16 15:44:30 +01:00
91cd988f52 more comments and progress output 2018-09-16 15:26:49 +01:00
f1855f5add re-order imports because I'm fussy 2018-09-16 09:06:30 +01:00
336517e84a more documentation and add back some required imports 2018-09-16 09:00:43 +01:00
7bc9fe0679 improved documentation and remove unneeded set 2018-09-16 08:56:44 +01:00
6548f55416 improve documentation 2018-09-15 21:48:50 +01:00
0244435fea remove unecessary imports 2018-09-15 21:38:51 +01:00
8 changed files with 203 additions and 122 deletions

View File

@@ -2,7 +2,7 @@
## Requirements ## Requirements
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features. This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
Install required modules: Install required modules:
@@ -13,9 +13,16 @@ pip install -r requirements.txt
Run: Run:
```bash ```bash
python crawler.py -u https://urltocrawl.com python async_crawler.py -u https://urltocrawl.com [-c 100]
``` ```
Flags:
- `-u/--url https://url.com`
- The base URL is required.
- `-c/--concurrency 100`
- Specifying concurrency value is optional (defaults to 100).
## Results ## Results
The resulting sitemap will be output in the root of this directory as `sitemap.html` The resulting sitemap will be output to the root of this directory as `sitemap.html`

View File

@@ -1,37 +1,59 @@
#!/usr/bin/env python #!/usr/bin/env python
''' '''
Need a docstring. Asynchronous web crawler written in Python 3.5+.
This script will respect the site's `robots.txt`, if one exists. If not, all
URLs discovered will be crawled.
The crawler takes a total of two arguments (concurrency is optional):
url: the root URL to begin the crawl from.
concurrency: the maximum number of pages which may be crawled concurrently.
''' '''
import argparse import argparse
import asyncio
from datetime import datetime
import jinja2 import jinja2
import os import os
import sys import sys
import asyncio
from datetime import datetime
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
def sanity_checks(url=None): def sanity_checks(url=None):
''' '''
Runs some basic sanity checks before the crawler is initialised. Runs some basic sanity checks before the crawler is initialised.
Accepts:
url: the root URL to be crawled.
Returns:
rooturl: a string containing avalidated and cleaned version of the
initial URL.
robots: an object which allows us to query whether a site may be crawled.
''' '''
# ensure we have a sensible URL to work with # ensure we have a sensible URL to work with
baseurl = standardise_url(url=url) rooturl = standardise_url(url=url)
# get robots.txt # get robots.txt
robots = RobotsTxt(base_url=baseurl) robots = RobotsTxt(rooturl=rooturl)
# fail early if robots denies all crawling # fail early if robots denies all crawling
if not robots.check(url=baseurl): if not robots.check(url=rooturl):
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl)) sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
rooturl))
return(baseurl, robots) return(rooturl, robots)
def render_sitemap(base_url=None, crawled_urls=None, runtime=None): def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
''' '''
Renders the sitemap to an HTML file. Renders the sitemap to an HTML file.
Accepts:
rooturl: string containing the root URL
crawled_urls: set containing discovered URLs
runtime: int representing run time of AsyncCrawler
''' '''
urlcount = len(crawled_urls) urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls) sorted_urls = sorted(crawled_urls)
@@ -40,8 +62,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
loader=jinja2.FileSystemLoader('templates') loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2') ).get_template('sitemap.html.j2')
rendered_html = template.render( rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile: with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html) outfile.write(rendered_html)
@@ -51,31 +73,36 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
def main(): def main():
''' '''
docstring Main function, responsible for prepping and running the crawler and
rendering the sitemap.
''' '''
starttime = datetime.now() starttime = datetime.now()
baseurl, robots = sanity_checks(url=args.url) rooturl, robots = sanity_checks(url=args.url)
# create a crawler # create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
concurrency=args.concurrency)
# create a task to run the crawler, run the loop and then gather the
# results.
task = asyncio.Task(async_crawler.main()) task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(task) loop.run_until_complete(task)
loop.close() loop.close()
results = task.result() results = sorted(task.result())
runtime = int((datetime.now() - starttime).total_seconds()) runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime) render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler') parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl") parser.add_argument("-u", "--url", required=True, help="Initial url")
parser.add_argument("-c", "--concurrency", required=False, type=int, parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max number of pages to crawl concurrently") default=100, help="Max pages to crawl concurrently")
args = parser.parse_args() args = parser.parse_args()
main() main()

View File

@@ -18,39 +18,9 @@
* better exception handling * better exception handling
* randomise output filename * randomise output filename
### Async bits ### talking points
in `__main__`: - token bucket algo to enforce n requests per second
- read up on bucket algo types
```python - re-structuring AsyncCrawler to be more testable
loop = asyncio.get_event_loop() - use exponential backoff algo?
try:
loop.run_until_complete(main())
finally:
loop.close()
```
* initialises loop and runs it to completion
* needs to handle errors (try/except/finally)
```python
async def run(args=None):
tasks = []
for url in pool:
tasks.append(url)
# for i in range(10):
# tasks.append(asyncio.ensure_future(myCoroutine(i)))
# gather completed tasks
await asyncio.gather(*tasks)
```
Getting the contents of the page needs to be async too
```python
async def get_source():
blah
blah
await urlopen(url)
```

View File

@@ -1,8 +1,12 @@
aiohttp==3.4.4
async-timeout==3.0.0
attrs==18.2.0
beautifulsoup4==4.6.3 beautifulsoup4==4.6.3
bs4==0.0.1 bs4==0.0.1
certifi==2018.8.13
chardet==3.0.4 chardet==3.0.4
idna==2.7 idna==2.7
Jinja2==2.10 Jinja2==2.10
lxml==4.2.4 lxml==4.2.4
MarkupSafe==1.0 MarkupSafe==1.0
multidict==4.4.0
yarl==1.2.6

View File

@@ -4,7 +4,7 @@
</head> </head>
<body> <body>
<p> <p>
Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds. Crawled {{ urlcount }} URLs on {{ rooturl }} in ~{{ runtime }} seconds.
<ul> <ul>
{% for url in urls %} {% for url in urls %}
<li><a href="{{ url }}">{{ url }}</a></li> <li><a href="{{ url }}">{{ url }}</a></li>

View File

@@ -0,0 +1,10 @@
<html>
<body>
<p>
<ul>
<li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
<li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
<li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
</ul>
</body>
</html>

View File

@@ -1,34 +1,65 @@
#!/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
from utils.helpers import (sanitise_url) from unittest import mock
from utils.helpers import RobotsTxt, standardise_url
class TestRobots(unittest.TestCase):
rooturl = 'http://eu.httpbin.org'
no_robots = 'https://www.simonweald.com'
test_paths = (('/', True), ('/deny', False))
robots = RobotsTxt(rooturl=rooturl)
norobots = RobotsTxt(rooturl=no_robots)
def test_robots_txt_deny(self):
'''
Asserts result is True or False.
'''
for path, allowed in self.test_paths:
result = self.robots.check(url=path)
self.assertIs(result, allowed)
def test_no_robots_txt(self):
'''
Ensure we can crawl if robots.txt isn't present.
'''
result = self.norobots.check(url='/')
self.assertTrue(result)
class TestUrls(unittest.TestCase): class TestUrls(unittest.TestCase):
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), rooturl = 'http://eu.httpbin.org'
('www.simonweald.com', 'http://www.simonweald.com'),
('http://www.github.com/', 'http://www.github.com'),
('https://www.github.com', 'https://www.github.com'))
urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'), rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('https://github.com/?foo=bar', 'https://github.com/'), ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
('https://github.com/#anchor', 'https://github.com/')) ('https://eu.httpbin.org', 'https://eu.httpbin.org'))
urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
def test_sanitise_base_url(self): def test_standardise_rooturl(self):
''' '''
Tests whether a URL's protocol can be discovered if not provided. Tests whether a base URL can be standardised to the format
proto://[sub].domain.tld.
''' '''
for url, target in self.base_url_list: for url, target in self.rooturl_list:
result = sanitise_url(url, base_url=True) result = standardise_url(url)
self.assertEqual(result, target) self.assertEqual(result, target)
def test_sanitise_url(self): def test_standardise_url(self):
''' '''
Tests whether a URL's protocol can be discovered if not provided. Ensure that fragments/anchors etc are stripped.
''' '''
for url, target in self.urls_to_clean: for url, target in self.urls_to_clean:
result = sanitise_url(url) result = standardise_url(url, rooturl=self.rooturl)
self.assertEqual(result, target) self.assertEqual(result, target)

View File

@@ -3,38 +3,43 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
# import urllib.request
# import urllib.error
# import gzip
# from time import sleep
import aiohttp import aiohttp
import asyncio import asyncio
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.error
from urllib.parse import urljoin, urlsplit from urllib.parse import urljoin, urlsplit
import urllib.request
import urllib.robotparser import urllib.robotparser
class AsyncCrawler(object): class AsyncCrawler(object):
''' '''
docstring A concurrent recursive web crawler.
A recursive web crawler which finds all URLs local to the domains specified
in the `rooturl` argument.
Arguments:
rooturl: Root domain to begin crawling.
robots: RobotsTxt object for the rooturl.
concurrency: number of concurrent pages to crawl.
Returns:
All discovered pages in a set.
''' '''
def __init__(self, baseurl=None, robots=None, concurrency=None): def __init__(self, rooturl=None, robots=None, concurrency=None):
self.baseurl = baseurl self.rooturl = rooturl
self.robots = robots self.robots = robots
self.uncrawled = set()
self.crawled = set() self.crawled = set()
self.headers = {'Accept-Encoding': 'gzip, deflate', self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.client_session = None self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency) self.semaphore = asyncio.BoundedSemaphore(concurrency)
async def crawl_url(self, url=None): async def crawl_url(self, url=None):
''' '''
docstring Crawls the given URL and finds all new URLs in the given page.
''' '''
urls = [] urls = []
source = await self.get_source(url) source = await self.get_source(url)
@@ -43,46 +48,45 @@ class AsyncCrawler(object):
return urls return urls
def validate_url(self, url=None): def validate_url(self, url=None):
''' '''
Ensures we have a valid URL to crawl and that the site's robots.txt Ensures we have a valid URL to crawl and that the site's robots.txt
allows it. allows it.
''' '''
# ensure the URL is in a sane format # ensure the URL is in a sane format
url = standardise_url(url=url, base_url=self.baseurl) url = standardise_url(url=url, rooturl=self.rooturl)
if url and self.robots.check(url=url): if url and self.robots.check(url=url):
return url return url
else: else:
return False return False
async def get_source(self, url=None): async def get_source(self, url=None):
''' '''
Obtains the page's source. Obtains the URL's source, provided it is HTML. Usage of semaphores
ensures only a certain number of coroutines can run at any given
time.
''' '''
async with self.semaphore: async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head: async with self.client_session.head(url, timeout=5) as head:
try: try:
data = await head.read() _ = await head.read()
except Exception as e: except Exception:
print(e) pass
if 'text/html' in head.headers['Content-Type']: if 'text/html' in head.headers['Content-Type']:
async with self.client_session.get(url, timeout=5) as resp: async with self.client_session.get(url, timeout=5) as resp:
try: try:
source = await resp.read() source = await resp.read()
print('crawled {0}'.format(url))
return source return source
except Exception: except Exception:
return None return None
else: else:
print('{0} - {1}'.format(head.headers['Content-Type'], url)) return None
def find_all_urls(self, source=None): def find_all_urls(self, source=None):
''' '''
Find all URLs in a page's source. Find all URLs in a page's source. Returns a list of URLs which have
been validated as local to the starting URL.
''' '''
urls = [] urls = []
@@ -97,39 +101,58 @@ class AsyncCrawler(object):
return urls return urls
async def run(self, urls=None): async def run(self, urls=None):
'''
Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only
returns unseen URLs).
'''
tasks = [] tasks = []
all_urls = set() all_urls = set()
for url in urls: for url in urls:
# mark the URL as seen.
self.crawled.add(url) self.crawled.add(url)
# create a task to crawl the URL.
tasks.append(self.crawl_url(url)) tasks.append(self.crawl_url(url))
# wait for all tasks to complete.
for task in asyncio.as_completed(tasks): for task in asyncio.as_completed(tasks):
urls = None urls = None
try: try:
# completed.append((await task)) # try getting all tasks as completed.
urls = await task urls = await task
except Exception as e: except Exception:
print(e) # skip until all tasks have completed.
pass
# add the URLs to a set to be returned.
if urls: if urls:
for url in urls: for url in urls:
all_urls.add(url) all_urls.add(url)
return all_urls return all_urls
async def main(self): async def main(self):
'''
Runs a crawl with batches of URLs. Once complete returns a list of all
crawled URLs.
'''
self.client_session = aiohttp.ClientSession(headers=self.headers) self.client_session = aiohttp.ClientSession(headers=self.headers)
to_crawl = [] to_crawl = []
to_crawl.append(self.baseurl) # add the root URL to initialise the crawler.
to_crawl.append(self.rooturl)
print('Crawling: {0}'.format(self.rooturl))
while len(to_crawl) > 0: while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl) discovered_urls = await self.run(urls=to_crawl)
# empty to_crawl list and then add all newly discovered URLs for
# the next iteration.
to_crawl.clear() to_crawl.clear()
to_crawl.extend(discovered_urls) to_crawl.extend(discovered_urls)
print('{0} URLs crawled'.format(len(self.crawled)))
# close the ssions once all URLs have been crawled.
await self.client_session.close() await self.client_session.close()
return self.crawled return self.crawled
@@ -137,17 +160,23 @@ class AsyncCrawler(object):
class RobotsTxt(object): class RobotsTxt(object):
''' '''
needs a docstring Retrieve and query robots.txt for a given domain.
Retrieves and parses robots.txt for the given domain. Calling the check()
method returns True or False depending on whether crawling of that given
URL is allowed.
''' '''
def __init__(self, base_url=None): def __init__(self, rooturl=None):
''' '''
Manually retrieve robots.txt to allow us to set the user-agent. Manually retrieve robots.txt to allow us to set the user-agent (works
around sites which disallow access to robots.txt without a sane
user-agent).
''' '''
self.base_url = base_url self.rooturl = rooturl
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
robots_url = urljoin(self.base_url, 'robots.txt') robots_url = urljoin(self.rooturl, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers) request = urllib.request.Request(robots_url, headers=self.headers)
robots = urllib.robotparser.RobotFileParser() robots = urllib.robotparser.RobotFileParser()
@@ -156,6 +185,7 @@ class RobotsTxt(object):
try: try:
response = urllib.request.urlopen(request, timeout=5) response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError: except urllib.error.HTTPError:
# if robots.txt doesn't exist then allow all URLs to be crawled.
robots.allow_all = True robots.allow_all = True
else: else:
data = response.read() data = response.read()
@@ -164,7 +194,6 @@ class RobotsTxt(object):
self.robots = robots self.robots = robots
def check(self, url): def check(self, url):
''' '''
Test if robots allows us to crawl that URL. Test if robots allows us to crawl that URL.
@@ -172,25 +201,27 @@ class RobotsTxt(object):
return self.robots.can_fetch("*", url) return self.robots.can_fetch("*", url)
def standardise_url(url=None, base_url=None): def standardise_url(url=None, rooturl=None):
''' '''
If `base_url` is None then we attempt to standarise the URL to ensure it can If `rooturl` is None then we attempt to standarise the URL to ensure it
be prepended to relative URLs. If no scheme has been provided then we default can be prepended to relative URLs. If no scheme has been provided then we
to http as any sane https-only site should 301 redirect http > https. default to http as any sane https-only site should 301 redirect http to
https.
If `base_url` is set, we standardise URLs to strip queries and fragments (we If `rooturl` is set, we standardise URLs to strip queries and fragments
don't want to scrape in-page anchors etc). Any relative URLs will be appended (we don't want to scrape in-page anchors etc). Any relative URLs will be
to the base url. appended to the root url.
Returns a standardised URL as a string. Returns a standardised URL as a string.
''' '''
default_proto = 'http' default_proto = 'http'
delim = '://' delim = '://'
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm') file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
'cfm')
split_url = urlsplit(url) split_url = urlsplit(url)
if not base_url: if not rooturl:
# This will sanitise the initial url provided by the user. # This will sanitise the initial url provided by the user.
if split_url.scheme and split_url.scheme.startswith('http'): if split_url.scheme and split_url.scheme.startswith('http'):
return "".join([split_url.scheme, delim, split_url.netloc]) return "".join([split_url.scheme, delim, split_url.netloc])
@@ -199,10 +230,11 @@ def standardise_url(url=None, base_url=None):
else: else:
# if url.endswith(file_extensions): # if url.endswith(file_extensions):
# Sanitise discovered URLs. We already expect them in the format # Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path # protocol://rooturl/path
if url.startswith('/'): if url.startswith('/'):
return urljoin(base_url, split_url.path) return urljoin(rooturl, split_url.path)
elif url.startswith(base_url): elif url.startswith(rooturl):
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) return "".join([split_url.scheme, delim, split_url.netloc,
split_url.path])
return None return None