add test files

update docs
correct tests with new arg names
2018-09-19 08:39:05 +01:00 · 2018-09-19 08:38:49 +01:00 · 2018-09-19 08:37:55 +01:00 · 2018-09-18 18:24:15 +01:00 · 2018-09-18 18:23:12 +01:00 · 2018-09-18 18:22:55 +01:00
8 changed files with 203 additions and 122 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 ## Requirements
-This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
+This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
 Install required modules:
@@ -13,9 +13,16 @@ pip install -r requirements.txt
 Run:
 ```bash
-python crawler.py -u https://urltocrawl.com
+python async_crawler.py -u https://urltocrawl.com [-c 100]
 ```
 Flags:
  - `-u/--url https://url.com`
    - The base URL is required.
  - `-c/--concurrency 100`
    - Specifying concurrency value is optional (defaults to 100).
 ## Results
-The resulting sitemap will be output in the root of this directory as `sitemap.html`
+The resulting sitemap will be output to the root of this directory as `sitemap.html`
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -1,37 +1,59 @@
 #!/usr/bin/env python
 '''
-Need a docstring.
+Asynchronous web crawler written in Python 3.5+.
 This script will respect the site's `robots.txt`, if one exists. If not, all
 URLs discovered will be crawled.
 The crawler takes a total of two arguments (concurrency is optional):
    url: the root URL to begin the crawl from.
    concurrency: the maximum number of pages which may be crawled concurrently.
 '''
 import argparse
 import asyncio
 from datetime import datetime
 import jinja2
 import os
 import sys
 import asyncio
 from datetime import datetime
 from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    Accepts:
      url: the root URL to be crawled.
    Returns:
      rooturl: a string containing avalidated and cleaned version of the
               initial URL.
      robots: an object which allows us to query whether a site may be crawled.
    '''
    # ensure we have a sensible URL to work with
-    baseurl = standardise_url(url=url)
+    rooturl = standardise_url(url=url)
    # get robots.txt
-    robots = RobotsTxt(base_url=baseurl)
+    robots = RobotsTxt(rooturl=rooturl)
    # fail early if robots denies all crawling
-    if not robots.check(url=baseurl):
+    if not robots.check(url=rooturl):
-        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
+        sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
                 rooturl))
-    return(baseurl, robots)
+    return(rooturl, robots)
-def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
+def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    Accepts:
        rooturl: string containing the root URL
        crawled_urls: set containing discovered URLs
        runtime: int representing run time of AsyncCrawler
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
@@ -40,8 +62,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')
-    rendered_html = template.render(
+    rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
-        base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
+                                    urls=sorted_urls, runtime=runtime)
    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)
@@ -51,31 +73,36 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
 def main():
    '''
-    docstring
+    Main function, responsible for prepping and running the crawler and
    rendering the sitemap.
    '''
    starttime = datetime.now()
-    baseurl, robots = sanity_checks(url=args.url)
+    rooturl, robots = sanity_checks(url=args.url)
    # create a crawler
-    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
+    async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
                                 concurrency=args.concurrency)
    # create a task to run the crawler, run the loop and then gather the
    # results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
-    results = task.result()
+    results = sorted(task.result())
    runtime = int((datetime.now() - starttime).total_seconds())
-    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
+    render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Recursive web crawler')
-    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
+    parser.add_argument("-u", "--url", required=True, help="Initial url")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
-                        default=100, help="Max number of pages to crawl concurrently")
+                        default=100, help="Max pages to crawl concurrently")
    args = parser.parse_args()
    main()
--- a/notes.md
+++ b/notes.md
@@ -18,39 +18,9 @@
  * better exception handling
  * randomise output filename
-### Async bits
+### talking points
-in `__main__`:
+ - token bucket algo to enforce n requests per second
-
+   - read up on bucket algo types
-```python
+ - re-structuring AsyncCrawler to be more testable
-loop = asyncio.get_event_loop()
+ - use exponential backoff algo?
 try:
    loop.run_until_complete(main())
 finally:
    loop.close()
 ```
  * initialises loop and runs it to completion
  * needs to handle errors (try/except/finally)
 ```python
 async def run(args=None):
    tasks = []
    for url in pool:
        tasks.append(url)
    # for i in range(10):
        # tasks.append(asyncio.ensure_future(myCoroutine(i)))
    # gather completed tasks
    await asyncio.gather(*tasks)
 ```
 Getting the contents of the page needs to be async too
 ```python
 async def get_source():
    blah
    blah
    await urlopen(url)
 ```
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,12 @@
 aiohttp==3.4.4
 async-timeout==3.0.0
 attrs==18.2.0
 beautifulsoup4==4.6.3
 bs4==0.0.1
 certifi==2018.8.13
 chardet==3.0.4
 idna==2.7
 Jinja2==2.10
 lxml==4.2.4
 MarkupSafe==1.0
 multidict==4.4.0
 yarl==1.2.6
--- a/templates/sitemap.html.j2
+++ b/templates/sitemap.html.j2
@@ -4,7 +4,7 @@
 </head>
 <body>
 <p>
-Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
+Crawled {{ urlcount }} URLs on {{ rooturl }} in ~{{ runtime }} seconds.
 <ul>
 {% for url in urls %}
  <li><a href="{{ url }}">{{ url }}</a></li>
--- a/test/files/find_all_urls.html
+++ b/test/files/find_all_urls.html
@@ -0,0 +1,10 @@
 <html>
 <body>
 <p>
 <ul>
  <li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
  <li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
  <li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
 </ul>
 </body>
 </html>
--- a/test_helpers.py
+++ b/test_helpers.py
@@ -1,34 +1,65 @@
 #!/usr/bin/env python
 import unittest
-from utils.helpers import (sanitise_url)
+from unittest import mock
 from utils.helpers import RobotsTxt, standardise_url
 class TestRobots(unittest.TestCase):
    rooturl = 'http://eu.httpbin.org'
    no_robots = 'https://www.simonweald.com'
    test_paths = (('/', True), ('/deny', False))
    robots = RobotsTxt(rooturl=rooturl)
    norobots = RobotsTxt(rooturl=no_robots)
    def test_robots_txt_deny(self):
        '''
        Asserts result is True or False.
        '''
        for path, allowed in self.test_paths:
            result = self.robots.check(url=path)
            self.assertIs(result, allowed)
    def test_no_robots_txt(self):
        '''
        Ensure we can crawl if robots.txt isn't present.
        '''
        result = self.norobots.check(url='/')
        self.assertTrue(result)
 class TestUrls(unittest.TestCase):
-    base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
+    rooturl = 'http://eu.httpbin.org'
                     ('www.simonweald.com', 'http://www.simonweald.com'),
                     ('http://www.github.com/', 'http://www.github.com'),
                     ('https://www.github.com', 'https://www.github.com'))
-    urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
+    rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
-                     ('https://github.com/?foo=bar', 'https://github.com/'),
+                     ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
-                     ('https://github.com/#anchor', 'https://github.com/'))
+                     ('https://eu.httpbin.org', 'https://eu.httpbin.org'))
    urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
                     ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
                     ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
-    def test_sanitise_base_url(self):
+    def test_standardise_rooturl(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Tests whether a base URL can be standardised to the format
        proto://[sub].domain.tld.
        '''
-        for url, target in self.base_url_list:
+        for url, target in self.rooturl_list:
-            result = sanitise_url(url, base_url=True)
+            result = standardise_url(url)
            self.assertEqual(result, target)
-    def test_sanitise_url(self):
+    def test_standardise_url(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Ensure that fragments/anchors etc are stripped.
        '''
        for url, target in self.urls_to_clean:
-            result = sanitise_url(url)
+            result = standardise_url(url, rooturl=self.rooturl)
            self.assertEqual(result, target)
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,38 +3,43 @@
 Utilities to provide various misc functions.
 '''
 # import urllib.request
 # import urllib.error
 # import gzip
 # from time import sleep
 import aiohttp
 import asyncio
 from bs4 import BeautifulSoup
 import urllib.error
 from urllib.parse import urljoin, urlsplit
 import urllib.request
 import urllib.robotparser
 class AsyncCrawler(object):
    '''
-    docstring
+    A concurrent recursive web crawler.
    A recursive web crawler which finds all URLs local to the domains specified
    in the `rooturl` argument.
    Arguments:
        rooturl:        Root domain to begin crawling.
        robots:         RobotsTxt object for the rooturl.
        concurrency:    number of concurrent pages to crawl.
    Returns:
        All discovered pages in a set.
    '''
-    def __init__(self, baseurl=None, robots=None, concurrency=None):
+    def __init__(self, rooturl=None, robots=None, concurrency=None):
-        self.baseurl = baseurl
+        self.rooturl = rooturl
        self.robots = robots
        self.uncrawled = set()
        self.crawled = set()
        self.headers = {'Accept-Encoding': 'gzip, deflate',
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
    async def crawl_url(self, url=None):
        '''
-        docstring
+        Crawls the given URL and finds all new URLs in the given page.
        '''
        urls = []
        source = await self.get_source(url)
@@ -43,46 +48,45 @@ class AsyncCrawler(object):
        return urls
    def validate_url(self, url=None):
        '''
        Ensures we have a valid URL to crawl and that the site's robots.txt
        allows it.
        '''
        # ensure the URL is in a sane format
-        url = standardise_url(url=url, base_url=self.baseurl)
+        url = standardise_url(url=url, rooturl=self.rooturl)
        if url and self.robots.check(url=url):
            return url
        else:
            return False
    async def get_source(self, url=None):
        '''
-        Obtains the page's source.
+        Obtains the URL's source, provided it is HTML. Usage of semaphores
        ensures only a certain number of coroutines can run at any given
        time.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
-                    data = await head.read()
+                    _ = await head.read()
-                except Exception as e:
+                except Exception:
-                    print(e)
+                    pass
            if 'text/html' in head.headers['Content-Type']:
                async with self.client_session.get(url, timeout=5) as resp:
                    try:
                        source = await resp.read()
                        print('crawled {0}'.format(url))
                        return source
                    except Exception:
                        return None
            else:
-                print('{0} - {1}'.format(head.headers['Content-Type'], url))
+                return None
    def find_all_urls(self, source=None):
        '''
-        Find all URLs in a page's source.
+        Find all URLs in a page's source. Returns a list of URLs which have
        been validated as local to the starting URL.
        '''
        urls = []
@@ -97,39 +101,58 @@ class AsyncCrawler(object):
        return urls
    async def run(self, urls=None):
        '''
        Crawls a batch of URLs of any size (resource usage is bounded by n
        semaphores (where n = concurrency). Returns a set of URLs to be added
        to the list of URLs which need to be crawled (find_all_urls only
        returns unseen URLs).
        '''
        tasks = []
        all_urls = set()
        for url in urls:
            # mark the URL as seen.
            self.crawled.add(url)
            # create a task to crawl the URL.
            tasks.append(self.crawl_url(url))
        # wait for all tasks to complete.
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
-                # completed.append((await task))
+                # try getting all tasks as completed.
                urls = await task
-            except Exception as e:
+            except Exception:
-                print(e)
+                # skip until all tasks have completed.
                pass
            # add the URLs to a set to be returned.
            if urls:
                for url in urls:
                    all_urls.add(url)
        return all_urls
    async def main(self):
        '''
        Runs a crawl with batches of URLs. Once complete returns a list of all
        crawled URLs.
        '''
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        to_crawl = []
-        to_crawl.append(self.baseurl)
+        # add the root URL to initialise the crawler.
        to_crawl.append(self.rooturl)
        print('Crawling: {0}'.format(self.rooturl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
            # empty to_crawl list and then add all newly discovered URLs for
            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)
            print('{0} URLs crawled'.format(len(self.crawled)))
        # close the ssions once all URLs have been crawled.
        await self.client_session.close()
        return self.crawled
@@ -137,17 +160,23 @@ class AsyncCrawler(object):
 class RobotsTxt(object):
    '''
-    needs a docstring
+    Retrieve and query robots.txt for a given domain.
    Retrieves and parses robots.txt for the given domain. Calling the check()
    method returns True or False depending on whether crawling of that given
    URL is allowed.
    '''
-    def __init__(self, base_url=None):
+    def __init__(self, rooturl=None):
        '''
-        Manually retrieve robots.txt to allow us to set the user-agent.
+        Manually retrieve robots.txt to allow us to set the user-agent (works
        around sites which disallow access to robots.txt without a sane
        user-agent).
        '''
-        self.base_url = base_url
+        self.rooturl = rooturl
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-        robots_url = urljoin(self.base_url, 'robots.txt')
+        robots_url = urljoin(self.rooturl, 'robots.txt')
        request = urllib.request.Request(robots_url, headers=self.headers)
        robots = urllib.robotparser.RobotFileParser()
@@ -156,6 +185,7 @@ class RobotsTxt(object):
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError:
            # if robots.txt doesn't exist then allow all URLs to be crawled.
            robots.allow_all = True
        else:
            data = response.read()
@@ -164,7 +194,6 @@ class RobotsTxt(object):
        self.robots = robots
    def check(self, url):
        '''
        Test if robots allows us to crawl that URL.
@@ -172,25 +201,27 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)
-def standardise_url(url=None, base_url=None):
+def standardise_url(url=None, rooturl=None):
    '''
-    If `base_url` is None then we attempt to standarise the URL to ensure it can
+    If `rooturl` is None then we attempt to standarise the URL to ensure it
-    be prepended to relative URLs. If no scheme has been provided then we default
+    can be prepended to relative URLs. If no scheme has been provided then we
-    to http as any sane https-only site should 301 redirect http > https.
+    default to http as any sane https-only site should 301 redirect http to
    https.
-    If `base_url` is set, we standardise URLs to strip queries and fragments (we
+    If `rooturl` is set, we standardise URLs to strip queries and fragments
-    don't want to scrape in-page anchors etc). Any relative URLs will be appended
+    (we don't want to scrape in-page anchors etc). Any relative URLs will be
-    to the base url.
+    appended to the root url.
    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
-    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
+    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
                       'cfm')
    split_url = urlsplit(url)
-    if not base_url:
+    if not rooturl:
        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
            return "".join([split_url.scheme, delim, split_url.netloc])
@@ -199,10 +230,11 @@ def standardise_url(url=None, base_url=None):
    else:
        # if url.endswith(file_extensions):
        # Sanitise discovered URLs. We already expect them in the format
-        # protocol://base_url/path
+        # protocol://rooturl/path
        if url.startswith('/'):
-            return urljoin(base_url, split_url.path)
+            return urljoin(rooturl, split_url.path)
-        elif url.startswith(base_url):
+        elif url.startswith(rooturl):
-            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+            return "".join([split_url.scheme, delim, split_url.netloc,
                            split_url.path])
    return None
Author	SHA1	Message	Date
Simon Weald	5f7d66912f	add test files	2018-09-19 08:39:05 +01:00
Simon Weald	d4cd93e3d4	update docs	2018-09-19 08:38:49 +01:00
Simon Weald	f5f6afd1a4	correct tests with new arg names	2018-09-19 08:37:55 +01:00
Simon Weald	679b1b7b53	rename all instances of base_url to rooturl, add more documentation	2018-09-18 18:24:15 +01:00
Simon Weald	32d7f1e54b	add talking points	2018-09-18 18:23:12 +01:00
Simon Weald	f6265f18a7	initial test for AsyncCrawler	2018-09-18 18:22:55 +01:00
Simon Weald	9a4e9ddfc7	add test for missing robots.txt	2018-09-18 10:53:13 +01:00
Simon Weald	51f988e1bc	added more tests	2018-09-17 21:44:20 +01:00
Simon Weald	73c21e5bd3	small improvements to docs and variables	2018-09-17 21:44:04 +01:00
Simon Weald	eb2395d461	minor change to README	2018-09-17 08:11:26 +01:00
Simon Weald	c53f62b55d	add most changes suggested by pycodestyle	2018-09-16 16:10:38 +01:00
Simon Weald	75d3756bbc	fix errors discovered by pycyodestyle	2018-09-16 16:04:07 +01:00
Simon Weald	5262c23281	add flags to README	2018-09-16 15:58:17 +01:00
Simon Weald	524f6a45cd	improve documentation	2018-09-16 15:53:47 +01:00
Simon Weald	a926090bed	update requirements	2018-09-16 15:44:30 +01:00
Simon Weald	91cd988f52	more comments and progress output	2018-09-16 15:26:49 +01:00
Simon Weald	f1855f5add	re-order imports because I'm fussy	2018-09-16 09:06:30 +01:00
Simon Weald	336517e84a	more documentation and add back some required imports	2018-09-16 09:00:43 +01:00
Simon Weald	7bc9fe0679	improved documentation and remove unneeded set	2018-09-16 08:56:44 +01:00
Simon Weald	6548f55416	improve documentation	2018-09-15 21:48:50 +01:00
Simon Weald	0244435fea	remove unecessary imports	2018-09-15 21:38:51 +01:00