add test files

update docs
correct tests with new arg names
2018-09-19 08:39:05 +01:00 · 2018-09-19 08:38:49 +01:00 · 2018-09-19 08:37:55 +01:00 · 2018-09-18 18:24:15 +01:00 · 2018-09-18 18:23:12 +01:00 · 2018-09-18 18:22:55 +01:00
8 changed files with 181 additions and 113 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 ## Requirements
-This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
+This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
 Install required modules:
@@ -13,9 +13,16 @@ pip install -r requirements.txt
 Run:
 ```bash
-python crawler.py -u https://urltocrawl.com
+python async_crawler.py -u https://urltocrawl.com [-c 100]
 ```
 Flags:
  - `-u/--url https://url.com`
    - The base URL is required.
  - `-c/--concurrency 100`
    - Specifying concurrency value is optional (defaults to 100).
 ## Results
-The resulting sitemap will be output in the root of this directory as `sitemap.html`
+The resulting sitemap will be output to the root of this directory as `sitemap.html`
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -1,7 +1,15 @@
 #!/usr/bin/env python
 '''
-Need a docstring.
+Asynchronous web crawler written in Python 3.5+.
 This script will respect the site's `robots.txt`, if one exists. If not, all
 URLs discovered will be crawled.
 The crawler takes a total of two arguments (concurrency is optional):
    url: the root URL to begin the crawl from.
    concurrency: the maximum number of pages which may be crawled concurrently.
 '''
 import argparse
@@ -16,22 +24,36 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    Accepts:
      url: the root URL to be crawled.
    Returns:
      rooturl: a string containing avalidated and cleaned version of the
               initial URL.
      robots: an object which allows us to query whether a site may be crawled.
    '''
    # ensure we have a sensible URL to work with
-    baseurl = standardise_url(url=url)
+    rooturl = standardise_url(url=url)
    # get robots.txt
-    robots = RobotsTxt(base_url=baseurl)
+    robots = RobotsTxt(rooturl=rooturl)
    # fail early if robots denies all crawling
-    if not robots.check(url=baseurl):
+    if not robots.check(url=rooturl):
-        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
+        sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
                 rooturl))
-    return(baseurl, robots)
+    return(rooturl, robots)
-def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
+def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    Accepts:
        rooturl: string containing the root URL
        crawled_urls: set containing discovered URLs
        runtime: int representing run time of AsyncCrawler
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
@@ -40,8 +62,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')
-    rendered_html = template.render(
+    rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
-        base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
+                                    urls=sorted_urls, runtime=runtime)
    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)
@@ -51,31 +73,36 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
 def main():
    '''
-    docstring
+    Main function, responsible for prepping and running the crawler and
    rendering the sitemap.
    '''
    starttime = datetime.now()
-    baseurl, robots = sanity_checks(url=args.url)
+    rooturl, robots = sanity_checks(url=args.url)
    # create a crawler
-    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
+    async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
                                 concurrency=args.concurrency)
    # create a task to run the crawler, run the loop and then gather the
    # results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
-    results = task.result()
+    results = sorted(task.result())
    runtime = int((datetime.now() - starttime).total_seconds())
-    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
+    render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Recursive web crawler')
-    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
+    parser.add_argument("-u", "--url", required=True, help="Initial url")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
-                        default=100, help="Max number of pages to crawl concurrently")
+                        default=100, help="Max pages to crawl concurrently")
    args = parser.parse_args()
    main()
--- a/notes.md
+++ b/notes.md
@@ -18,39 +18,9 @@
  * better exception handling
  * randomise output filename
-### Async bits
+### talking points
-in `__main__`:
+ - token bucket algo to enforce n requests per second
-
+   - read up on bucket algo types
-```python
+ - re-structuring AsyncCrawler to be more testable
-loop = asyncio.get_event_loop()
+ - use exponential backoff algo?
 try:
    loop.run_until_complete(main())
 finally:
    loop.close()
 ```
  * initialises loop and runs it to completion
  * needs to handle errors (try/except/finally)
 ```python
 async def run(args=None):
    tasks = []
    for url in pool:
        tasks.append(url)
    # for i in range(10):
        # tasks.append(asyncio.ensure_future(myCoroutine(i)))
    # gather completed tasks
    await asyncio.gather(*tasks)
 ```
 Getting the contents of the page needs to be async too
 ```python
 async def get_source():
    blah
    blah
    await urlopen(url)
 ```
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,12 @@
 aiohttp==3.4.4
 async-timeout==3.0.0
 attrs==18.2.0
 beautifulsoup4==4.6.3
 bs4==0.0.1
 certifi==2018.8.13
 chardet==3.0.4
 idna==2.7
 Jinja2==2.10
 lxml==4.2.4
 MarkupSafe==1.0
 multidict==4.4.0
 yarl==1.2.6
--- a/templates/sitemap.html.j2
+++ b/templates/sitemap.html.j2
@@ -4,7 +4,7 @@
 </head>
 <body>
 <p>
-Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
+Crawled {{ urlcount }} URLs on {{ rooturl }} in ~{{ runtime }} seconds.
 <ul>
 {% for url in urls %}
  <li><a href="{{ url }}">{{ url }}</a></li>
--- a/test/files/find_all_urls.html
+++ b/test/files/find_all_urls.html
@@ -0,0 +1,10 @@
 <html>
 <body>
 <p>
 <ul>
  <li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
  <li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
  <li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
 </ul>
 </body>
 </html>
--- a/test_helpers.py
+++ b/test_helpers.py
@@ -1,34 +1,65 @@
 #!/usr/bin/env python
 import unittest
-from utils.helpers import (sanitise_url)
+from unittest import mock
 from utils.helpers import RobotsTxt, standardise_url
 class TestRobots(unittest.TestCase):
    rooturl = 'http://eu.httpbin.org'
    no_robots = 'https://www.simonweald.com'
    test_paths = (('/', True), ('/deny', False))
    robots = RobotsTxt(rooturl=rooturl)
    norobots = RobotsTxt(rooturl=no_robots)
    def test_robots_txt_deny(self):
        '''
        Asserts result is True or False.
        '''
        for path, allowed in self.test_paths:
            result = self.robots.check(url=path)
            self.assertIs(result, allowed)
    def test_no_robots_txt(self):
        '''
        Ensure we can crawl if robots.txt isn't present.
        '''
        result = self.norobots.check(url='/')
        self.assertTrue(result)
 class TestUrls(unittest.TestCase):
-    base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
+    rooturl = 'http://eu.httpbin.org'
                     ('www.simonweald.com', 'http://www.simonweald.com'),
                     ('http://www.github.com/', 'http://www.github.com'),
                     ('https://www.github.com', 'https://www.github.com'))
-    urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
+    rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
-                     ('https://github.com/?foo=bar', 'https://github.com/'),
+                     ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
-                     ('https://github.com/#anchor', 'https://github.com/'))
+                     ('https://eu.httpbin.org', 'https://eu.httpbin.org'))
    urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
                     ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
                     ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
                     ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
-    def test_sanitise_base_url(self):
+    def test_standardise_rooturl(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Tests whether a base URL can be standardised to the format
        proto://[sub].domain.tld.
        '''
-        for url, target in self.base_url_list:
+        for url, target in self.rooturl_list:
-            result = sanitise_url(url, base_url=True)
+            result = standardise_url(url)
            self.assertEqual(result, target)
-    def test_sanitise_url(self):
+    def test_standardise_url(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Ensure that fragments/anchors etc are stripped.
        '''
        for url, target in self.urls_to_clean:
-            result = sanitise_url(url)
+            result = standardise_url(url, rooturl=self.rooturl)
            self.assertEqual(result, target)
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -14,11 +14,22 @@ import urllib.robotparser
 class AsyncCrawler(object):
    '''
-    docstring
+    A concurrent recursive web crawler.
    A recursive web crawler which finds all URLs local to the domains specified
    in the `rooturl` argument.
    Arguments:
        rooturl:        Root domain to begin crawling.
        robots:         RobotsTxt object for the rooturl.
        concurrency:    number of concurrent pages to crawl.
    Returns:
        All discovered pages in a set.
    '''
-    def __init__(self, baseurl=None, robots=None, concurrency=None):
+    def __init__(self, rooturl=None, robots=None, concurrency=None):
-        self.baseurl = baseurl
+        self.rooturl = rooturl
        self.robots = robots
        self.crawled = set()
        self.headers = {'Accept-Encoding': 'gzip, deflate',
@@ -26,10 +37,9 @@ class AsyncCrawler(object):
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
    async def crawl_url(self, url=None):
        '''
-        Crawls the given URL and finds all new URLs in the initial page.
+        Crawls the given URL and finds all new URLs in the given page.
        '''
        urls = []
        source = await self.get_source(url)
@@ -38,32 +48,31 @@ class AsyncCrawler(object):
        return urls
    def validate_url(self, url=None):
        '''
        Ensures we have a valid URL to crawl and that the site's robots.txt
        allows it.
        '''
        # ensure the URL is in a sane format
-        url = standardise_url(url=url, base_url=self.baseurl)
+        url = standardise_url(url=url, rooturl=self.rooturl)
        if url and self.robots.check(url=url):
            return url
        else:
            return False
    async def get_source(self, url=None):
        '''
        Obtains the URL's source, provided it is HTML. Usage of semaphores
-        ensures only a certain number of coroutines can run at once.
+        ensures only a certain number of coroutines can run at any given
        time.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
-                    data = await head.read()
+                    _ = await head.read()
-                except Exception as e:
+                except Exception:
-                    print(e)
+                    pass
            if 'text/html' in head.headers['Content-Type']:
                async with self.client_session.get(url, timeout=5) as resp:
                    try:
@@ -71,7 +80,8 @@ class AsyncCrawler(object):
                        return source
                    except Exception:
                        return None
-
+            else:
                return None
    def find_all_urls(self, source=None):
        '''
@@ -91,30 +101,30 @@ class AsyncCrawler(object):
        return urls
    async def run(self, urls=None):
        '''
        Crawls a batch of URLs of any size (resource usage is bounded by n
        semaphores (where n = concurrency). Returns a set of URLs to be added
-        to the list of URLs which need to be crawled (find_all_urls only returns
+        to the list of URLs which need to be crawled (find_all_urls only
-        unseen URLs).
+        returns unseen URLs).
        '''
        tasks = []
        all_urls = set()
        for url in urls:
            # mark the URL as seen.
            self.crawled.add(url)
-            # create an coroutine to crawl the URL.
+            # create a task to crawl the URL.
            tasks.append(self.crawl_url(url))
        # wait for all tasks to complete.
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
-                # completed.append((await task))
+                # try getting all tasks as completed.
                urls = await task
-            except Exception as e:
+            except Exception:
-                print(e)
+                # skip until all tasks have completed.
                pass
            # add the URLs to a set to be returned.
            if urls:
@@ -123,7 +133,6 @@ class AsyncCrawler(object):
        return all_urls
    async def main(self):
        '''
        Runs a crawl with batches of URLs. Once complete returns a list of all
@@ -131,14 +140,17 @@ class AsyncCrawler(object):
        '''
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        to_crawl = []
-        to_crawl.append(self.baseurl)
+        # add the root URL to initialise the crawler.
        to_crawl.append(self.rooturl)
        print('Crawling: {0}'.format(self.rooturl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
-            # empty toe crawl list and then add all newly discovered URLs for
+            # empty to_crawl list and then add all newly discovered URLs for
            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)
            print('{0} URLs crawled'.format(len(self.crawled)))
        # close the ssions once all URLs have been crawled.
        await self.client_session.close()
@@ -148,19 +160,23 @@ class AsyncCrawler(object):
 class RobotsTxt(object):
    '''
-    needs a docstring
+    Retrieve and query robots.txt for a given domain.
    Retrieves and parses robots.txt for the given domain. Calling the check()
    method returns True or False depending on whether crawling of that given
    URL is allowed.
    '''
-    def __init__(self, base_url=None):
+    def __init__(self, rooturl=None):
        '''
        Manually retrieve robots.txt to allow us to set the user-agent (works
        around sites which disallow access to robots.txt without a sane
        user-agent).
        '''
-        self.base_url = base_url
+        self.rooturl = rooturl
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-        robots_url = urljoin(self.base_url, 'robots.txt')
+        robots_url = urljoin(self.rooturl, 'robots.txt')
        request = urllib.request.Request(robots_url, headers=self.headers)
        robots = urllib.robotparser.RobotFileParser()
@@ -169,6 +185,7 @@ class RobotsTxt(object):
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError:
            # if robots.txt doesn't exist then allow all URLs to be crawled.
            robots.allow_all = True
        else:
            data = response.read()
@@ -177,7 +194,6 @@ class RobotsTxt(object):
        self.robots = robots
    def check(self, url):
        '''
        Test if robots allows us to crawl that URL.
@@ -185,25 +201,27 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)
-def standardise_url(url=None, base_url=None):
+def standardise_url(url=None, rooturl=None):
    '''
-    If `base_url` is None then we attempt to standarise the URL to ensure it can
+    If `rooturl` is None then we attempt to standarise the URL to ensure it
-    be prepended to relative URLs. If no scheme has been provided then we default
+    can be prepended to relative URLs. If no scheme has been provided then we
-    to http as any sane https-only site should 301 redirect http > https.
+    default to http as any sane https-only site should 301 redirect http to
    https.
-    If `base_url` is set, we standardise URLs to strip queries and fragments (we
+    If `rooturl` is set, we standardise URLs to strip queries and fragments
-    don't want to scrape in-page anchors etc). Any relative URLs will be appended
+    (we don't want to scrape in-page anchors etc). Any relative URLs will be
-    to the base url.
+    appended to the root url.
    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
-    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
+    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
                       'cfm')
    split_url = urlsplit(url)
-    if not base_url:
+    if not rooturl:
        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
            return "".join([split_url.scheme, delim, split_url.netloc])
@@ -212,10 +230,11 @@ def standardise_url(url=None, base_url=None):
    else:
        # if url.endswith(file_extensions):
        # Sanitise discovered URLs. We already expect them in the format
-        # protocol://base_url/path
+        # protocol://rooturl/path
        if url.startswith('/'):
-            return urljoin(base_url, split_url.path)
+            return urljoin(rooturl, split_url.path)
-        elif url.startswith(base_url):
+        elif url.startswith(rooturl):
-            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+            return "".join([split_url.scheme, delim, split_url.netloc,
                            split_url.path])
    return None
Author	SHA1	Message	Date
Simon Weald	5f7d66912f	add test files	2018-09-19 08:39:05 +01:00
Simon Weald	d4cd93e3d4	update docs	2018-09-19 08:38:49 +01:00
Simon Weald	f5f6afd1a4	correct tests with new arg names	2018-09-19 08:37:55 +01:00
Simon Weald	679b1b7b53	rename all instances of base_url to rooturl, add more documentation	2018-09-18 18:24:15 +01:00
Simon Weald	32d7f1e54b	add talking points	2018-09-18 18:23:12 +01:00
Simon Weald	f6265f18a7	initial test for AsyncCrawler	2018-09-18 18:22:55 +01:00
Simon Weald	9a4e9ddfc7	add test for missing robots.txt	2018-09-18 10:53:13 +01:00
Simon Weald	51f988e1bc	added more tests	2018-09-17 21:44:20 +01:00
Simon Weald	73c21e5bd3	small improvements to docs and variables	2018-09-17 21:44:04 +01:00
Simon Weald	eb2395d461	minor change to README	2018-09-17 08:11:26 +01:00
Simon Weald	c53f62b55d	add most changes suggested by pycodestyle	2018-09-16 16:10:38 +01:00
Simon Weald	75d3756bbc	fix errors discovered by pycyodestyle	2018-09-16 16:04:07 +01:00
Simon Weald	5262c23281	add flags to README	2018-09-16 15:58:17 +01:00
Simon Weald	524f6a45cd	improve documentation	2018-09-16 15:53:47 +01:00
Simon Weald	a926090bed	update requirements	2018-09-16 15:44:30 +01:00
Simon Weald	91cd988f52	more comments and progress output	2018-09-16 15:26:49 +01:00