rename all instances of base_url to rooturl, add more documentation

add talking points
initial test for AsyncCrawler
2018-09-18 18:24:15 +01:00 · 2018-09-18 18:23:12 +01:00 · 2018-09-18 18:22:55 +01:00
5 changed files with 92 additions and 81 deletions
@@ -8,7 +8,7 @@ URLs discovered will be crawled.

 The crawler takes a total of two arguments (concurrency is optional):

-    url: the base URL to begin the crawl from.
+    url: the root URL to begin the crawl from.
    concurrency: the maximum number of pages which may be crawled concurrently.
 '''

@@ -29,32 +29,31 @@ def sanity_checks(url=None):
      url: the root URL to be crawled.

    Returns:
-      baseurl: a validated and cleaned version of the initial URL.
-              (type=string)
+      rooturl: a string containing avalidated and cleaned version of the
+               initial URL.
      robots: an object which allows us to query whether a site may be crawled.
-              (type=RobotsTxt)
    '''
    # ensure we have a sensible URL to work with
-    baseurl = standardise_url(url=url)
+    rooturl = standardise_url(url=url)
    # get robots.txt
-    robots = RobotsTxt(base_url=baseurl)
+    robots = RobotsTxt(rooturl=rooturl)

    # fail early if robots denies all crawling
-    if not robots.check(url=baseurl):
-        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(
-            baseurl=baseurl))
+    if not robots.check(url=rooturl):
+        sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
+                 rooturl))

-    return(baseurl, robots)
+    return(rooturl, robots)


-def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
+def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.

    Accepts:
-        base_url:
-        crawled_urls:
-        runtime:
+        rooturl: string containing the root URL
+        crawled_urls: set containing discovered URLs
+        runtime: int representing run time of AsyncCrawler
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
@@ -63,7 +62,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')

-    rendered_html = template.render(base_url=base_url, urlcount=urlcount,
+    rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
                                    urls=sorted_urls, runtime=runtime)

    with open('sitemap.html', 'w') as outfile:
@@ -79,10 +78,10 @@ def main():
    '''
    starttime = datetime.now()

-    baseurl, robots = sanity_checks(url=args.url)
+    rooturl, robots = sanity_checks(url=args.url)

    # create a crawler
-    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots,
+    async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
                                 concurrency=args.concurrency)

    # create a task to run the crawler, run the loop and then gather the
@@ -95,7 +94,7 @@ def main():

    runtime = int((datetime.now() - starttime).total_seconds())

-    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
+    render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)


 if __name__ == '__main__':
@@ -18,39 +18,9 @@
  * better exception handling
  * randomise output filename

-### Async bits
+### talking points

-in `__main__`:
-
-```python
-loop = asyncio.get_event_loop()
-try:
-    loop.run_until_complete(main())
-finally:
-    loop.close()
-```
-
-  * initialises loop and runs it to completion
-  * needs to handle errors (try/except/finally)
-
-```python
-async def run(args=None):
-    tasks = []
-
-    for url in pool:
-        tasks.append(url)
-    # for i in range(10):
-        # tasks.append(asyncio.ensure_future(myCoroutine(i)))
-
-    # gather completed tasks
-    await asyncio.gather(*tasks)
-```
-
-Getting the contents of the page needs to be async too
-
-```python
-async def get_source():
-    blah
-    blah
-    await urlopen(url)
-```
+ - token bucket algo to enforce n requests per second
+   - read up on bucket algo types
+ - re-structuring AsyncCrawler to be more testable
+ - use exponential backoff algo?
@@ -4,7 +4,7 @@
 </head>
 <body>
 <p>
-Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
+Crawled {{ urlcount }} URLs on {{ rooturl }} in ~{{ runtime }} seconds.
 <ul>
 {% for url in urls %}
  <li><a href="{{ url }}">{{ url }}</a></li>
@@ -1,7 +1,28 @@
 #!/usr/bin/env python

 import unittest
-from utils.helpers import (RobotsTxt, standardise_url)
+from unittest import mock
+from utils.helpers import AsyncCrawler, RobotsTxt, standardise_url
+
+
+class TestAsyncCrawler(unittest.TestCase):
+
+    base_url = 'http://eu.httpbin.org'
+    concurrency = 10
+    testcrawler = AsyncCrawler(baseurl=base_url, concurrency=concurrency)
+    expected_urls = ['http://eu.httpbin.org/b/', 'http://eu.httpbin.org/c/']
+    crawled = set()
+    crawled.add('https://eu.httpbin.org/a/')
+
+    @mock.patch('utils.helpers.AsyncCrawler.validate_url', response=True)
+    def test_find_all_urls(self, validate_url):
+
+        with open('test/files/find_all_urls.html', 'r') as f:
+            source = f.read()
+
+        urls = self.testcrawler.find_all_urls(source=source)
+        self.assertEqual(urls, self.expected_urls)
+

 class TestRobots(unittest.TestCase):

@@ -14,11 +14,22 @@ import urllib.robotparser

 class AsyncCrawler(object):
    '''
-    docstring
+    A concurrent recursive web crawler.
+
+    A recursive web crawler which finds all URLs local to the domains specified
+    in the `rooturl` argument.
+
+    Arguments:
+        rooturl:        Root domain to begin crawling.
+        robots:         RobotsTxt object for the rooturl.
+        concurrency:    number of concurrent pages to crawl.
+
+    Returns:
+        All discovered pages in a set.
    '''

-    def __init__(self, baseurl=None, robots=None, concurrency=None):
-        self.baseurl = baseurl
+    def __init__(self, rooturl=None, robots=None, concurrency=None):
+        self.rooturl = rooturl
        self.robots = robots
        self.crawled = set()
        self.headers = {'Accept-Encoding': 'gzip, deflate',
@@ -28,7 +39,7 @@ class AsyncCrawler(object):

    async def crawl_url(self, url=None):
        '''
-        Crawls the given URL and finds all new URLs in the initial page.
+        Crawls the given URL and finds all new URLs in the given page.
        '''
        urls = []
        source = await self.get_source(url)
@@ -43,7 +54,7 @@ class AsyncCrawler(object):
        allows it.
        '''
        # ensure the URL is in a sane format
-        url = standardise_url(url=url, base_url=self.baseurl)
+        url = standardise_url(url=url, rooturl=self.rooturl)

        if url and self.robots.check(url=url):
            return url
@@ -53,14 +64,15 @@ class AsyncCrawler(object):
    async def get_source(self, url=None):
        '''
        Obtains the URL's source, provided it is HTML. Usage of semaphores
-        ensures only a certain number of coroutines can run at once.
+        ensures only a certain number of coroutines can run at any given
+        time.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
                    _ = await head.read()
-                except Exception as e:
-                    print(e)
+                except Exception:
+                    pass
            if 'text/html' in head.headers['Content-Type']:
                async with self.client_session.get(url, timeout=5) as resp:
                    try:
@@ -68,6 +80,8 @@ class AsyncCrawler(object):
                        return source
                    except Exception:
                        return None
+            else:
+                return None

    def find_all_urls(self, source=None):
        '''
@@ -99,21 +113,22 @@ class AsyncCrawler(object):
        for url in urls:
            # mark the URL as seen.
            self.crawled.add(url)
-            # create an coroutine to crawl the URL.
+            # create a task to crawl the URL.
            tasks.append(self.crawl_url(url))

        # wait for all tasks to complete.
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
+                # try getting all tasks as completed.
                urls = await task
-            except Exception as e:
-                print(e)
+            except Exception:
+                # skip until all tasks have completed.
+                pass

            # add the URLs to a set to be returned.
            if urls:
                for url in urls:
-                    print('Found: {0}'.format(url))
                    all_urls.add(url)

        return all_urls
@@ -125,15 +140,17 @@ class AsyncCrawler(object):
        '''
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        to_crawl = []
-        to_crawl.append(self.baseurl)
+        # add the root URL to initialise the crawler.
+        to_crawl.append(self.rooturl)

-        print('Crawling: {0}'.format(self.baseurl))
+        print('Crawling: {0}'.format(self.rooturl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
            # empty to_crawl list and then add all newly discovered URLs for
            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)
+            print('{0} URLs crawled'.format(len(self.crawled)))

        # close the ssions once all URLs have been crawled.
        await self.client_session.close()
@@ -143,19 +160,23 @@ class AsyncCrawler(object):

 class RobotsTxt(object):
    '''
-    needs a docstring
+    Retrieve and query robots.txt for a given domain.
+
+    Retrieves and parses robots.txt for the given domain. Calling the check()
+    method returns True or False depending on whether crawling of that given
+    URL is allowed.
    '''

-    def __init__(self, base_url=None):
+    def __init__(self, rooturl=None):
        '''
        Manually retrieve robots.txt to allow us to set the user-agent (works
        around sites which disallow access to robots.txt without a sane
        user-agent).
        '''
-        self.base_url = base_url
+        self.rooturl = rooturl
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}

-        robots_url = urljoin(self.base_url, 'robots.txt')
+        robots_url = urljoin(self.rooturl, 'robots.txt')
        request = urllib.request.Request(robots_url, headers=self.headers)

        robots = urllib.robotparser.RobotFileParser()
@@ -180,16 +201,16 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)


-def standardise_url(url=None, base_url=None):
+def standardise_url(url=None, rooturl=None):
    '''
-    If `base_url` is None then we attempt to standarise the URL to ensure it
+    If `rooturl` is None then we attempt to standarise the URL to ensure it
    can be prepended to relative URLs. If no scheme has been provided then we
    default to http as any sane https-only site should 301 redirect http to
    https.

-    If `base_url` is set, we standardise URLs to strip queries and fragments
+    If `rooturl` is set, we standardise URLs to strip queries and fragments
    (we don't want to scrape in-page anchors etc). Any relative URLs will be
-    appended to the base url.
+    appended to the root url.

    Returns a standardised URL as a string.
    '''
@@ -200,7 +221,7 @@ def standardise_url(url=None, base_url=None):

    split_url = urlsplit(url)

-    if not base_url:
+    if not rooturl:
        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
            return "".join([split_url.scheme, delim, split_url.netloc])
@@ -209,10 +230,10 @@ def standardise_url(url=None, base_url=None):
    else:
        # if url.endswith(file_extensions):
        # Sanitise discovered URLs. We already expect them in the format
-        # protocol://base_url/path
+        # protocol://rooturl/path
        if url.startswith('/'):
-            return urljoin(base_url, split_url.path)
-        elif url.startswith(base_url):
+            return urljoin(rooturl, split_url.path)
+        elif url.startswith(rooturl):
            return "".join([split_url.scheme, delim, split_url.netloc,
                            split_url.path])
Author	SHA1	Message	Date
simon	679b1b7b53	rename all instances of base_url to rooturl, add more documentation	2018-09-18 18:24:15 +01:00
simon	32d7f1e54b	add talking points	2018-09-18 18:23:12 +01:00
simon	f6265f18a7	initial test for AsyncCrawler	2018-09-18 18:22:55 +01:00