add test files

update docs
correct tests with new arg names
2018-09-19 08:39:05 +01:00 · 2018-09-19 08:38:49 +01:00 · 2018-09-19 08:37:55 +01:00 · 2018-09-18 18:24:15 +01:00 · 2018-09-18 18:23:12 +01:00 · 2018-09-18 18:22:55 +01:00
11 changed files with 548 additions and 192 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 venv/
 .vscode/*
 __pycache__/
+sitemap.html
--- a/README.md
+++ b/README.md
@@ -1 +1,28 @@
-# Concurrent web scraper
+# Concurrent web scraper
+
+## Requirements
+
+This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
+
+Install required modules:
+
+```bash
+pip install -r requirements.txt
+```
+
+Run:
+
+```bash
+python async_crawler.py -u https://urltocrawl.com [-c 100]
+```
+
+Flags:
+
+  - `-u/--url https://url.com`
+    - The base URL is required.
+  - `-c/--concurrency 100`
+    - Specifying concurrency value is optional (defaults to 100).
+
+## Results
+
+The resulting sitemap will be output to the root of this directory as `sitemap.html`
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+'''
+Asynchronous web crawler written in Python 3.5+.
+
+This script will respect the site's `robots.txt`, if one exists. If not, all
+URLs discovered will be crawled.
+
+The crawler takes a total of two arguments (concurrency is optional):
+
+    url: the root URL to begin the crawl from.
+    concurrency: the maximum number of pages which may be crawled concurrently.
+'''
+
+import argparse
+import asyncio
+from datetime import datetime
+import jinja2
+import os
+import sys
+from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
+
+
+def sanity_checks(url=None):
+    '''
+    Runs some basic sanity checks before the crawler is initialised.
+
+    Accepts:
+      url: the root URL to be crawled.
+
+    Returns:
+      rooturl: a string containing avalidated and cleaned version of the
+               initial URL.
+      robots: an object which allows us to query whether a site may be crawled.
+    '''
+    # ensure we have a sensible URL to work with
+    rooturl = standardise_url(url=url)
+    # get robots.txt
+    robots = RobotsTxt(rooturl=rooturl)
+
+    # fail early if robots denies all crawling
+    if not robots.check(url=rooturl):
+        sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
+                 rooturl))
+
+    return(rooturl, robots)
+
+
+def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
+    '''
+    Renders the sitemap to an HTML file.
+
+    Accepts:
+        rooturl: string containing the root URL
+        crawled_urls: set containing discovered URLs
+        runtime: int representing run time of AsyncCrawler
+    '''
+    urlcount = len(crawled_urls)
+    sorted_urls = sorted(crawled_urls)
+
+    template = jinja2.Environment(
+        loader=jinja2.FileSystemLoader('templates')
+    ).get_template('sitemap.html.j2')
+
+    rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
+                                    urls=sorted_urls, runtime=runtime)
+
+    with open('sitemap.html', 'w') as outfile:
+        outfile.write(rendered_html)
+
+    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
+
+
+def main():
+    '''
+    Main function, responsible for prepping and running the crawler and
+    rendering the sitemap.
+    '''
+    starttime = datetime.now()
+
+    rooturl, robots = sanity_checks(url=args.url)
+
+    # create a crawler
+    async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
+                                 concurrency=args.concurrency)
+
+    # create a task to run the crawler, run the loop and then gather the
+    # results.
+    task = asyncio.Task(async_crawler.main())
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(task)
+    loop.close()
+    results = sorted(task.result())
+
+    runtime = int((datetime.now() - starttime).total_seconds())
+
+    render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Recursive web crawler')
+    parser.add_argument("-u", "--url", required=True, help="Initial url")
+    parser.add_argument("-c", "--concurrency", required=False, type=int,
+                        default=100, help="Max pages to crawl concurrently")
+    args = parser.parse_args()
+
+    main()
--- a/crawler.py
+++ b/crawler.py
@@ -4,20 +4,112 @@ Need a docstring.
 '''

 import argparse
-from utils.helpers import (UrlPool, WebPage, sanitise_url, qualify_url)
+import jinja2
+import os
+import asyncio
+from datetime import datetime
+from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)

-def init_crawler(base_url=None):
+
+def init_crawler(base_url=None, robots=None):
    '''
-    needs a docstring
+    Initialises the crawler by running the initial URL.
    '''
+    uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
+    initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
+
+    try:
+        initial_page.run()
+    except Exception as e:
+        print(e)
+
+    initial_urls = initial_page.list_urls()
+
+    # ensure the base URL isn't crawled again
+    try:
+        initial_urls.remove(base_url)
+    except KeyError:
+        pass
+    # also ensure base URL wasn't discovered with a trailing slash on the
+    # initial page scrape
+    try:
+        initial_urls.remove("".join([base_url, '/']))
+    except KeyError:
+        pass
+
+    # Add the base URL to the crawled pool
+    crawled_urls.add_to_pool(base_url)
+
+    for url in initial_urls:
+        sanitised_url = sanitise_url(url=url)
+        if sanitised_url not in crawled_urls.pool:
+            uncrawled_urls.add_to_pool(sanitised_url)
+
+    return(uncrawled_urls, crawled_urls)
+
+
+def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
+    '''
+    Iterates over the pool of URLs and adds any discovered URLs.
+    '''
+    while uncrawled_urls.pool:
+        # pop url from pool
+        new_url = uncrawled_urls.remove_from_pool()
+        # create a WebPage object for the URL
+        current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
+        try:
+            succeeded = current_page.run()
+        except Exception as e:
+            print(e)
+
+        if succeeded:
+            _urls = current_page.list_urls()
+            crawled_urls.add_to_pool(new_url)
+
+            for url in _urls:
+                sanitised_url = sanitise_url(url=url)
+                if sanitised_url not in crawled_urls.pool:
+                    uncrawled_urls.add_to_pool(url)
+
+        print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
+                len(uncrawled_urls.pool)))
+
+
+def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
+    '''
+    Renders the sitemap as an HTML file.
+    '''
+    urlcount = len(crawled_urls)
+    sorted_urls = sorted(crawled_urls)
+
+    tmpl = jinja2.Environment(
+        loader=jinja2.FileSystemLoader('templates')
+        ).get_template('sitemap.html.j2')
+
+    rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
+
+    with open('sitemap.html', 'w') as outfile:
+        outfile.write(rendered_html)
+
+    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))


 def run(args=None):
    '''
    needs a docstring.
    '''
-    base_url = sanitise_url(args.url)
-    print(base_url)
+    starttime = datetime.now()
+
+    base_url = sanitise_url(args.url, base_url=True)
+    robots = RobotsTxt(base_url=base_url)
+
+    uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
+    process_pool(base_url, uncrawled_urls, crawled_urls, robots)
+
+    runtime = int((datetime.now() - starttime).total_seconds())
+
+    render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
+

 if __name__ == '__main__':

--- a/notes.md
+++ b/notes.md
@@ -1,9 +1,26 @@
 ## Thoughts

-###### for each URL, do the following:
- * mark it as crawled
- * get page content
-   * if that fails, mark the link as invalid
- * find all links in the content
-   * check each link for dupes
-   * add to pool or discard
+  * ~~strip hashes and everything following (as they're in-page anchors)~~
+  * ~~strip args~~
+  * ~~use `pop()` on the set instead of `.remove()`~~
+    * ~~return false once the set is empty~~
+  * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
+  * ~~ignore any links which aren't to pages~~
+  * ~~better url checking to get bare domain~~ #wontfix
+  * ~~remove trailing slash from any discovered url~~
+  * ~~investigate lxml parser~~
+  * ~~remove base url from initial urls with and without trailing slash~~
+  * ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
+  * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
+  * ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
+  * ~~implement some kind of progress display~~
+  * async
+  * better exception handling
+  * randomise output filename
+
+### talking points
+
+ - token bucket algo to enforce n requests per second
+   - read up on bucket algo types
+ - re-structuring AsyncCrawler to be more testable
+ - use exponential backoff algo?
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,12 @@
+aiohttp==3.4.4
+async-timeout==3.0.0
+attrs==18.2.0
 beautifulsoup4==4.6.3
 bs4==0.0.1
-certifi==2018.8.13
 chardet==3.0.4
 idna==2.7
-requests==2.19.1
-urllib3==1.23
+Jinja2==2.10
+lxml==4.2.4
+MarkupSafe==1.0
+multidict==4.4.0
+yarl==1.2.6
--- a/scraper.py
+++ b/scraper.py
@@ -1,88 +0,0 @@
-#!/usr/bin/env python
-
-import re
-import argparse
-import urllib.request
-from bs4 import BeautifulSoup
-
-class WebPage(object):
-
-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-
-    def __init__(self, args):
-        self.url = args['url']
-
-
-    def get_source(self, args=None):
-        request = urllib.request.Request(self.url, headers=headers)
-        page = urllib.request.urlopen(request)
-        self.source = page.read()
-
-
-    def find_links(self, args=None, source=None):
-        soup = BeautifulSoup(self.source, 'html.parser')
-        links = soup.find_all('a')
-        hrefs = []
-
-        for link in links:
-            if link['href'].startswith('/'):
-                hrefs.append("".join([self.url, link['href']]))
-            else:
-                hrefs.append(link['href'])
-
-        return hrefs
-
-
-def run(args=None):
-    source = get_source(args)
-    urls = find_links(args, source)
-    local_urls = parse_urls(args, urls)
-
-    print(local_urls)
-
-def get_source(args=None):
-    url = args.url
-    useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
-    headers = {'User-Agent': useragent}
-    request = urllib.request.Request(url, headers=headers)
-    page = urllib.request.urlopen(request)
-    source = page.read()
-
-    return source
-
-
-def find_links(args=None, source=None):
-    soup = BeautifulSoup(source, 'html.parser')
-    links = soup.find_all('a')
-    hrefs = []
-
-    for link in links:
-        if link['href'].startswith('/'):
-            hrefs.append("".join([args.url, link['href']]))
-        else:
-            hrefs.append(link['href'])
-
-    return hrefs
-
-
-def parse_urls(args=None, urls=None):
-    local_urls = []
-
-    for url in urls:
-        if url.startswith(args.url):
-            local_urls.append(url)
-
-
-    return local_urls
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser(description='Recursive web scraper')
-    parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
-    args = parser.parse_args()
-
-    if not args.url.startswith('http'):
-        raise SystemExit('URL must start with a protocol (http(s)).')
-
-    run(args)
--- a/templates/sitemap.html.j2
+++ b/templates/sitemap.html.j2
@@ -0,0 +1,14 @@
+<html>
+<head>
+ <title>Sitemap for {{ base_url }}</title>
+</head>
+<body>
+<p>
+Crawled {{ urlcount }} URLs on {{ rooturl }} in ~{{ runtime }} seconds.
+<ul>
+{% for url in urls %}
+  <li><a href="{{ url }}">{{ url }}</a></li>
+{% endfor %}
+</ul>
+</body>
+</html>
--- a/test/files/find_all_urls.html
+++ b/test/files/find_all_urls.html
@@ -0,0 +1,10 @@
+<html>
+<body>
+<p>
+<ul>
+  <li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
+  <li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
+  <li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
+</ul>
+</body>
+</html>
--- a/test_helpers.py
+++ b/test_helpers.py
@@ -1,37 +1,66 @@
 #!/usr/bin/env python

 import unittest
-from utils.helpers import (clean_base_url)
+from unittest import mock
+from utils.helpers import RobotsTxt, standardise_url
+
+
+class TestRobots(unittest.TestCase):
+
+    rooturl = 'http://eu.httpbin.org'
+    no_robots = 'https://www.simonweald.com'
+
+    test_paths = (('/', True), ('/deny', False))
+
+    robots = RobotsTxt(rooturl=rooturl)
+    norobots = RobotsTxt(rooturl=no_robots)
+
+    def test_robots_txt_deny(self):
+        '''
+        Asserts result is True or False.
+        '''
+        for path, allowed in self.test_paths:
+            result = self.robots.check(url=path)
+            self.assertIs(result, allowed)
+
+    def test_no_robots_txt(self):
+        '''
+        Ensure we can crawl if robots.txt isn't present.
+        '''
+        result = self.norobots.check(url='/')
+        self.assertTrue(result)
+

 class TestUrls(unittest.TestCase):

-    base_url = "github.com"
+    rooturl = 'http://eu.httpbin.org'

-    base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
-                     ('www.simonweald.com', 'http://www.simonweald.com'),
-                     ('http://www.github.com/', 'http://www.github.com'),
-                     ('https://www.github.com', 'https://www.github.com'))
+    rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
+                     ('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
+                     ('https://eu.httpbin.org', 'https://eu.httpbin.org'))

-    valid_urls = ["https://www.github.com", "http://www.github.com",
-                  "github.com", "/some/url/", "index.html"]
+    urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
+                     ('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
+                     ('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
+                     ('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
+                     ('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))

-
-    def test_clean_base_url(self):
+    def test_standardise_rooturl(self):
        '''
-        Tests whether a URL's protocol can be discovered if not provided.
+        Tests whether a base URL can be standardised to the format
+        proto://[sub].domain.tld.
        '''
-        for url, target in self.base_url_list:
-            result = clean_base_url(url)
+        for url, target in self.rooturl_list:
+            result = standardise_url(url)
            self.assertEqual(result, target)

-    # def test_url_validation(self):
-    #     '''
-    #     Passes when given a valid URL. A valid URL is qualified
-    #     by being local to the domain to be crawled.
-    #     '''
-    #     for url in self.valid_urls:
-    #         result = url_validation(self.base_url, url)
-    #         self.assertTrue(result)
+    def test_standardise_url(self):
+        '''
+        Ensure that fragments/anchors etc are stripped.
+        '''
+        for url, target in self.urls_to_clean:
+            result = standardise_url(url, rooturl=self.rooturl)
+            self.assertEqual(result, target)


 if __name__ == '__main__':
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,97 +3,238 @@
 Utilities to provide various misc functions.
 '''

-import urllib.request
+import aiohttp
+import asyncio
 from bs4 import BeautifulSoup
-from urllib.parse import (urljoin, urlsplit)
+import urllib.error
+from urllib.parse import urljoin, urlsplit
+import urllib.request
+import urllib.robotparser


-class UrlPool(object):
+class AsyncCrawler(object):
    '''
-    Object to manage a pool of URLs.
+    A concurrent recursive web crawler.
+
+    A recursive web crawler which finds all URLs local to the domains specified
+    in the `rooturl` argument.
+
+    Arguments:
+        rooturl:        Root domain to begin crawling.
+        robots:         RobotsTxt object for the rooturl.
+        concurrency:    number of concurrent pages to crawl.
+
+    Returns:
+        All discovered pages in a set.
    '''

-    def __init__(self):
-        self.url_pool = set()
+    def __init__(self, rooturl=None, robots=None, concurrency=None):
+        self.rooturl = rooturl
+        self.robots = robots
+        self.crawled = set()
+        self.headers = {'Accept-Encoding': 'gzip, deflate',
+                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+        self.client_session = None
+        self.semaphore = asyncio.BoundedSemaphore(concurrency)

-    def check_duplicate(self, new_url):
+    async def crawl_url(self, url=None):
        '''
-        Checks if a URL exists in the current pool.
+        Crawls the given URL and finds all new URLs in the given page.
        '''
-        if new_url in self.url_pool:
-            return True
+        urls = []
+        source = await self.get_source(url)
+        if source:
+            urls = self.find_all_urls(source)
+
+        return urls
+
+    def validate_url(self, url=None):
+        '''
+        Ensures we have a valid URL to crawl and that the site's robots.txt
+        allows it.
+        '''
+        # ensure the URL is in a sane format
+        url = standardise_url(url=url, rooturl=self.rooturl)
+
+        if url and self.robots.check(url=url):
+            return url
        else:
            return False

-    def invalidate_url(self, url):
-        self.url_pool.remove(url)
-
-    def add_to_list(self, url):
-        self.url_pool.add(url)
-
-
-class WebPage(object):
-
-    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-
-    def __init__(self, url):
-        self.url = url
-
-    def get_source(self):
-        request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request)
-        self.source = page.read()
-
-    def find_links(self):
-        soup = BeautifulSoup(self.source, 'html.parser')
-        links = soup.find_all('a')
-        hrefs = []
-
-        for link in links:
-            if link['href'].startswith('/'):
-                hrefs.append("".join([self.url, link['href']]))
+    async def get_source(self, url=None):
+        '''
+        Obtains the URL's source, provided it is HTML. Usage of semaphores
+        ensures only a certain number of coroutines can run at any given
+        time.
+        '''
+        async with self.semaphore:
+            async with self.client_session.head(url, timeout=5) as head:
+                try:
+                    _ = await head.read()
+                except Exception:
+                    pass
+            if 'text/html' in head.headers['Content-Type']:
+                async with self.client_session.get(url, timeout=5) as resp:
+                    try:
+                        source = await resp.read()
+                        return source
+                    except Exception:
+                        return None
            else:
-                hrefs.append(link['href'])
+                return None

-        self.hrefs = hrefs
+    def find_all_urls(self, source=None):
+        '''
+        Find all URLs in a page's source. Returns a list of URLs which have
+        been validated as local to the starting URL.
+        '''
+        urls = []

-    def parse_urls(self):
-        local_urls = []
-        for url in self.hrefs:
-            if url.startswith(self.url):
-                local_urls.append(url)
+        html = BeautifulSoup(source, 'lxml')
+        hrefs = html.find_all('a', href=True)

-        return local_urls
+        # build a set of URLs which are valid and haven't been crawled yet
+        for href in hrefs:
+            url = self.validate_url(url=href['href'])
+            if url and url not in self.crawled:
+                urls.append(url)
+
+        return urls
+
+    async def run(self, urls=None):
+        '''
+        Crawls a batch of URLs of any size (resource usage is bounded by n
+        semaphores (where n = concurrency). Returns a set of URLs to be added
+        to the list of URLs which need to be crawled (find_all_urls only
+        returns unseen URLs).
+        '''
+        tasks = []
+        all_urls = set()
+        for url in urls:
+            # mark the URL as seen.
+            self.crawled.add(url)
+            # create a task to crawl the URL.
+            tasks.append(self.crawl_url(url))
+
+        # wait for all tasks to complete.
+        for task in asyncio.as_completed(tasks):
+            urls = None
+            try:
+                # try getting all tasks as completed.
+                urls = await task
+            except Exception:
+                # skip until all tasks have completed.
+                pass
+
+            # add the URLs to a set to be returned.
+            if urls:
+                for url in urls:
+                    all_urls.add(url)
+
+        return all_urls
+
+    async def main(self):
+        '''
+        Runs a crawl with batches of URLs. Once complete returns a list of all
+        crawled URLs.
+        '''
+        self.client_session = aiohttp.ClientSession(headers=self.headers)
+        to_crawl = []
+        # add the root URL to initialise the crawler.
+        to_crawl.append(self.rooturl)
+
+        print('Crawling: {0}'.format(self.rooturl))
+        while len(to_crawl) > 0:
+            discovered_urls = await self.run(urls=to_crawl)
+            # empty to_crawl list and then add all newly discovered URLs for
+            # the next iteration.
+            to_crawl.clear()
+            to_crawl.extend(discovered_urls)
+            print('{0} URLs crawled'.format(len(self.crawled)))
+
+        # close the ssions once all URLs have been crawled.
+        await self.client_session.close()
+
+        return self.crawled


-def sanitise_url(url):
+class RobotsTxt(object):
    '''
-    Attempt to standardise the base url to ensure it can be prepended to
-    relative URLs. If no scheme provided then we default to http as any
-    sane https-only site should 301 redirect http > https.
+    Retrieve and query robots.txt for a given domain.

-    Returns a corrected base URL as a string.
+    Retrieves and parses robots.txt for the given domain. Calling the check()
+    method returns True or False depending on whether crawling of that given
+    URL is allowed.
+    '''
+
+    def __init__(self, rooturl=None):
+        '''
+        Manually retrieve robots.txt to allow us to set the user-agent (works
+        around sites which disallow access to robots.txt without a sane
+        user-agent).
+        '''
+        self.rooturl = rooturl
+        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+
+        robots_url = urljoin(self.rooturl, 'robots.txt')
+        request = urllib.request.Request(robots_url, headers=self.headers)
+
+        robots = urllib.robotparser.RobotFileParser()
+        robots.set_url(robots_url)
+
+        try:
+            response = urllib.request.urlopen(request, timeout=5)
+        except urllib.error.HTTPError:
+            # if robots.txt doesn't exist then allow all URLs to be crawled.
+            robots.allow_all = True
+        else:
+            data = response.read()
+            decoded_data = data.decode("utf-8").splitlines()
+            robots.parse(decoded_data)
+
+        self.robots = robots
+
+    def check(self, url):
+        '''
+        Test if robots allows us to crawl that URL.
+        '''
+        return self.robots.can_fetch("*", url)
+
+
+def standardise_url(url=None, rooturl=None):
+    '''
+    If `rooturl` is None then we attempt to standarise the URL to ensure it
+    can be prepended to relative URLs. If no scheme has been provided then we
+    default to http as any sane https-only site should 301 redirect http to
+    https.
+
+    If `rooturl` is set, we standardise URLs to strip queries and fragments
+    (we don't want to scrape in-page anchors etc). Any relative URLs will be
+    appended to the root url.
+
+    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
+    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
+                       'cfm')

    split_url = urlsplit(url)
-    if split_url.scheme and split_url.scheme.startswith('http'):
-        base_url = "".join([split_url.scheme, delim, split_url.netloc])
-    elif (split_url.path and not split_url.scheme and not split_url.netloc):
-        base_url = "".join([default_proto, delim, split_url.path])

-    return base_url
+    if not rooturl:
+        # This will sanitise the initial url provided by the user.
+        if split_url.scheme and split_url.scheme.startswith('http'):
+            return "".join([split_url.scheme, delim, split_url.netloc])
+        elif (split_url.path and not split_url.scheme and not split_url.netloc):
+            return "".join([default_proto, delim, split_url.path])
+    else:
+        # if url.endswith(file_extensions):
+        # Sanitise discovered URLs. We already expect them in the format
+        # protocol://rooturl/path
+        if url.startswith('/'):
+            return urljoin(rooturl, split_url.path)
+        elif url.startswith(rooturl):
+            return "".join([split_url.scheme, delim, split_url.netloc,
+                            split_url.path])

-
-def qualify_url(base_url=None, url=None):
-    '''
-    Ensure any URLs discovered are absolute. If relative,
-    they will be appended to the base URL. Returns an
-    absolute URL as a string.
-    '''
-
-    if url.startswith('/'):
-        return urljoin(base_url, url)
-    if url.startswith(base_url):
-        return url
+    return None
Author	SHA1	Message	Date
Simon Weald	5f7d66912f	add test files	2018-09-19 08:39:05 +01:00
Simon Weald	d4cd93e3d4	update docs	2018-09-19 08:38:49 +01:00
Simon Weald	f5f6afd1a4	correct tests with new arg names	2018-09-19 08:37:55 +01:00
Simon Weald	679b1b7b53	rename all instances of base_url to rooturl, add more documentation	2018-09-18 18:24:15 +01:00
Simon Weald	32d7f1e54b	add talking points	2018-09-18 18:23:12 +01:00
Simon Weald	f6265f18a7	initial test for AsyncCrawler	2018-09-18 18:22:55 +01:00
Simon Weald	9a4e9ddfc7	add test for missing robots.txt	2018-09-18 10:53:13 +01:00
Simon Weald	51f988e1bc	added more tests	2018-09-17 21:44:20 +01:00
Simon Weald	73c21e5bd3	small improvements to docs and variables	2018-09-17 21:44:04 +01:00
Simon Weald	eb2395d461	minor change to README	2018-09-17 08:11:26 +01:00
Simon Weald	c53f62b55d	add most changes suggested by pycodestyle	2018-09-16 16:10:38 +01:00
Simon Weald	75d3756bbc	fix errors discovered by pycyodestyle	2018-09-16 16:04:07 +01:00
Simon Weald	5262c23281	add flags to README	2018-09-16 15:58:17 +01:00
Simon Weald	524f6a45cd	improve documentation	2018-09-16 15:53:47 +01:00
Simon Weald	a926090bed	update requirements	2018-09-16 15:44:30 +01:00
Simon Weald	91cd988f52	more comments and progress output	2018-09-16 15:26:49 +01:00
Simon Weald	f1855f5add	re-order imports because I'm fussy	2018-09-16 09:06:30 +01:00
Simon Weald	336517e84a	more documentation and add back some required imports	2018-09-16 09:00:43 +01:00
Simon Weald	7bc9fe0679	improved documentation and remove unneeded set	2018-09-16 08:56:44 +01:00
Simon Weald	6548f55416	improve documentation	2018-09-15 21:48:50 +01:00
Simon Weald	0244435fea	remove unecessary imports	2018-09-15 21:38:51 +01:00
Simon Weald	d6964672b6	commit of working async crawler	2018-09-15 21:30:02 +01:00
Simon Weald	3808f72f73	correct semaphore usage	2018-09-14 16:06:17 +01:00
Simon Weald	7ebe4855b8	remove unecessary classes2	2018-09-14 16:02:20 +01:00
Simon Weald	db986b0eba	async crawler in a mostly-working state	2018-09-14 16:01:12 +01:00
Simon Weald	36e1f7693f	initial foray into asynchronous crawling	2018-09-12 22:54:12 +01:00
Simon Weald	8698c21fda	return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was	2018-09-12 08:03:08 +01:00
Simon Weald	273cf56a3b	add some basic tests	2018-09-11 13:42:15 +01:00
Simon Weald	1af26f50f2	added a docstring	2018-09-11 13:42:02 +01:00
Simon Weald	c40c5cea50	add async info	2018-09-10 21:29:46 +01:00
Simon Weald	a6224f9b6a	updated readme	2018-09-10 20:56:12 +01:00
Simon Weald	b64711973f	add new thoughts	2018-09-10 11:58:58 +01:00
Simon Weald	9e125dfae0	added comments and docstrings	2018-09-09 22:49:55 +01:00
Simon Weald	f16f82fdfb	improved completion message	2018-09-09 22:40:42 +01:00
Simon Weald	a523154848	display count of crawled/uncrawled URLs whilst running	2018-09-09 22:35:55 +01:00
Simon Weald	9e754a5584	improve handling of gzip/deflated data detection	2018-09-09 11:21:46 +01:00
Simon Weald	1b005570ee	implement gzip compression requests and handling	2018-09-09 10:53:09 +01:00
Simon Weald	17fa9f93f9	tick off gzip encoding	2018-09-09 10:52:37 +01:00
Simon Weald	1e51e10db2	update with changes	2018-09-09 10:22:18 +01:00
Simon Weald	225fd8b3ea	update with changes	2018-09-09 10:22:03 +01:00
Simon Weald	d686ae0bc4	update with changes	2018-09-09 10:21:45 +01:00
Simon Weald	69f5788745	update notes	2018-09-09 10:16:22 +01:00
Simon Weald	b5d644a223	various minor improvements to exception handling	2018-09-09 10:16:03 +01:00
Simon Weald	6508156aa4	use lxml as the parser and only find links on a page if we've got the source	2018-09-09 10:06:25 +01:00
Simon Weald	738ab8e441	adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent	2018-09-09 09:57:16 +01:00
Simon Weald	fdd84a8786	manually retrieve robots.txt to ensure we can set the user-agent	2018-09-07 12:40:12 +01:00
Simon Weald	ab0ab0a010	add more thoughts	2018-09-07 11:50:53 +01:00
Simon Weald	6a1259aa7d	update plans to add gzip encoding	2018-09-06 17:33:10 +01:00
Simon Weald	164239b343	more thoughts	2018-09-06 17:31:12 +01:00
Simon Weald	ce1f2745c9	update thoughts	2018-09-06 17:30:28 +01:00
Simon Weald	e70bdc9ca1	update requirements.txt	2018-09-06 17:25:30 +01:00
Simon Weald	d1c1e17f4f	report runtime of script in generated sitemap	2018-09-06 17:20:59 +01:00
Simon Weald	816a727d79	ignore generated file	2018-09-06 17:08:56 +01:00
Simon Weald	84ab27a75e	render results as HTML	2018-09-06 17:08:26 +01:00
Simon Weald	6d9103c154	improved content-type detection	2018-09-06 17:08:12 +01:00
Simon Weald	e57a86c60a	only attempt to read html	2018-09-06 16:30:11 +01:00
Simon Weald	a3ec9451e3	implement parsing of robots.txt	2018-09-05 18:56:20 +01:00
Simon Weald	f2c294ebdb	added new ideas to implement	2018-09-04 15:40:11 +01:00
Simon Weald	1b9b207a28	attempt to remove base url with trailing slash (if discovered)	2018-09-04 13:57:52 +01:00
Simon Weald	6abe7d68e0	updated notes	2018-09-04 12:51:59 +01:00
Simon Weald	7d919039b6	removed unecessary modules	2018-09-04 10:14:27 +01:00
Simon Weald	0726bcccb0	removed original file	2018-09-04 09:21:55 +01:00
Simon Weald	05e907ecec	too many changes to make a sensible commit message	2018-09-04 09:21:26 +01:00
Simon Weald	abc628106d	added a docstring to the WebPage object	2018-08-31 19:18:00 +01:00
Simon Weald	c436016e0c	remove unecessary function	2018-08-31 19:16:08 +01:00
Simon Weald	03554fde80	add docstrings	2018-08-31 19:15:35 +01:00
Simon Weald	759f965e95	use more explicit names, use urljoin to combine urls	2018-08-31 19:12:58 +01:00
Simon Weald	0517e5bc56	crawler now initialises and populates crawled pool with urls it finds	2018-08-31 19:02:21 +01:00
Simon Weald	1b18aa83eb	corrected some small errors and added runner function	2018-08-31 19:01:35 +01:00