add most changes suggested by pycodestyle

fix errors discovered by pycyodestyle
add flags to README
2018-09-16 16:10:38 +01:00 · 2018-09-16 16:04:07 +01:00 · 2018-09-16 15:58:17 +01:00 · 2018-09-16 15:53:47 +01:00 · 2018-09-16 15:44:30 +01:00 · 2018-09-16 15:26:49 +01:00
4 changed files with 253 additions and 132 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 ## Requirements

-This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
+This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.

 Install required modules:

@@ -13,9 +13,16 @@ pip install -r requirements.txt
 Run:

 ```bash
-python crawler.py -u https://urltocrawl.com
+python crawler.py -u https://urltocrawl.com [-c 100]
 ```

+Flags:
+
+  - -u/--url https://url.com
+    - The base URL is required.
+  - -c/--concurrency 100
+    - Specifying concurrency value is optional (defaults to 100).
+
 ## Results

-The resulting sitemap will be output in the root of this directory as `sitemap.html`
+The resulting sitemap will be output to the root of this directory as `sitemap.html`
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+'''
+Asynchronous web crawler written in Python 3.5+.
+
+This script will respect the site's `robots.txt`, if one exists. If not, all
+URLs discovered will be crawled.
+
+The crawler takes a total of two arguments (concurrency is optional):
+
+    url: the base URL to begin the crawl from.
+    concurrency: the maximum number of pages which may be crawled concurrently.
+'''
+
+import argparse
+import asyncio
+from datetime import datetime
+import jinja2
+import os
+import sys
+from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
+
+
+def sanity_checks(url=None):
+    '''
+    Runs some basic sanity checks before the crawler is initialised.
+    '''
+    # ensure we have a sensible URL to work with
+    baseurl = standardise_url(url=url)
+    # get robots.txt
+    robots = RobotsTxt(base_url=baseurl)
+
+    # fail early if robots denies all crawling
+    if not robots.check(url=baseurl):
+        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(
+            baseurl=baseurl))
+
+    return(baseurl, robots)
+
+
+def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
+    '''
+    Renders the sitemap to an HTML file.
+    '''
+    urlcount = len(crawled_urls)
+    sorted_urls = sorted(crawled_urls)
+
+    template = jinja2.Environment(
+        loader=jinja2.FileSystemLoader('templates')
+    ).get_template('sitemap.html.j2')
+
+    rendered_html = template.render(base_url=base_url, urlcount=urlcount,
+                                    urls=sorted_urls, runtime=runtime)
+
+    with open('sitemap.html', 'w') as outfile:
+        outfile.write(rendered_html)
+
+    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
+
+
+def main():
+    '''
+    Main function, responsible for prepping and running the crawler and
+    rendering the sitemap.
+    '''
+    starttime = datetime.now()
+
+    baseurl, robots = sanity_checks(url=args.url)
+
+    # create a crawler
+    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots,
+                                 concurrency=args.concurrency)
+
+    # create a task to run the crawler, run the loop and then gather the
+    # results.
+    task = asyncio.Task(async_crawler.main())
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(task)
+    loop.close()
+    results = sorted(task.result())
+
+    runtime = int((datetime.now() - starttime).total_seconds())
+
+    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Recursive web crawler')
+    parser.add_argument("-u", "--url", required=True, help="Initial url")
+    parser.add_argument("-c", "--concurrency", required=False, type=int,
+                        default=100, help="Max pages to crawl concurrently")
+    args = parser.parse_args()
+
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,12 @@
+aiohttp==3.4.4
+async-timeout==3.0.0
+attrs==18.2.0
 beautifulsoup4==4.6.3
 bs4==0.0.1
-certifi==2018.8.13
 chardet==3.0.4
 idna==2.7
 Jinja2==2.10
 lxml==4.2.4
 MarkupSafe==1.0
+multidict==4.4.0
+yarl==1.2.6
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,139 +3,143 @@
 Utilities to provide various misc functions.
 '''

-from bs4 import BeautifulSoup
 import aiohttp
+import asyncio
+from bs4 import BeautifulSoup
+import urllib.error
+from urllib.parse import urljoin, urlsplit
 import urllib.request
 import urllib.robotparser
-import urllib.error
-import gzip
-from urllib.parse import (urljoin, urlsplit)


-class UrlPool(object):
+class AsyncCrawler(object):
    '''
-    Object to manage a pool of URLs.
+    docstring
    '''

-    def __init__(self):
-        self.pool = set()
-
-    def check_duplicate(self, new_url):
-        '''
-        Checks if a URL exists in the current pool.
-        '''
-        if new_url in self.pool:
-            return True
-        else:
-            return False
-
-    def remove_from_pool(self):
-        '''
-        Remove a URL from the pool and return it to be crawled.
-        '''
-        return(self.pool.pop())
-
-    def add_to_pool(self, url):
-        self.pool.add(url)
-
-    def list_pool(self):
-        pool = self.pool
-        return pool
-
-
-class WebPage(object):
-    '''
-    Object to manage common operations required to return
-    the data from each individual page.
-    '''
-
-    # set a sane user-agent and request compression if available.
-    headers = {'Accept-Encoding': 'gzip, deflate',
-               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-
-    def __init__(self, url=None, base_url=None, robots=None):
-        self.url = url
-        self.base_url = base_url
+    def __init__(self, baseurl=None, robots=None, concurrency=None):
+        self.baseurl = baseurl
        self.robots = robots
-        self.source = None
-        self.urls_to_crawl = set()
+        self.crawled = set()
+        self.headers = {'Accept-Encoding': 'gzip, deflate',
+                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+        self.client_session = None
+        self.semaphore = asyncio.BoundedSemaphore(concurrency)

-
-    def get_source(self):
+    async def crawl_url(self, url=None):
        '''
-        Retrieve a page's source.
+        Crawls the given URL and finds all new URLs in the initial page.
        '''
+        urls = []
+        source = await self.get_source(url)
+        if source:
+            urls = self.find_all_urls(source)

-        request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request, timeout=5)
+        return urls

-        # handle the content encoding in case it needs decompressing.
-        if 'text/html' in page.info().get('Content-Type'):
-            if page.info().get('Content-Encoding'):
-                if page.info().get('Content-Encoding') == 'gzip':
-                    self.source = gzip.decompress(page.read())
-                elif page.info().get('Content-Encoding') == 'deflate':
-                    self.source = page.read()
-            else:
-                self.source = page.read()
-
-
-    def find_links(self):
+    def validate_url(self, url=None):
        '''
-        Find all URLs on a page and ensure they are absolute. If they are
-        relative then they will be appended to the base URL.
+        Ensures we have a valid URL to crawl and that the site's robots.txt
+        allows it.
        '''
-        hrefs = set()
+        # ensure the URL is in a sane format
+        url = standardise_url(url=url, base_url=self.baseurl)

-        soup = BeautifulSoup(self.source, 'lxml')
-        links = soup.find_all('a', href=True)
-
-        for link in links:
-            if link['href'].startswith('/'):
-                hrefs.add(urljoin(self.url, link['href']))
-            else:
-                hrefs.add(link['href'])
-
-        self.discovered_hrefs = hrefs
-
-
-    def parse_urls(self):
-        '''
-        Iterate through the list of discovered URLs and add them to the
-        pool if they start with the base URL.
-        '''
-        for url in self.discovered_hrefs:
-            if url.startswith(self.base_url) and self.robots.check(url):
-                sanitised_url = sanitise_url(url=url)
-                self.urls_to_crawl.add(sanitised_url)
-
-
-    def list_urls(self):
-        '''
-        Returns all valid discovered URLs.
-        '''
-
-        return self.urls_to_crawl
-
-
-    def run(self):
-        '''
-        Attempt to get the page's source and if successful, iterate through it
-        to find any links we can crawl.
-        '''
-        try:
-            self.get_source()
-        except Exception:
-            # skip if we didn't retrieve the source.
-            pass
-
-        if self.source:
-            self.find_links()
-            self.parse_urls()
-            return True
+        if url and self.robots.check(url=url):
+            return url
        else:
            return False

+    async def get_source(self, url=None):
+        '''
+        Obtains the URL's source, provided it is HTML. Usage of semaphores
+        ensures only a certain number of coroutines can run at once.
+        '''
+        async with self.semaphore:
+            async with self.client_session.head(url, timeout=5) as head:
+                try:
+                    data = await head.read()
+                except Exception as e:
+                    print(e)
+            if 'text/html' in head.headers['Content-Type']:
+                async with self.client_session.get(url, timeout=5) as resp:
+                    try:
+                        source = await resp.read()
+                        return source
+                    except Exception:
+                        return None
+
+    def find_all_urls(self, source=None):
+        '''
+        Find all URLs in a page's source. Returns a list of URLs which have
+        been validated as local to the starting URL.
+        '''
+        urls = []
+
+        html = BeautifulSoup(source, 'lxml')
+        hrefs = html.find_all('a', href=True)
+
+        # build a set of URLs which are valid and haven't been crawled yet
+        for href in hrefs:
+            url = self.validate_url(url=href['href'])
+            if url and url not in self.crawled:
+                urls.append(url)
+
+        return urls
+
+    async def run(self, urls=None):
+        '''
+        Crawls a batch of URLs of any size (resource usage is bounded by n
+        semaphores (where n = concurrency). Returns a set of URLs to be added
+        to the list of URLs which need to be crawled (find_all_urls only
+        returns unseen URLs).
+        '''
+        tasks = []
+        all_urls = set()
+        for url in urls:
+            # mark the URL as seen.
+            self.crawled.add(url)
+            # create an coroutine to crawl the URL.
+            tasks.append(self.crawl_url(url))
+
+        # wait for all tasks to complete.
+        for task in asyncio.as_completed(tasks):
+            urls = None
+            try:
+                urls = await task
+            except Exception as e:
+                print(e)
+
+            # add the URLs to a set to be returned.
+            if urls:
+                for url in urls:
+                    print('Found: {0}'.format(url))
+                    all_urls.add(url)
+
+        return all_urls
+
+    async def main(self):
+        '''
+        Runs a crawl with batches of URLs. Once complete returns a list of all
+        crawled URLs.
+        '''
+        self.client_session = aiohttp.ClientSession(headers=self.headers)
+        to_crawl = []
+        to_crawl.append(self.baseurl)
+
+        print('Crawling: {0}'.format(self.baseurl))
+        while len(to_crawl) > 0:
+            discovered_urls = await self.run(urls=to_crawl)
+            # empty toe crawl list and then add all newly discovered URLs for
+            # the next iteration.
+            to_crawl.clear()
+            to_crawl.extend(discovered_urls)
+
+        # close the ssions once all URLs have been crawled.
+        await self.client_session.close()
+
+        return self.crawled
+

 class RobotsTxt(object):
    '''
@@ -144,7 +148,9 @@ class RobotsTxt(object):

    def __init__(self, base_url=None):
        '''
-        Manually retrieve robots.txt to allow us to set the user-agent.
+        Manually retrieve robots.txt to allow us to set the user-agent (works
+        around sites which disallow access to robots.txt without a sane
+        user-agent).
        '''
        self.base_url = base_url
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -158,6 +164,7 @@ class RobotsTxt(object):
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError:
+            # if robots.txt doesn't exist then allow all URLs to be crawled.
            robots.allow_all = True
        else:
            data = response.read()
@@ -166,7 +173,6 @@ class RobotsTxt(object):

        self.robots = robots

-
    def check(self, url):
        '''
        Test if robots allows us to crawl that URL.
@@ -174,31 +180,40 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)


-def sanitise_url(url, base_url=False):
+def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is True, we attempt to standardise `url` to ensure it can be
-    prepended to relative URLs. If no scheme has been provided then we default
-    to http as any sane https-only site should 301 redirect http > https.
+    If `base_url` is None then we attempt to standarise the URL to ensure it
+    can be prepended to relative URLs. If no scheme has been provided then we
+    default to http as any sane https-only site should 301 redirect http to
+    https.

-    If `base_url` is False, we sanitise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc).
+    If `base_url` is set, we standardise URLs to strip queries and fragments
+    (we don't want to scrape in-page anchors etc). Any relative URLs will be
+    appended to the base url.

-    Returns a sanitised URL as a string.
+    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
+    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
+                       'cfm')

    split_url = urlsplit(url)

-    if base_url:
-        # This will sanitise the initial url for the initial page crawl.
+    if not base_url:
+        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
-            sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
+            return "".join([split_url.scheme, delim, split_url.netloc])
        elif (split_url.path and not split_url.scheme and not split_url.netloc):
-            sanitised_url = "".join([default_proto, delim, split_url.path])
+            return "".join([default_proto, delim, split_url.path])
    else:
+        # if url.endswith(file_extensions):
        # Sanitise discovered URLs. We already expect them in the format
        # protocol://base_url/path
-        sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+        if url.startswith('/'):
+            return urljoin(base_url, split_url.path)
+        elif url.startswith(base_url):
+            return "".join([split_url.scheme, delim, split_url.netloc,
+                            split_url.path])

-    return sanitised_url
+    return None
Author	SHA1	Message	Date
Simon Weald	c53f62b55d	add most changes suggested by pycodestyle	2018-09-16 16:10:38 +01:00
Simon Weald	75d3756bbc	fix errors discovered by pycyodestyle	2018-09-16 16:04:07 +01:00
Simon Weald	5262c23281	add flags to README	2018-09-16 15:58:17 +01:00
Simon Weald	524f6a45cd	improve documentation	2018-09-16 15:53:47 +01:00
Simon Weald	a926090bed	update requirements	2018-09-16 15:44:30 +01:00
Simon Weald	91cd988f52	more comments and progress output	2018-09-16 15:26:49 +01:00
Simon Weald	f1855f5add	re-order imports because I'm fussy	2018-09-16 09:06:30 +01:00
Simon Weald	336517e84a	more documentation and add back some required imports	2018-09-16 09:00:43 +01:00
Simon Weald	7bc9fe0679	improved documentation and remove unneeded set	2018-09-16 08:56:44 +01:00
Simon Weald	6548f55416	improve documentation	2018-09-15 21:48:50 +01:00
Simon Weald	0244435fea	remove unecessary imports	2018-09-15 21:38:51 +01:00
Simon Weald	d6964672b6	commit of working async crawler	2018-09-15 21:30:02 +01:00
Simon Weald	3808f72f73	correct semaphore usage	2018-09-14 16:06:17 +01:00
Simon Weald	7ebe4855b8	remove unecessary classes2	2018-09-14 16:02:20 +01:00
Simon Weald	db986b0eba	async crawler in a mostly-working state	2018-09-14 16:01:12 +01:00
Simon Weald	36e1f7693f	initial foray into asynchronous crawling	2018-09-12 22:54:12 +01:00