add flags to README

improve documentation
update requirements
2018-09-16 15:58:17 +01:00 · 2018-09-16 15:53:47 +01:00 · 2018-09-16 15:44:30 +01:00 · 2018-09-16 15:26:49 +01:00 · 2018-09-16 09:06:30 +01:00 · 2018-09-16 09:00:43 +01:00
4 changed files with 251 additions and 128 deletions
@@ -2,7 +2,7 @@
 ## Requirements
-This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
+This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
 Install required modules:
@@ -13,9 +13,16 @@ pip install -r requirements.txt
 Run:
 ```bash
-python crawler.py -u https://urltocrawl.com
+python crawler.py -u https://urltocrawl.com [-c 100]
 ```
 Flags:
  - -u/--url https://url.com
    - The base URL is required.
  - -c/--concurrency 100
    - Specifying concurrency value is optional (defaults to 100).
 ## Results
-The resulting sitemap will be output in the root of this directory as `sitemap.html`
+The resulting sitemap will be output to the root of this directory as `sitemap.html`
@@ -0,0 +1,92 @@
 #!/usr/bin/env python
 '''
 Asynchronous web crawler written in Python 3.5+.
 This script will respect the site's `robots.txt`, if one exists. If not, all
 URLs discovered will be crawled.
 The crawler takes a total of two arguments (concurrency is optional):
    url: the base URL to begin the crawl from.
    concurrency: the maximum number of pages which may be crawled concurrently.
 '''
 import argparse
 import asyncio
 from datetime import datetime
 import jinja2
 import os
 import sys
 from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    '''
    # ensure we have a sensible URL to work with
    baseurl = standardise_url(url=url)
    # get robots.txt
    robots = RobotsTxt(base_url=baseurl)
    # fail early if robots denies all crawling
    if not robots.check(url=baseurl):
        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
    return(baseurl, robots)
 def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
    template = jinja2.Environment(
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')
    rendered_html = template.render(
        base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)
    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
 def main():
    '''
    Main function, responsible for prepping and running the crawler and
    rendering the sitemap.
    '''
    starttime = datetime.now()
    baseurl, robots = sanity_checks(url=args.url)
    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
    # create a task to run the crawler, run the loop and then gather the results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
    results = sorted(task.result())
    runtime = int((datetime.now() - starttime).total_seconds())
    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
                        default=100, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()
    main()
@@ -1,8 +1,12 @@
 aiohttp==3.4.4
 async-timeout==3.0.0
 attrs==18.2.0
 beautifulsoup4==4.6.3
 bs4==0.0.1
 certifi==2018.8.13
 chardet==3.0.4
 idna==2.7
 Jinja2==2.10
 lxml==4.2.4
 MarkupSafe==1.0
 multidict==4.4.0
 yarl==1.2.6
@@ -3,140 +3,151 @@
 Utilities to provide various misc functions.
 '''
 from bs4 import BeautifulSoup
 import aiohttp
 import asyncio
 from bs4 import BeautifulSoup
 import urllib.error
 from urllib.parse import urljoin, urlsplit
 import urllib.request
 import urllib.robotparser
 import urllib.error
 import gzip
 from urllib.parse import (urljoin, urlsplit)
-class UrlPool(object):
+class AsyncCrawler(object):
    '''
-    Object to manage a pool of URLs.
+    docstring
    '''
-    def __init__(self):
+    def __init__(self, baseurl=None, robots=None, concurrency=None):
-        self.pool = set()
+        self.baseurl = baseurl
    def check_duplicate(self, new_url):
        '''
        Checks if a URL exists in the current pool.
        '''
        if new_url in self.pool:
            return True
        else:
            return False
    def remove_from_pool(self):
        '''
        Remove a URL from the pool and return it to be crawled.
        '''
        return(self.pool.pop())
    def add_to_pool(self, url):
        self.pool.add(url)
    def list_pool(self):
        pool = self.pool
        return pool
 class WebPage(object):
    '''
    Object to manage common operations required to return
    the data from each individual page.
    '''
    # set a sane user-agent and request compression if available.
    headers = {'Accept-Encoding': 'gzip, deflate',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    def __init__(self, url=None, base_url=None, robots=None):
        self.url = url
        self.base_url = base_url
        self.robots = robots
-        self.source = None
+        self.crawled = set()
-        self.urls_to_crawl = set()
+        self.headers = {'Accept-Encoding': 'gzip, deflate',
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
-    def get_source(self):
+    async def crawl_url(self, url=None):
        '''
-        Retrieve a page's source.
+        Crawls the given URL and finds all new URLs in the initial page.
        '''
        urls = []
        source = await self.get_source(url)
        if source:
            urls = self.find_all_urls(source)
-        request = urllib.request.Request(self.url, headers=self.headers)
+        return urls
        page = urllib.request.urlopen(request, timeout=5)
        # handle the content encoding in case it needs decompressing.
        if 'text/html' in page.info().get('Content-Type'):
            if page.info().get('Content-Encoding'):
                if page.info().get('Content-Encoding') == 'gzip':
                    self.source = gzip.decompress(page.read())
                elif page.info().get('Content-Encoding') == 'deflate':
                    self.source = page.read()
            else:
                self.source = page.read()
-    def find_links(self):
+    def validate_url(self, url=None):
        '''
-        Find all URLs on a page and ensure they are absolute. If they are
+        Ensures we have a valid URL to crawl and that the site's robots.txt
-        relative then they will be appended to the base URL.
+        allows it.
        '''
-        hrefs = set()
+        # ensure the URL is in a sane format
        url = standardise_url(url=url, base_url=self.baseurl)
-        soup = BeautifulSoup(self.source, 'lxml')
+        if url and self.robots.check(url=url):
-        links = soup.find_all('a', href=True)
+            return url
        for link in links:
            if link['href'].startswith('/'):
                hrefs.add(urljoin(self.url, link['href']))
            else:
                hrefs.add(link['href'])
        self.discovered_hrefs = hrefs
    def parse_urls(self):
        '''
        Iterate through the list of discovered URLs and add them to the
        pool if they start with the base URL.
        '''
        for url in self.discovered_hrefs:
            if url.startswith(self.base_url) and self.robots.check(url):
                sanitised_url = sanitise_url(url=url)
                self.urls_to_crawl.add(sanitised_url)
    def list_urls(self):
        '''
        Returns all valid discovered URLs.
        '''
        return self.urls_to_crawl
    def run(self):
        '''
        Attempt to get the page's source and if successful, iterate through it
        to find any links we can crawl.
        '''
        try:
            self.get_source()
        except Exception:
            # skip if we didn't retrieve the source.
            pass
        if self.source:
            self.find_links()
            self.parse_urls()
            return True
        else:
            return False
    async def get_source(self, url=None):
        '''
        Obtains the URL's source, provided it is HTML. Usage of semaphores
        ensures only a certain number of coroutines can run at once.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
                    data = await head.read()
                except Exception as e:
                    print(e)
            if 'text/html' in head.headers['Content-Type']:
                async with self.client_session.get(url, timeout=5) as resp:
                    try:
                        source = await resp.read()
                        return source
                    except Exception:
                        return None
    def find_all_urls(self, source=None):
        '''
        Find all URLs in a page's source. Returns a list of URLs which have
        been validated as local to the starting URL.
        '''
        urls = []
        html = BeautifulSoup(source, 'lxml')
        hrefs = html.find_all('a', href=True)
        # build a set of URLs which are valid and haven't been crawled yet
        for href in hrefs:
            url = self.validate_url(url=href['href'])
            if url and url not in self.crawled:
                urls.append(url)
        return urls
    async def run(self, urls=None):
        '''
        Crawls a batch of URLs of any size (resource usage is bounded by n
        semaphores (where n = concurrency). Returns a set of URLs to be added
        to the list of URLs which need to be crawled (find_all_urls only returns
        unseen URLs).
        '''
        tasks = []
        all_urls = set()
        for url in urls:
            # mark the URL as seen.
            self.crawled.add(url)
            # create an coroutine to crawl the URL.
            tasks.append(self.crawl_url(url))
        # wait for all tasks to complete.
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
                # completed.append((await task))
                urls = await task
            except Exception as e:
                print(e)
            # add the URLs to a set to be returned.
            if urls:
                for url in urls:
                    print('Found: {0}'.format(url))
                    all_urls.add(url)
        return all_urls
    async def main(self):
        '''
        Runs a crawl with batches of URLs. Once complete returns a list of all
        crawled URLs.
        '''
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        to_crawl = []
        to_crawl.append(self.baseurl)
        print('Crawling: {0}'.format(self.baseurl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
            # empty toe crawl list and then add all newly discovered URLs for
            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)
        # close the ssions once all URLs have been crawled.
        await self.client_session.close()
        return self.crawled
 class RobotsTxt(object):
    '''
    needs a docstring
@@ -144,7 +155,9 @@ class RobotsTxt(object):
    def __init__(self, base_url=None):
        '''
-        Manually retrieve robots.txt to allow us to set the user-agent.
+        Manually retrieve robots.txt to allow us to set the user-agent (works
        around sites which disallow access to robots.txt without a sane
        user-agent).
        '''
        self.base_url = base_url
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -158,6 +171,7 @@ class RobotsTxt(object):
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError:
            # if robots.txt doesn't exist then allow all URLs to be crawled.
            robots.allow_all = True
        else:
            data = response.read()
@@ -174,31 +188,37 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)
-def sanitise_url(url, base_url=False):
+def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is True, we attempt to standardise `url` to ensure it can be
+    If `base_url` is None then we attempt to standarise the URL to ensure it can
-    prepended to relative URLs. If no scheme has been provided then we default
+    be prepended to relative URLs. If no scheme has been provided then we default
    to http as any sane https-only site should 301 redirect http > https.
-    If `base_url` is False, we sanitise URLs to strip queries and fragments (we
+    If `base_url` is set, we standardise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc).
+    don't want to scrape in-page anchors etc). Any relative URLs will be appended
    to the base url.
-    Returns a sanitised URL as a string.
+    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
    split_url = urlsplit(url)
-    if base_url:
+    if not base_url:
-        # This will sanitise the initial url for the initial page crawl.
+        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
-            sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
+            return "".join([split_url.scheme, delim, split_url.netloc])
        elif (split_url.path and not split_url.scheme and not split_url.netloc):
-            sanitised_url = "".join([default_proto, delim, split_url.path])
+            return "".join([default_proto, delim, split_url.path])
    else:
        # if url.endswith(file_extensions):
        # Sanitise discovered URLs. We already expect them in the format
        # protocol://base_url/path
-        sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+        if url.startswith('/'):
            return urljoin(base_url, split_url.path)
        elif url.startswith(base_url):
            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
-    return sanitised_url
+    return None
Author	SHA1	Message	Date
simon	5262c23281	add flags to README	2018-09-16 15:58:17 +01:00
simon	524f6a45cd	improve documentation	2018-09-16 15:53:47 +01:00
simon	a926090bed	update requirements	2018-09-16 15:44:30 +01:00
simon	91cd988f52	more comments and progress output	2018-09-16 15:26:49 +01:00
simon	f1855f5add	re-order imports because I'm fussy	2018-09-16 09:06:30 +01:00
simon	336517e84a	more documentation and add back some required imports	2018-09-16 09:00:43 +01:00
simon	7bc9fe0679	improved documentation and remove unneeded set	2018-09-16 08:56:44 +01:00
simon	6548f55416	improve documentation	2018-09-15 21:48:50 +01:00
simon	0244435fea	remove unecessary imports	2018-09-15 21:38:51 +01:00
simon	d6964672b6	commit of working async crawler	2018-09-15 21:30:02 +01:00
simon	3808f72f73	correct semaphore usage	2018-09-14 16:06:17 +01:00
simon	7ebe4855b8	remove unecessary classes2	2018-09-14 16:02:20 +01:00
simon	db986b0eba	async crawler in a mostly-working state	2018-09-14 16:01:12 +01:00
simon	36e1f7693f	initial foray into asynchronous crawling	2018-09-12 22:54:12 +01:00