commit of working async crawler

correct semaphore usage
remove unecessary classes2
2018-09-15 21:30:02 +01:00 · 2018-09-14 16:06:17 +01:00 · 2018-09-14 16:02:20 +01:00 · 2018-09-14 16:01:12 +01:00 · 2018-09-12 22:54:12 +01:00
2 changed files with 209 additions and 124 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -0,0 +1,81 @@
 #!/usr/bin/env python
 '''
 Need a docstring.
 '''
 import argparse
 import jinja2
 import os
 import sys
 import asyncio
 from datetime import datetime
 from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 def sanity_checks(url=None):
    '''
    Runs some basic sanity checks before the crawler is initialised.
    '''
    # ensure we have a sensible URL to work with
    baseurl = standardise_url(url=url)
    # get robots.txt
    robots = RobotsTxt(base_url=baseurl)
    # fail early if robots denies all crawling
    if not robots.check(url=baseurl):
        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
    return(baseurl, robots)
 def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    '''
    Renders the sitemap to an HTML file.
    '''
    urlcount = len(crawled_urls)
    sorted_urls = sorted(crawled_urls)
    template = jinja2.Environment(
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')
    rendered_html = template.render(
        base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)
    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
 def main():
    '''
    docstring
    '''
    starttime = datetime.now()
    baseurl, robots = sanity_checks(url=args.url)
    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
    results = task.result()
    runtime = int((datetime.now() - starttime).total_seconds())
    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
                        default=100, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()
    main()
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -3,140 +3,138 @@
 Utilities to provide various misc functions.
 '''
-from bs4 import BeautifulSoup
+# import urllib.request
 # import urllib.error
 # import gzip
 # from time import sleep
 import aiohttp
-import urllib.request
+import asyncio
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlsplit
 import urllib.robotparser
 import urllib.error
 import gzip
 from urllib.parse import (urljoin, urlsplit)
-class UrlPool(object):
+class AsyncCrawler(object):
    '''
-    Object to manage a pool of URLs.
+    docstring
    '''
-    def __init__(self):
+    def __init__(self, baseurl=None, robots=None, concurrency=None):
-        self.pool = set()
+        self.baseurl = baseurl
    def check_duplicate(self, new_url):
        '''
        Checks if a URL exists in the current pool.
        '''
        if new_url in self.pool:
            return True
        else:
            return False
    def remove_from_pool(self):
        '''
        Remove a URL from the pool and return it to be crawled.
        '''
        return(self.pool.pop())
    def add_to_pool(self, url):
        self.pool.add(url)
    def list_pool(self):
        pool = self.pool
        return pool
 class WebPage(object):
    '''
    Object to manage common operations required to return
    the data from each individual page.
    '''
    # set a sane user-agent and request compression if available.
    headers = {'Accept-Encoding': 'gzip, deflate',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    def __init__(self, url=None, base_url=None, robots=None):
        self.url = url
        self.base_url = base_url
        self.robots = robots
-        self.source = None
+        self.uncrawled = set()
-        self.urls_to_crawl = set()
+        self.crawled = set()
        self.headers = {'Accept-Encoding': 'gzip, deflate',
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
-    def get_source(self):
+    async def crawl_url(self, url=None):
        '''
-        Retrieve a page's source.
+        docstring
        '''
        urls = []
        source = await self.get_source(url)
        if source:
            urls = self.find_all_urls(source)
-        request = urllib.request.Request(self.url, headers=self.headers)
+        return urls
        page = urllib.request.urlopen(request, timeout=5)
        # handle the content encoding in case it needs decompressing.
        if 'text/html' in page.info().get('Content-Type'):
            if page.info().get('Content-Encoding'):
                if page.info().get('Content-Encoding') == 'gzip':
                    self.source = gzip.decompress(page.read())
                elif page.info().get('Content-Encoding') == 'deflate':
                    self.source = page.read()
            else:
                self.source = page.read()
-    def find_links(self):
+    def validate_url(self, url=None):
        '''
-        Find all URLs on a page and ensure they are absolute. If they are
+        Ensures we have a valid URL to crawl and that the site's robots.txt
-        relative then they will be appended to the base URL.
+        allows it.
        '''
-        hrefs = set()
+        # ensure the URL is in a sane format
        url = standardise_url(url=url, base_url=self.baseurl)
-        soup = BeautifulSoup(self.source, 'lxml')
+        if url and self.robots.check(url=url):
-        links = soup.find_all('a', href=True)
+            return url
        for link in links:
            if link['href'].startswith('/'):
                hrefs.add(urljoin(self.url, link['href']))
            else:
                hrefs.add(link['href'])
        self.discovered_hrefs = hrefs
    def parse_urls(self):
        '''
        Iterate through the list of discovered URLs and add them to the
        pool if they start with the base URL.
        '''
        for url in self.discovered_hrefs:
            if url.startswith(self.base_url) and self.robots.check(url):
                sanitised_url = sanitise_url(url=url)
                self.urls_to_crawl.add(sanitised_url)
    def list_urls(self):
        '''
        Returns all valid discovered URLs.
        '''
        return self.urls_to_crawl
    def run(self):
        '''
        Attempt to get the page's source and if successful, iterate through it
        to find any links we can crawl.
        '''
        try:
            self.get_source()
        except Exception:
            # skip if we didn't retrieve the source.
            pass
        if self.source:
            self.find_links()
            self.parse_urls()
            return True
        else:
            return False
    async def get_source(self, url=None):
        '''
        Obtains the page's source.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
                try:
                    data = await head.read()
                except Exception as e:
                    print(e)
            if 'text/html' in head.headers['Content-Type']:
                async with self.client_session.get(url, timeout=5) as resp:
                    try:
                        source = await resp.read()
                        print('crawled {0}'.format(url))
                        return source
                    except Exception:
                        return None
            else:
                print('{0} - {1}'.format(head.headers['Content-Type'], url))
    def find_all_urls(self, source=None):
        '''
        Find all URLs in a page's source.
        '''
        urls = []
        html = BeautifulSoup(source, 'lxml')
        hrefs = html.find_all('a', href=True)
        # build a set of URLs which are valid and haven't been crawled yet
        for href in hrefs:
            url = self.validate_url(url=href['href'])
            if url and url not in self.crawled:
                urls.append(url)
        return urls
    async def run(self, urls=None):
        tasks = []
        all_urls = set()
        for url in urls:
            self.crawled.add(url)
            tasks.append(self.crawl_url(url))
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
                # completed.append((await task))
                urls = await task
            except Exception as e:
                print(e)
            if urls:
                for url in urls:
                    all_urls.add(url)
        return all_urls
    async def main(self):
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        to_crawl = []
        to_crawl.append(self.baseurl)
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
            to_crawl.clear()
            to_crawl.extend(discovered_urls)
        await self.client_session.close()
        return self.crawled
 class RobotsTxt(object):
    '''
    needs a docstring
@@ -174,31 +172,37 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)
-def sanitise_url(url, base_url=False):
+def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is True, we attempt to standardise `url` to ensure it can be
+    If `base_url` is None then we attempt to standarise the URL to ensure it can
-    prepended to relative URLs. If no scheme has been provided then we default
+    be prepended to relative URLs. If no scheme has been provided then we default
    to http as any sane https-only site should 301 redirect http > https.
-    If `base_url` is False, we sanitise URLs to strip queries and fragments (we
+    If `base_url` is set, we standardise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc).
+    don't want to scrape in-page anchors etc). Any relative URLs will be appended
    to the base url.
-    Returns a sanitised URL as a string.
+    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
    split_url = urlsplit(url)
-    if base_url:
+    if not base_url:
-        # This will sanitise the initial url for the initial page crawl.
+        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
-            sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
+            return "".join([split_url.scheme, delim, split_url.netloc])
        elif (split_url.path and not split_url.scheme and not split_url.netloc):
-            sanitised_url = "".join([default_proto, delim, split_url.path])
+            return "".join([default_proto, delim, split_url.path])
    else:
        # if url.endswith(file_extensions):
        # Sanitise discovered URLs. We already expect them in the format
        # protocol://base_url/path
-        sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+        if url.startswith('/'):
            return urljoin(base_url, split_url.path)
        elif url.startswith(base_url):
            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
-    return sanitised_url
+    return None
Author	SHA1	Message	Date
Simon Weald	d6964672b6	commit of working async crawler	2018-09-15 21:30:02 +01:00
Simon Weald	3808f72f73	correct semaphore usage	2018-09-14 16:06:17 +01:00
Simon Weald	7ebe4855b8	remove unecessary classes2	2018-09-14 16:02:20 +01:00
Simon Weald	db986b0eba	async crawler in a mostly-working state	2018-09-14 16:01:12 +01:00
Simon Weald	36e1f7693f	initial foray into asynchronous crawling	2018-09-12 22:54:12 +01:00