async crawler in a mostly-working state

2018-09-14 16:01:12 +01:00
parent 36e1f7693f
commit db986b0eba
2 changed files with 114 additions and 53 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -1,3 +1,4 @@
 #!/usr/bin/env python
 '''
 Need a docstring.
@@ -6,10 +7,10 @@ Need a docstring.
 import argparse
 import jinja2
 import os
 import sys
 import asyncio
 from datetime import datetime
-# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
+from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
 from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
 def init_crawler(url=None):
@@ -17,10 +18,14 @@ def init_crawler(url=None):
    docstring
    '''
    # ensure we have a sensible URL to work with
-    baseurl = sanitise_url(url=url, base_url=True)
+    baseurl = standardise_url(url=url, base_url=url)
    # get robots.txt
    robots = RobotsTxt(base_url=baseurl)
    # fail early if robots denies all crawling
    if not robots.check(url=baseurl):
        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
    return(baseurl, robots)
@@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
-def main(args=None):
+def main():
    '''
    docstring
    '''
@@ -54,21 +59,25 @@ def main(args=None):
    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
-    # async_crawler.run()
+    # run the crawler
-    crawler = asyncio.Task(async_crawler.run())
+    task = asyncio.Task(async_crawler.run_loop())
    loop = asyncio.get_event_loop()
-    loop.run_until_complete(crawler)
+    loop.run_until_complete(task)
    loop.close()
-    result = crawler.result()
+    results = task.result()
-    print(len(result))
+    print(results)
    print(len(results))
    runtime = int((datetime.now() - starttime).total_seconds())
    print(runtime)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
-    parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
+    parser.add_argument("-c", "--concurrency", required=False, type=int,
                        default=50, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()
-    main(args)
+    main()
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -26,65 +26,113 @@ class AsyncCrawler(object):
        self.robots = robots
        self.uncrawled = set()
        self.crawled = set()
-        self.session = aiohttp.ClientSession()
+        # self.headers = {'Accept-Encoding': 'gzip, deflate',
                        # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
        # add the base URL to be crawled
        self.uncrawled.add(baseurl)
        self.headers = {'Accept-Encoding': 'gzip, deflate',
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-
+    async def crawl_url(self, url=None):
    def validate_url(self, url):
        '''
-        Checks if the discovered URL is local to the base URL.
+        docstring
        '''
        urls = set()
        async with self.semaphore:
            source = await self.get_source(url)
            if source:
                self.crawled.add(url)
                # for new_url in self.find_all_urls(source):
                #     urls.add(new_url)
                urls_to_crawl = self.find_all_urls(source)
                # print('discovered {0} new URLs'.format(len(urls_to_crawl)))
                for new_url in urls_to_crawl:
                    urls.add(new_url)
                # add the url we just crawled to the crawled pool.
        return urls
    def validate_url(self, url=None):
        '''
        Ensures we have a valid URL to crawl and that the site's robots.txt
        allows it.
        '''
        # ensure the URL is in a sane format
-        url = sanitise_url(url=url)
+        url = standardise_url(url=url, base_url=self.baseurl)
-        if url.startswith(self.baseurl) and robots.check(url=url):
+        if url and self.robots.check(url=url):
            # print('validated url: {0}'.format(url))
            return url
        else:
            return False
-    def get_source(self, url):
+    async def get_source(self, url=None):
        '''
        Obtains the page's source.
        '''
-        pass
+        print('semaphore held for {0}'.format(url))
-
+        async with self.client_session.get(url, timeout=5) as resp:
            try:
                source = await resp.read()
                return source
            except Exception:
                return None
-    def find_links(self, source):
+    def find_all_urls(self, source=None):
        '''
-        Find all links in a page's source.
+        Find all URLs in a page's source.
        '''
-        links = set()
+        urls = set()
        html = BeautifulSoup(source, 'lxml')
        hrefs = html.find_all('a', href=True)
        # build a set of URLs which are valid and haven't been crawled yet
        for href in hrefs:
-            url = self.validate_url(url=href)
+            url = self.validate_url(url=href['href'])
-            if url:
+            if url and url not in self.crawled:
-                links.add(url)
+                urls.add(url)
-        return links
+        return urls
-    def run(self):
+    async def run_loop(self):
        '''
        function which runs the crawler
        '''
-        pass
+        print('Crawling: {}'.format(self.baseurl))
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        # provide the starting URL to the crawler
        self.uncrawled.add(self.baseurl)
        while len(self.uncrawled) > 0:
            # print('################################ there are {0} uncrawled urls in the pool'.format(
            #     len(self.uncrawled)))
            url = self.uncrawled.pop()
            # print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
                # len(self.uncrawled)))
            new_urls = await self.crawl_url(url=url)
            for url in new_urls:
                # print('adding: {0}'.format(url))
                self.uncrawled.add(url)
        await self.client_session.close()
        return self.crawled
        for url in self.uncrawled:
            validated = validate_url(url=url)
            if validated:
                source = get_source(url=url)
                links = find_links(source=source)
@@ -183,8 +231,8 @@ class WebPage(object):
        '''
        for url in self.discovered_hrefs:
            if url.startswith(self.base_url) and self.robots.check(url):
-                sanitised_url = sanitise_url(url=url)
+                standardised_url = sanitise_url(url=url)
-                self.urls_to_crawl.add(sanitised_url)
+                self.urls_to_crawl.add(standardised_url)
    def list_urls(self):
@@ -251,31 +299,35 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)
-def sanitise_url(url, base_url=False):
+def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is True, we attempt to standardise `url` to ensure it can be
+    If `base_url` is None then we attempt to standarise the URL to ensure it can
-    prepended to relative URLs. If no scheme has been provided then we default
+    be prepended to relative URLs. If no scheme has been provided then we default
    to http as any sane https-only site should 301 redirect http > https.
-    If `base_url` is False, we sanitise URLs to strip queries and fragments (we
+    If `base_url` is set, we standardise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc).
+    don't want to scrape in-page anchors etc). Any relative URLs will be appended
    to the base url.
-    Returns a sanitised URL as a string.
+    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
    split_url = urlsplit(url)
-    if base_url:
+    if not base_url:
-        # This will sanitise the initial url for the initial page crawl.
+        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
-            sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
+            return "".join([split_url.scheme, delim, split_url.netloc])
        elif (split_url.path and not split_url.scheme and not split_url.netloc):
-            sanitised_url = "".join([default_proto, delim, split_url.path])
+            return "".join([default_proto, delim, split_url.path])
    else:
        # Sanitise discovered URLs. We already expect them in the format
        # protocol://base_url/path
-        sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+        if url.startswith('/'):
            return urljoin(base_url, split_url.path)
        elif url.startswith(base_url):
            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
-    return sanitised_url
+    return None