async crawler in a mostly-working state

2018-09-14 16:01:12 +01:00
parent 36e1f7693f
commit db986b0eba
2 changed files with 114 additions and 53 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -1,3 +1,4 @@
+
 #!/usr/bin/env python
 '''
 Need a docstring.
@@ -6,10 +7,10 @@ Need a docstring.
 import argparse
 import jinja2
 import os
+import sys
 import asyncio
 from datetime import datetime
-# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
-from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
+from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url


 def init_crawler(url=None):
@@ -17,10 +18,14 @@ def init_crawler(url=None):
    docstring
    '''
    # ensure we have a sensible URL to work with
-    baseurl = sanitise_url(url=url, base_url=True)
+    baseurl = standardise_url(url=url, base_url=url)
    # get robots.txt
    robots = RobotsTxt(base_url=baseurl)

+    # fail early if robots denies all crawling
+    if not robots.check(url=baseurl):
+        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
+
    return(baseurl, robots)


@@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))


-def main(args=None):
+def main():
    '''
    docstring
    '''
@@ -54,21 +59,25 @@ def main(args=None):

    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
-    # async_crawler.run()
+    # run the crawler

-    crawler = asyncio.Task(async_crawler.run())
+    task = asyncio.Task(async_crawler.run_loop())
    loop = asyncio.get_event_loop()
-    loop.run_until_complete(crawler)
+    loop.run_until_complete(task)
    loop.close()
-    result = crawler.result()
-    print(len(result))
+    results = task.result()
+    print(results)
+    print(len(results))
+    runtime = int((datetime.now() - starttime).total_seconds())
+    print(runtime)


 if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
-    parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
+    parser.add_argument("-c", "--concurrency", required=False, type=int,
+                        default=50, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()

-    main(args)
+    main()
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -26,65 +26,113 @@ class AsyncCrawler(object):
        self.robots = robots
        self.uncrawled = set()
        self.crawled = set()
-        self.session = aiohttp.ClientSession()
+        # self.headers = {'Accept-Encoding': 'gzip, deflate',
+                        # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
+        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
-        # add the base URL to be crawled
-        self.uncrawled.add(baseurl)
-        self.headers = {'Accept-Encoding': 'gzip, deflate',
-                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}

-
-    def validate_url(self, url):
+    async def crawl_url(self, url=None):
        '''
-        Checks if the discovered URL is local to the base URL.
+        docstring
+        '''
+        urls = set()
+        async with self.semaphore:
+            source = await self.get_source(url)
+            if source:
+                self.crawled.add(url)
+                # for new_url in self.find_all_urls(source):
+                #     urls.add(new_url)
+                urls_to_crawl = self.find_all_urls(source)
+                # print('discovered {0} new URLs'.format(len(urls_to_crawl)))
+                for new_url in urls_to_crawl:
+                    urls.add(new_url)
+                # add the url we just crawled to the crawled pool.
+
+
+        return urls
+
+
+    def validate_url(self, url=None):
+        '''
+        Ensures we have a valid URL to crawl and that the site's robots.txt
+        allows it.
        '''
        # ensure the URL is in a sane format
-        url = sanitise_url(url=url)
+        url = standardise_url(url=url, base_url=self.baseurl)

-        if url.startswith(self.baseurl) and robots.check(url=url):
+        if url and self.robots.check(url=url):
+            # print('validated url: {0}'.format(url))
            return url
        else:
            return False


-    def get_source(self, url):
+    async def get_source(self, url=None):
        '''
        Obtains the page's source.
        '''
-        pass
-
+        print('semaphore held for {0}'.format(url))
+        async with self.client_session.get(url, timeout=5) as resp:
+            try:
+                source = await resp.read()
                return source
+            except Exception:
+                return None


-    def find_links(self, source):
+    def find_all_urls(self, source=None):
        '''
-        Find all links in a page's source.
+        Find all URLs in a page's source.
        '''
-        links = set()
+        urls = set()

        html = BeautifulSoup(source, 'lxml')
        hrefs = html.find_all('a', href=True)

+        # build a set of URLs which are valid and haven't been crawled yet
        for href in hrefs:
-            url = self.validate_url(url=href)
-            if url:
-                links.add(url)
+            url = self.validate_url(url=href['href'])
+            if url and url not in self.crawled:
+                urls.add(url)

-        return links
+        return urls


-    def run(self):
+    async def run_loop(self):
        '''
        function which runs the crawler
        '''
-        pass
+        print('Crawling: {}'.format(self.baseurl))
+        self.client_session = aiohttp.ClientSession(headers=self.headers)
+        # provide the starting URL to the crawler
+        self.uncrawled.add(self.baseurl)
+
+        while len(self.uncrawled) > 0:
+            # print('################################ there are {0} uncrawled urls in the pool'.format(
+            #     len(self.uncrawled)))
+            url = self.uncrawled.pop()
+            # print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
+                # len(self.uncrawled)))
+            new_urls = await self.crawl_url(url=url)
+            for url in new_urls:
+                # print('adding: {0}'.format(url))
+                self.uncrawled.add(url)
+
+        await self.client_session.close()
+        return self.crawled
+
+
+
+
+
+
+
+
+
+

-        for url in self.uncrawled:
-            validated = validate_url(url=url)

-            if validated:
-                source = get_source(url=url)
-                links = find_links(source=source)



@@ -183,8 +231,8 @@ class WebPage(object):
        '''
        for url in self.discovered_hrefs:
            if url.startswith(self.base_url) and self.robots.check(url):
-                sanitised_url = sanitise_url(url=url)
-                self.urls_to_crawl.add(sanitised_url)
+                standardised_url = sanitise_url(url=url)
+                self.urls_to_crawl.add(standardised_url)


    def list_urls(self):
@@ -251,31 +299,35 @@ class RobotsTxt(object):
        return self.robots.can_fetch("*", url)


-def sanitise_url(url, base_url=False):
+def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is True, we attempt to standardise `url` to ensure it can be
-    prepended to relative URLs. If no scheme has been provided then we default
+    If `base_url` is None then we attempt to standarise the URL to ensure it can
+    be prepended to relative URLs. If no scheme has been provided then we default
    to http as any sane https-only site should 301 redirect http > https.

-    If `base_url` is False, we sanitise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc).
+    If `base_url` is set, we standardise URLs to strip queries and fragments (we
+    don't want to scrape in-page anchors etc). Any relative URLs will be appended
+    to the base url.

-    Returns a sanitised URL as a string.
+    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'

    split_url = urlsplit(url)

-    if base_url:
-        # This will sanitise the initial url for the initial page crawl.
+    if not base_url:
+        # This will sanitise the initial url provided by the user.
        if split_url.scheme and split_url.scheme.startswith('http'):
-            sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
+            return "".join([split_url.scheme, delim, split_url.netloc])
        elif (split_url.path and not split_url.scheme and not split_url.netloc):
-            sanitised_url = "".join([default_proto, delim, split_url.path])
+            return "".join([default_proto, delim, split_url.path])
    else:
        # Sanitise discovered URLs. We already expect them in the format
        # protocol://base_url/path
-        sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+        if url.startswith('/'):
+            return urljoin(base_url, split_url.path)
+        elif url.startswith(base_url):
+            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])

-    return sanitised_url
+    return None