async crawler in a mostly-working state

2018-09-14 16:01:12 +01:00
parent 36e1f7693f
commit db986b0eba
2 changed files with 114 additions and 53 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -1,3 +1,4 @@
+
 #!/usr/bin/env python
 '''
 Need a docstring.
@@ -6,10 +7,10 @@ Need a docstring.
 import argparse
 import jinja2
 import os
+import sys
 import asyncio
 from datetime import datetime
-# from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
-from utils.helpers import RobotsTxt, AsyncCrawler, sanitise_url
+from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url


 def init_crawler(url=None):
@@ -17,10 +18,14 @@ def init_crawler(url=None):
    docstring
    '''
    # ensure we have a sensible URL to work with
-    baseurl = sanitise_url(url=url, base_url=True)
+    baseurl = standardise_url(url=url, base_url=url)
    # get robots.txt
    robots = RobotsTxt(base_url=baseurl)

+    # fail early if robots denies all crawling
+    if not robots.check(url=baseurl):
+        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
+
    return(baseurl, robots)


@@ -44,7 +49,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
    print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))


-def main(args=None):
+def main():
    '''
    docstring
    '''
@@ -54,21 +59,25 @@ def main(args=None):

    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
-    # async_crawler.run()
+    # run the crawler

-    crawler = asyncio.Task(async_crawler.run())
+    task = asyncio.Task(async_crawler.run_loop())
    loop = asyncio.get_event_loop()
-    loop.run_until_complete(crawler)
+    loop.run_until_complete(task)
    loop.close()
-    result = crawler.result()
-    print(len(result))
+    results = task.result()
+    print(results)
+    print(len(results))
+    runtime = int((datetime.now() - starttime).total_seconds())
+    print(runtime)


 if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Recursive web crawler')
    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
-    parser.add_argument("-s", "--concurrency", required=False, type=int, default=50, help="Max number of pages to crawl concurrently")
+    parser.add_argument("-c", "--concurrency", required=False, type=int,
+                        default=50, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()

-    main(args)
+    main()