From 524f6a45cd962e80048d7a17d13e9fdbc7c07280 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Sun, 16 Sep 2018 15:53:47 +0100 Subject: [PATCH] improve documentation --- README.md | 6 +++--- async_crawler.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e8e228e..0a2eef1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Requirements -This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features. +This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`. Install required modules: @@ -13,9 +13,9 @@ pip install -r requirements.txt Run: ```bash -python crawler.py -u https://urltocrawl.com +python crawler.py -u https://urltocrawl.com [-c 100] ``` ## Results -The resulting sitemap will be output in the root of this directory as `sitemap.html` +The resulting sitemap will be output to the root of this directory as `sitemap.html` diff --git a/async_crawler.py b/async_crawler.py index ff66adf..8716bd0 100644 --- a/async_crawler.py +++ b/async_crawler.py @@ -1,7 +1,15 @@ #!/usr/bin/env python ''' -Need a docstring. +Asynchronous web crawler written in Python 3.5+. + +This script will respect the site's `robots.txt`, if one exists. If not, all +URLs discovered will be crawled. + +The crawler takes a total of two arguments (concurrency is optional): + + url: the base URL to begin the crawl from. + concurrency: the maximum number of pages which may be crawled concurrently. ''' import argparse @@ -76,7 +84,7 @@ def main(): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') - parser.add_argument("-u", "--url", required=True, help="Base url to crawl") + parser.add_argument("-u", "--url", required=True, help="Initial url to crawl") parser.add_argument("-c", "--concurrency", required=False, type=int, default=100, help="Max number of pages to crawl concurrently") args = parser.parse_args()