improve documentation

This commit is contained in:
2018-09-16 15:53:47 +01:00
parent a926090bed
commit 524f6a45cd
2 changed files with 13 additions and 5 deletions

View File

@@ -2,7 +2,7 @@
## Requirements ## Requirements
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features. This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
Install required modules: Install required modules:
@@ -13,9 +13,9 @@ pip install -r requirements.txt
Run: Run:
```bash ```bash
python crawler.py -u https://urltocrawl.com python crawler.py -u https://urltocrawl.com [-c 100]
``` ```
## Results ## Results
The resulting sitemap will be output in the root of this directory as `sitemap.html` The resulting sitemap will be output to the root of this directory as `sitemap.html`

View File

@@ -1,7 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
''' '''
Need a docstring. Asynchronous web crawler written in Python 3.5+.
This script will respect the site's `robots.txt`, if one exists. If not, all
URLs discovered will be crawled.
The crawler takes a total of two arguments (concurrency is optional):
url: the base URL to begin the crawl from.
concurrency: the maximum number of pages which may be crawled concurrently.
''' '''
import argparse import argparse
@@ -76,7 +84,7 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler') parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl") parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
parser.add_argument("-c", "--concurrency", required=False, type=int, parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max number of pages to crawl concurrently") default=100, help="Max number of pages to crawl concurrently")
args = parser.parse_args() args = parser.parse_args()