improve documentation
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
|
This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
|
||||||
|
|
||||||
Install required modules:
|
Install required modules:
|
||||||
|
|
||||||
@@ -13,9 +13,9 @@ pip install -r requirements.txt
|
|||||||
Run:
|
Run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python crawler.py -u https://urltocrawl.com
|
python crawler.py -u https://urltocrawl.com [-c 100]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Results
|
## Results
|
||||||
|
|
||||||
The resulting sitemap will be output in the root of this directory as `sitemap.html`
|
The resulting sitemap will be output to the root of this directory as `sitemap.html`
|
||||||
|
|||||||
@@ -1,7 +1,15 @@
|
|||||||
|
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
'''
|
||||||
Need a docstring.
|
Asynchronous web crawler written in Python 3.5+.
|
||||||
|
|
||||||
|
This script will respect the site's `robots.txt`, if one exists. If not, all
|
||||||
|
URLs discovered will be crawled.
|
||||||
|
|
||||||
|
The crawler takes a total of two arguments (concurrency is optional):
|
||||||
|
|
||||||
|
url: the base URL to begin the crawl from.
|
||||||
|
concurrency: the maximum number of pages which may be crawled concurrently.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@@ -76,7 +84,7 @@ def main():
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
||||||
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
|
parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
|
||||||
parser.add_argument("-c", "--concurrency", required=False, type=int,
|
parser.add_argument("-c", "--concurrency", required=False, type=int,
|
||||||
default=100, help="Max number of pages to crawl concurrently")
|
default=100, help="Max number of pages to crawl concurrently")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
Reference in New Issue
Block a user