Compare commits
4 Commits
f1855f5add
...
5262c23281
| Author | SHA1 | Date | |
|---|---|---|---|
| 5262c23281 | |||
| 524f6a45cd | |||
| a926090bed | |||
| 91cd988f52 |
13
README.md
13
README.md
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
|
This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
|
||||||
|
|
||||||
Install required modules:
|
Install required modules:
|
||||||
|
|
||||||
@@ -13,9 +13,16 @@ pip install -r requirements.txt
|
|||||||
Run:
|
Run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python crawler.py -u https://urltocrawl.com
|
python crawler.py -u https://urltocrawl.com [-c 100]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Flags:
|
||||||
|
|
||||||
|
- -u/--url https://url.com
|
||||||
|
- The base URL is required.
|
||||||
|
- -c/--concurrency 100
|
||||||
|
- Specifying concurrency value is optional (defaults to 100).
|
||||||
|
|
||||||
## Results
|
## Results
|
||||||
|
|
||||||
The resulting sitemap will be output in the root of this directory as `sitemap.html`
|
The resulting sitemap will be output to the root of this directory as `sitemap.html`
|
||||||
|
|||||||
@@ -1,7 +1,15 @@
|
|||||||
|
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
'''
|
||||||
Need a docstring.
|
Asynchronous web crawler written in Python 3.5+.
|
||||||
|
|
||||||
|
This script will respect the site's `robots.txt`, if one exists. If not, all
|
||||||
|
URLs discovered will be crawled.
|
||||||
|
|
||||||
|
The crawler takes a total of two arguments (concurrency is optional):
|
||||||
|
|
||||||
|
url: the base URL to begin the crawl from.
|
||||||
|
concurrency: the maximum number of pages which may be crawled concurrently.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@@ -51,7 +59,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
'''
|
'''
|
||||||
docstring
|
Main function, responsible for prepping and running the crawler and
|
||||||
|
rendering the sitemap.
|
||||||
'''
|
'''
|
||||||
starttime = datetime.now()
|
starttime = datetime.now()
|
||||||
|
|
||||||
@@ -60,11 +69,13 @@ def main():
|
|||||||
# create a crawler
|
# create a crawler
|
||||||
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
|
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
|
||||||
|
|
||||||
|
# create a task to run the crawler, run the loop and then gather the results.
|
||||||
task = asyncio.Task(async_crawler.main())
|
task = asyncio.Task(async_crawler.main())
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
loop.run_until_complete(task)
|
loop.run_until_complete(task)
|
||||||
loop.close()
|
loop.close()
|
||||||
results = task.result()
|
results = sorted(task.result())
|
||||||
|
|
||||||
runtime = int((datetime.now() - starttime).total_seconds())
|
runtime = int((datetime.now() - starttime).total_seconds())
|
||||||
|
|
||||||
render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
|
render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
|
||||||
@@ -73,7 +84,7 @@ def main():
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
parser = argparse.ArgumentParser(description='Recursive web crawler')
|
||||||
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
|
parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
|
||||||
parser.add_argument("-c", "--concurrency", required=False, type=int,
|
parser.add_argument("-c", "--concurrency", required=False, type=int,
|
||||||
default=100, help="Max number of pages to crawl concurrently")
|
default=100, help="Max number of pages to crawl concurrently")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
|
aiohttp==3.4.4
|
||||||
|
async-timeout==3.0.0
|
||||||
|
attrs==18.2.0
|
||||||
beautifulsoup4==4.6.3
|
beautifulsoup4==4.6.3
|
||||||
bs4==0.0.1
|
bs4==0.0.1
|
||||||
certifi==2018.8.13
|
|
||||||
chardet==3.0.4
|
chardet==3.0.4
|
||||||
idna==2.7
|
idna==2.7
|
||||||
Jinja2==2.10
|
Jinja2==2.10
|
||||||
lxml==4.2.4
|
lxml==4.2.4
|
||||||
MarkupSafe==1.0
|
MarkupSafe==1.0
|
||||||
|
multidict==4.4.0
|
||||||
|
yarl==1.2.6
|
||||||
|
|||||||
@@ -119,6 +119,7 @@ class AsyncCrawler(object):
|
|||||||
# add the URLs to a set to be returned.
|
# add the URLs to a set to be returned.
|
||||||
if urls:
|
if urls:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
|
print('Found: {0}'.format(url))
|
||||||
all_urls.add(url)
|
all_urls.add(url)
|
||||||
|
|
||||||
return all_urls
|
return all_urls
|
||||||
@@ -133,6 +134,7 @@ class AsyncCrawler(object):
|
|||||||
to_crawl = []
|
to_crawl = []
|
||||||
to_crawl.append(self.baseurl)
|
to_crawl.append(self.baseurl)
|
||||||
|
|
||||||
|
print('Crawling: {0}'.format(self.baseurl))
|
||||||
while len(to_crawl) > 0:
|
while len(to_crawl) > 0:
|
||||||
discovered_urls = await self.run(urls=to_crawl)
|
discovered_urls = await self.run(urls=to_crawl)
|
||||||
# empty toe crawl list and then add all newly discovered URLs for
|
# empty toe crawl list and then add all newly discovered URLs for
|
||||||
@@ -169,6 +171,7 @@ class RobotsTxt(object):
|
|||||||
try:
|
try:
|
||||||
response = urllib.request.urlopen(request, timeout=5)
|
response = urllib.request.urlopen(request, timeout=5)
|
||||||
except urllib.error.HTTPError:
|
except urllib.error.HTTPError:
|
||||||
|
# if robots.txt doesn't exist then allow all URLs to be crawled.
|
||||||
robots.allow_all = True
|
robots.allow_all = True
|
||||||
else:
|
else:
|
||||||
data = response.read()
|
data = response.read()
|
||||||
|
|||||||
Reference in New Issue
Block a user