Compare commits
3 Commits
679b1b7b53
...
asyncio
| Author | SHA1 | Date | |
|---|---|---|---|
| 5f7d66912f | |||
| d4cd93e3d4 | |||
| f5f6afd1a4 |
@@ -13,7 +13,7 @@ pip install -r requirements.txt
|
|||||||
Run:
|
Run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python crawler.py -u https://urltocrawl.com [-c 100]
|
python async_crawler.py -u https://urltocrawl.com [-c 100]
|
||||||
```
|
```
|
||||||
|
|
||||||
Flags:
|
Flags:
|
||||||
|
|||||||
10
test/files/find_all_urls.html
Normal file
10
test/files/find_all_urls.html
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
<ul>
|
||||||
|
<li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
|
||||||
|
<li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
|
||||||
|
<li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -2,37 +2,18 @@
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
from utils.helpers import AsyncCrawler, RobotsTxt, standardise_url
|
from utils.helpers import RobotsTxt, standardise_url
|
||||||
|
|
||||||
|
|
||||||
class TestAsyncCrawler(unittest.TestCase):
|
|
||||||
|
|
||||||
base_url = 'http://eu.httpbin.org'
|
|
||||||
concurrency = 10
|
|
||||||
testcrawler = AsyncCrawler(baseurl=base_url, concurrency=concurrency)
|
|
||||||
expected_urls = ['http://eu.httpbin.org/b/', 'http://eu.httpbin.org/c/']
|
|
||||||
crawled = set()
|
|
||||||
crawled.add('https://eu.httpbin.org/a/')
|
|
||||||
|
|
||||||
@mock.patch('utils.helpers.AsyncCrawler.validate_url', response=True)
|
|
||||||
def test_find_all_urls(self, validate_url):
|
|
||||||
|
|
||||||
with open('test/files/find_all_urls.html', 'r') as f:
|
|
||||||
source = f.read()
|
|
||||||
|
|
||||||
urls = self.testcrawler.find_all_urls(source=source)
|
|
||||||
self.assertEqual(urls, self.expected_urls)
|
|
||||||
|
|
||||||
|
|
||||||
class TestRobots(unittest.TestCase):
|
class TestRobots(unittest.TestCase):
|
||||||
|
|
||||||
base_url = 'http://eu.httpbin.org'
|
rooturl = 'http://eu.httpbin.org'
|
||||||
no_robots = 'https://www.simonweald.com'
|
no_robots = 'https://www.simonweald.com'
|
||||||
|
|
||||||
test_paths = (('/', True), ('/deny', False))
|
test_paths = (('/', True), ('/deny', False))
|
||||||
|
|
||||||
robots = RobotsTxt(base_url=base_url)
|
robots = RobotsTxt(rooturl=rooturl)
|
||||||
norobots = RobotsTxt(base_url=no_robots)
|
norobots = RobotsTxt(rooturl=no_robots)
|
||||||
|
|
||||||
def test_robots_txt_deny(self):
|
def test_robots_txt_deny(self):
|
||||||
'''
|
'''
|
||||||
@@ -52,9 +33,9 @@ class TestRobots(unittest.TestCase):
|
|||||||
|
|
||||||
class TestUrls(unittest.TestCase):
|
class TestUrls(unittest.TestCase):
|
||||||
|
|
||||||
base_url = 'http://eu.httpbin.org'
|
rooturl = 'http://eu.httpbin.org'
|
||||||
|
|
||||||
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
|
rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
|
||||||
('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
|
('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
|
||||||
('https://eu.httpbin.org', 'https://eu.httpbin.org'))
|
('https://eu.httpbin.org', 'https://eu.httpbin.org'))
|
||||||
|
|
||||||
@@ -64,12 +45,12 @@ class TestUrls(unittest.TestCase):
|
|||||||
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
|
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
|
||||||
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
|
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
|
||||||
|
|
||||||
def test_standardise_base_url(self):
|
def test_standardise_rooturl(self):
|
||||||
'''
|
'''
|
||||||
Tests whether a base URL can be standardised to the format
|
Tests whether a base URL can be standardised to the format
|
||||||
proto://[sub].domain.tld.
|
proto://[sub].domain.tld.
|
||||||
'''
|
'''
|
||||||
for url, target in self.base_url_list:
|
for url, target in self.rooturl_list:
|
||||||
result = standardise_url(url)
|
result = standardise_url(url)
|
||||||
self.assertEqual(result, target)
|
self.assertEqual(result, target)
|
||||||
|
|
||||||
@@ -78,7 +59,7 @@ class TestUrls(unittest.TestCase):
|
|||||||
Ensure that fragments/anchors etc are stripped.
|
Ensure that fragments/anchors etc are stripped.
|
||||||
'''
|
'''
|
||||||
for url, target in self.urls_to_clean:
|
for url, target in self.urls_to_clean:
|
||||||
result = standardise_url(url, base_url=self.base_url)
|
result = standardise_url(url, rooturl=self.rooturl)
|
||||||
self.assertEqual(result, target)
|
self.assertEqual(result, target)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user