Compare commits
2 Commits
eb2395d461
...
51f988e1bc
| Author | SHA1 | Date | |
|---|---|---|---|
| 51f988e1bc | |||
| 73c21e5bd3 |
@@ -24,6 +24,15 @@ from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
|
|||||||
def sanity_checks(url=None):
|
def sanity_checks(url=None):
|
||||||
'''
|
'''
|
||||||
Runs some basic sanity checks before the crawler is initialised.
|
Runs some basic sanity checks before the crawler is initialised.
|
||||||
|
|
||||||
|
Accepts:
|
||||||
|
url: the root URL to be crawled.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
baseurl: a validated and cleaned version of the initial URL.
|
||||||
|
(type=string)
|
||||||
|
robots: an object which allows us to query whether a site may be crawled.
|
||||||
|
(type=RobotsTxt)
|
||||||
'''
|
'''
|
||||||
# ensure we have a sensible URL to work with
|
# ensure we have a sensible URL to work with
|
||||||
baseurl = standardise_url(url=url)
|
baseurl = standardise_url(url=url)
|
||||||
@@ -41,6 +50,11 @@ def sanity_checks(url=None):
|
|||||||
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
|
||||||
'''
|
'''
|
||||||
Renders the sitemap to an HTML file.
|
Renders the sitemap to an HTML file.
|
||||||
|
|
||||||
|
Accepts:
|
||||||
|
base_url:
|
||||||
|
crawled_urls:
|
||||||
|
runtime:
|
||||||
'''
|
'''
|
||||||
urlcount = len(crawled_urls)
|
urlcount = len(crawled_urls)
|
||||||
sorted_urls = sorted(crawled_urls)
|
sorted_urls = sorted(crawled_urls)
|
||||||
|
|||||||
@@ -1,34 +1,54 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from utils.helpers import (sanitise_url)
|
from utils.helpers import (RobotsTxt, standardise_url)
|
||||||
|
|
||||||
|
class TestRobots(unittest.TestCase):
|
||||||
|
|
||||||
|
base_url = 'http://eu.httpbin.org'
|
||||||
|
|
||||||
|
test_paths = (('/', True), ('/deny', False))
|
||||||
|
|
||||||
|
robots = RobotsTxt(base_url=base_url)
|
||||||
|
|
||||||
|
def test_robots_txt_deny(self):
|
||||||
|
'''
|
||||||
|
Asserts result is True or False.
|
||||||
|
'''
|
||||||
|
for path, allowed in self.test_paths:
|
||||||
|
result = self.robots.check(url=path)
|
||||||
|
self.assertIs(result, allowed)
|
||||||
|
|
||||||
|
|
||||||
class TestUrls(unittest.TestCase):
|
class TestUrls(unittest.TestCase):
|
||||||
|
|
||||||
|
base_url = 'http://eu.httpbin.org'
|
||||||
|
|
||||||
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
|
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
|
||||||
('www.simonweald.com', 'http://www.simonweald.com'),
|
('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
|
||||||
('http://www.github.com/', 'http://www.github.com'),
|
('https://eu.httpbin.org', 'https://eu.httpbin.org'))
|
||||||
('https://www.github.com', 'https://www.github.com'))
|
|
||||||
|
|
||||||
urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
|
urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
|
||||||
('https://github.com/?foo=bar', 'https://github.com/'),
|
('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
|
||||||
('https://github.com/#anchor', 'https://github.com/'))
|
('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
|
||||||
|
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
|
||||||
|
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
|
||||||
|
|
||||||
|
def test_standardise_base_url(self):
|
||||||
def test_sanitise_base_url(self):
|
|
||||||
'''
|
'''
|
||||||
Tests whether a URL's protocol can be discovered if not provided.
|
Tests whether a base URL can be standardised to the format
|
||||||
|
proto://[sub].domain.tld.
|
||||||
'''
|
'''
|
||||||
for url, target in self.base_url_list:
|
for url, target in self.base_url_list:
|
||||||
result = sanitise_url(url, base_url=True)
|
result = standardise_url(url)
|
||||||
self.assertEqual(result, target)
|
self.assertEqual(result, target)
|
||||||
|
|
||||||
def test_sanitise_url(self):
|
def test_standardise_url(self):
|
||||||
'''
|
'''
|
||||||
Tests whether a URL's protocol can be discovered if not provided.
|
Ensure that fragments/anchors etc are stripped.
|
||||||
'''
|
'''
|
||||||
for url, target in self.urls_to_clean:
|
for url, target in self.urls_to_clean:
|
||||||
result = sanitise_url(url)
|
result = standardise_url(url, base_url=self.base_url)
|
||||||
self.assertEqual(result, target)
|
self.assertEqual(result, target)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class AsyncCrawler(object):
|
|||||||
async with self.semaphore:
|
async with self.semaphore:
|
||||||
async with self.client_session.head(url, timeout=5) as head:
|
async with self.client_session.head(url, timeout=5) as head:
|
||||||
try:
|
try:
|
||||||
data = await head.read()
|
_ = await head.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
if 'text/html' in head.headers['Content-Type']:
|
if 'text/html' in head.headers['Content-Type']:
|
||||||
@@ -130,7 +130,7 @@ class AsyncCrawler(object):
|
|||||||
print('Crawling: {0}'.format(self.baseurl))
|
print('Crawling: {0}'.format(self.baseurl))
|
||||||
while len(to_crawl) > 0:
|
while len(to_crawl) > 0:
|
||||||
discovered_urls = await self.run(urls=to_crawl)
|
discovered_urls = await self.run(urls=to_crawl)
|
||||||
# empty toe crawl list and then add all newly discovered URLs for
|
# empty to_crawl list and then add all newly discovered URLs for
|
||||||
# the next iteration.
|
# the next iteration.
|
||||||
to_crawl.clear()
|
to_crawl.clear()
|
||||||
to_crawl.extend(discovered_urls)
|
to_crawl.extend(discovered_urls)
|
||||||
|
|||||||
Reference in New Issue
Block a user