Compare commits
3 Commits
a04de7f4de
...
8a1fd39dc4
| Author | SHA1 | Date | |
|---|---|---|---|
| 8a1fd39dc4 | |||
| 79b10798a3 | |||
| fb6b976391 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
|||||||
*~
|
*~
|
||||||
venv/
|
venv/
|
||||||
.vscode/*
|
.vscode/*
|
||||||
|
__pycache__/
|
||||||
|
|||||||
35
test_helpers.py
Normal file
35
test_helpers.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from utils.helpers import (url_validation, standardise_base_url)
|
||||||
|
|
||||||
|
class TestUrls(unittest.TestCase):
|
||||||
|
|
||||||
|
base_url = "github.com"
|
||||||
|
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
|
||||||
|
('www.simonweald.com', 'https://www.simonweald.com'),
|
||||||
|
('http://www.github.com', 'http://www.github.com'))
|
||||||
|
valid_urls = ["https://www.github.com", "http://www.github.com",
|
||||||
|
"github.com", "/some/url/", "index.html"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_standardisation(self):
|
||||||
|
'''
|
||||||
|
Tests whether a URL's protocol can be discovered if not provided.
|
||||||
|
'''
|
||||||
|
for url, target in self.base_url_list:
|
||||||
|
result = standardise_base_url(url)
|
||||||
|
self.assertEqual(result, target)
|
||||||
|
|
||||||
|
def test_url_validation(self):
|
||||||
|
'''
|
||||||
|
Passes when given a valid URL. A valid URL is qualified
|
||||||
|
by being local to the domain to be crawled.
|
||||||
|
'''
|
||||||
|
for url in self.valid_urls:
|
||||||
|
result = url_validation(self.base_url, url)
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
44
utils/helpers.py
Normal file
44
utils/helpers.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def standardise_base_url(url):
|
||||||
|
'''
|
||||||
|
Standardise the URL to be scraped to ensure it
|
||||||
|
is added to relative URLs in a consistent manner.
|
||||||
|
'''
|
||||||
|
match_protocol = r'http(s?)\:\/\/'
|
||||||
|
|
||||||
|
if re.match(match_protocol, url):
|
||||||
|
base_url = url
|
||||||
|
else:
|
||||||
|
http_url = 'http://{0}'.format(url)
|
||||||
|
https_url = 'https://{0}'.format(url)
|
||||||
|
# attempt to discover which protocol is being used.
|
||||||
|
try:
|
||||||
|
result = requests.get(http_url)
|
||||||
|
if result.url.startswith('http'):
|
||||||
|
base_url = http_url
|
||||||
|
if result.url.startswith('https'):
|
||||||
|
base_url = https_url
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
base_url = https_url
|
||||||
|
|
||||||
|
if base_url.endswith('/'):
|
||||||
|
base_url = base_url[:-1]
|
||||||
|
|
||||||
|
return base_url
|
||||||
|
|
||||||
|
|
||||||
|
def get_url_validation(base_url=None, url=None):
|
||||||
|
'''
|
||||||
|
Checks if a URL is valid. Can be absolute or relative.
|
||||||
|
'''
|
||||||
|
|
||||||
|
if url.startswith('/'):
|
||||||
|
full_url = '{0}{1}'.format(base_url, url)
|
||||||
|
if url.startswith(ffbase_url):
|
||||||
|
full_url = url
|
||||||
|
elif url.startswith('/'):
|
||||||
Reference in New Issue
Block a user