Compare commits

...

79 Commits

Author SHA1 Message Date
5f7d66912f add test files 2018-09-19 08:39:05 +01:00
d4cd93e3d4 update docs 2018-09-19 08:38:49 +01:00
f5f6afd1a4 correct tests with new arg names 2018-09-19 08:37:55 +01:00
679b1b7b53 rename all instances of base_url to rooturl, add more documentation 2018-09-18 18:24:15 +01:00
32d7f1e54b add talking points 2018-09-18 18:23:12 +01:00
f6265f18a7 initial test for AsyncCrawler 2018-09-18 18:22:55 +01:00
9a4e9ddfc7 add test for missing robots.txt 2018-09-18 10:53:13 +01:00
51f988e1bc added more tests 2018-09-17 21:44:20 +01:00
73c21e5bd3 small improvements to docs and variables 2018-09-17 21:44:04 +01:00
eb2395d461 minor change to README 2018-09-17 08:11:26 +01:00
c53f62b55d add most changes suggested by pycodestyle 2018-09-16 16:10:38 +01:00
75d3756bbc fix errors discovered by pycyodestyle 2018-09-16 16:04:07 +01:00
5262c23281 add flags to README 2018-09-16 15:58:17 +01:00
524f6a45cd improve documentation 2018-09-16 15:53:47 +01:00
a926090bed update requirements 2018-09-16 15:44:30 +01:00
91cd988f52 more comments and progress output 2018-09-16 15:26:49 +01:00
f1855f5add re-order imports because I'm fussy 2018-09-16 09:06:30 +01:00
336517e84a more documentation and add back some required imports 2018-09-16 09:00:43 +01:00
7bc9fe0679 improved documentation and remove unneeded set 2018-09-16 08:56:44 +01:00
6548f55416 improve documentation 2018-09-15 21:48:50 +01:00
0244435fea remove unecessary imports 2018-09-15 21:38:51 +01:00
d6964672b6 commit of working async crawler 2018-09-15 21:30:02 +01:00
3808f72f73 correct semaphore usage 2018-09-14 16:06:17 +01:00
7ebe4855b8 remove unecessary classes2 2018-09-14 16:02:20 +01:00
db986b0eba async crawler in a mostly-working state 2018-09-14 16:01:12 +01:00
36e1f7693f initial foray into asynchronous crawling 2018-09-12 22:54:12 +01:00
8698c21fda return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was 2018-09-12 08:03:08 +01:00
273cf56a3b add some basic tests 2018-09-11 13:42:15 +01:00
1af26f50f2 added a docstring 2018-09-11 13:42:02 +01:00
c40c5cea50 add async info 2018-09-10 21:29:46 +01:00
a6224f9b6a updated readme 2018-09-10 20:56:12 +01:00
b64711973f add new thoughts 2018-09-10 11:58:58 +01:00
9e125dfae0 added comments and docstrings 2018-09-09 22:49:55 +01:00
f16f82fdfb improved completion message 2018-09-09 22:40:42 +01:00
a523154848 display count of crawled/uncrawled URLs whilst running 2018-09-09 22:35:55 +01:00
9e754a5584 improve handling of gzip/deflated data detection 2018-09-09 11:21:46 +01:00
1b005570ee implement gzip compression requests and handling 2018-09-09 10:53:09 +01:00
17fa9f93f9 tick off gzip encoding 2018-09-09 10:52:37 +01:00
1e51e10db2 update with changes 2018-09-09 10:22:18 +01:00
225fd8b3ea update with changes 2018-09-09 10:22:03 +01:00
d686ae0bc4 update with changes 2018-09-09 10:21:45 +01:00
69f5788745 update notes 2018-09-09 10:16:22 +01:00
b5d644a223 various minor improvements to exception handling 2018-09-09 10:16:03 +01:00
6508156aa4 use lxml as the parser and only find links on a page if we've got the source 2018-09-09 10:06:25 +01:00
738ab8e441 adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent 2018-09-09 09:57:16 +01:00
fdd84a8786 manually retrieve robots.txt to ensure we can set the user-agent 2018-09-07 12:40:12 +01:00
ab0ab0a010 add more thoughts 2018-09-07 11:50:53 +01:00
6a1259aa7d update plans to add gzip encoding 2018-09-06 17:33:10 +01:00
164239b343 more thoughts 2018-09-06 17:31:12 +01:00
ce1f2745c9 update thoughts 2018-09-06 17:30:28 +01:00
e70bdc9ca1 update requirements.txt 2018-09-06 17:25:30 +01:00
d1c1e17f4f report runtime of script in generated sitemap 2018-09-06 17:20:59 +01:00
816a727d79 ignore generated file 2018-09-06 17:08:56 +01:00
84ab27a75e render results as HTML 2018-09-06 17:08:26 +01:00
6d9103c154 improved content-type detection 2018-09-06 17:08:12 +01:00
e57a86c60a only attempt to read html 2018-09-06 16:30:11 +01:00
a3ec9451e3 implement parsing of robots.txt 2018-09-05 18:56:20 +01:00
f2c294ebdb added new ideas to implement 2018-09-04 15:40:11 +01:00
1b9b207a28 attempt to remove base url with trailing slash (if discovered) 2018-09-04 13:57:52 +01:00
6abe7d68e0 updated notes 2018-09-04 12:51:59 +01:00
7d919039b6 removed unecessary modules 2018-09-04 10:14:27 +01:00
0726bcccb0 removed original file 2018-09-04 09:21:55 +01:00
05e907ecec too many changes to make a sensible commit message 2018-09-04 09:21:26 +01:00
abc628106d added a docstring to the WebPage object 2018-08-31 19:18:00 +01:00
c436016e0c remove unecessary function 2018-08-31 19:16:08 +01:00
03554fde80 add docstrings 2018-08-31 19:15:35 +01:00
759f965e95 use more explicit names, use urljoin to combine urls 2018-08-31 19:12:58 +01:00
0517e5bc56 crawler now initialises and populates crawled pool with urls it finds 2018-08-31 19:02:21 +01:00
1b18aa83eb corrected some small errors and added runner function 2018-08-31 19:01:35 +01:00
5e0d9fd568 initial commit of crawler skeleton 2018-08-31 18:26:49 +01:00
915def3a5d rework url sanitiser to use urllib modules, move WebPage object to helpers 2018-08-31 18:26:25 +01:00
453331d69d simplified url qualifier 2018-08-29 22:27:26 +01:00
2b812da26a simplify UrlPoolManager to use a set instead of a dict 2018-08-29 21:49:15 +01:00
fb096b4468 add scratchpad for notes 2018-08-28 22:34:05 +01:00
5d94991167 start making the scraper an object 2018-08-28 22:29:36 +01:00
482d23dd4f blank __init__.py 2018-08-28 22:29:11 +01:00
452de87f35 change name of pool management object to be more clear 2018-08-28 22:28:49 +01:00
73cb883151 add a list manager object 2018-08-28 22:28:16 +01:00
5c933fc5c9 initial commit of single-page scraper 2018-08-28 18:29:34 +01:00
12 changed files with 596 additions and 55 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
venv/
.vscode/*
__pycache__/
sitemap.html

View File

@@ -1 +1,28 @@
# Concurrent web scraper
# Concurrent web scraper
## Requirements
This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.
Install required modules:
```bash
pip install -r requirements.txt
```
Run:
```bash
python async_crawler.py -u https://urltocrawl.com [-c 100]
```
Flags:
- `-u/--url https://url.com`
- The base URL is required.
- `-c/--concurrency 100`
- Specifying concurrency value is optional (defaults to 100).
## Results
The resulting sitemap will be output to the root of this directory as `sitemap.html`

108
async_crawler.py Normal file
View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python
'''
Asynchronous web crawler written in Python 3.5+.
This script will respect the site's `robots.txt`, if one exists. If not, all
URLs discovered will be crawled.
The crawler takes a total of two arguments (concurrency is optional):
url: the root URL to begin the crawl from.
concurrency: the maximum number of pages which may be crawled concurrently.
'''
import argparse
import asyncio
from datetime import datetime
import jinja2
import os
import sys
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
def sanity_checks(url=None):
'''
Runs some basic sanity checks before the crawler is initialised.
Accepts:
url: the root URL to be crawled.
Returns:
rooturl: a string containing avalidated and cleaned version of the
initial URL.
robots: an object which allows us to query whether a site may be crawled.
'''
# ensure we have a sensible URL to work with
rooturl = standardise_url(url=url)
# get robots.txt
robots = RobotsTxt(rooturl=rooturl)
# fail early if robots denies all crawling
if not robots.check(url=rooturl):
sys.exit("{0} cannot be crawled (denied by robots.txt)".format(
rooturl))
return(rooturl, robots)
def render_sitemap(rooturl=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap to an HTML file.
Accepts:
rooturl: string containing the root URL
crawled_urls: set containing discovered URLs
runtime: int representing run time of AsyncCrawler
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
template = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = template.render(rooturl=rooturl, urlcount=urlcount,
urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def main():
'''
Main function, responsible for prepping and running the crawler and
rendering the sitemap.
'''
starttime = datetime.now()
rooturl, robots = sanity_checks(url=args.url)
# create a crawler
async_crawler = AsyncCrawler(rooturl=rooturl, robots=robots,
concurrency=args.concurrency)
# create a task to run the crawler, run the loop and then gather the
# results.
task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
loop.close()
results = sorted(task.result())
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(rooturl=rooturl, crawled_urls=results, runtime=runtime)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Initial url")
parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max pages to crawl concurrently")
args = parser.parse_args()
main()

120
crawler.py Normal file
View File

@@ -0,0 +1,120 @@
#!/usr/bin/env python
'''
Need a docstring.
'''
import argparse
import jinja2
import os
import asyncio
from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
def init_crawler(base_url=None, robots=None):
'''
Initialises the crawler by running the initial URL.
'''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
try:
initial_page.run()
except Exception as e:
print(e)
initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again
try:
initial_urls.remove(base_url)
except KeyError:
pass
# also ensure base URL wasn't discovered with a trailing slash on the
# initial page scrape
try:
initial_urls.remove("".join([base_url, '/']))
except KeyError:
pass
# Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url)
for url in initial_urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(sanitised_url)
return(uncrawled_urls, crawled_urls)
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
'''
Iterates over the pool of URLs and adds any discovered URLs.
'''
while uncrawled_urls.pool:
# pop url from pool
new_url = uncrawled_urls.remove_from_pool()
# create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try:
succeeded = current_page.run()
except Exception as e:
print(e)
if succeeded:
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
len(uncrawled_urls.pool)))
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap as an HTML file.
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
tmpl = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def run(args=None):
'''
needs a docstring.
'''
starttime = datetime.now()
base_url = sanitise_url(args.url, base_url=True)
robots = RobotsTxt(base_url=base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
args = parser.parse_args()
run(args)

26
notes.md Normal file
View File

@@ -0,0 +1,26 @@
## Thoughts
* ~~strip hashes and everything following (as they're in-page anchors)~~
* ~~strip args~~
* ~~use `pop()` on the set instead of `.remove()`~~
* ~~return false once the set is empty~~
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ~~ignore any links which aren't to pages~~
* ~~better url checking to get bare domain~~ #wontfix
* ~~remove trailing slash from any discovered url~~
* ~~investigate lxml parser~~
* ~~remove base url from initial urls with and without trailing slash~~
* ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
* ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
* ~~implement some kind of progress display~~
* async
* better exception handling
* randomise output filename
### talking points
- token bucket algo to enforce n requests per second
- read up on bucket algo types
- re-structuring AsyncCrawler to be more testable
- use exponential backoff algo?

View File

@@ -1,7 +1,12 @@
aiohttp==3.4.4
async-timeout==3.0.0
attrs==18.2.0
beautifulsoup4==4.6.3
bs4==0.0.1
certifi==2018.8.13
chardet==3.0.4
idna==2.7
requests==2.19.1
urllib3==1.23
Jinja2==2.10
lxml==4.2.4
MarkupSafe==1.0
multidict==4.4.0
yarl==1.2.6

View File

@@ -1 +0,0 @@
#!/usr/bin/env python

14
templates/sitemap.html.j2 Normal file
View File

@@ -0,0 +1,14 @@
<html>
<head>
<title>Sitemap for {{ base_url }}</title>
</head>
<body>
<p>
Crawled {{ urlcount }} URLs on {{ rooturl }} in ~{{ runtime }} seconds.
<ul>
{% for url in urls %}
<li><a href="{{ url }}">{{ url }}</a></li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -0,0 +1,10 @@
<html>
<body>
<p>
<ul>
<li><a href="http://eu.httpbin.org/a/">http://eu.httpbin.org/a/</a></li>
<li><a href="http://eu.httpbin.org/b/">http://eu.httpbin.org/b/</a></li>
<li><a href="http://eu.httpbin.org/c/">http://eu.httpbin.org/c/</a></li>
</ul>
</body>
</html>

View File

@@ -1,37 +1,66 @@
#!/usr/bin/env python
import unittest
from utils.helpers import (clean_base_url)
from unittest import mock
from utils.helpers import RobotsTxt, standardise_url
class TestRobots(unittest.TestCase):
rooturl = 'http://eu.httpbin.org'
no_robots = 'https://www.simonweald.com'
test_paths = (('/', True), ('/deny', False))
robots = RobotsTxt(rooturl=rooturl)
norobots = RobotsTxt(rooturl=no_robots)
def test_robots_txt_deny(self):
'''
Asserts result is True or False.
'''
for path, allowed in self.test_paths:
result = self.robots.check(url=path)
self.assertIs(result, allowed)
def test_no_robots_txt(self):
'''
Ensure we can crawl if robots.txt isn't present.
'''
result = self.norobots.check(url='/')
self.assertTrue(result)
class TestUrls(unittest.TestCase):
base_url = "github.com"
rooturl = 'http://eu.httpbin.org'
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('www.simonweald.com', 'http://www.simonweald.com'),
('http://www.github.com/', 'http://www.github.com'),
('https://www.github.com', 'https://www.github.com'))
rooturl_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('http://eu.httpbin.org/', 'http://eu.httpbin.org'),
('https://eu.httpbin.org', 'https://eu.httpbin.org'))
valid_urls = ["https://www.github.com", "http://www.github.com",
"github.com", "/some/url/", "index.html"]
urls_to_clean = (('http://eu.httpbin.org', 'http://eu.httpbin.org'),
('http://eu.httpbin.org/some/path/', 'http://eu.httpbin.org/some/path/'),
('http://eu.httpbin.org/index.html','http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html?foo=bar', 'http://eu.httpbin.org/index.html'),
('http://eu.httpbin.org/index.html#anchor', 'http://eu.httpbin.org/index.html'))
def test_clean_base_url(self):
def test_standardise_rooturl(self):
'''
Tests whether a URL's protocol can be discovered if not provided.
Tests whether a base URL can be standardised to the format
proto://[sub].domain.tld.
'''
for url, target in self.base_url_list:
result = clean_base_url(url)
for url, target in self.rooturl_list:
result = standardise_url(url)
self.assertEqual(result, target)
# def test_url_validation(self):
# '''
# Passes when given a valid URL. A valid URL is qualified
# by being local to the domain to be crawled.
# '''
# for url in self.valid_urls:
# result = url_validation(self.base_url, url)
# self.assertTrue(result)
def test_standardise_url(self):
'''
Ensure that fragments/anchors etc are stripped.
'''
for url, target in self.urls_to_clean:
result = standardise_url(url, rooturl=self.rooturl)
self.assertEqual(result, target)
if __name__ == '__main__':

0
utils/__init__.py Normal file
View File

View File

@@ -3,36 +3,238 @@
Utilities to provide various misc functions.
'''
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import urllib.error
from urllib.parse import urljoin, urlsplit
import urllib.request
import urllib.robotparser
def clean_base_url(url):
'''
Standardise the URL to be scraped to ensure it
is added to relative URLs in a consistent manner.
'''
protocol = 'http://'
if url.startswith('http'):
base_url = url
class AsyncCrawler(object):
'''
A concurrent recursive web crawler.
A recursive web crawler which finds all URLs local to the domains specified
in the `rooturl` argument.
Arguments:
rooturl: Root domain to begin crawling.
robots: RobotsTxt object for the rooturl.
concurrency: number of concurrent pages to crawl.
Returns:
All discovered pages in a set.
'''
def __init__(self, rooturl=None, robots=None, concurrency=None):
self.rooturl = rooturl
self.robots = robots
self.crawled = set()
self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency)
async def crawl_url(self, url=None):
'''
Crawls the given URL and finds all new URLs in the given page.
'''
urls = []
source = await self.get_source(url)
if source:
urls = self.find_all_urls(source)
return urls
def validate_url(self, url=None):
'''
Ensures we have a valid URL to crawl and that the site's robots.txt
allows it.
'''
# ensure the URL is in a sane format
url = standardise_url(url=url, rooturl=self.rooturl)
if url and self.robots.check(url=url):
return url
else:
return False
async def get_source(self, url=None):
'''
Obtains the URL's source, provided it is HTML. Usage of semaphores
ensures only a certain number of coroutines can run at any given
time.
'''
async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head:
try:
_ = await head.read()
except Exception:
pass
if 'text/html' in head.headers['Content-Type']:
async with self.client_session.get(url, timeout=5) as resp:
try:
source = await resp.read()
return source
except Exception:
return None
else:
return None
def find_all_urls(self, source=None):
'''
Find all URLs in a page's source. Returns a list of URLs which have
been validated as local to the starting URL.
'''
urls = []
html = BeautifulSoup(source, 'lxml')
hrefs = html.find_all('a', href=True)
# build a set of URLs which are valid and haven't been crawled yet
for href in hrefs:
url = self.validate_url(url=href['href'])
if url and url not in self.crawled:
urls.append(url)
return urls
async def run(self, urls=None):
'''
Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only
returns unseen URLs).
'''
tasks = []
all_urls = set()
for url in urls:
# mark the URL as seen.
self.crawled.add(url)
# create a task to crawl the URL.
tasks.append(self.crawl_url(url))
# wait for all tasks to complete.
for task in asyncio.as_completed(tasks):
urls = None
try:
# try getting all tasks as completed.
urls = await task
except Exception:
# skip until all tasks have completed.
pass
# add the URLs to a set to be returned.
if urls:
for url in urls:
all_urls.add(url)
return all_urls
async def main(self):
'''
Runs a crawl with batches of URLs. Once complete returns a list of all
crawled URLs.
'''
self.client_session = aiohttp.ClientSession(headers=self.headers)
to_crawl = []
# add the root URL to initialise the crawler.
to_crawl.append(self.rooturl)
print('Crawling: {0}'.format(self.rooturl))
while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl)
# empty to_crawl list and then add all newly discovered URLs for
# the next iteration.
to_crawl.clear()
to_crawl.extend(discovered_urls)
print('{0} URLs crawled'.format(len(self.crawled)))
# close the ssions once all URLs have been crawled.
await self.client_session.close()
return self.crawled
class RobotsTxt(object):
'''
Retrieve and query robots.txt for a given domain.
Retrieves and parses robots.txt for the given domain. Calling the check()
method returns True or False depending on whether crawling of that given
URL is allowed.
'''
def __init__(self, rooturl=None):
'''
Manually retrieve robots.txt to allow us to set the user-agent (works
around sites which disallow access to robots.txt without a sane
user-agent).
'''
self.rooturl = rooturl
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
robots_url = urljoin(self.rooturl, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers)
robots = urllib.robotparser.RobotFileParser()
robots.set_url(robots_url)
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError:
# if robots.txt doesn't exist then allow all URLs to be crawled.
robots.allow_all = True
else:
data = response.read()
decoded_data = data.decode("utf-8").splitlines()
robots.parse(decoded_data)
self.robots = robots
def check(self, url):
'''
Test if robots allows us to crawl that URL.
'''
return self.robots.can_fetch("*", url)
def standardise_url(url=None, rooturl=None):
'''
If `rooturl` is None then we attempt to standarise the URL to ensure it
can be prepended to relative URLs. If no scheme has been provided then we
default to http as any sane https-only site should 301 redirect http to
https.
If `rooturl` is set, we standardise URLs to strip queries and fragments
(we don't want to scrape in-page anchors etc). Any relative URLs will be
appended to the root url.
Returns a standardised URL as a string.
'''
default_proto = 'http'
delim = '://'
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
'cfm')
split_url = urlsplit(url)
if not rooturl:
# This will sanitise the initial url provided by the user.
if split_url.scheme and split_url.scheme.startswith('http'):
return "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
return "".join([default_proto, delim, split_url.path])
else:
# otherwise assume HTTP as any sane site should upgrade
# to HTTPS via a 301 redirect.
base_url = "".join([protocol, url])
# if url.endswith(file_extensions):
# Sanitise discovered URLs. We already expect them in the format
# protocol://rooturl/path
if url.startswith('/'):
return urljoin(rooturl, split_url.path)
elif url.startswith(rooturl):
return "".join([split_url.scheme, delim, split_url.netloc,
split_url.path])
# strip the trailing slash to allow us to append
# relative URLs.
if base_url.endswith('/'):
base_url = base_url[:-1]
return base_url
# def get_url_validation(base_url=None, url=None):
# '''
# Checks if a URL is valid. Can be absolute or relative.
# '''
# if url.startswith('/'):
# full_url = '{0}{1}'.format(base_url, url)
# if url.startswith(ffbase_url):
# full_url = url
# elif url.startswith('/'):
return None