Compare commits

...

45 Commits

Author SHA1 Message Date
8698c21fda return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was 2018-09-12 08:03:08 +01:00
273cf56a3b add some basic tests 2018-09-11 13:42:15 +01:00
1af26f50f2 added a docstring 2018-09-11 13:42:02 +01:00
c40c5cea50 add async info 2018-09-10 21:29:46 +01:00
a6224f9b6a updated readme 2018-09-10 20:56:12 +01:00
b64711973f add new thoughts 2018-09-10 11:58:58 +01:00
9e125dfae0 added comments and docstrings 2018-09-09 22:49:55 +01:00
f16f82fdfb improved completion message 2018-09-09 22:40:42 +01:00
a523154848 display count of crawled/uncrawled URLs whilst running 2018-09-09 22:35:55 +01:00
9e754a5584 improve handling of gzip/deflated data detection 2018-09-09 11:21:46 +01:00
1b005570ee implement gzip compression requests and handling 2018-09-09 10:53:09 +01:00
17fa9f93f9 tick off gzip encoding 2018-09-09 10:52:37 +01:00
1e51e10db2 update with changes 2018-09-09 10:22:18 +01:00
225fd8b3ea update with changes 2018-09-09 10:22:03 +01:00
d686ae0bc4 update with changes 2018-09-09 10:21:45 +01:00
69f5788745 update notes 2018-09-09 10:16:22 +01:00
b5d644a223 various minor improvements to exception handling 2018-09-09 10:16:03 +01:00
6508156aa4 use lxml as the parser and only find links on a page if we've got the source 2018-09-09 10:06:25 +01:00
738ab8e441 adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent 2018-09-09 09:57:16 +01:00
fdd84a8786 manually retrieve robots.txt to ensure we can set the user-agent 2018-09-07 12:40:12 +01:00
ab0ab0a010 add more thoughts 2018-09-07 11:50:53 +01:00
6a1259aa7d update plans to add gzip encoding 2018-09-06 17:33:10 +01:00
164239b343 more thoughts 2018-09-06 17:31:12 +01:00
ce1f2745c9 update thoughts 2018-09-06 17:30:28 +01:00
e70bdc9ca1 update requirements.txt 2018-09-06 17:25:30 +01:00
d1c1e17f4f report runtime of script in generated sitemap 2018-09-06 17:20:59 +01:00
816a727d79 ignore generated file 2018-09-06 17:08:56 +01:00
84ab27a75e render results as HTML 2018-09-06 17:08:26 +01:00
6d9103c154 improved content-type detection 2018-09-06 17:08:12 +01:00
e57a86c60a only attempt to read html 2018-09-06 16:30:11 +01:00
a3ec9451e3 implement parsing of robots.txt 2018-09-05 18:56:20 +01:00
f2c294ebdb added new ideas to implement 2018-09-04 15:40:11 +01:00
1b9b207a28 attempt to remove base url with trailing slash (if discovered) 2018-09-04 13:57:52 +01:00
6abe7d68e0 updated notes 2018-09-04 12:51:59 +01:00
7d919039b6 removed unecessary modules 2018-09-04 10:14:27 +01:00
0726bcccb0 removed original file 2018-09-04 09:21:55 +01:00
05e907ecec too many changes to make a sensible commit message 2018-09-04 09:21:26 +01:00
abc628106d added a docstring to the WebPage object 2018-08-31 19:18:00 +01:00
c436016e0c remove unecessary function 2018-08-31 19:16:08 +01:00
03554fde80 add docstrings 2018-08-31 19:15:35 +01:00
759f965e95 use more explicit names, use urljoin to combine urls 2018-08-31 19:12:58 +01:00
0517e5bc56 crawler now initialises and populates crawled pool with urls it finds 2018-08-31 19:02:21 +01:00
1b18aa83eb corrected some small errors and added runner function 2018-08-31 19:01:35 +01:00
5e0d9fd568 initial commit of crawler skeleton 2018-08-31 18:26:49 +01:00
915def3a5d rework url sanitiser to use urllib modules, move WebPage object to helpers 2018-08-31 18:26:25 +01:00
9 changed files with 403 additions and 144 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
venv/
.vscode/*
__pycache__/
sitemap.html

View File

@@ -1 +1,21 @@
# Concurrent web scraper
# Concurrent web scraper
## Requirements
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
Install required modules:
```bash
pip install -r requirements.txt
```
Run:
```bash
python crawler.py -u https://urltocrawl.com
```
## Results
The resulting sitemap will be output in the root of this directory as `sitemap.html`

120
crawler.py Normal file
View File

@@ -0,0 +1,120 @@
#!/usr/bin/env python
'''
Need a docstring.
'''
import argparse
import jinja2
import os
import asyncio
from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
def init_crawler(base_url=None, robots=None):
'''
Initialises the crawler by running the initial URL.
'''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
try:
initial_page.run()
except Exception as e:
print(e)
initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again
try:
initial_urls.remove(base_url)
except KeyError:
pass
# also ensure base URL wasn't discovered with a trailing slash on the
# initial page scrape
try:
initial_urls.remove("".join([base_url, '/']))
except KeyError:
pass
# Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url)
for url in initial_urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(sanitised_url)
return(uncrawled_urls, crawled_urls)
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
'''
Iterates over the pool of URLs and adds any discovered URLs.
'''
while uncrawled_urls.pool:
# pop url from pool
new_url = uncrawled_urls.remove_from_pool()
# create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try:
succeeded = current_page.run()
except Exception as e:
print(e)
if succeeded:
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
len(uncrawled_urls.pool)))
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap as an HTML file.
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
tmpl = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def run(args=None):
'''
needs a docstring.
'''
starttime = datetime.now()
base_url = sanitise_url(args.url, base_url=True)
robots = RobotsTxt(base_url=base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
args = parser.parse_args()
run(args)

View File

@@ -1,9 +1,56 @@
## Thoughts
###### for each URL, do the following:
* mark it as crawled
* get page content
* if that fails, mark the link as invalid
* find all links in the content
* check each link for dupes
* add to pool or discard
* ~~strip hashes and everything following (as they're in-page anchors)~~
* ~~strip args~~
* ~~use `pop()` on the set instead of `.remove()`~~
* ~~return false once the set is empty~~
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ~~ignore any links which aren't to pages~~
* ~~better url checking to get bare domain~~ #wontfix
* ~~remove trailing slash from any discovered url~~
* ~~investigate lxml parser~~
* ~~remove base url from initial urls with and without trailing slash~~
* ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
* ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
* ~~implement some kind of progress display~~
* async
* better exception handling
* randomise output filename
### Async bits
in `__main__`:
```python
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.close()
```
* initialises loop and runs it to completion
* needs to handle errors (try/except/finally)
```python
async def run(args=None):
tasks = []
for url in pool:
tasks.append(url)
# for i in range(10):
# tasks.append(asyncio.ensure_future(myCoroutine(i)))
# gather completed tasks
await asyncio.gather(*tasks)
```
Getting the contents of the page needs to be async too
```python
async def get_source():
blah
blah
await urlopen(url)
```

View File

@@ -3,5 +3,6 @@ bs4==0.0.1
certifi==2018.8.13
chardet==3.0.4
idna==2.7
requests==2.19.1
urllib3==1.23
Jinja2==2.10
lxml==4.2.4
MarkupSafe==1.0

View File

@@ -1,88 +0,0 @@
#!/usr/bin/env python
import re
import argparse
import urllib.request
from bs4 import BeautifulSoup
class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, args):
self.url = args['url']
def get_source(self, args=None):
request = urllib.request.Request(self.url, headers=headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self, args=None, source=None):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def run(args=None):
source = get_source(args)
urls = find_links(args, source)
local_urls = parse_urls(args, urls)
print(local_urls)
def get_source(args=None):
url = args.url
useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
headers = {'User-Agent': useragent}
request = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(request)
source = page.read()
return source
def find_links(args=None, source=None):
soup = BeautifulSoup(source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([args.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def parse_urls(args=None, urls=None):
local_urls = []
for url in urls:
if url.startswith(args.url):
local_urls.append(url)
return local_urls
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Recursive web scraper')
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
args = parser.parse_args()
if not args.url.startswith('http'):
raise SystemExit('URL must start with a protocol (http(s)).')
run(args)

14
templates/sitemap.html.j2 Normal file
View File

@@ -0,0 +1,14 @@
<html>
<head>
<title>Sitemap for {{ base_url }}</title>
</head>
<body>
<p>
Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
<ul>
{% for url in urls %}
<li><a href="{{ url }}">{{ url }}</a></li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -1,37 +1,35 @@
#!/usr/bin/env python
import unittest
from utils.helpers import (clean_base_url)
from utils.helpers import (sanitise_url)
class TestUrls(unittest.TestCase):
base_url = "github.com"
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('www.simonweald.com', 'http://www.simonweald.com'),
('http://www.github.com/', 'http://www.github.com'),
('https://www.github.com', 'https://www.github.com'))
valid_urls = ["https://www.github.com", "http://www.github.com",
"github.com", "/some/url/", "index.html"]
urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
('https://github.com/?foo=bar', 'https://github.com/'),
('https://github.com/#anchor', 'https://github.com/'))
def test_clean_base_url(self):
def test_sanitise_base_url(self):
'''
Tests whether a URL's protocol can be discovered if not provided.
'''
for url, target in self.base_url_list:
result = clean_base_url(url)
result = sanitise_url(url, base_url=True)
self.assertEqual(result, target)
# def test_url_validation(self):
# '''
# Passes when given a valid URL. A valid URL is qualified
# by being local to the domain to be crawled.
# '''
# for url in self.valid_urls:
# result = url_validation(self.base_url, url)
# self.assertTrue(result)
def test_sanitise_url(self):
'''
Tests whether a URL's protocol can be discovered if not provided.
'''
for url, target in self.urls_to_clean:
result = sanitise_url(url)
self.assertEqual(result, target)
if __name__ == '__main__':

View File

@@ -3,56 +3,202 @@
Utilities to provide various misc functions.
'''
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import aiohttp
import urllib.request
import urllib.robotparser
import urllib.error
import gzip
from urllib.parse import (urljoin, urlsplit)
class UrlPoolManager(object):
class UrlPool(object):
'''
Object to manage the lifecycle of a pool of URLs.
Object to manage a pool of URLs.
'''
def __init__(self):
self.url_pool = set()
self.pool = set()
def check_duplicate(self, new_url):
'''
Checks if a URL exists in the current pool.
'''
if new_url in self.url_pool:
if new_url in self.pool:
return True
else:
return False
def invalidate_url(self, url):
self.url_pool.remove(url)
def remove_from_pool(self):
'''
Remove a URL from the pool and return it to be crawled.
'''
return(self.pool.pop())
def add_to_list(self, url):
self.url_pool.add(url)
def add_to_pool(self, url):
self.pool.add(url)
def list_pool(self):
pool = self.pool
return pool
def clean_base_url(url):
class WebPage(object):
'''
Standardise the URL to be scraped to ensure it
is added to relative URLs in a consistent manner.
Object to manage common operations required to return
the data from each individual page.
'''
protocol = 'http://'
if url.startswith('http'):
base_url = url
# set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None, robots=None):
self.url = url
self.base_url = base_url
self.robots = robots
self.source = None
self.urls_to_crawl = set()
def get_source(self):
'''
Retrieve a page's source.
'''
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5)
# handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
self.source = gzip.decompress(page.read())
elif page.info().get('Content-Encoding') == 'deflate':
self.source = page.read()
else:
self.source = page.read()
def find_links(self):
'''
Find all URLs on a page and ensure they are absolute. If they are
relative then they will be appended to the base URL.
'''
hrefs = set()
soup = BeautifulSoup(self.source, 'lxml')
links = soup.find_all('a', href=True)
for link in links:
if link['href'].startswith('/'):
hrefs.add(urljoin(self.url, link['href']))
else:
hrefs.add(link['href'])
self.discovered_hrefs = hrefs
def parse_urls(self):
'''
Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL.
'''
for url in self.discovered_hrefs:
if url.startswith(self.base_url) and self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self):
'''
Returns all valid discovered URLs.
'''
return self.urls_to_crawl
def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try:
self.get_source()
except Exception:
# skip if we didn't retrieve the source.
pass
if self.source:
self.find_links()
self.parse_urls()
return True
else:
return False
class RobotsTxt(object):
'''
needs a docstring
'''
def __init__(self, base_url=None):
'''
Manually retrieve robots.txt to allow us to set the user-agent.
'''
self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
robots_url = urljoin(self.base_url, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers)
robots = urllib.robotparser.RobotFileParser()
robots.set_url(robots_url)
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError:
robots.allow_all = True
else:
data = response.read()
decoded_data = data.decode("utf-8").splitlines()
robots.parse(decoded_data)
self.robots = robots
def check(self, url):
'''
Test if robots allows us to crawl that URL.
'''
return self.robots.can_fetch("*", url)
def sanitise_url(url, base_url=False):
'''
If `base_url` is True, we attempt to standardise `url` to ensure it can be
prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https.
If `base_url` is False, we sanitise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc).
Returns a sanitised URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if base_url:
# This will sanitise the initial url for the initial page crawl.
if split_url.scheme and split_url.scheme.startswith('http'):
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
sanitised_url = "".join([default_proto, delim, split_url.path])
else:
# otherwise assume HTTP as any sane site should upgrade
# to HTTPS via a 301 redirect.
base_url = "".join([protocol, url])
# Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return base_url
def get_url_validation(base_url=None, url=None):
'''
Ensure any URLs discovered are absolute. If relative,
they will be appended to the base URL.
'''
if url.startswith('/'):
return urljoin(base_url, url)
if url.startswith(base_url):
return url
return sanitised_url