Compare commits

...

43 Commits

Author SHA1 Message Date
8698c21fda return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was 2018-09-12 08:03:08 +01:00
273cf56a3b add some basic tests 2018-09-11 13:42:15 +01:00
1af26f50f2 added a docstring 2018-09-11 13:42:02 +01:00
c40c5cea50 add async info 2018-09-10 21:29:46 +01:00
a6224f9b6a updated readme 2018-09-10 20:56:12 +01:00
b64711973f add new thoughts 2018-09-10 11:58:58 +01:00
9e125dfae0 added comments and docstrings 2018-09-09 22:49:55 +01:00
f16f82fdfb improved completion message 2018-09-09 22:40:42 +01:00
a523154848 display count of crawled/uncrawled URLs whilst running 2018-09-09 22:35:55 +01:00
9e754a5584 improve handling of gzip/deflated data detection 2018-09-09 11:21:46 +01:00
1b005570ee implement gzip compression requests and handling 2018-09-09 10:53:09 +01:00
17fa9f93f9 tick off gzip encoding 2018-09-09 10:52:37 +01:00
1e51e10db2 update with changes 2018-09-09 10:22:18 +01:00
225fd8b3ea update with changes 2018-09-09 10:22:03 +01:00
d686ae0bc4 update with changes 2018-09-09 10:21:45 +01:00
69f5788745 update notes 2018-09-09 10:16:22 +01:00
b5d644a223 various minor improvements to exception handling 2018-09-09 10:16:03 +01:00
6508156aa4 use lxml as the parser and only find links on a page if we've got the source 2018-09-09 10:06:25 +01:00
738ab8e441 adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent 2018-09-09 09:57:16 +01:00
fdd84a8786 manually retrieve robots.txt to ensure we can set the user-agent 2018-09-07 12:40:12 +01:00
ab0ab0a010 add more thoughts 2018-09-07 11:50:53 +01:00
6a1259aa7d update plans to add gzip encoding 2018-09-06 17:33:10 +01:00
164239b343 more thoughts 2018-09-06 17:31:12 +01:00
ce1f2745c9 update thoughts 2018-09-06 17:30:28 +01:00
e70bdc9ca1 update requirements.txt 2018-09-06 17:25:30 +01:00
d1c1e17f4f report runtime of script in generated sitemap 2018-09-06 17:20:59 +01:00
816a727d79 ignore generated file 2018-09-06 17:08:56 +01:00
84ab27a75e render results as HTML 2018-09-06 17:08:26 +01:00
6d9103c154 improved content-type detection 2018-09-06 17:08:12 +01:00
e57a86c60a only attempt to read html 2018-09-06 16:30:11 +01:00
a3ec9451e3 implement parsing of robots.txt 2018-09-05 18:56:20 +01:00
f2c294ebdb added new ideas to implement 2018-09-04 15:40:11 +01:00
1b9b207a28 attempt to remove base url with trailing slash (if discovered) 2018-09-04 13:57:52 +01:00
6abe7d68e0 updated notes 2018-09-04 12:51:59 +01:00
7d919039b6 removed unecessary modules 2018-09-04 10:14:27 +01:00
0726bcccb0 removed original file 2018-09-04 09:21:55 +01:00
05e907ecec too many changes to make a sensible commit message 2018-09-04 09:21:26 +01:00
abc628106d added a docstring to the WebPage object 2018-08-31 19:18:00 +01:00
c436016e0c remove unecessary function 2018-08-31 19:16:08 +01:00
03554fde80 add docstrings 2018-08-31 19:15:35 +01:00
759f965e95 use more explicit names, use urljoin to combine urls 2018-08-31 19:12:58 +01:00
0517e5bc56 crawler now initialises and populates crawled pool with urls it finds 2018-08-31 19:02:21 +01:00
1b18aa83eb corrected some small errors and added runner function 2018-08-31 19:01:35 +01:00
9 changed files with 353 additions and 163 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
venv/
.vscode/*
__pycache__/
sitemap.html

View File

@@ -1 +1,21 @@
# Concurrent web scraper
# Concurrent web scraper
## Requirements
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
Install required modules:
```bash
pip install -r requirements.txt
```
Run:
```bash
python crawler.py -u https://urltocrawl.com
```
## Results
The resulting sitemap will be output in the root of this directory as `sitemap.html`

View File

@@ -4,20 +4,112 @@ Need a docstring.
'''
import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url, qualify_url)
import jinja2
import os
import asyncio
from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
def init_crawler(base_url=None):
def init_crawler(base_url=None, robots=None):
'''
needs a docstring
Initialises the crawler by running the initial URL.
'''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
try:
initial_page.run()
except Exception as e:
print(e)
initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again
try:
initial_urls.remove(base_url)
except KeyError:
pass
# also ensure base URL wasn't discovered with a trailing slash on the
# initial page scrape
try:
initial_urls.remove("".join([base_url, '/']))
except KeyError:
pass
# Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url)
for url in initial_urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(sanitised_url)
return(uncrawled_urls, crawled_urls)
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
'''
Iterates over the pool of URLs and adds any discovered URLs.
'''
while uncrawled_urls.pool:
# pop url from pool
new_url = uncrawled_urls.remove_from_pool()
# create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try:
succeeded = current_page.run()
except Exception as e:
print(e)
if succeeded:
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
len(uncrawled_urls.pool)))
def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
'''
Renders the sitemap as an HTML file.
'''
urlcount = len(crawled_urls)
sorted_urls = sorted(crawled_urls)
tmpl = jinja2.Environment(
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = tmpl.render(base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def run(args=None):
'''
needs a docstring.
'''
base_url = sanitise_url(args.url)
print(base_url)
starttime = datetime.now()
base_url = sanitise_url(args.url, base_url=True)
robots = RobotsTxt(base_url=base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls, robots)
runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
if __name__ == '__main__':

View File

@@ -1,9 +1,56 @@
## Thoughts
###### for each URL, do the following:
* mark it as crawled
* get page content
* if that fails, mark the link as invalid
* find all links in the content
* check each link for dupes
* add to pool or discard
* ~~strip hashes and everything following (as they're in-page anchors)~~
* ~~strip args~~
* ~~use `pop()` on the set instead of `.remove()`~~
* ~~return false once the set is empty~~
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ~~ignore any links which aren't to pages~~
* ~~better url checking to get bare domain~~ #wontfix
* ~~remove trailing slash from any discovered url~~
* ~~investigate lxml parser~~
* ~~remove base url from initial urls with and without trailing slash~~
* ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
* ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
* ~~implement some kind of progress display~~
* async
* better exception handling
* randomise output filename
### Async bits
in `__main__`:
```python
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.close()
```
* initialises loop and runs it to completion
* needs to handle errors (try/except/finally)
```python
async def run(args=None):
tasks = []
for url in pool:
tasks.append(url)
# for i in range(10):
# tasks.append(asyncio.ensure_future(myCoroutine(i)))
# gather completed tasks
await asyncio.gather(*tasks)
```
Getting the contents of the page needs to be async too
```python
async def get_source():
blah
blah
await urlopen(url)
```

View File

@@ -3,5 +3,6 @@ bs4==0.0.1
certifi==2018.8.13
chardet==3.0.4
idna==2.7
requests==2.19.1
urllib3==1.23
Jinja2==2.10
lxml==4.2.4
MarkupSafe==1.0

View File

@@ -1,88 +0,0 @@
#!/usr/bin/env python
import re
import argparse
import urllib.request
from bs4 import BeautifulSoup
class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, args):
self.url = args['url']
def get_source(self, args=None):
request = urllib.request.Request(self.url, headers=headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self, args=None, source=None):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def run(args=None):
source = get_source(args)
urls = find_links(args, source)
local_urls = parse_urls(args, urls)
print(local_urls)
def get_source(args=None):
url = args.url
useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
headers = {'User-Agent': useragent}
request = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(request)
source = page.read()
return source
def find_links(args=None, source=None):
soup = BeautifulSoup(source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([args.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def parse_urls(args=None, urls=None):
local_urls = []
for url in urls:
if url.startswith(args.url):
local_urls.append(url)
return local_urls
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Recursive web scraper')
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
args = parser.parse_args()
if not args.url.startswith('http'):
raise SystemExit('URL must start with a protocol (http(s)).')
run(args)

14
templates/sitemap.html.j2 Normal file
View File

@@ -0,0 +1,14 @@
<html>
<head>
<title>Sitemap for {{ base_url }}</title>
</head>
<body>
<p>
Crawled {{ urlcount }} URLs on {{ base_url }} in ~{{ runtime }} seconds.
<ul>
{% for url in urls %}
<li><a href="{{ url }}">{{ url }}</a></li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -1,37 +1,35 @@
#!/usr/bin/env python
import unittest
from utils.helpers import (clean_base_url)
from utils.helpers import (sanitise_url)
class TestUrls(unittest.TestCase):
base_url = "github.com"
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('www.simonweald.com', 'http://www.simonweald.com'),
('http://www.github.com/', 'http://www.github.com'),
('https://www.github.com', 'https://www.github.com'))
valid_urls = ["https://www.github.com", "http://www.github.com",
"github.com", "/some/url/", "index.html"]
urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
('https://github.com/?foo=bar', 'https://github.com/'),
('https://github.com/#anchor', 'https://github.com/'))
def test_clean_base_url(self):
def test_sanitise_base_url(self):
'''
Tests whether a URL's protocol can be discovered if not provided.
'''
for url, target in self.base_url_list:
result = clean_base_url(url)
result = sanitise_url(url, base_url=True)
self.assertEqual(result, target)
# def test_url_validation(self):
# '''
# Passes when given a valid URL. A valid URL is qualified
# by being local to the domain to be crawled.
# '''
# for url in self.valid_urls:
# result = url_validation(self.base_url, url)
# self.assertTrue(result)
def test_sanitise_url(self):
'''
Tests whether a URL's protocol can be discovered if not provided.
'''
for url, target in self.urls_to_clean:
result = sanitise_url(url)
self.assertEqual(result, target)
if __name__ == '__main__':

View File

@@ -3,8 +3,12 @@
Utilities to provide various misc functions.
'''
import urllib.request
from bs4 import BeautifulSoup
import aiohttp
import urllib.request
import urllib.robotparser
import urllib.error
import gzip
from urllib.parse import (urljoin, urlsplit)
@@ -14,86 +18,187 @@ class UrlPool(object):
'''
def __init__(self):
self.url_pool = set()
self.pool = set()
def check_duplicate(self, new_url):
'''
Checks if a URL exists in the current pool.
'''
if new_url in self.url_pool:
if new_url in self.pool:
return True
else:
return False
def invalidate_url(self, url):
self.url_pool.remove(url)
def remove_from_pool(self):
'''
Remove a URL from the pool and return it to be crawled.
'''
return(self.pool.pop())
def add_to_list(self, url):
self.url_pool.add(url)
def add_to_pool(self, url):
self.pool.add(url)
def list_pool(self):
pool = self.pool
return pool
class WebPage(object):
'''
Object to manage common operations required to return
the data from each individual page.
'''
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
# set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url):
def __init__(self, url=None, base_url=None, robots=None):
self.url = url
self.base_url = base_url
self.robots = robots
self.source = None
self.urls_to_crawl = set()
def get_source(self):
'''
Retrieve a page's source.
'''
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
self.source = page.read()
page = urllib.request.urlopen(request, timeout=5)
# handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
self.source = gzip.decompress(page.read())
elif page.info().get('Content-Encoding') == 'deflate':
self.source = page.read()
else:
self.source = page.read()
def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
'''
Find all URLs on a page and ensure they are absolute. If they are
relative then they will be appended to the base URL.
'''
hrefs = set()
soup = BeautifulSoup(self.source, 'lxml')
links = soup.find_all('a', href=True)
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
hrefs.add(urljoin(self.url, link['href']))
else:
hrefs.append(link['href'])
hrefs.add(link['href'])
self.discovered_hrefs = hrefs
self.hrefs = hrefs
def parse_urls(self):
local_urls = []
for url in self.hrefs:
if url.startswith(self.url):
local_urls.append(url)
return local_urls
'''
Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL.
'''
for url in self.discovered_hrefs:
if url.startswith(self.base_url) and self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def sanitise_url(url):
def list_urls(self):
'''
Returns all valid discovered URLs.
'''
return self.urls_to_crawl
def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try:
self.get_source()
except Exception:
# skip if we didn't retrieve the source.
pass
if self.source:
self.find_links()
self.parse_urls()
return True
else:
return False
class RobotsTxt(object):
'''
needs a docstring
'''
Attempt to standardise the base url to ensure it can be prepended to
relative URLs. If no scheme provided then we default to http as any
sane https-only site should 301 redirect http > https.
Returns a corrected base URL as a string.
def __init__(self, base_url=None):
'''
Manually retrieve robots.txt to allow us to set the user-agent.
'''
self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
robots_url = urljoin(self.base_url, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers)
robots = urllib.robotparser.RobotFileParser()
robots.set_url(robots_url)
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError:
robots.allow_all = True
else:
data = response.read()
decoded_data = data.decode("utf-8").splitlines()
robots.parse(decoded_data)
self.robots = robots
def check(self, url):
'''
Test if robots allows us to crawl that URL.
'''
return self.robots.can_fetch("*", url)
def sanitise_url(url, base_url=False):
'''
If `base_url` is True, we attempt to standardise `url` to ensure it can be
prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https.
If `base_url` is False, we sanitise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc).
Returns a sanitised URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if split_url.scheme and split_url.scheme.startswith('http'):
base_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
base_url = "".join([default_proto, delim, split_url.path])
return base_url
if base_url:
# This will sanitise the initial url for the initial page crawl.
if split_url.scheme and split_url.scheme.startswith('http'):
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
sanitised_url = "".join([default_proto, delim, split_url.path])
else:
# Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
def qualify_url(base_url=None, url=None):
'''
Ensure any URLs discovered are absolute. If relative,
they will be appended to the base URL. Returns an
absolute URL as a string.
'''
if url.startswith('/'):
return urljoin(base_url, url)
if url.startswith(base_url):
return url
return sanitised_url