Compare commits

..

22 Commits

Author SHA1 Message Date
8698c21fda return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was 2018-09-12 08:03:08 +01:00
273cf56a3b add some basic tests 2018-09-11 13:42:15 +01:00
1af26f50f2 added a docstring 2018-09-11 13:42:02 +01:00
c40c5cea50 add async info 2018-09-10 21:29:46 +01:00
a6224f9b6a updated readme 2018-09-10 20:56:12 +01:00
b64711973f add new thoughts 2018-09-10 11:58:58 +01:00
9e125dfae0 added comments and docstrings 2018-09-09 22:49:55 +01:00
f16f82fdfb improved completion message 2018-09-09 22:40:42 +01:00
a523154848 display count of crawled/uncrawled URLs whilst running 2018-09-09 22:35:55 +01:00
9e754a5584 improve handling of gzip/deflated data detection 2018-09-09 11:21:46 +01:00
1b005570ee implement gzip compression requests and handling 2018-09-09 10:53:09 +01:00
17fa9f93f9 tick off gzip encoding 2018-09-09 10:52:37 +01:00
1e51e10db2 update with changes 2018-09-09 10:22:18 +01:00
225fd8b3ea update with changes 2018-09-09 10:22:03 +01:00
d686ae0bc4 update with changes 2018-09-09 10:21:45 +01:00
69f5788745 update notes 2018-09-09 10:16:22 +01:00
b5d644a223 various minor improvements to exception handling 2018-09-09 10:16:03 +01:00
6508156aa4 use lxml as the parser and only find links on a page if we've got the source 2018-09-09 10:06:25 +01:00
738ab8e441 adjust robots handling to deal with 404s and enforce a user agent which allows us to initially obtain the user agent 2018-09-09 09:57:16 +01:00
fdd84a8786 manually retrieve robots.txt to ensure we can set the user-agent 2018-09-07 12:40:12 +01:00
ab0ab0a010 add more thoughts 2018-09-07 11:50:53 +01:00
6a1259aa7d update plans to add gzip encoding 2018-09-06 17:33:10 +01:00
6 changed files with 150 additions and 60 deletions

View File

@@ -1 +1,21 @@
# Concurrent web scraper # Concurrent web scraper
## Requirements
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
Install required modules:
```bash
pip install -r requirements.txt
```
Run:
```bash
python crawler.py -u https://urltocrawl.com
```
## Results
The resulting sitemap will be output in the root of this directory as `sitemap.html`

View File

@@ -5,13 +5,15 @@ Need a docstring.
import argparse import argparse
import jinja2 import jinja2
import os
import asyncio
from datetime import datetime from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
def init_crawler(base_url=None, robots=None): def init_crawler(base_url=None, robots=None):
''' '''
needs a docstring Initialises the crawler by running the initial URL.
''' '''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool() uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(url=base_url, base_url=base_url, robots=robots) initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
@@ -48,7 +50,7 @@ def init_crawler(base_url=None, robots=None):
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None): def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
''' '''
Needs a docstring Iterates over the pool of URLs and adds any discovered URLs.
''' '''
while uncrawled_urls.pool: while uncrawled_urls.pool:
# pop url from pool # pop url from pool
@@ -56,17 +58,22 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
# create a WebPage object for the URL # create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots) current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try: try:
current_page.run() succeeded = current_page.run()
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
except Exception as e: except Exception as e:
print(e) print(e)
if succeeded:
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
for url in _urls: for url in _urls:
sanitised_url = sanitise_url(url=url) sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool: if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url) uncrawled_urls.add_to_pool(url)
print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
len(uncrawled_urls.pool)))
def render_sitemap(base_url=None, crawled_urls=None, runtime=None): def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
''' '''
@@ -84,7 +91,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
with open('sitemap.html', 'w') as outfile: with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html) outfile.write(rendered_html)
print('Sitemap available at sitemap.html') print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def run(args=None): def run(args=None):
@@ -103,9 +110,6 @@ def run(args=None):
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime) render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
# pprint(crawled_urls.pool)
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -1,14 +1,56 @@
## Thoughts ## Thoughts
* ~~strip hashes and everything following (as they're in-page anchors)~~ * ~~strip hashes and everything following (as they're in-page anchors)~~
* strip args * ~~strip args~~
* ~~use `pop()` on the set instead of `.remove()`~~ * ~~use `pop()` on the set instead of `.remove()`~~
* ~~return false once the set is empty~~ * ~~return false once the set is empty~~
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~ * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ~~ignore any links which aren't to pages~~ * ~~ignore any links which aren't to pages~~
* better url checking to get bare domain * ~~better url checking to get bare domain~~ #wontfix
* remove trailing slash from any discovered url * ~~remove trailing slash from any discovered url~~
* investigate lxml parser * ~~investigate lxml parser~~
* ~~remove base url from initial urls with and without trailing slash~~ * ~~remove base url from initial urls with and without trailing slash~~
* investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls * ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~ * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
* ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
* ~~implement some kind of progress display~~
* async
* better exception handling
* randomise output filename
### Async bits
in `__main__`:
```python
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.close()
```
* initialises loop and runs it to completion
* needs to handle errors (try/except/finally)
```python
async def run(args=None):
tasks = []
for url in pool:
tasks.append(url)
# for i in range(10):
# tasks.append(asyncio.ensure_future(myCoroutine(i)))
# gather completed tasks
await asyncio.gather(*tasks)
```
Getting the contents of the page needs to be async too
```python
async def get_source():
blah
blah
await urlopen(url)
```

View File

@@ -4,4 +4,5 @@ certifi==2018.8.13
chardet==3.0.4 chardet==3.0.4
idna==2.7 idna==2.7
Jinja2==2.10 Jinja2==2.10
lxml==4.2.4
MarkupSafe==1.0 MarkupSafe==1.0

View File

@@ -1,37 +1,35 @@
#!/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
from utils.helpers import (clean_base_url) from utils.helpers import (sanitise_url)
class TestUrls(unittest.TestCase): class TestUrls(unittest.TestCase):
base_url = "github.com"
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('www.simonweald.com', 'http://www.simonweald.com'), ('www.simonweald.com', 'http://www.simonweald.com'),
('http://www.github.com/', 'http://www.github.com'), ('http://www.github.com/', 'http://www.github.com'),
('https://www.github.com', 'https://www.github.com')) ('https://www.github.com', 'https://www.github.com'))
valid_urls = ["https://www.github.com", "http://www.github.com", urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
"github.com", "/some/url/", "index.html"] ('https://github.com/?foo=bar', 'https://github.com/'),
('https://github.com/#anchor', 'https://github.com/'))
def test_clean_base_url(self): def test_sanitise_base_url(self):
''' '''
Tests whether a URL's protocol can be discovered if not provided. Tests whether a URL's protocol can be discovered if not provided.
''' '''
for url, target in self.base_url_list: for url, target in self.base_url_list:
result = clean_base_url(url) result = sanitise_url(url, base_url=True)
self.assertEqual(result, target) self.assertEqual(result, target)
# def test_url_validation(self): def test_sanitise_url(self):
# ''' '''
# Passes when given a valid URL. A valid URL is qualified Tests whether a URL's protocol can be discovered if not provided.
# by being local to the domain to be crawled. '''
# ''' for url, target in self.urls_to_clean:
# for url in self.valid_urls: result = sanitise_url(url)
# result = url_validation(self.base_url, url) self.assertEqual(result, target)
# self.assertTrue(result)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -4,8 +4,11 @@ Utilities to provide various misc functions.
''' '''
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import aiohttp
import urllib.request import urllib.request
import urllib.robotparser import urllib.robotparser
import urllib.error
import gzip
from urllib.parse import (urljoin, urlsplit) from urllib.parse import (urljoin, urlsplit)
@@ -46,12 +49,16 @@ class WebPage(object):
the data from each individual page. the data from each individual page.
''' '''
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} # set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None, robots=None): def __init__(self, url=None, base_url=None, robots=None):
self.url = url self.url = url
self.base_url = base_url self.base_url = base_url
self.robots = robots self.robots = robots
self.source = None
self.urls_to_crawl = set()
def get_source(self): def get_source(self):
@@ -61,8 +68,15 @@ class WebPage(object):
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5) page = urllib.request.urlopen(request, timeout=5)
headers = page.info()
if "text/html" in headers['content-type']: # handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
self.source = gzip.decompress(page.read())
elif page.info().get('Content-Encoding') == 'deflate':
self.source = page.read()
else:
self.source = page.read() self.source = page.read()
@@ -73,7 +87,7 @@ class WebPage(object):
''' '''
hrefs = set() hrefs = set()
soup = BeautifulSoup(self.source, 'html.parser') soup = BeautifulSoup(self.source, 'lxml')
links = soup.find_all('a', href=True) links = soup.find_all('a', href=True)
for link in links: for link in links:
@@ -90,38 +104,37 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL. pool if they start with the base URL.
''' '''
self.urls_to_crawl = set()
for url in self.discovered_hrefs: for url in self.discovered_hrefs:
if url.startswith(self.url): if url.startswith(self.base_url) and self.robots.check(url):
if self.robots.check(url):
sanitised_url = sanitise_url(url=url) sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url) self.urls_to_crawl.add(sanitised_url)
def list_urls(self): def list_urls(self):
''' '''
Returns the contents of the Returns all valid discovered URLs.
''' '''
return self.urls_to_crawl return self.urls_to_crawl
def run(self): def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try: try:
self.get_source() self.get_source()
except Exception as e: except Exception:
print(e) # skip if we didn't retrieve the source.
pass
try: if self.source:
self.find_links() self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls() self.parse_urls()
except Exception as e: return True
print(e) else:
return False
class RobotsTxt(object): class RobotsTxt(object):
@@ -130,21 +143,33 @@ class RobotsTxt(object):
''' '''
def __init__(self, base_url=None): def __init__(self, base_url=None):
'''
Manually retrieve robots.txt to allow us to set the user-agent.
'''
self.base_url = base_url self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
robots_url = urljoin(self.base_url, 'robots.txt')
request = urllib.request.Request(robots_url, headers=self.headers)
robots = urllib.robotparser.RobotFileParser() robots = urllib.robotparser.RobotFileParser()
robots.set_url(urljoin(self.base_url, 'robots.txt')) robots.set_url(robots_url)
try: try:
robots.read() response = urllib.request.urlopen(request, timeout=5)
except Exception as e: except urllib.error.HTTPError:
print(e) robots.allow_all = True
else:
data = response.read()
decoded_data = data.decode("utf-8").splitlines()
robots.parse(decoded_data)
self.robots = robots self.robots = robots
def check(self, url): def check(self, url):
''' '''
needs a docstring Test if robots allows us to crawl that URL.
''' '''
return self.robots.can_fetch("*", url) return self.robots.can_fetch("*", url)