Compare commits

...

15 Commits

5 changed files with 130 additions and 49 deletions

View File

@@ -1 +1,21 @@
# Concurrent web scraper # Concurrent web scraper
## Requirements
This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
Install required modules:
```bash
pip install -r requirements.txt
```
Run:
```bash
python crawler.py -u https://urltocrawl.com
```
## Results
The resulting sitemap will be output in the root of this directory as `sitemap.html`

View File

@@ -5,13 +5,15 @@ Need a docstring.
import argparse import argparse
import jinja2 import jinja2
import os
import asyncio
from datetime import datetime from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
def init_crawler(base_url=None, robots=None): def init_crawler(base_url=None, robots=None):
''' '''
needs a docstring Initialises the crawler by running the initial URL.
''' '''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool() uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(url=base_url, base_url=base_url, robots=robots) initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
@@ -48,7 +50,7 @@ def init_crawler(base_url=None, robots=None):
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None): def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
''' '''
Needs a docstring Iterates over the pool of URLs and adds any discovered URLs.
''' '''
while uncrawled_urls.pool: while uncrawled_urls.pool:
# pop url from pool # pop url from pool
@@ -56,16 +58,21 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
# create a WebPage object for the URL # create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots) current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try: try:
current_page.run() succeeded = current_page.run()
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
except Exception as e: except Exception as e:
print(e) print(e)
for url in _urls: if succeeded:
sanitised_url = sanitise_url(url=url) _urls = current_page.list_urls()
if sanitised_url not in crawled_urls.pool: crawled_urls.add_to_pool(new_url)
uncrawled_urls.add_to_pool(url)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool),
len(uncrawled_urls.pool)))
def render_sitemap(base_url=None, crawled_urls=None, runtime=None): def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
@@ -84,7 +91,7 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
with open('sitemap.html', 'w') as outfile: with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html) outfile.write(rendered_html)
print('Sitemap available at sitemap.html') print('Sitemap available at {0}/sitemap.html'.format(os.getcwd()))
def run(args=None): def run(args=None):
@@ -103,9 +110,6 @@ def run(args=None):
render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime) render_sitemap(base_url=base_url, crawled_urls=crawled_urls.pool, runtime=runtime)
# pprint(crawled_urls.pool)
# print('{0} URLs crawled'.format(len(crawled_urls.pool)))
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -1,16 +1,56 @@
## Thoughts ## Thoughts
* ~~strip hashes and everything following (as they're in-page anchors)~~ * ~~strip hashes and everything following (as they're in-page anchors)~~
* strip args * ~~strip args~~
* ~~use `pop()` on the set instead of `.remove()`~~ * ~~use `pop()` on the set instead of `.remove()`~~
* ~~return false once the set is empty~~ * ~~return false once the set is empty~~
* ~~`WebPage.parse_urls()` needs to compare startswith to base url~~ * ~~`WebPage.parse_urls()` needs to compare startswith to base url~~
* ~~ignore any links which aren't to pages~~ * ~~ignore any links which aren't to pages~~
* better url checking to get bare domain * ~~better url checking to get bare domain~~ #wontfix
* remove trailing slash from any discovered url * ~~remove trailing slash from any discovered url~~
* ~~investigate lxml parser~~ * ~~investigate lxml parser~~
* ~~remove base url from initial urls with and without trailing slash~~ * ~~remove base url from initial urls with and without trailing slash~~
* investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls * ~~investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls~~ #wontfix
* ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~ * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~
* investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request) * ~~investigate [gzip encoding](https://stackoverflow.com/questions/36383227/avoid-downloading-images-using-beautifulsoup-and-urllib-request)~~
* implement some kind of progress display * ~~implement some kind of progress display~~
* async
* better exception handling
* randomise output filename
### Async bits
in `__main__`:
```python
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.close()
```
* initialises loop and runs it to completion
* needs to handle errors (try/except/finally)
```python
async def run(args=None):
tasks = []
for url in pool:
tasks.append(url)
# for i in range(10):
# tasks.append(asyncio.ensure_future(myCoroutine(i)))
# gather completed tasks
await asyncio.gather(*tasks)
```
Getting the contents of the page needs to be async too
```python
async def get_source():
blah
blah
await urlopen(url)
```

View File

@@ -1,37 +1,35 @@
#!/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
from utils.helpers import (clean_base_url) from utils.helpers import (sanitise_url)
class TestUrls(unittest.TestCase): class TestUrls(unittest.TestCase):
base_url = "github.com"
base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'), base_url_list = (('eu.httpbin.org', 'http://eu.httpbin.org'),
('www.simonweald.com', 'http://www.simonweald.com'), ('www.simonweald.com', 'http://www.simonweald.com'),
('http://www.github.com/', 'http://www.github.com'), ('http://www.github.com/', 'http://www.github.com'),
('https://www.github.com', 'https://www.github.com')) ('https://www.github.com', 'https://www.github.com'))
valid_urls = ["https://www.github.com", "http://www.github.com", urls_to_clean = (('https://www.github.com/', 'https://www.github.com/'),
"github.com", "/some/url/", "index.html"] ('https://github.com/?foo=bar', 'https://github.com/'),
('https://github.com/#anchor', 'https://github.com/'))
def test_clean_base_url(self): def test_sanitise_base_url(self):
''' '''
Tests whether a URL's protocol can be discovered if not provided. Tests whether a URL's protocol can be discovered if not provided.
''' '''
for url, target in self.base_url_list: for url, target in self.base_url_list:
result = clean_base_url(url) result = sanitise_url(url, base_url=True)
self.assertEqual(result, target) self.assertEqual(result, target)
# def test_url_validation(self): def test_sanitise_url(self):
# ''' '''
# Passes when given a valid URL. A valid URL is qualified Tests whether a URL's protocol can be discovered if not provided.
# by being local to the domain to be crawled. '''
# ''' for url, target in self.urls_to_clean:
# for url in self.valid_urls: result = sanitise_url(url)
# result = url_validation(self.base_url, url) self.assertEqual(result, target)
# self.assertTrue(result)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -4,9 +4,11 @@ Utilities to provide various misc functions.
''' '''
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import aiohttp
import urllib.request import urllib.request
import urllib.robotparser import urllib.robotparser
import urllib.error import urllib.error
import gzip
from urllib.parse import (urljoin, urlsplit) from urllib.parse import (urljoin, urlsplit)
@@ -47,7 +49,9 @@ class WebPage(object):
the data from each individual page. the data from each individual page.
''' '''
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} # set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None, robots=None): def __init__(self, url=None, base_url=None, robots=None):
self.url = url self.url = url
@@ -63,11 +67,17 @@ class WebPage(object):
''' '''
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5) # handle page = urllib.request.urlopen(request, timeout=5)
headers = page.info()
print(headers['content-type']) # handle the content encoding in case it needs decompressing.
if "text/html" in headers['content-type']: if 'text/html' in page.info().get('Content-Type'):
self.source = page.read() if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
self.source = gzip.decompress(page.read())
elif page.info().get('Content-Encoding') == 'deflate':
self.source = page.read()
else:
self.source = page.read()
def find_links(self): def find_links(self):
@@ -94,31 +104,37 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL. pool if they start with the base URL.
''' '''
for url in self.discovered_hrefs:
for url in self.discovered_hrefs: #handle no hrefs found if url.startswith(self.base_url) and self.robots.check(url):
if url.startswith(self.url): sanitised_url = sanitise_url(url=url)
if self.robots.check(url): self.urls_to_crawl.add(sanitised_url)
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self): def list_urls(self):
''' '''
Returns the contents of the Returns all valid discovered URLs.
''' '''
return self.urls_to_crawl return self.urls_to_crawl
def run(self): def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try: try:
self.get_source() self.get_source()
except Exception: except Exception:
# skip if we didn't retrieve the source.
pass pass
if self.source: if self.source:
self.find_links() self.find_links()
self.parse_urls() self.parse_urls()
return True
else:
return False
class RobotsTxt(object): class RobotsTxt(object):
@@ -127,6 +143,9 @@ class RobotsTxt(object):
''' '''
def __init__(self, base_url=None): def __init__(self, base_url=None):
'''
Manually retrieve robots.txt to allow us to set the user-agent.
'''
self.base_url = base_url self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -150,7 +169,7 @@ class RobotsTxt(object):
def check(self, url): def check(self, url):
''' '''
needs a docstring Test if robots allows us to crawl that URL.
''' '''
return self.robots.can_fetch("*", url) return self.robots.can_fetch("*", url)