implement parsing of robots.txt

This commit is contained in:
2018-09-05 18:56:20 +01:00
parent f2c294ebdb
commit a3ec9451e3
3 changed files with 41 additions and 12 deletions

View File

@@ -4,16 +4,16 @@ Need a docstring.
''' '''
import argparse import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url) from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
from pprint import pprint from pprint import pprint
def init_crawler(base_url=None): def init_crawler(base_url=None, robots=None):
''' '''
needs a docstring needs a docstring
''' '''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool() uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(url=base_url, base_url=base_url) initial_page = WebPage(url=base_url, base_url=base_url, robots=robots)
try: try:
initial_page.run() initial_page.run()
@@ -45,7 +45,7 @@ def init_crawler(base_url=None):
return(uncrawled_urls, crawled_urls) return(uncrawled_urls, crawled_urls)
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None): def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=None):
''' '''
Needs a docstring Needs a docstring
''' '''
@@ -53,7 +53,7 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None):
# pop url from pool # pop url from pool
new_url = uncrawled_urls.remove_from_pool() new_url = uncrawled_urls.remove_from_pool()
# create a WebPage object for the URL # create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url) current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try: try:
current_page.run() current_page.run()
_urls = current_page.list_urls() _urls = current_page.list_urls()
@@ -72,9 +72,10 @@ def run(args=None):
needs a docstring. needs a docstring.
''' '''
base_url = sanitise_url(args.url, base_url=True) base_url = sanitise_url(args.url, base_url=True)
robots = RobotsTxt(base_url=base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url) uncrawled_urls, crawled_urls = init_crawler(base_url, robots)
process_pool(base_url, uncrawled_urls, crawled_urls) process_pool(base_url, uncrawled_urls, crawled_urls, robots)
pprint(crawled_urls.pool) pprint(crawled_urls.pool)
print('{0} URLs crawled'.format(len(crawled_urls.pool))) print('{0} URLs crawled'.format(len(crawled_urls.pool)))

View File

@@ -9,4 +9,4 @@
* better url checking to get bare domain * better url checking to get bare domain
* ~~remove base url from initial urls with and without trailing slash~~ * ~~remove base url from initial urls with and without trailing slash~~
* investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls * investigate using [tldextract](https://github.com/john-kurkowski/tldextract) to match urls
* implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/) * ~~implement parsing of [robots.txt](http://docs.w3cub.com/python~3.6/library/urllib.robotparser/)~~

View File

@@ -3,8 +3,9 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
import urllib.request
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.request
import urllib.robotparser
from urllib.parse import (urljoin, urlsplit) from urllib.parse import (urljoin, urlsplit)
@@ -47,9 +48,10 @@ class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None): def __init__(self, url=None, base_url=None, robots=None):
self.url = url self.url = url
self.base_url = base_url self.base_url = base_url
self.robots = robots
def get_source(self): def get_source(self):
@@ -90,8 +92,9 @@ class WebPage(object):
for url in self.discovered_hrefs: for url in self.discovered_hrefs:
if url.startswith(self.url): if url.startswith(self.url):
sanitised_url = sanitise_url(url=url) if self.robots.check(url):
self.urls_to_crawl.add(sanitised_url) sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self): def list_urls(self):
@@ -119,6 +122,31 @@ class WebPage(object):
print(e) print(e)
class RobotsTxt(object):
'''
needs a docstring
'''
def __init__(self, base_url=None):
self.base_url = base_url
robots = urllib.robotparser.RobotFileParser()
robots.set_url(urljoin(self.base_url, 'robots.txt'))
try:
robots.read()
except Exception as e:
print(e)
self.robots = robots
def check(self, url):
'''
needs a docstring
'''
return self.robots.can_fetch("*", url)
def sanitise_url(url, base_url=False): def sanitise_url(url, base_url=False):
''' '''
If `base_url` is True, we attempt to standardise `url` to ensure it can be If `base_url` is True, we attempt to standardise `url` to ensure it can be