From 8698c21fda84d39b23e69c9381dab8f64c36e2d7 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Wed, 12 Sep 2018 08:00:08 +0100 Subject: [PATCH] return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was --- crawler.py | 17 ++++++++++------- utils/helpers.py | 11 +++++++---- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/crawler.py b/crawler.py index 5f349d2..32fb0c5 100644 --- a/crawler.py +++ b/crawler.py @@ -6,6 +6,7 @@ Need a docstring. import argparse import jinja2 import os +import asyncio from datetime import datetime from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url) @@ -57,16 +58,18 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N # create a WebPage object for the URL current_page = WebPage(url=new_url, base_url=base_url, robots=robots) try: - current_page.run() - _urls = current_page.list_urls() - crawled_urls.add_to_pool(new_url) + succeeded = current_page.run() except Exception as e: print(e) - for url in _urls: - sanitised_url = sanitise_url(url=url) - if sanitised_url not in crawled_urls.pool: - uncrawled_urls.add_to_pool(url) + if succeeded: + _urls = current_page.list_urls() + crawled_urls.add_to_pool(new_url) + + for url in _urls: + sanitised_url = sanitise_url(url=url) + if sanitised_url not in crawled_urls.pool: + uncrawled_urls.add_to_pool(url) print('{0} URLs crawled, {1} remaining'.format(len(crawled_urls.pool), len(uncrawled_urls.pool))) diff --git a/utils/helpers.py b/utils/helpers.py index 3f9f1a3..8da18f0 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -4,6 +4,7 @@ Utilities to provide various misc functions. ''' from bs4 import BeautifulSoup +import aiohttp import urllib.request import urllib.robotparser import urllib.error @@ -104,10 +105,9 @@ class WebPage(object): pool if they start with the base URL. ''' for url in self.discovered_hrefs: - if url.startswith(self.url): - if self.robots.check(url): - sanitised_url = sanitise_url(url=url) - self.urls_to_crawl.add(sanitised_url) + if url.startswith(self.base_url) and self.robots.check(url): + sanitised_url = sanitise_url(url=url) + self.urls_to_crawl.add(sanitised_url) def list_urls(self): @@ -132,6 +132,9 @@ class WebPage(object): if self.source: self.find_links() self.parse_urls() + return True + else: + return False class RobotsTxt(object):