return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was

This commit is contained in:
2018-09-12 08:00:08 +01:00
parent 273cf56a3b
commit 8698c21fda
2 changed files with 17 additions and 11 deletions

View File

@@ -6,6 +6,7 @@ Need a docstring.
import argparse
import jinja2
import os
import asyncio
from datetime import datetime
from utils.helpers import (UrlPool, WebPage, RobotsTxt, sanitise_url)
@@ -57,12 +58,14 @@ def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None, robots=N
# create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url, robots=robots)
try:
current_page.run()
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
succeeded = current_page.run()
except Exception as e:
print(e)
if succeeded:
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:

View File

@@ -4,6 +4,7 @@ Utilities to provide various misc functions.
'''
from bs4 import BeautifulSoup
import aiohttp
import urllib.request
import urllib.robotparser
import urllib.error
@@ -104,8 +105,7 @@ class WebPage(object):
pool if they start with the base URL.
'''
for url in self.discovered_hrefs:
if url.startswith(self.url):
if self.robots.check(url):
if url.startswith(self.base_url) and self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
@@ -132,6 +132,9 @@ class WebPage(object):
if self.source:
self.find_links()
self.parse_urls()
return True
else:
return False
class RobotsTxt(object):