return from WebPage to indicate whether a link was actually crawlable and only actually crawl it if it was

This commit is contained in:
2018-09-12 08:00:08 +01:00
parent 273cf56a3b
commit 8698c21fda
2 changed files with 17 additions and 11 deletions

View File

@@ -4,6 +4,7 @@ Utilities to provide various misc functions.
'''
from bs4 import BeautifulSoup
import aiohttp
import urllib.request
import urllib.robotparser
import urllib.error
@@ -104,10 +105,9 @@ class WebPage(object):
pool if they start with the base URL.
'''
for url in self.discovered_hrefs:
if url.startswith(self.url):
if self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
if url.startswith(self.base_url) and self.robots.check(url):
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self):
@@ -132,6 +132,9 @@ class WebPage(object):
if self.source:
self.find_links()
self.parse_urls()
return True
else:
return False
class RobotsTxt(object):