more comments and progress output

This commit is contained in:
2018-09-16 15:26:49 +01:00
parent f1855f5add
commit 91cd988f52
2 changed files with 8 additions and 2 deletions

View File

@@ -119,6 +119,7 @@ class AsyncCrawler(object):
# add the URLs to a set to be returned.
if urls:
for url in urls:
print('Found: {0}'.format(url))
all_urls.add(url)
return all_urls
@@ -133,6 +134,7 @@ class AsyncCrawler(object):
to_crawl = []
to_crawl.append(self.baseurl)
print('Crawling: {0}'.format(self.baseurl))
while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl)
# empty toe crawl list and then add all newly discovered URLs for
@@ -169,6 +171,7 @@ class RobotsTxt(object):
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError:
# if robots.txt doesn't exist then allow all URLs to be crawled.
robots.allow_all = True
else:
data = response.read()