more comments and progress output
This commit is contained in:
@@ -119,6 +119,7 @@ class AsyncCrawler(object):
|
||||
# add the URLs to a set to be returned.
|
||||
if urls:
|
||||
for url in urls:
|
||||
print('Found: {0}'.format(url))
|
||||
all_urls.add(url)
|
||||
|
||||
return all_urls
|
||||
@@ -133,6 +134,7 @@ class AsyncCrawler(object):
|
||||
to_crawl = []
|
||||
to_crawl.append(self.baseurl)
|
||||
|
||||
print('Crawling: {0}'.format(self.baseurl))
|
||||
while len(to_crawl) > 0:
|
||||
discovered_urls = await self.run(urls=to_crawl)
|
||||
# empty toe crawl list and then add all newly discovered URLs for
|
||||
@@ -169,6 +171,7 @@ class RobotsTxt(object):
|
||||
try:
|
||||
response = urllib.request.urlopen(request, timeout=5)
|
||||
except urllib.error.HTTPError:
|
||||
# if robots.txt doesn't exist then allow all URLs to be crawled.
|
||||
robots.allow_all = True
|
||||
else:
|
||||
data = response.read()
|
||||
|
||||
Reference in New Issue
Block a user