more comments and progress output

This commit is contained in:
2018-09-16 15:26:49 +01:00
parent f1855f5add
commit 91cd988f52
2 changed files with 8 additions and 2 deletions

View File

@@ -51,7 +51,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
def main(): def main():
''' '''
docstring Main function, responsible for prepping and running the crawler and
rendering the sitemap.
''' '''
starttime = datetime.now() starttime = datetime.now()
@@ -60,11 +61,13 @@ def main():
# create a crawler # create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
# create a task to run the crawler, run the loop and then gather the results.
task = asyncio.Task(async_crawler.main()) task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(task) loop.run_until_complete(task)
loop.close() loop.close()
results = task.result() results = sorted(task.result())
runtime = int((datetime.now() - starttime).total_seconds()) runtime = int((datetime.now() - starttime).total_seconds())
render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime) render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)

View File

@@ -119,6 +119,7 @@ class AsyncCrawler(object):
# add the URLs to a set to be returned. # add the URLs to a set to be returned.
if urls: if urls:
for url in urls: for url in urls:
print('Found: {0}'.format(url))
all_urls.add(url) all_urls.add(url)
return all_urls return all_urls
@@ -133,6 +134,7 @@ class AsyncCrawler(object):
to_crawl = [] to_crawl = []
to_crawl.append(self.baseurl) to_crawl.append(self.baseurl)
print('Crawling: {0}'.format(self.baseurl))
while len(to_crawl) > 0: while len(to_crawl) > 0:
discovered_urls = await self.run(urls=to_crawl) discovered_urls = await self.run(urls=to_crawl)
# empty toe crawl list and then add all newly discovered URLs for # empty toe crawl list and then add all newly discovered URLs for
@@ -169,6 +171,7 @@ class RobotsTxt(object):
try: try:
response = urllib.request.urlopen(request, timeout=5) response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError: except urllib.error.HTTPError:
# if robots.txt doesn't exist then allow all URLs to be crawled.
robots.allow_all = True robots.allow_all = True
else: else:
data = response.read() data = response.read()