From 75d3756bbcd01a5ce98e5b5d4f69fa8cd4577ca6 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Sun, 16 Sep 2018 16:04:07 +0100 Subject: [PATCH] fix errors discovered by pycyodestyle --- async_crawler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/async_crawler.py b/async_crawler.py index 8716bd0..f8285ad 100644 --- a/async_crawler.py +++ b/async_crawler.py @@ -1,5 +1,5 @@ - #!/usr/bin/env python + ''' Asynchronous web crawler written in Python 3.5+. @@ -32,7 +32,8 @@ def sanity_checks(url=None): # fail early if robots denies all crawling if not robots.check(url=baseurl): - sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl)) + sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format( + baseurl=baseurl)) return(baseurl, robots) @@ -48,8 +49,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None): loader=jinja2.FileSystemLoader('templates') ).get_template('sitemap.html.j2') - rendered_html = template.render( - base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) + rendered_html = template.render(base_url=base_url, urlcount=urlcount, + urls=sorted_urls, runtime=runtime) with open('sitemap.html', 'w') as outfile: outfile.write(rendered_html) @@ -67,9 +68,11 @@ def main(): baseurl, robots = sanity_checks(url=args.url) # create a crawler - async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) + async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, + concurrency=args.concurrency) - # create a task to run the crawler, run the loop and then gather the results. + # create a task to run the crawler, run the loop and then gather the + # results. task = asyncio.Task(async_crawler.main()) loop = asyncio.get_event_loop() loop.run_until_complete(task) @@ -84,9 +87,9 @@ def main(): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recursive web crawler') - parser.add_argument("-u", "--url", required=True, help="Initial url to crawl") + parser.add_argument("-u", "--url", required=True, help="Initial url") parser.add_argument("-c", "--concurrency", required=False, type=int, - default=100, help="Max number of pages to crawl concurrently") + default=100, help="Max pages to crawl concurrently") args = parser.parse_args() main()