add most changes suggested by pycodestyle

fix errors discovered by pycyodestyle
2018-09-16 16:10:38 +01:00 · 2018-09-16 16:04:07 +01:00
2 changed files with 24 additions and 26 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -1,5 +1,5 @@
-
 #!/usr/bin/env python
+
 '''
 Asynchronous web crawler written in Python 3.5+.

@@ -32,7 +32,8 @@ def sanity_checks(url=None):

    # fail early if robots denies all crawling
    if not robots.check(url=baseurl):
-        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
+        sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(
+            baseurl=baseurl))

    return(baseurl, robots)

@@ -48,8 +49,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
        loader=jinja2.FileSystemLoader('templates')
    ).get_template('sitemap.html.j2')

-    rendered_html = template.render(
-        base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
+    rendered_html = template.render(base_url=base_url, urlcount=urlcount,
+                                    urls=sorted_urls, runtime=runtime)

    with open('sitemap.html', 'w') as outfile:
        outfile.write(rendered_html)
@@ -67,9 +68,11 @@ def main():
    baseurl, robots = sanity_checks(url=args.url)

    # create a crawler
-    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
+    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots,
+                                 concurrency=args.concurrency)

-    # create a task to run the crawler, run the loop and then gather the results.
+    # create a task to run the crawler, run the loop and then gather the
+    # results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
@@ -84,9 +87,9 @@ def main():
 if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Recursive web crawler')
-    parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
+    parser.add_argument("-u", "--url", required=True, help="Initial url")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
-                        default=100, help="Max number of pages to crawl concurrently")
+                        default=100, help="Max pages to crawl concurrently")
    args = parser.parse_args()

    main()
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -26,7 +26,6 @@ class AsyncCrawler(object):
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)

-
    async def crawl_url(self, url=None):
        '''
        Crawls the given URL and finds all new URLs in the initial page.
@@ -38,7 +37,6 @@ class AsyncCrawler(object):

        return urls

-
    def validate_url(self, url=None):
        '''
        Ensures we have a valid URL to crawl and that the site's robots.txt
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
        else:
            return False

-
    async def get_source(self, url=None):
        '''
        Obtains the URL's source, provided it is HTML. Usage of semaphores
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
                    except Exception:
                        return None

-
    def find_all_urls(self, source=None):
        '''
        Find all URLs in a page's source. Returns a list of URLs which have
@@ -91,13 +87,12 @@ class AsyncCrawler(object):

        return urls

-
    async def run(self, urls=None):
        '''
        Crawls a batch of URLs of any size (resource usage is bounded by n
        semaphores (where n = concurrency). Returns a set of URLs to be added
-        to the list of URLs which need to be crawled (find_all_urls only returns
-        unseen URLs).
+        to the list of URLs which need to be crawled (find_all_urls only
+        returns unseen URLs).
        '''
        tasks = []
        all_urls = set()
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
-                # completed.append((await task))
                urls = await task
            except Exception as e:
                print(e)
@@ -124,7 +118,6 @@ class AsyncCrawler(object):

        return all_urls

-
    async def main(self):
        '''
        Runs a crawl with batches of URLs. Once complete returns a list of all
@@ -180,7 +173,6 @@ class RobotsTxt(object):

        self.robots = robots

-
    def check(self, url):
        '''
        Test if robots allows us to crawl that URL.
@@ -190,19 +182,21 @@ class RobotsTxt(object):

 def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is None then we attempt to standarise the URL to ensure it can
-    be prepended to relative URLs. If no scheme has been provided then we default
-    to http as any sane https-only site should 301 redirect http > https.
+    If `base_url` is None then we attempt to standarise the URL to ensure it
+    can be prepended to relative URLs. If no scheme has been provided then we
+    default to http as any sane https-only site should 301 redirect http to
+    https.

-    If `base_url` is set, we standardise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc). Any relative URLs will be appended
-    to the base url.
+    If `base_url` is set, we standardise URLs to strip queries and fragments
+    (we don't want to scrape in-page anchors etc). Any relative URLs will be
+    appended to the base url.

    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
-    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
+    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
+                       'cfm')

    split_url = urlsplit(url)

@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
        if url.startswith('/'):
            return urljoin(base_url, split_url.path)
        elif url.startswith(base_url):
-            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+            return "".join([split_url.scheme, delim, split_url.netloc,
+                            split_url.path])

    return None
Author	SHA1	Message	Date
Simon Weald	c53f62b55d	add most changes suggested by pycodestyle	2018-09-16 16:10:38 +01:00
Simon Weald	75d3756bbc	fix errors discovered by pycyodestyle	2018-09-16 16:04:07 +01:00