add most changes suggested by pycodestyle

2018-09-16 16:10:38 +01:00
parent 75d3756bbc
commit c53f62b55d
1 changed files with 13 additions and 18 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -26,7 +26,6 @@ class AsyncCrawler(object):
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)

-
    async def crawl_url(self, url=None):
        '''
        Crawls the given URL and finds all new URLs in the initial page.
@@ -38,7 +37,6 @@ class AsyncCrawler(object):

        return urls

-
    def validate_url(self, url=None):
        '''
        Ensures we have a valid URL to crawl and that the site's robots.txt
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
        else:
            return False

-
    async def get_source(self, url=None):
        '''
        Obtains the URL's source, provided it is HTML. Usage of semaphores
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
                    except Exception:
                        return None

-
    def find_all_urls(self, source=None):
        '''
        Find all URLs in a page's source. Returns a list of URLs which have
@@ -91,13 +87,12 @@ class AsyncCrawler(object):

        return urls

-
    async def run(self, urls=None):
        '''
        Crawls a batch of URLs of any size (resource usage is bounded by n
        semaphores (where n = concurrency). Returns a set of URLs to be added
-        to the list of URLs which need to be crawled (find_all_urls only returns
-        unseen URLs).
+        to the list of URLs which need to be crawled (find_all_urls only
+        returns unseen URLs).
        '''
        tasks = []
        all_urls = set()
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
-                # completed.append((await task))
                urls = await task
            except Exception as e:
                print(e)
@@ -124,7 +118,6 @@ class AsyncCrawler(object):

        return all_urls

-
    async def main(self):
        '''
        Runs a crawl with batches of URLs. Once complete returns a list of all
@@ -180,7 +173,6 @@ class RobotsTxt(object):

        self.robots = robots

-
    def check(self, url):
        '''
        Test if robots allows us to crawl that URL.
@@ -190,19 +182,21 @@ class RobotsTxt(object):

 def standardise_url(url=None, base_url=None):
    '''
-    If `base_url` is None then we attempt to standarise the URL to ensure it can
-    be prepended to relative URLs. If no scheme has been provided then we default
-    to http as any sane https-only site should 301 redirect http > https.
+    If `base_url` is None then we attempt to standarise the URL to ensure it
+    can be prepended to relative URLs. If no scheme has been provided then we
+    default to http as any sane https-only site should 301 redirect http to
+    https.

-    If `base_url` is set, we standardise URLs to strip queries and fragments (we
-    don't want to scrape in-page anchors etc). Any relative URLs will be appended
-    to the base url.
+    If `base_url` is set, we standardise URLs to strip queries and fragments
+    (we don't want to scrape in-page anchors etc). Any relative URLs will be
+    appended to the base url.

    Returns a standardised URL as a string.
    '''
    default_proto = 'http'
    delim = '://'
-    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
+    file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
+                       'cfm')

    split_url = urlsplit(url)

@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
        if url.startswith('/'):
            return urljoin(base_url, split_url.path)
        elif url.startswith(base_url):
-            return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
+            return "".join([split_url.scheme, delim, split_url.netloc,
+                            split_url.path])

    return None