improve documentation

2018-09-15 21:48:50 +01:00
parent 0244435fea
commit 6548f55416
1 changed files with 20 additions and 4 deletions
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -54,7 +54,7 @@ class AsyncCrawler(object):

    async def get_source(self, url=None):
        '''
-        Obtains the page's source.
+        Obtains the URL's source, provided it is HTML.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
@@ -70,13 +70,12 @@ class AsyncCrawler(object):
                        return source
                    except Exception:
                        return None
-            else:
-                print('{0} - {1}'.format(head.headers['Content-Type'], url))


    def find_all_urls(self, source=None):
        '''
-        Find all URLs in a page's source.
+        Find all URLs in a page's source. Returns a list of URLs which have
+        been validated as local to the starting URL.
        '''
        urls = []

@@ -93,12 +92,21 @@ class AsyncCrawler(object):


    async def run(self, urls=None):
+        '''
+        Crawls a batch of URLs of any size (resource usage is bounded by n
+        semaphores (where n = concurrency). Returns a set of URLs to be added
+        to the list of URLs which need to be crawled (find_all_urls only returns
+        unseen URLs).
+        '''
        tasks = []
        all_urls = set()
        for url in urls:
+            # mark the URL as seen.
            self.crawled.add(url)
+            # create an coroutine to crawl the URL.
            tasks.append(self.crawl_url(url))

+        # wait for all tasks to complete.
        for task in asyncio.as_completed(tasks):
            urls = None
            try:
@@ -107,6 +115,7 @@ class AsyncCrawler(object):
            except Exception as e:
                print(e)

+            # add the URLs to a set to be returned.
            if urls:
                for url in urls:
                    all_urls.add(url)
@@ -115,15 +124,22 @@ class AsyncCrawler(object):


    async def main(self):
+        '''
+        Runs a crawl with batches of URLs. Once complete returns a list of all
+        crawled URLs.
+        '''
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        to_crawl = []
        to_crawl.append(self.baseurl)

        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
+            # empty toe crawl list and then add all newly discovered URLs for
+            # the next iteration.
            to_crawl.clear()
            to_crawl.extend(discovered_urls)

+        # close the ssions once all URLs have been crawled.
        await self.client_session.close()

        return self.crawled