remove unecessary classes2

2018-09-14 16:02:20 +01:00
parent db986b0eba
commit 7ebe4855b8
1 changed files with 7 additions and 153 deletions
@@ -32,6 +32,7 @@ class AsyncCrawler(object):
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)

+
    async def crawl_url(self, url=None):
        '''
        docstring
@@ -40,15 +41,13 @@ class AsyncCrawler(object):
        async with self.semaphore:
            source = await self.get_source(url)
            if source:
+                # add the URL we've just crawled
                self.crawled.add(url)
-                # for new_url in self.find_all_urls(source):
-                #     urls.add(new_url)
-                urls_to_crawl = self.find_all_urls(source)
-                # print('discovered {0} new URLs'.format(len(urls_to_crawl)))
-                for new_url in urls_to_crawl:
+                for new_url in self.find_all_urls(source):
                    urls.add(new_url)
-                # add the url we just crawled to the crawled pool.
-
+                # urls_to_crawl = self.find_all_urls(source)
+                # for new_url in urls_to_crawl:
+                #     urls.add(new_url)

        return urls

@@ -62,7 +61,6 @@ class AsyncCrawler(object):
        url = standardise_url(url=url, base_url=self.baseurl)

        if url and self.robots.check(url=url):
-            # print('validated url: {0}'.format(url))
            return url
        else:
            return False
@@ -103,165 +101,21 @@ class AsyncCrawler(object):
        '''
        function which runs the crawler
        '''
-        print('Crawling: {}'.format(self.baseurl))
+        # print('Crawling: {}'.format(self.baseurl))
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        # provide the starting URL to the crawler
        self.uncrawled.add(self.baseurl)

        while len(self.uncrawled) > 0:
-            # print('################################ there are {0} uncrawled urls in the pool'.format(
-            #     len(self.uncrawled)))
            url = self.uncrawled.pop()
-            # print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
-                # len(self.uncrawled)))
            new_urls = await self.crawl_url(url=url)
            for url in new_urls:
-                # print('adding: {0}'.format(url))
                self.uncrawled.add(url)

        await self.client_session.close()
        return self.crawled


-
-
-
-
-
-
-
-
-
-
-
-
-
-
-class UrlPool(object):
-    '''
-    Object to manage a pool of URLs.
-    '''
-
-    def __init__(self):
-        self.pool = set()
-
-    def check_duplicate(self, new_url):
-        '''
-        Checks if a URL exists in the current pool.
-        '''
-        if new_url in self.pool:
-            return True
-        else:
-            return False
-
-    def remove_from_pool(self):
-        '''
-        Remove a URL from the pool and return it to be crawled.
-        '''
-        return(self.pool.pop())
-
-    def add_to_pool(self, url):
-        self.pool.add(url)
-
-    def list_pool(self):
-        pool = self.pool
-        return pool
-
-
-class WebPage(object):
-    '''
-    Object to manage common operations required to return
-    the data from each individual page.
-    '''
-
-    # set a sane user-agent and request compression if available.
-    headers = {'Accept-Encoding': 'gzip, deflate',
-               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
-
-    def __init__(self, url=None, base_url=None, robots=None):
-        self.url = url
-        self.base_url = base_url
-        self.robots = robots
-        self.source = None
-        self.urls_to_crawl = set()
-
-
-    def get_source(self):
-        '''
-        Retrieve a page's source.
-        '''
-
-        request = urllib.request.Request(self.url, headers=self.headers)
-        page = urllib.request.urlopen(request, timeout=5)
-
-        # handle the content encoding in case it needs decompressing.
-        if 'text/html' in page.info().get('Content-Type'):
-            if page.info().get('Content-Encoding'):
-                if page.info().get('Content-Encoding') == 'gzip':
-                    self.source = gzip.decompress(page.read())
-                elif page.info().get('Content-Encoding') == 'deflate':
-                    self.source = page.read()
-            else:
-                self.source = page.read()
-
-
-    def find_links(self):
-        '''
-        Find all URLs on a page and ensure they are absolute. If they are
-        relative then they will be appended to the base URL.
-        '''
-        hrefs = set()
-
-        soup = BeautifulSoup(self.source, 'lxml')
-        links = soup.find_all('a', href=True)
-
-        for link in links:
-            if link['href'].startswith('/'):
-                hrefs.add(urljoin(self.url, link['href']))
-            else:
-                hrefs.add(link['href'])
-
-        self.discovered_hrefs = hrefs
-
-
-    def parse_urls(self):
-        '''
-        Iterate through the list of discovered URLs and add them to the
-        pool if they start with the base URL.
-        '''
-        for url in self.discovered_hrefs:
-            if url.startswith(self.base_url) and self.robots.check(url):
-                standardised_url = sanitise_url(url=url)
-                self.urls_to_crawl.add(standardised_url)
-
-
-    def list_urls(self):
-        '''
-        Returns all valid discovered URLs.
-        '''
-
-        return self.urls_to_crawl
-
-
-    def run(self):
-        '''
-        Attempt to get the page's source and if successful, iterate through it
-        to find any links we can crawl.
-        '''
-        try:
-            self.get_source()
-        except Exception:
-            # skip if we didn't retrieve the source.
-            pass
-
-        if self.source:
-            self.find_links()
-            self.parse_urls()
-            return True
-        else:
-            return False
-
-
 class RobotsTxt(object):
    '''
    needs a docstring