remove unecessary classes2

2018-09-14 16:02:20 +01:00
parent db986b0eba
commit 7ebe4855b8
1 changed files with 7 additions and 153 deletions
@@ -32,6 +32,7 @@ class AsyncCrawler(object):
        self.client_session = None
        self.semaphore = asyncio.BoundedSemaphore(concurrency)
    async def crawl_url(self, url=None):
        '''
        docstring
@@ -40,15 +41,13 @@ class AsyncCrawler(object):
        async with self.semaphore:
            source = await self.get_source(url)
            if source:
                # add the URL we've just crawled
                self.crawled.add(url)
-                # for new_url in self.find_all_urls(source):
+                for new_url in self.find_all_urls(source):
                #     urls.add(new_url)
                urls_to_crawl = self.find_all_urls(source)
                # print('discovered {0} new URLs'.format(len(urls_to_crawl)))
                for new_url in urls_to_crawl:
                    urls.add(new_url)
-                # add the url we just crawled to the crawled pool.
+                # urls_to_crawl = self.find_all_urls(source)
-
+                # for new_url in urls_to_crawl:
                #     urls.add(new_url)
        return urls
@@ -62,7 +61,6 @@ class AsyncCrawler(object):
        url = standardise_url(url=url, base_url=self.baseurl)
        if url and self.robots.check(url=url):
            # print('validated url: {0}'.format(url))
            return url
        else:
            return False
@@ -103,165 +101,21 @@ class AsyncCrawler(object):
        '''
        function which runs the crawler
        '''
-        print('Crawling: {}'.format(self.baseurl))
+        # print('Crawling: {}'.format(self.baseurl))
        self.client_session = aiohttp.ClientSession(headers=self.headers)
        # provide the starting URL to the crawler
        self.uncrawled.add(self.baseurl)
        while len(self.uncrawled) > 0:
            # print('################################ there are {0} uncrawled urls in the pool'.format(
            #     len(self.uncrawled)))
            url = self.uncrawled.pop()
            # print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
                # len(self.uncrawled)))
            new_urls = await self.crawl_url(url=url)
            for url in new_urls:
                # print('adding: {0}'.format(url))
                self.uncrawled.add(url)
        await self.client_session.close()
        return self.crawled
 class UrlPool(object):
    '''
    Object to manage a pool of URLs.
    '''
    def __init__(self):
        self.pool = set()
    def check_duplicate(self, new_url):
        '''
        Checks if a URL exists in the current pool.
        '''
        if new_url in self.pool:
            return True
        else:
            return False
    def remove_from_pool(self):
        '''
        Remove a URL from the pool and return it to be crawled.
        '''
        return(self.pool.pop())
    def add_to_pool(self, url):
        self.pool.add(url)
    def list_pool(self):
        pool = self.pool
        return pool
 class WebPage(object):
    '''
    Object to manage common operations required to return
    the data from each individual page.
    '''
    # set a sane user-agent and request compression if available.
    headers = {'Accept-Encoding': 'gzip, deflate',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    def __init__(self, url=None, base_url=None, robots=None):
        self.url = url
        self.base_url = base_url
        self.robots = robots
        self.source = None
        self.urls_to_crawl = set()
    def get_source(self):
        '''
        Retrieve a page's source.
        '''
        request = urllib.request.Request(self.url, headers=self.headers)
        page = urllib.request.urlopen(request, timeout=5)
        # handle the content encoding in case it needs decompressing.
        if 'text/html' in page.info().get('Content-Type'):
            if page.info().get('Content-Encoding'):
                if page.info().get('Content-Encoding') == 'gzip':
                    self.source = gzip.decompress(page.read())
                elif page.info().get('Content-Encoding') == 'deflate':
                    self.source = page.read()
            else:
                self.source = page.read()
    def find_links(self):
        '''
        Find all URLs on a page and ensure they are absolute. If they are
        relative then they will be appended to the base URL.
        '''
        hrefs = set()
        soup = BeautifulSoup(self.source, 'lxml')
        links = soup.find_all('a', href=True)
        for link in links:
            if link['href'].startswith('/'):
                hrefs.add(urljoin(self.url, link['href']))
            else:
                hrefs.add(link['href'])
        self.discovered_hrefs = hrefs
    def parse_urls(self):
        '''
        Iterate through the list of discovered URLs and add them to the
        pool if they start with the base URL.
        '''
        for url in self.discovered_hrefs:
            if url.startswith(self.base_url) and self.robots.check(url):
                standardised_url = sanitise_url(url=url)
                self.urls_to_crawl.add(standardised_url)
    def list_urls(self):
        '''
        Returns all valid discovered URLs.
        '''
        return self.urls_to_crawl
    def run(self):
        '''
        Attempt to get the page's source and if successful, iterate through it
        to find any links we can crawl.
        '''
        try:
            self.get_source()
        except Exception:
            # skip if we didn't retrieve the source.
            pass
        if self.source:
            self.find_links()
            self.parse_urls()
            return True
        else:
            return False
 class RobotsTxt(object):
    '''
    needs a docstring