From 7ebe4855b8c9b3fe11f4cf515e7b7179b3be0ce6 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Fri, 14 Sep 2018 16:02:20 +0100 Subject: [PATCH] remove unecessary classes2 --- utils/helpers.py | 160 +++-------------------------------------------- 1 file changed, 7 insertions(+), 153 deletions(-) diff --git a/utils/helpers.py b/utils/helpers.py index f0791a4..be1bda3 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -32,6 +32,7 @@ class AsyncCrawler(object): self.client_session = None self.semaphore = asyncio.BoundedSemaphore(concurrency) + async def crawl_url(self, url=None): ''' docstring @@ -40,15 +41,13 @@ class AsyncCrawler(object): async with self.semaphore: source = await self.get_source(url) if source: + # add the URL we've just crawled self.crawled.add(url) - # for new_url in self.find_all_urls(source): - # urls.add(new_url) - urls_to_crawl = self.find_all_urls(source) - # print('discovered {0} new URLs'.format(len(urls_to_crawl))) - for new_url in urls_to_crawl: + for new_url in self.find_all_urls(source): urls.add(new_url) - # add the url we just crawled to the crawled pool. - + # urls_to_crawl = self.find_all_urls(source) + # for new_url in urls_to_crawl: + # urls.add(new_url) return urls @@ -62,7 +61,6 @@ class AsyncCrawler(object): url = standardise_url(url=url, base_url=self.baseurl) if url and self.robots.check(url=url): - # print('validated url: {0}'.format(url)) return url else: return False @@ -103,165 +101,21 @@ class AsyncCrawler(object): ''' function which runs the crawler ''' - print('Crawling: {}'.format(self.baseurl)) + # print('Crawling: {}'.format(self.baseurl)) self.client_session = aiohttp.ClientSession(headers=self.headers) # provide the starting URL to the crawler self.uncrawled.add(self.baseurl) while len(self.uncrawled) > 0: - # print('################################ there are {0} uncrawled urls in the pool'.format( - # len(self.uncrawled))) url = self.uncrawled.pop() - # print('################ url popped, there are now {0} uncrawled urls in the pool'.format( - # len(self.uncrawled))) new_urls = await self.crawl_url(url=url) for url in new_urls: - # print('adding: {0}'.format(url)) self.uncrawled.add(url) await self.client_session.close() return self.crawled - - - - - - - - - - - - - - -class UrlPool(object): - ''' - Object to manage a pool of URLs. - ''' - - def __init__(self): - self.pool = set() - - def check_duplicate(self, new_url): - ''' - Checks if a URL exists in the current pool. - ''' - if new_url in self.pool: - return True - else: - return False - - def remove_from_pool(self): - ''' - Remove a URL from the pool and return it to be crawled. - ''' - return(self.pool.pop()) - - def add_to_pool(self, url): - self.pool.add(url) - - def list_pool(self): - pool = self.pool - return pool - - -class WebPage(object): - ''' - Object to manage common operations required to return - the data from each individual page. - ''' - - # set a sane user-agent and request compression if available. - headers = {'Accept-Encoding': 'gzip, deflate', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} - - def __init__(self, url=None, base_url=None, robots=None): - self.url = url - self.base_url = base_url - self.robots = robots - self.source = None - self.urls_to_crawl = set() - - - def get_source(self): - ''' - Retrieve a page's source. - ''' - - request = urllib.request.Request(self.url, headers=self.headers) - page = urllib.request.urlopen(request, timeout=5) - - # handle the content encoding in case it needs decompressing. - if 'text/html' in page.info().get('Content-Type'): - if page.info().get('Content-Encoding'): - if page.info().get('Content-Encoding') == 'gzip': - self.source = gzip.decompress(page.read()) - elif page.info().get('Content-Encoding') == 'deflate': - self.source = page.read() - else: - self.source = page.read() - - - def find_links(self): - ''' - Find all URLs on a page and ensure they are absolute. If they are - relative then they will be appended to the base URL. - ''' - hrefs = set() - - soup = BeautifulSoup(self.source, 'lxml') - links = soup.find_all('a', href=True) - - for link in links: - if link['href'].startswith('/'): - hrefs.add(urljoin(self.url, link['href'])) - else: - hrefs.add(link['href']) - - self.discovered_hrefs = hrefs - - - def parse_urls(self): - ''' - Iterate through the list of discovered URLs and add them to the - pool if they start with the base URL. - ''' - for url in self.discovered_hrefs: - if url.startswith(self.base_url) and self.robots.check(url): - standardised_url = sanitise_url(url=url) - self.urls_to_crawl.add(standardised_url) - - - def list_urls(self): - ''' - Returns all valid discovered URLs. - ''' - - return self.urls_to_crawl - - - def run(self): - ''' - Attempt to get the page's source and if successful, iterate through it - to find any links we can crawl. - ''' - try: - self.get_source() - except Exception: - # skip if we didn't retrieve the source. - pass - - if self.source: - self.find_links() - self.parse_urls() - return True - else: - return False - - class RobotsTxt(object): ''' needs a docstring