remove unecessary classes2

This commit is contained in:
2018-09-14 16:02:20 +01:00
parent db986b0eba
commit 7ebe4855b8

View File

@@ -32,6 +32,7 @@ class AsyncCrawler(object):
self.client_session = None self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency) self.semaphore = asyncio.BoundedSemaphore(concurrency)
async def crawl_url(self, url=None): async def crawl_url(self, url=None):
''' '''
docstring docstring
@@ -40,15 +41,13 @@ class AsyncCrawler(object):
async with self.semaphore: async with self.semaphore:
source = await self.get_source(url) source = await self.get_source(url)
if source: if source:
# add the URL we've just crawled
self.crawled.add(url) self.crawled.add(url)
# for new_url in self.find_all_urls(source): for new_url in self.find_all_urls(source):
# urls.add(new_url)
urls_to_crawl = self.find_all_urls(source)
# print('discovered {0} new URLs'.format(len(urls_to_crawl)))
for new_url in urls_to_crawl:
urls.add(new_url) urls.add(new_url)
# add the url we just crawled to the crawled pool. # urls_to_crawl = self.find_all_urls(source)
# for new_url in urls_to_crawl:
# urls.add(new_url)
return urls return urls
@@ -62,7 +61,6 @@ class AsyncCrawler(object):
url = standardise_url(url=url, base_url=self.baseurl) url = standardise_url(url=url, base_url=self.baseurl)
if url and self.robots.check(url=url): if url and self.robots.check(url=url):
# print('validated url: {0}'.format(url))
return url return url
else: else:
return False return False
@@ -103,165 +101,21 @@ class AsyncCrawler(object):
''' '''
function which runs the crawler function which runs the crawler
''' '''
print('Crawling: {}'.format(self.baseurl)) # print('Crawling: {}'.format(self.baseurl))
self.client_session = aiohttp.ClientSession(headers=self.headers) self.client_session = aiohttp.ClientSession(headers=self.headers)
# provide the starting URL to the crawler # provide the starting URL to the crawler
self.uncrawled.add(self.baseurl) self.uncrawled.add(self.baseurl)
while len(self.uncrawled) > 0: while len(self.uncrawled) > 0:
# print('################################ there are {0} uncrawled urls in the pool'.format(
# len(self.uncrawled)))
url = self.uncrawled.pop() url = self.uncrawled.pop()
# print('################ url popped, there are now {0} uncrawled urls in the pool'.format(
# len(self.uncrawled)))
new_urls = await self.crawl_url(url=url) new_urls = await self.crawl_url(url=url)
for url in new_urls: for url in new_urls:
# print('adding: {0}'.format(url))
self.uncrawled.add(url) self.uncrawled.add(url)
await self.client_session.close() await self.client_session.close()
return self.crawled return self.crawled
class UrlPool(object):
'''
Object to manage a pool of URLs.
'''
def __init__(self):
self.pool = set()
def check_duplicate(self, new_url):
'''
Checks if a URL exists in the current pool.
'''
if new_url in self.pool:
return True
else:
return False
def remove_from_pool(self):
'''
Remove a URL from the pool and return it to be crawled.
'''
return(self.pool.pop())
def add_to_pool(self, url):
self.pool.add(url)
def list_pool(self):
pool = self.pool
return pool
class WebPage(object):
'''
Object to manage common operations required to return
the data from each individual page.
'''
# set a sane user-agent and request compression if available.
headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url=None, base_url=None, robots=None):
self.url = url
self.base_url = base_url
self.robots = robots
self.source = None
self.urls_to_crawl = set()
def get_source(self):
'''
Retrieve a page's source.
'''
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request, timeout=5)
# handle the content encoding in case it needs decompressing.
if 'text/html' in page.info().get('Content-Type'):
if page.info().get('Content-Encoding'):
if page.info().get('Content-Encoding') == 'gzip':
self.source = gzip.decompress(page.read())
elif page.info().get('Content-Encoding') == 'deflate':
self.source = page.read()
else:
self.source = page.read()
def find_links(self):
'''
Find all URLs on a page and ensure they are absolute. If they are
relative then they will be appended to the base URL.
'''
hrefs = set()
soup = BeautifulSoup(self.source, 'lxml')
links = soup.find_all('a', href=True)
for link in links:
if link['href'].startswith('/'):
hrefs.add(urljoin(self.url, link['href']))
else:
hrefs.add(link['href'])
self.discovered_hrefs = hrefs
def parse_urls(self):
'''
Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL.
'''
for url in self.discovered_hrefs:
if url.startswith(self.base_url) and self.robots.check(url):
standardised_url = sanitise_url(url=url)
self.urls_to_crawl.add(standardised_url)
def list_urls(self):
'''
Returns all valid discovered URLs.
'''
return self.urls_to_crawl
def run(self):
'''
Attempt to get the page's source and if successful, iterate through it
to find any links we can crawl.
'''
try:
self.get_source()
except Exception:
# skip if we didn't retrieve the source.
pass
if self.source:
self.find_links()
self.parse_urls()
return True
else:
return False
class RobotsTxt(object): class RobotsTxt(object):
''' '''
needs a docstring needs a docstring