corrected some small errors and added runner function

This commit is contained in:
2018-08-31 19:01:35 +01:00
parent 5e0d9fd568
commit 1b18aa83eb

View File

@@ -25,10 +25,10 @@ class UrlPool(object):
else: else:
return False return False
def invalidate_url(self, url): def remove_from_pool(self, url):
self.url_pool.remove(url) self.url_pool.remove(url)
def add_to_list(self, url): def add_to_pool(self, url):
self.url_pool.add(url) self.url_pool.add(url)
@@ -39,31 +39,51 @@ class WebPage(object):
def __init__(self, url): def __init__(self, url):
self.url = url self.url = url
def get_source(self): def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request) page = urllib.request.urlopen(request)
self.source = page.read() self.source = page.read()
def find_links(self): def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser') soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a') links = soup.find_all('a')
hrefs = [] hrefs = set()
for link in links: for link in links:
if link['href'].startswith('/'): if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']])) hrefs.add("".join([self.url, link['href']]))
else: else:
hrefs.append(link['href']) hrefs.add(link['href'])
self.hrefs = hrefs self.hrefs = hrefs
def parse_urls(self): def parse_urls(self):
local_urls = [] self.urls_to_crawl = set()
for url in self.hrefs: for url in self.hrefs:
if url.startswith(self.url): if url.startswith(self.url):
local_urls.append(url) self.urls_to_crawl.add(url)
return local_urls
def run(self):
try:
self.get_source()
except Exception as e:
print(e)
try:
self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls()
except Exception as e:
print(e)
return self.urls_to_crawl
def sanitise_url(url): def sanitise_url(url):