corrected some small errors and added runner function

This commit is contained in:
2018-08-31 19:01:35 +01:00
parent 5e0d9fd568
commit 1b18aa83eb

View File

@@ -25,10 +25,10 @@ class UrlPool(object):
else:
return False
def invalidate_url(self, url):
def remove_from_pool(self, url):
self.url_pool.remove(url)
def add_to_list(self, url):
def add_to_pool(self, url):
self.url_pool.add(url)
@@ -39,31 +39,51 @@ class WebPage(object):
def __init__(self, url):
self.url = url
def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
hrefs = set()
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
hrefs.add("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
hrefs.add(link['href'])
self.hrefs = hrefs
def parse_urls(self):
local_urls = []
self.urls_to_crawl = set()
for url in self.hrefs:
if url.startswith(self.url):
local_urls.append(url)
self.urls_to_crawl.add(url)
return local_urls
def run(self):
try:
self.get_source()
except Exception as e:
print(e)
try:
self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls()
except Exception as e:
print(e)
return self.urls_to_crawl
def sanitise_url(url):