Compare commits

..

2 Commits

2 changed files with 46 additions and 9 deletions

View File

@@ -10,6 +10,21 @@ def init_crawler(base_url=None):
'''
needs a docstring
'''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(base_url)
try:
initial_urls = initial_page.run()
except Exception as e:
print(e)
for url in initial_urls:
try:
uncrawled_urls.add_to_pool(url)
except Exception as e:
print(e)
print(uncrawled_urls.url_pool)
def run(args=None):
@@ -17,7 +32,9 @@ def run(args=None):
needs a docstring.
'''
base_url = sanitise_url(args.url)
print(base_url)
init_crawler(base_url)
if __name__ == '__main__':

View File

@@ -25,10 +25,10 @@ class UrlPool(object):
else:
return False
def invalidate_url(self, url):
def remove_from_pool(self, url):
self.url_pool.remove(url)
def add_to_list(self, url):
def add_to_pool(self, url):
self.url_pool.add(url)
@@ -39,31 +39,51 @@ class WebPage(object):
def __init__(self, url):
self.url = url
def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
hrefs = set()
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
hrefs.add("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
hrefs.add(link['href'])
self.hrefs = hrefs
def parse_urls(self):
local_urls = []
self.urls_to_crawl = set()
for url in self.hrefs:
if url.startswith(self.url):
local_urls.append(url)
self.urls_to_crawl.add(url)
return local_urls
def run(self):
try:
self.get_source()
except Exception as e:
print(e)
try:
self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls()
except Exception as e:
print(e)
return self.urls_to_crawl
def sanitise_url(url):