Compare commits

...

2 Commits

2 changed files with 46 additions and 9 deletions

View File

@@ -10,6 +10,21 @@ def init_crawler(base_url=None):
''' '''
needs a docstring needs a docstring
''' '''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(base_url)
try:
initial_urls = initial_page.run()
except Exception as e:
print(e)
for url in initial_urls:
try:
uncrawled_urls.add_to_pool(url)
except Exception as e:
print(e)
print(uncrawled_urls.url_pool)
def run(args=None): def run(args=None):
@@ -17,7 +32,9 @@ def run(args=None):
needs a docstring. needs a docstring.
''' '''
base_url = sanitise_url(args.url) base_url = sanitise_url(args.url)
print(base_url)
init_crawler(base_url)
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -25,10 +25,10 @@ class UrlPool(object):
else: else:
return False return False
def invalidate_url(self, url): def remove_from_pool(self, url):
self.url_pool.remove(url) self.url_pool.remove(url)
def add_to_list(self, url): def add_to_pool(self, url):
self.url_pool.add(url) self.url_pool.add(url)
@@ -39,31 +39,51 @@ class WebPage(object):
def __init__(self, url): def __init__(self, url):
self.url = url self.url = url
def get_source(self): def get_source(self):
request = urllib.request.Request(self.url, headers=self.headers) request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request) page = urllib.request.urlopen(request)
self.source = page.read() self.source = page.read()
def find_links(self): def find_links(self):
soup = BeautifulSoup(self.source, 'html.parser') soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a') links = soup.find_all('a')
hrefs = [] hrefs = set()
for link in links: for link in links:
if link['href'].startswith('/'): if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']])) hrefs.add("".join([self.url, link['href']]))
else: else:
hrefs.append(link['href']) hrefs.add(link['href'])
self.hrefs = hrefs self.hrefs = hrefs
def parse_urls(self): def parse_urls(self):
local_urls = [] self.urls_to_crawl = set()
for url in self.hrefs: for url in self.hrefs:
if url.startswith(self.url): if url.startswith(self.url):
local_urls.append(url) self.urls_to_crawl.add(url)
return local_urls
def run(self):
try:
self.get_source()
except Exception as e:
print(e)
try:
self.find_links()
except Exception as e:
print(e)
try:
self.parse_urls()
except Exception as e:
print(e)
return self.urls_to_crawl
def sanitise_url(url): def sanitise_url(url):