From 03554fde80a6ee59af023bf8547331529a795b7c Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Fri, 31 Aug 2018 19:15:35 +0100 Subject: [PATCH] add docstrings --- utils/helpers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/utils/helpers.py b/utils/helpers.py index e4ed38b..4c15603 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -41,15 +41,24 @@ class WebPage(object): def get_source(self): + ''' + Retrieve a page's source. + ''' + request = urllib.request.Request(self.url, headers=self.headers) page = urllib.request.urlopen(request) self.source = page.read() def find_links(self): + ''' + Find all URLs on a page and ensure they are absolute. If they are + relative then they will be appended to the base URL. + ''' + hrefs = set() + soup = BeautifulSoup(self.source, 'html.parser') links = soup.find_all('a') - hrefs = set() for link in links: if link['href'].startswith('/'): @@ -61,6 +70,11 @@ class WebPage(object): def parse_urls(self): + ''' + Iterate through the list of discovered URLs and add them to the + pool if they start with the base URL. + ''' + self.urls_to_crawl = set() for url in self.discovered_hrefs: if url.startswith(self.url):