From 5d949911672657b92f9e6d73940b68ffdff96161 Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Tue, 28 Aug 2018 22:29:36 +0100 Subject: [PATCH] start making the scraper an object --- scraper.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/scraper.py b/scraper.py index 9705fd5..d86e192 100644 --- a/scraper.py +++ b/scraper.py @@ -1,15 +1,36 @@ #!/usr/bin/env python +import re import argparse import urllib.request from bs4 import BeautifulSoup -# class WebScraper(object): +class WebPage(object): -# def __init__(self, url): -# parser = argparse.ArgumentParser(description='Recursive web scraper') -# parser.add_argument("-s", "--site", required=True, help="Base url to scrape") - # self.args = parser.parse_args() + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} + + def __init__(self, args): + self.url = args['url'] + + + def get_source(self, args=None): + request = urllib.request.Request(self.url, headers=headers) + page = urllib.request.urlopen(request) + self.source = page.read() + + + def find_links(self, args=None, source=None): + soup = BeautifulSoup(self.source, 'html.parser') + links = soup.find_all('a') + hrefs = [] + + for link in links: + if link['href'].startswith('/'): + hrefs.append("".join([self.url, link['href']])) + else: + hrefs.append(link['href']) + + return hrefs def run(args=None): @@ -60,5 +81,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='Recursive web scraper') parser.add_argument("-u", "--url", required=True, help="Base url to scrape") args = parser.parse_args() - print(args) + + if not args.url.startswith('http'): + raise SystemExit('URL must start with a protocol (http(s)).') + run(args)