From 5c933fc5c93dcff73a1f2edee9027d24a89fb16e Mon Sep 17 00:00:00 2001 From: Simon Weald Date: Tue, 28 Aug 2018 18:29:34 +0100 Subject: [PATCH] initial commit of single-page scraper --- scraper.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/scraper.py b/scraper.py index fef66b5..9705fd5 100644 --- a/scraper.py +++ b/scraper.py @@ -1 +1,64 @@ -#!/usr/bin/env python \ No newline at end of file +#!/usr/bin/env python + +import argparse +import urllib.request +from bs4 import BeautifulSoup + +# class WebScraper(object): + +# def __init__(self, url): +# parser = argparse.ArgumentParser(description='Recursive web scraper') +# parser.add_argument("-s", "--site", required=True, help="Base url to scrape") + # self.args = parser.parse_args() + + +def run(args=None): + source = get_source(args) + urls = find_links(args, source) + local_urls = parse_urls(args, urls) + + print(local_urls) + +def get_source(args=None): + url = args.url + useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0' + headers = {'User-Agent': useragent} + request = urllib.request.Request(url, headers=headers) + page = urllib.request.urlopen(request) + source = page.read() + + return source + + +def find_links(args=None, source=None): + soup = BeautifulSoup(source, 'html.parser') + links = soup.find_all('a') + hrefs = [] + + for link in links: + if link['href'].startswith('/'): + hrefs.append("".join([args.url, link['href']])) + else: + hrefs.append(link['href']) + + return hrefs + + +def parse_urls(args=None, urls=None): + local_urls = [] + + for url in urls: + if url.startswith(args.url): + local_urls.append(url) + + + return local_urls + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Recursive web scraper') + parser.add_argument("-u", "--url", required=True, help="Base url to scrape") + args = parser.parse_args() + print(args) + run(args)