diff --git a/scraper.py b/scraper.py deleted file mode 100644 index d86e192..0000000 --- a/scraper.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python - -import re -import argparse -import urllib.request -from bs4 import BeautifulSoup - -class WebPage(object): - - headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} - - def __init__(self, args): - self.url = args['url'] - - - def get_source(self, args=None): - request = urllib.request.Request(self.url, headers=headers) - page = urllib.request.urlopen(request) - self.source = page.read() - - - def find_links(self, args=None, source=None): - soup = BeautifulSoup(self.source, 'html.parser') - links = soup.find_all('a') - hrefs = [] - - for link in links: - if link['href'].startswith('/'): - hrefs.append("".join([self.url, link['href']])) - else: - hrefs.append(link['href']) - - return hrefs - - -def run(args=None): - source = get_source(args) - urls = find_links(args, source) - local_urls = parse_urls(args, urls) - - print(local_urls) - -def get_source(args=None): - url = args.url - useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0' - headers = {'User-Agent': useragent} - request = urllib.request.Request(url, headers=headers) - page = urllib.request.urlopen(request) - source = page.read() - - return source - - -def find_links(args=None, source=None): - soup = BeautifulSoup(source, 'html.parser') - links = soup.find_all('a') - hrefs = [] - - for link in links: - if link['href'].startswith('/'): - hrefs.append("".join([args.url, link['href']])) - else: - hrefs.append(link['href']) - - return hrefs - - -def parse_urls(args=None, urls=None): - local_urls = [] - - for url in urls: - if url.startswith(args.url): - local_urls.append(url) - - - return local_urls - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Recursive web scraper') - parser.add_argument("-u", "--url", required=True, help="Base url to scrape") - args = parser.parse_args() - - if not args.url.startswith('http'): - raise SystemExit('URL must start with a protocol (http(s)).') - - run(args)