#!/usr/bin/env python import argparse import urllib.request from bs4 import BeautifulSoup # class WebScraper(object): # def __init__(self, url): # parser = argparse.ArgumentParser(description='Recursive web scraper') # parser.add_argument("-s", "--site", required=True, help="Base url to scrape") # self.args = parser.parse_args() def run(args=None): source = get_source(args) urls = find_links(args, source) local_urls = parse_urls(args, urls) print(local_urls) def get_source(args=None): url = args.url useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0' headers = {'User-Agent': useragent} request = urllib.request.Request(url, headers=headers) page = urllib.request.urlopen(request) source = page.read() return source def find_links(args=None, source=None): soup = BeautifulSoup(source, 'html.parser') links = soup.find_all('a') hrefs = [] for link in links: if link['href'].startswith('/'): hrefs.append("".join([args.url, link['href']])) else: hrefs.append(link['href']) return hrefs def parse_urls(args=None, urls=None): local_urls = [] for url in urls: if url.startswith(args.url): local_urls.append(url) return local_urls if __name__ == "__main__": parser = argparse.ArgumentParser(description='Recursive web scraper') parser.add_argument("-u", "--url", required=True, help="Base url to scrape") args = parser.parse_args() print(args) run(args)