initial commit of single-page scraper

This commit is contained in:
2018-08-28 18:29:34 +01:00
parent 25f8c4c686
commit 5c933fc5c9

View File

@@ -1 +1,64 @@
#!/usr/bin/env python
#!/usr/bin/env python
import argparse
import urllib.request
from bs4 import BeautifulSoup
# class WebScraper(object):
# def __init__(self, url):
# parser = argparse.ArgumentParser(description='Recursive web scraper')
# parser.add_argument("-s", "--site", required=True, help="Base url to scrape")
# self.args = parser.parse_args()
def run(args=None):
source = get_source(args)
urls = find_links(args, source)
local_urls = parse_urls(args, urls)
print(local_urls)
def get_source(args=None):
url = args.url
useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
headers = {'User-Agent': useragent}
request = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(request)
source = page.read()
return source
def find_links(args=None, source=None):
soup = BeautifulSoup(source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([args.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def parse_urls(args=None, urls=None):
local_urls = []
for url in urls:
if url.startswith(args.url):
local_urls.append(url)
return local_urls
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Recursive web scraper')
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
args = parser.parse_args()
print(args)
run(args)