start making the scraper an object

This commit is contained in:
2018-08-28 22:29:36 +01:00
parent 482d23dd4f
commit 5d94991167

View File

@@ -1,15 +1,36 @@
#!/usr/bin/env python
import re
import argparse
import urllib.request
from bs4 import BeautifulSoup
# class WebScraper(object):
class WebPage(object):
# def __init__(self, url):
# parser = argparse.ArgumentParser(description='Recursive web scraper')
# parser.add_argument("-s", "--site", required=True, help="Base url to scrape")
# self.args = parser.parse_args()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, args):
self.url = args['url']
def get_source(self, args=None):
request = urllib.request.Request(self.url, headers=headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self, args=None, source=None):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def run(args=None):
@@ -60,5 +81,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Recursive web scraper')
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
args = parser.parse_args()
print(args)
if not args.url.startswith('http'):
raise SystemExit('URL must start with a protocol (http(s)).')
run(args)