start making the scraper an object
This commit is contained in:
36
scraper.py
36
scraper.py
@@ -1,15 +1,36 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import argparse
|
||||
import urllib.request
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# class WebScraper(object):
|
||||
class WebPage(object):
|
||||
|
||||
# def __init__(self, url):
|
||||
# parser = argparse.ArgumentParser(description='Recursive web scraper')
|
||||
# parser.add_argument("-s", "--site", required=True, help="Base url to scrape")
|
||||
# self.args = parser.parse_args()
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||
|
||||
def __init__(self, args):
|
||||
self.url = args['url']
|
||||
|
||||
|
||||
def get_source(self, args=None):
|
||||
request = urllib.request.Request(self.url, headers=headers)
|
||||
page = urllib.request.urlopen(request)
|
||||
self.source = page.read()
|
||||
|
||||
|
||||
def find_links(self, args=None, source=None):
|
||||
soup = BeautifulSoup(self.source, 'html.parser')
|
||||
links = soup.find_all('a')
|
||||
hrefs = []
|
||||
|
||||
for link in links:
|
||||
if link['href'].startswith('/'):
|
||||
hrefs.append("".join([self.url, link['href']]))
|
||||
else:
|
||||
hrefs.append(link['href'])
|
||||
|
||||
return hrefs
|
||||
|
||||
|
||||
def run(args=None):
|
||||
@@ -60,5 +81,8 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Recursive web scraper')
|
||||
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if not args.url.startswith('http'):
|
||||
raise SystemExit('URL must start with a protocol (http(s)).')
|
||||
|
||||
run(args)
|
||||
|
||||
Reference in New Issue
Block a user