start making the scraper an object
This commit is contained in:
36
scraper.py
36
scraper.py
@@ -1,15 +1,36 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# class WebScraper(object):
|
class WebPage(object):
|
||||||
|
|
||||||
# def __init__(self, url):
|
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
# parser = argparse.ArgumentParser(description='Recursive web scraper')
|
|
||||||
# parser.add_argument("-s", "--site", required=True, help="Base url to scrape")
|
def __init__(self, args):
|
||||||
# self.args = parser.parse_args()
|
self.url = args['url']
|
||||||
|
|
||||||
|
|
||||||
|
def get_source(self, args=None):
|
||||||
|
request = urllib.request.Request(self.url, headers=headers)
|
||||||
|
page = urllib.request.urlopen(request)
|
||||||
|
self.source = page.read()
|
||||||
|
|
||||||
|
|
||||||
|
def find_links(self, args=None, source=None):
|
||||||
|
soup = BeautifulSoup(self.source, 'html.parser')
|
||||||
|
links = soup.find_all('a')
|
||||||
|
hrefs = []
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if link['href'].startswith('/'):
|
||||||
|
hrefs.append("".join([self.url, link['href']]))
|
||||||
|
else:
|
||||||
|
hrefs.append(link['href'])
|
||||||
|
|
||||||
|
return hrefs
|
||||||
|
|
||||||
|
|
||||||
def run(args=None):
|
def run(args=None):
|
||||||
@@ -60,5 +81,8 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser(description='Recursive web scraper')
|
parser = argparse.ArgumentParser(description='Recursive web scraper')
|
||||||
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
|
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
|
||||||
|
if not args.url.startswith('http'):
|
||||||
|
raise SystemExit('URL must start with a protocol (http(s)).')
|
||||||
|
|
||||||
run(args)
|
run(args)
|
||||||
|
|||||||
Reference in New Issue
Block a user