Compare commits

...

5 Commits

3 changed files with 117 additions and 1 deletions

View File

@@ -1 +1,88 @@
#!/usr/bin/env python #!/usr/bin/env python
import re
import argparse
import urllib.request
from bs4 import BeautifulSoup
class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, args):
self.url = args['url']
def get_source(self, args=None):
request = urllib.request.Request(self.url, headers=headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self, args=None, source=None):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def run(args=None):
source = get_source(args)
urls = find_links(args, source)
local_urls = parse_urls(args, urls)
print(local_urls)
def get_source(args=None):
url = args.url
useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
headers = {'User-Agent': useragent}
request = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(request)
source = page.read()
return source
def find_links(args=None, source=None):
soup = BeautifulSoup(source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([args.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def parse_urls(args=None, urls=None):
local_urls = []
for url in urls:
if url.startswith(args.url):
local_urls.append(url)
return local_urls
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Recursive web scraper')
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
args = parser.parse_args()
if not args.url.startswith('http'):
raise SystemExit('URL must start with a protocol (http(s)).')
run(args)

0
utils/__init__.py Normal file
View File

View File

@@ -3,6 +3,35 @@
Utilities to provide various misc functions. Utilities to provide various misc functions.
''' '''
class UrlPoolManager(object):
'''
Object to manage the lifecycle of a pool of URLs.
'''
def __init__(self):
self.url_pool = dict()
self.not_crawled = 0
self.crawled = 1
self.invalid = 2
def check_duplicate(self, new_url):
for url, status in self.url_pool.items():
if url == new_url:
return True
else:
return False
def invalidate_url(self, url):
self.url_pool[url] = self.invalid
def add_to_list(self, url):
self.url_pool[url] = self.not_crawled
# calculate depth
# add link, crawled status to url_pool
def mark_as_crawled(self, url):
self.url_pool[url] = self.crawled
def clean_base_url(url): def clean_base_url(url):
''' '''