Compare commits

..

2 Commits

Author SHA1 Message Date
0726bcccb0 removed original file 2018-09-04 09:21:55 +01:00
05e907ecec too many changes to make a sensible commit message 2018-09-04 09:21:26 +01:00
3 changed files with 88 additions and 116 deletions

View File

@@ -5,35 +5,70 @@ Need a docstring.
import argparse
from utils.helpers import (UrlPool, WebPage, sanitise_url)
from pprint import pprint
def init_crawler(base_url=None):
'''
needs a docstring
'''
uncrawled_urls, crawled_urls = UrlPool(), UrlPool()
initial_page = WebPage(base_url)
initial_page = WebPage(url=base_url, base_url=base_url)
try:
initial_urls = initial_page.run()
initial_page.run()
except Exception as e:
print(e)
initial_urls = initial_page.list_urls()
# ensure the base URL isn't crawled again
try:
initial_urls.remove(base_url)
except KeyError:
pass
# Add the base URL to the crawled pool
crawled_urls.add_to_pool(base_url)
for url in initial_urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(sanitised_url)
return(uncrawled_urls, crawled_urls)
def process_pool(base_url=None, uncrawled_urls=None, crawled_urls=None):
'''
Needs a docstring
'''
while uncrawled_urls.pool:
# pop url from pool
new_url = uncrawled_urls.remove_from_pool()
# create a WebPage object for the URL
current_page = WebPage(url=new_url, base_url=base_url)
try:
uncrawled_urls.add_to_pool(url)
current_page.run()
_urls = current_page.list_urls()
crawled_urls.add_to_pool(new_url)
except Exception as e:
print(e)
print(uncrawled_urls.url_pool)
for url in _urls:
sanitised_url = sanitise_url(url=url)
if sanitised_url not in crawled_urls.pool:
uncrawled_urls.add_to_pool(url)
def run(args=None):
'''
needs a docstring.
'''
base_url = sanitise_url(args.url)
base_url = sanitise_url(args.url, base_url=True)
init_crawler(base_url)
uncrawled_urls, crawled_urls = init_crawler(base_url)
process_pool(base_url, uncrawled_urls, crawled_urls)
pprint(crawled_urls.pool)
print('{0} URLs crawled'.format(len(crawled_urls.pool)))
if __name__ == '__main__':

View File

@@ -1,88 +0,0 @@
#!/usr/bin/env python
import re
import argparse
import urllib.request
from bs4 import BeautifulSoup
class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, args):
self.url = args['url']
def get_source(self, args=None):
request = urllib.request.Request(self.url, headers=headers)
page = urllib.request.urlopen(request)
self.source = page.read()
def find_links(self, args=None, source=None):
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([self.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def run(args=None):
source = get_source(args)
urls = find_links(args, source)
local_urls = parse_urls(args, urls)
print(local_urls)
def get_source(args=None):
url = args.url
useragent = 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'
headers = {'User-Agent': useragent}
request = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(request)
source = page.read()
return source
def find_links(args=None, source=None):
soup = BeautifulSoup(source, 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
if link['href'].startswith('/'):
hrefs.append("".join([args.url, link['href']]))
else:
hrefs.append(link['href'])
return hrefs
def parse_urls(args=None, urls=None):
local_urls = []
for url in urls:
if url.startswith(args.url):
local_urls.append(url)
return local_urls
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Recursive web scraper')
parser.add_argument("-u", "--url", required=True, help="Base url to scrape")
args = parser.parse_args()
if not args.url.startswith('http'):
raise SystemExit('URL must start with a protocol (http(s)).')
run(args)

View File

@@ -14,22 +14,29 @@ class UrlPool(object):
'''
def __init__(self):
self.url_pool = set()
self.pool = set()
def check_duplicate(self, new_url):
'''
Checks if a URL exists in the current pool.
'''
if new_url in self.url_pool:
if new_url in self.pool:
return True
else:
return False
def remove_from_pool(self, url):
self.url_pool.remove(url)
def remove_from_pool(self):
'''
Remove a URL from the pool and return it to be crawled.
'''
return(self.pool.pop())
def add_to_pool(self, url):
self.url_pool.add(url)
self.pool.add(url)
def list_pool(self):
pool = self.pool
return pool
class WebPage(object):
@@ -40,8 +47,9 @@ class WebPage(object):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def __init__(self, url):
def __init__(self, url=None, base_url=None):
self.url = url
self.base_url = base_url
def get_source(self):
@@ -50,7 +58,7 @@ class WebPage(object):
'''
request = urllib.request.Request(self.url, headers=self.headers)
page = urllib.request.urlopen(request)
page = urllib.request.urlopen(request, timeout=5)
self.source = page.read()
@@ -62,7 +70,7 @@ class WebPage(object):
hrefs = set()
soup = BeautifulSoup(self.source, 'html.parser')
links = soup.find_all('a')
links = soup.find_all('a', href=True)
for link in links:
if link['href'].startswith('/'):
@@ -78,11 +86,20 @@ class WebPage(object):
Iterate through the list of discovered URLs and add them to the
pool if they start with the base URL.
'''
self.urls_to_crawl = set()
for url in self.discovered_hrefs:
if url.startswith(self.url):
self.urls_to_crawl.add(url)
sanitised_url = sanitise_url(url=url)
self.urls_to_crawl.add(sanitised_url)
def list_urls(self):
'''
Returns the contents of the
'''
return self.urls_to_crawl
def run(self):
@@ -101,24 +118,32 @@ class WebPage(object):
except Exception as e:
print(e)
return self.urls_to_crawl
def sanitise_url(url):
def sanitise_url(url, base_url=False):
'''
Attempt to standardise the base url to ensure it can be prepended to
relative URLs. If no scheme provided then we default to http as any
sane https-only site should 301 redirect http > https.
If `base_url` is True, we attempt to standardise `url` to ensure it can be
prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https.
Returns a corrected base URL as a string.
If `base_url` is False, we sanitise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc).
Returns a sanitised URL as a string.
'''
default_proto = 'http'
delim = '://'
split_url = urlsplit(url)
if split_url.scheme and split_url.scheme.startswith('http'):
base_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
base_url = "".join([default_proto, delim, split_url.path])
return base_url
if base_url:
# This will sanitise the initial url for the initial page crawl.
if split_url.scheme and split_url.scheme.startswith('http'):
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc])
elif (split_url.path and not split_url.scheme and not split_url.netloc):
sanitised_url = "".join([default_proto, delim, split_url.path])
else:
# Sanitise discovered URLs. We already expect them in the format
# protocol://base_url/path
sanitised_url = "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return sanitised_url