Compare commits

...

3 Commits

2 changed files with 10 additions and 7 deletions

View File

@@ -5,11 +5,11 @@ Need a docstring.
''' '''
import argparse import argparse
import asyncio
from datetime import datetime
import jinja2 import jinja2
import os import os
import sys import sys
import asyncio
from datetime import datetime
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url

View File

@@ -6,7 +6,9 @@ Utilities to provide various misc functions.
import aiohttp import aiohttp
import asyncio import asyncio
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.error
from urllib.parse import urljoin, urlsplit from urllib.parse import urljoin, urlsplit
import urllib.request
import urllib.robotparser import urllib.robotparser
@@ -18,7 +20,6 @@ class AsyncCrawler(object):
def __init__(self, baseurl=None, robots=None, concurrency=None): def __init__(self, baseurl=None, robots=None, concurrency=None):
self.baseurl = baseurl self.baseurl = baseurl
self.robots = robots self.robots = robots
self.uncrawled = set()
self.crawled = set() self.crawled = set()
self.headers = {'Accept-Encoding': 'gzip, deflate', self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -28,7 +29,7 @@ class AsyncCrawler(object):
async def crawl_url(self, url=None): async def crawl_url(self, url=None):
''' '''
docstring Crawls the given URL and finds all new URLs in the initial page.
''' '''
urls = [] urls = []
source = await self.get_source(url) source = await self.get_source(url)
@@ -54,7 +55,8 @@ class AsyncCrawler(object):
async def get_source(self, url=None): async def get_source(self, url=None):
''' '''
Obtains the URL's source, provided it is HTML. Obtains the URL's source, provided it is HTML. Usage of semaphores
ensures only a certain number of coroutines can run at once.
''' '''
async with self.semaphore: async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head: async with self.client_session.head(url, timeout=5) as head:
@@ -66,7 +68,6 @@ class AsyncCrawler(object):
async with self.client_session.get(url, timeout=5) as resp: async with self.client_session.get(url, timeout=5) as resp:
try: try:
source = await resp.read() source = await resp.read()
print('crawled {0}'.format(url))
return source return source
except Exception: except Exception:
return None return None
@@ -152,7 +153,9 @@ class RobotsTxt(object):
def __init__(self, base_url=None): def __init__(self, base_url=None):
''' '''
Manually retrieve robots.txt to allow us to set the user-agent. Manually retrieve robots.txt to allow us to set the user-agent (works
around sites which disallow access to robots.txt without a sane
user-agent).
''' '''
self.base_url = base_url self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'} self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}