Compare commits

...

3 Commits

2 changed files with 10 additions and 7 deletions

View File

@@ -5,11 +5,11 @@ Need a docstring.
'''
import argparse
import asyncio
from datetime import datetime
import jinja2
import os
import sys
import asyncio
from datetime import datetime
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url

View File

@@ -6,7 +6,9 @@ Utilities to provide various misc functions.
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import urllib.error
from urllib.parse import urljoin, urlsplit
import urllib.request
import urllib.robotparser
@@ -18,7 +20,6 @@ class AsyncCrawler(object):
def __init__(self, baseurl=None, robots=None, concurrency=None):
self.baseurl = baseurl
self.robots = robots
self.uncrawled = set()
self.crawled = set()
self.headers = {'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -28,7 +29,7 @@ class AsyncCrawler(object):
async def crawl_url(self, url=None):
'''
docstring
Crawls the given URL and finds all new URLs in the initial page.
'''
urls = []
source = await self.get_source(url)
@@ -54,7 +55,8 @@ class AsyncCrawler(object):
async def get_source(self, url=None):
'''
Obtains the URL's source, provided it is HTML.
Obtains the URL's source, provided it is HTML. Usage of semaphores
ensures only a certain number of coroutines can run at once.
'''
async with self.semaphore:
async with self.client_session.head(url, timeout=5) as head:
@@ -66,7 +68,6 @@ class AsyncCrawler(object):
async with self.client_session.get(url, timeout=5) as resp:
try:
source = await resp.read()
print('crawled {0}'.format(url))
return source
except Exception:
return None
@@ -152,7 +153,9 @@ class RobotsTxt(object):
def __init__(self, base_url=None):
'''
Manually retrieve robots.txt to allow us to set the user-agent.
Manually retrieve robots.txt to allow us to set the user-agent (works
around sites which disallow access to robots.txt without a sane
user-agent).
'''
self.base_url = base_url
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}