Compare commits
3 Commits
6548f55416
...
f1855f5add
| Author | SHA1 | Date | |
|---|---|---|---|
| f1855f5add | |||
| 336517e84a | |||
| 7bc9fe0679 |
@@ -5,11 +5,11 @@ Need a docstring.
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
import jinja2
|
import jinja2
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import asyncio
|
|
||||||
from datetime import datetime
|
|
||||||
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
|
from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,9 @@ Utilities to provide various misc functions.
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import asyncio
|
import asyncio
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib.error
|
||||||
from urllib.parse import urljoin, urlsplit
|
from urllib.parse import urljoin, urlsplit
|
||||||
|
import urllib.request
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
|
|
||||||
|
|
||||||
@@ -18,7 +20,6 @@ class AsyncCrawler(object):
|
|||||||
def __init__(self, baseurl=None, robots=None, concurrency=None):
|
def __init__(self, baseurl=None, robots=None, concurrency=None):
|
||||||
self.baseurl = baseurl
|
self.baseurl = baseurl
|
||||||
self.robots = robots
|
self.robots = robots
|
||||||
self.uncrawled = set()
|
|
||||||
self.crawled = set()
|
self.crawled = set()
|
||||||
self.headers = {'Accept-Encoding': 'gzip, deflate',
|
self.headers = {'Accept-Encoding': 'gzip, deflate',
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
@@ -28,7 +29,7 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
async def crawl_url(self, url=None):
|
async def crawl_url(self, url=None):
|
||||||
'''
|
'''
|
||||||
docstring
|
Crawls the given URL and finds all new URLs in the initial page.
|
||||||
'''
|
'''
|
||||||
urls = []
|
urls = []
|
||||||
source = await self.get_source(url)
|
source = await self.get_source(url)
|
||||||
@@ -54,7 +55,8 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
async def get_source(self, url=None):
|
async def get_source(self, url=None):
|
||||||
'''
|
'''
|
||||||
Obtains the URL's source, provided it is HTML.
|
Obtains the URL's source, provided it is HTML. Usage of semaphores
|
||||||
|
ensures only a certain number of coroutines can run at once.
|
||||||
'''
|
'''
|
||||||
async with self.semaphore:
|
async with self.semaphore:
|
||||||
async with self.client_session.head(url, timeout=5) as head:
|
async with self.client_session.head(url, timeout=5) as head:
|
||||||
@@ -66,7 +68,6 @@ class AsyncCrawler(object):
|
|||||||
async with self.client_session.get(url, timeout=5) as resp:
|
async with self.client_session.get(url, timeout=5) as resp:
|
||||||
try:
|
try:
|
||||||
source = await resp.read()
|
source = await resp.read()
|
||||||
print('crawled {0}'.format(url))
|
|
||||||
return source
|
return source
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
@@ -152,7 +153,9 @@ class RobotsTxt(object):
|
|||||||
|
|
||||||
def __init__(self, base_url=None):
|
def __init__(self, base_url=None):
|
||||||
'''
|
'''
|
||||||
Manually retrieve robots.txt to allow us to set the user-agent.
|
Manually retrieve robots.txt to allow us to set the user-agent (works
|
||||||
|
around sites which disallow access to robots.txt without a sane
|
||||||
|
user-agent).
|
||||||
'''
|
'''
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
|
||||||
|
|||||||
Reference in New Issue
Block a user