re-order imports because I'm fussy

more documentation and add back some required imports
improved documentation and remove unneeded set
2018-09-16 09:06:30 +01:00 · 2018-09-16 09:00:43 +01:00 · 2018-09-16 08:56:44 +01:00
2 changed files with 10 additions and 7 deletions
--- a/async_crawler.py
+++ b/async_crawler.py
@@ -5,11 +5,11 @@ Need a docstring.
 '''

 import argparse
+import asyncio
+from datetime import datetime
 import jinja2
 import os
 import sys
-import asyncio
-from datetime import datetime
 from utils.helpers import RobotsTxt, AsyncCrawler, standardise_url


--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -6,7 +6,9 @@ Utilities to provide various misc functions.
 import aiohttp
 import asyncio
 from bs4 import BeautifulSoup
+import urllib.error
 from urllib.parse import urljoin, urlsplit
+import urllib.request
 import urllib.robotparser


@@ -18,7 +20,6 @@ class AsyncCrawler(object):
    def __init__(self, baseurl=None, robots=None, concurrency=None):
        self.baseurl = baseurl
        self.robots = robots
-        self.uncrawled = set()
        self.crawled = set()
        self.headers = {'Accept-Encoding': 'gzip, deflate',
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
@@ -28,7 +29,7 @@ class AsyncCrawler(object):

    async def crawl_url(self, url=None):
        '''
-        docstring
+        Crawls the given URL and finds all new URLs in the initial page.
        '''
        urls = []
        source = await self.get_source(url)
@@ -54,7 +55,8 @@ class AsyncCrawler(object):

    async def get_source(self, url=None):
        '''
-        Obtains the URL's source, provided it is HTML.
+        Obtains the URL's source, provided it is HTML. Usage of semaphores
+        ensures only a certain number of coroutines can run at once.
        '''
        async with self.semaphore:
            async with self.client_session.head(url, timeout=5) as head:
@@ -66,7 +68,6 @@ class AsyncCrawler(object):
                async with self.client_session.get(url, timeout=5) as resp:
                    try:
                        source = await resp.read()
-                        print('crawled {0}'.format(url))
                        return source
                    except Exception:
                        return None
@@ -152,7 +153,9 @@ class RobotsTxt(object):

    def __init__(self, base_url=None):
        '''
-        Manually retrieve robots.txt to allow us to set the user-agent.
+        Manually retrieve robots.txt to allow us to set the user-agent (works
+        around sites which disallow access to robots.txt without a sane
+        user-agent).
        '''
        self.base_url = base_url
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
Author	SHA1	Message	Date
Simon Weald	f1855f5add	re-order imports because I'm fussy	2018-09-16 09:06:30 +01:00
Simon Weald	336517e84a	more documentation and add back some required imports	2018-09-16 09:00:43 +01:00
Simon Weald	7bc9fe0679	improved documentation and remove unneeded set	2018-09-16 08:56:44 +01:00