add flags to README

improve documentation
update requirements
2018-09-16 15:58:17 +01:00 · 2018-09-16 15:53:47 +01:00 · 2018-09-16 15:44:30 +01:00 · 2018-09-16 15:26:49 +01:00
4 changed files with 33 additions and 8 deletions
@@ -2,7 +2,7 @@

 ## Requirements

-This crawler was written in 3.7.0 to take advantage of the latest `asyncio` features.
+This crawler requires at least Python 3.5 in order to utilise the async/await keywords from `asyncio`.

 Install required modules:

@@ -13,9 +13,16 @@ pip install -r requirements.txt
 Run:

 ```bash
-python crawler.py -u https://urltocrawl.com
+python crawler.py -u https://urltocrawl.com [-c 100]
 ```

+Flags:
+
+  - -u/--url https://url.com
+    - The base URL is required.
+  - -c/--concurrency 100
+    - Specifying concurrency value is optional (defaults to 100).
+
 ## Results

-The resulting sitemap will be output in the root of this directory as `sitemap.html`
+The resulting sitemap will be output to the root of this directory as `sitemap.html`
@@ -1,7 +1,15 @@

 #!/usr/bin/env python
 '''
-Need a docstring.
+Asynchronous web crawler written in Python 3.5+.
+
+This script will respect the site's `robots.txt`, if one exists. If not, all
+URLs discovered will be crawled.
+
+The crawler takes a total of two arguments (concurrency is optional):
+
+    url: the base URL to begin the crawl from.
+    concurrency: the maximum number of pages which may be crawled concurrently.
 '''

 import argparse
@@ -51,7 +59,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):

 def main():
    '''
-    docstring
+    Main function, responsible for prepping and running the crawler and
+    rendering the sitemap.
    '''
    starttime = datetime.now()

@@ -60,11 +69,13 @@ def main():
    # create a crawler
    async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)

+    # create a task to run the crawler, run the loop and then gather the results.
    task = asyncio.Task(async_crawler.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    loop.close()
-    results = task.result()
+    results = sorted(task.result())
+
    runtime = int((datetime.now() - starttime).total_seconds())

    render_sitemap(base_url=baseurl, crawled_urls=results, runtime=runtime)
@@ -73,7 +84,7 @@ def main():
 if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Recursive web crawler')
-    parser.add_argument("-u", "--url", required=True, help="Base url to crawl")
+    parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
    parser.add_argument("-c", "--concurrency", required=False, type=int,
                        default=100, help="Max number of pages to crawl concurrently")
    args = parser.parse_args()
@@ -1,8 +1,12 @@
+aiohttp==3.4.4
+async-timeout==3.0.0
+attrs==18.2.0
 beautifulsoup4==4.6.3
 bs4==0.0.1
-certifi==2018.8.13
 chardet==3.0.4
 idna==2.7
 Jinja2==2.10
 lxml==4.2.4
 MarkupSafe==1.0
+multidict==4.4.0
+yarl==1.2.6
@@ -119,6 +119,7 @@ class AsyncCrawler(object):
            # add the URLs to a set to be returned.
            if urls:
                for url in urls:
+                    print('Found: {0}'.format(url))
                    all_urls.add(url)

        return all_urls
@@ -133,6 +134,7 @@ class AsyncCrawler(object):
        to_crawl = []
        to_crawl.append(self.baseurl)

+        print('Crawling: {0}'.format(self.baseurl))
        while len(to_crawl) > 0:
            discovered_urls = await self.run(urls=to_crawl)
            # empty toe crawl list and then add all newly discovered URLs for
@@ -169,6 +171,7 @@ class RobotsTxt(object):
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError:
+            # if robots.txt doesn't exist then allow all URLs to be crawled.
            robots.allow_all = True
        else:
            data = response.read()
Author	SHA1	Message	Date
simon	5262c23281	add flags to README	2018-09-16 15:58:17 +01:00
simon	524f6a45cd	improve documentation	2018-09-16 15:53:47 +01:00
simon	a926090bed	update requirements	2018-09-16 15:44:30 +01:00
simon	91cd988f52	more comments and progress output	2018-09-16 15:26:49 +01:00