Compare commits

..

2 Commits

Author SHA1 Message Date
c53f62b55d add most changes suggested by pycodestyle 2018-09-16 16:10:38 +01:00
75d3756bbc fix errors discovered by pycyodestyle 2018-09-16 16:04:07 +01:00
2 changed files with 24 additions and 26 deletions

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
Asynchronous web crawler written in Python 3.5+.
@@ -32,7 +32,8 @@ def sanity_checks(url=None):
# fail early if robots denies all crawling
if not robots.check(url=baseurl):
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl))
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(
baseurl=baseurl))
return(baseurl, robots)
@@ -48,8 +49,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2')
rendered_html = template.render(
base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime)
rendered_html = template.render(base_url=base_url, urlcount=urlcount,
urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html)
@@ -67,9 +68,11 @@ def main():
baseurl, robots = sanity_checks(url=args.url)
# create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency)
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots,
concurrency=args.concurrency)
# create a task to run the crawler, run the loop and then gather the results.
# create a task to run the crawler, run the loop and then gather the
# results.
task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
@@ -84,9 +87,9 @@ def main():
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Initial url to crawl")
parser.add_argument("-u", "--url", required=True, help="Initial url")
parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max number of pages to crawl concurrently")
default=100, help="Max pages to crawl concurrently")
args = parser.parse_args()
main()

View File

@@ -26,7 +26,6 @@ class AsyncCrawler(object):
self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency)
async def crawl_url(self, url=None):
'''
Crawls the given URL and finds all new URLs in the initial page.
@@ -38,7 +37,6 @@ class AsyncCrawler(object):
return urls
def validate_url(self, url=None):
'''
Ensures we have a valid URL to crawl and that the site's robots.txt
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
else:
return False
async def get_source(self, url=None):
'''
Obtains the URL's source, provided it is HTML. Usage of semaphores
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
except Exception:
return None
def find_all_urls(self, source=None):
'''
Find all URLs in a page's source. Returns a list of URLs which have
@@ -91,13 +87,12 @@ class AsyncCrawler(object):
return urls
async def run(self, urls=None):
'''
Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only returns
unseen URLs).
to the list of URLs which need to be crawled (find_all_urls only
returns unseen URLs).
'''
tasks = []
all_urls = set()
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
for task in asyncio.as_completed(tasks):
urls = None
try:
# completed.append((await task))
urls = await task
except Exception as e:
print(e)
@@ -124,7 +118,6 @@ class AsyncCrawler(object):
return all_urls
async def main(self):
'''
Runs a crawl with batches of URLs. Once complete returns a list of all
@@ -180,7 +173,6 @@ class RobotsTxt(object):
self.robots = robots
def check(self, url):
'''
Test if robots allows us to crawl that URL.
@@ -190,19 +182,21 @@ class RobotsTxt(object):
def standardise_url(url=None, base_url=None):
'''
If `base_url` is None then we attempt to standarise the URL to ensure it can
be prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https.
If `base_url` is None then we attempt to standarise the URL to ensure it
can be prepended to relative URLs. If no scheme has been provided then we
default to http as any sane https-only site should 301 redirect http to
https.
If `base_url` is set, we standardise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc). Any relative URLs will be appended
to the base url.
If `base_url` is set, we standardise URLs to strip queries and fragments
(we don't want to scrape in-page anchors etc). Any relative URLs will be
appended to the base url.
Returns a standardised URL as a string.
'''
default_proto = 'http'
delim = '://'
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
'cfm')
split_url = urlsplit(url)
@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
if url.startswith('/'):
return urljoin(base_url, split_url.path)
elif url.startswith(base_url):
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return "".join([split_url.scheme, delim, split_url.netloc,
split_url.path])
return None