Compare commits

...

2 Commits

Author SHA1 Message Date
c53f62b55d add most changes suggested by pycodestyle 2018-09-16 16:10:38 +01:00
75d3756bbc fix errors discovered by pycyodestyle 2018-09-16 16:04:07 +01:00
2 changed files with 24 additions and 26 deletions

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' '''
Asynchronous web crawler written in Python 3.5+. Asynchronous web crawler written in Python 3.5+.
@@ -32,7 +32,8 @@ def sanity_checks(url=None):
# fail early if robots denies all crawling # fail early if robots denies all crawling
if not robots.check(url=baseurl): if not robots.check(url=baseurl):
sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(baseurl=baseurl)) sys.exit("{baseurl} cannot be crawled (denied by robots.txt)".format(
baseurl=baseurl))
return(baseurl, robots) return(baseurl, robots)
@@ -48,8 +49,8 @@ def render_sitemap(base_url=None, crawled_urls=None, runtime=None):
loader=jinja2.FileSystemLoader('templates') loader=jinja2.FileSystemLoader('templates')
).get_template('sitemap.html.j2') ).get_template('sitemap.html.j2')
rendered_html = template.render( rendered_html = template.render(base_url=base_url, urlcount=urlcount,
base_url=base_url, urlcount=urlcount, urls=sorted_urls, runtime=runtime) urls=sorted_urls, runtime=runtime)
with open('sitemap.html', 'w') as outfile: with open('sitemap.html', 'w') as outfile:
outfile.write(rendered_html) outfile.write(rendered_html)
@@ -67,9 +68,11 @@ def main():
baseurl, robots = sanity_checks(url=args.url) baseurl, robots = sanity_checks(url=args.url)
# create a crawler # create a crawler
async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots, concurrency=args.concurrency) async_crawler = AsyncCrawler(baseurl=baseurl, robots=robots,
concurrency=args.concurrency)
# create a task to run the crawler, run the loop and then gather the results. # create a task to run the crawler, run the loop and then gather the
# results.
task = asyncio.Task(async_crawler.main()) task = asyncio.Task(async_crawler.main())
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(task) loop.run_until_complete(task)
@@ -84,9 +87,9 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursive web crawler') parser = argparse.ArgumentParser(description='Recursive web crawler')
parser.add_argument("-u", "--url", required=True, help="Initial url to crawl") parser.add_argument("-u", "--url", required=True, help="Initial url")
parser.add_argument("-c", "--concurrency", required=False, type=int, parser.add_argument("-c", "--concurrency", required=False, type=int,
default=100, help="Max number of pages to crawl concurrently") default=100, help="Max pages to crawl concurrently")
args = parser.parse_args() args = parser.parse_args()
main() main()

View File

@@ -26,7 +26,6 @@ class AsyncCrawler(object):
self.client_session = None self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency) self.semaphore = asyncio.BoundedSemaphore(concurrency)
async def crawl_url(self, url=None): async def crawl_url(self, url=None):
''' '''
Crawls the given URL and finds all new URLs in the initial page. Crawls the given URL and finds all new URLs in the initial page.
@@ -38,7 +37,6 @@ class AsyncCrawler(object):
return urls return urls
def validate_url(self, url=None): def validate_url(self, url=None):
''' '''
Ensures we have a valid URL to crawl and that the site's robots.txt Ensures we have a valid URL to crawl and that the site's robots.txt
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
else: else:
return False return False
async def get_source(self, url=None): async def get_source(self, url=None):
''' '''
Obtains the URL's source, provided it is HTML. Usage of semaphores Obtains the URL's source, provided it is HTML. Usage of semaphores
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
except Exception: except Exception:
return None return None
def find_all_urls(self, source=None): def find_all_urls(self, source=None):
''' '''
Find all URLs in a page's source. Returns a list of URLs which have Find all URLs in a page's source. Returns a list of URLs which have
@@ -91,13 +87,12 @@ class AsyncCrawler(object):
return urls return urls
async def run(self, urls=None): async def run(self, urls=None):
''' '''
Crawls a batch of URLs of any size (resource usage is bounded by n Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only returns to the list of URLs which need to be crawled (find_all_urls only
unseen URLs). returns unseen URLs).
''' '''
tasks = [] tasks = []
all_urls = set() all_urls = set()
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
for task in asyncio.as_completed(tasks): for task in asyncio.as_completed(tasks):
urls = None urls = None
try: try:
# completed.append((await task))
urls = await task urls = await task
except Exception as e: except Exception as e:
print(e) print(e)
@@ -124,7 +118,6 @@ class AsyncCrawler(object):
return all_urls return all_urls
async def main(self): async def main(self):
''' '''
Runs a crawl with batches of URLs. Once complete returns a list of all Runs a crawl with batches of URLs. Once complete returns a list of all
@@ -180,7 +173,6 @@ class RobotsTxt(object):
self.robots = robots self.robots = robots
def check(self, url): def check(self, url):
''' '''
Test if robots allows us to crawl that URL. Test if robots allows us to crawl that URL.
@@ -190,19 +182,21 @@ class RobotsTxt(object):
def standardise_url(url=None, base_url=None): def standardise_url(url=None, base_url=None):
''' '''
If `base_url` is None then we attempt to standarise the URL to ensure it can If `base_url` is None then we attempt to standarise the URL to ensure it
be prepended to relative URLs. If no scheme has been provided then we default can be prepended to relative URLs. If no scheme has been provided then we
to http as any sane https-only site should 301 redirect http > https. default to http as any sane https-only site should 301 redirect http to
https.
If `base_url` is set, we standardise URLs to strip queries and fragments (we If `base_url` is set, we standardise URLs to strip queries and fragments
don't want to scrape in-page anchors etc). Any relative URLs will be appended (we don't want to scrape in-page anchors etc). Any relative URLs will be
to the base url. appended to the base url.
Returns a standardised URL as a string. Returns a standardised URL as a string.
''' '''
default_proto = 'http' default_proto = 'http'
delim = '://' delim = '://'
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm') file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
'cfm')
split_url = urlsplit(url) split_url = urlsplit(url)
@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
if url.startswith('/'): if url.startswith('/'):
return urljoin(base_url, split_url.path) return urljoin(base_url, split_url.path)
elif url.startswith(base_url): elif url.startswith(base_url):
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) return "".join([split_url.scheme, delim, split_url.netloc,
split_url.path])
return None return None