add most changes suggested by pycodestyle

This commit is contained in:
2018-09-16 16:10:38 +01:00
parent 75d3756bbc
commit c53f62b55d

View File

@@ -26,7 +26,6 @@ class AsyncCrawler(object):
self.client_session = None
self.semaphore = asyncio.BoundedSemaphore(concurrency)
async def crawl_url(self, url=None):
'''
Crawls the given URL and finds all new URLs in the initial page.
@@ -38,7 +37,6 @@ class AsyncCrawler(object):
return urls
def validate_url(self, url=None):
'''
Ensures we have a valid URL to crawl and that the site's robots.txt
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
else:
return False
async def get_source(self, url=None):
'''
Obtains the URL's source, provided it is HTML. Usage of semaphores
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
except Exception:
return None
def find_all_urls(self, source=None):
'''
Find all URLs in a page's source. Returns a list of URLs which have
@@ -91,13 +87,12 @@ class AsyncCrawler(object):
return urls
async def run(self, urls=None):
'''
Crawls a batch of URLs of any size (resource usage is bounded by n
semaphores (where n = concurrency). Returns a set of URLs to be added
to the list of URLs which need to be crawled (find_all_urls only returns
unseen URLs).
to the list of URLs which need to be crawled (find_all_urls only
returns unseen URLs).
'''
tasks = []
all_urls = set()
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
for task in asyncio.as_completed(tasks):
urls = None
try:
# completed.append((await task))
urls = await task
except Exception as e:
print(e)
@@ -124,7 +118,6 @@ class AsyncCrawler(object):
return all_urls
async def main(self):
'''
Runs a crawl with batches of URLs. Once complete returns a list of all
@@ -180,7 +173,6 @@ class RobotsTxt(object):
self.robots = robots
def check(self, url):
'''
Test if robots allows us to crawl that URL.
@@ -190,19 +182,21 @@ class RobotsTxt(object):
def standardise_url(url=None, base_url=None):
'''
If `base_url` is None then we attempt to standarise the URL to ensure it can
be prepended to relative URLs. If no scheme has been provided then we default
to http as any sane https-only site should 301 redirect http > https.
If `base_url` is None then we attempt to standarise the URL to ensure it
can be prepended to relative URLs. If no scheme has been provided then we
default to http as any sane https-only site should 301 redirect http to
https.
If `base_url` is set, we standardise URLs to strip queries and fragments (we
don't want to scrape in-page anchors etc). Any relative URLs will be appended
to the base url.
If `base_url` is set, we standardise URLs to strip queries and fragments
(we don't want to scrape in-page anchors etc). Any relative URLs will be
appended to the base url.
Returns a standardised URL as a string.
'''
default_proto = 'http'
delim = '://'
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
'cfm')
split_url = urlsplit(url)
@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
if url.startswith('/'):
return urljoin(base_url, split_url.path)
elif url.startswith(base_url):
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
return "".join([split_url.scheme, delim, split_url.netloc,
split_url.path])
return None