add most changes suggested by pycodestyle
This commit is contained in:
@@ -26,7 +26,6 @@ class AsyncCrawler(object):
|
||||
self.client_session = None
|
||||
self.semaphore = asyncio.BoundedSemaphore(concurrency)
|
||||
|
||||
|
||||
async def crawl_url(self, url=None):
|
||||
'''
|
||||
Crawls the given URL and finds all new URLs in the initial page.
|
||||
@@ -38,7 +37,6 @@ class AsyncCrawler(object):
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def validate_url(self, url=None):
|
||||
'''
|
||||
Ensures we have a valid URL to crawl and that the site's robots.txt
|
||||
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
async def get_source(self, url=None):
|
||||
'''
|
||||
Obtains the URL's source, provided it is HTML. Usage of semaphores
|
||||
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_all_urls(self, source=None):
|
||||
'''
|
||||
Find all URLs in a page's source. Returns a list of URLs which have
|
||||
@@ -91,13 +87,12 @@ class AsyncCrawler(object):
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
async def run(self, urls=None):
|
||||
'''
|
||||
Crawls a batch of URLs of any size (resource usage is bounded by n
|
||||
semaphores (where n = concurrency). Returns a set of URLs to be added
|
||||
to the list of URLs which need to be crawled (find_all_urls only returns
|
||||
unseen URLs).
|
||||
to the list of URLs which need to be crawled (find_all_urls only
|
||||
returns unseen URLs).
|
||||
'''
|
||||
tasks = []
|
||||
all_urls = set()
|
||||
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
|
||||
for task in asyncio.as_completed(tasks):
|
||||
urls = None
|
||||
try:
|
||||
# completed.append((await task))
|
||||
urls = await task
|
||||
except Exception as e:
|
||||
print(e)
|
||||
@@ -124,7 +118,6 @@ class AsyncCrawler(object):
|
||||
|
||||
return all_urls
|
||||
|
||||
|
||||
async def main(self):
|
||||
'''
|
||||
Runs a crawl with batches of URLs. Once complete returns a list of all
|
||||
@@ -180,7 +173,6 @@ class RobotsTxt(object):
|
||||
|
||||
self.robots = robots
|
||||
|
||||
|
||||
def check(self, url):
|
||||
'''
|
||||
Test if robots allows us to crawl that URL.
|
||||
@@ -190,19 +182,21 @@ class RobotsTxt(object):
|
||||
|
||||
def standardise_url(url=None, base_url=None):
|
||||
'''
|
||||
If `base_url` is None then we attempt to standarise the URL to ensure it can
|
||||
be prepended to relative URLs. If no scheme has been provided then we default
|
||||
to http as any sane https-only site should 301 redirect http > https.
|
||||
If `base_url` is None then we attempt to standarise the URL to ensure it
|
||||
can be prepended to relative URLs. If no scheme has been provided then we
|
||||
default to http as any sane https-only site should 301 redirect http to
|
||||
https.
|
||||
|
||||
If `base_url` is set, we standardise URLs to strip queries and fragments (we
|
||||
don't want to scrape in-page anchors etc). Any relative URLs will be appended
|
||||
to the base url.
|
||||
If `base_url` is set, we standardise URLs to strip queries and fragments
|
||||
(we don't want to scrape in-page anchors etc). Any relative URLs will be
|
||||
appended to the base url.
|
||||
|
||||
Returns a standardised URL as a string.
|
||||
'''
|
||||
default_proto = 'http'
|
||||
delim = '://'
|
||||
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
|
||||
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
|
||||
'cfm')
|
||||
|
||||
split_url = urlsplit(url)
|
||||
|
||||
@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
|
||||
if url.startswith('/'):
|
||||
return urljoin(base_url, split_url.path)
|
||||
elif url.startswith(base_url):
|
||||
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
|
||||
return "".join([split_url.scheme, delim, split_url.netloc,
|
||||
split_url.path])
|
||||
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user