add most changes suggested by pycodestyle
This commit is contained in:
@@ -26,7 +26,6 @@ class AsyncCrawler(object):
|
|||||||
self.client_session = None
|
self.client_session = None
|
||||||
self.semaphore = asyncio.BoundedSemaphore(concurrency)
|
self.semaphore = asyncio.BoundedSemaphore(concurrency)
|
||||||
|
|
||||||
|
|
||||||
async def crawl_url(self, url=None):
|
async def crawl_url(self, url=None):
|
||||||
'''
|
'''
|
||||||
Crawls the given URL and finds all new URLs in the initial page.
|
Crawls the given URL and finds all new URLs in the initial page.
|
||||||
@@ -38,7 +37,6 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def validate_url(self, url=None):
|
def validate_url(self, url=None):
|
||||||
'''
|
'''
|
||||||
Ensures we have a valid URL to crawl and that the site's robots.txt
|
Ensures we have a valid URL to crawl and that the site's robots.txt
|
||||||
@@ -52,7 +50,6 @@ class AsyncCrawler(object):
|
|||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def get_source(self, url=None):
|
async def get_source(self, url=None):
|
||||||
'''
|
'''
|
||||||
Obtains the URL's source, provided it is HTML. Usage of semaphores
|
Obtains the URL's source, provided it is HTML. Usage of semaphores
|
||||||
@@ -72,7 +69,6 @@ class AsyncCrawler(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def find_all_urls(self, source=None):
|
def find_all_urls(self, source=None):
|
||||||
'''
|
'''
|
||||||
Find all URLs in a page's source. Returns a list of URLs which have
|
Find all URLs in a page's source. Returns a list of URLs which have
|
||||||
@@ -91,13 +87,12 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
async def run(self, urls=None):
|
async def run(self, urls=None):
|
||||||
'''
|
'''
|
||||||
Crawls a batch of URLs of any size (resource usage is bounded by n
|
Crawls a batch of URLs of any size (resource usage is bounded by n
|
||||||
semaphores (where n = concurrency). Returns a set of URLs to be added
|
semaphores (where n = concurrency). Returns a set of URLs to be added
|
||||||
to the list of URLs which need to be crawled (find_all_urls only returns
|
to the list of URLs which need to be crawled (find_all_urls only
|
||||||
unseen URLs).
|
returns unseen URLs).
|
||||||
'''
|
'''
|
||||||
tasks = []
|
tasks = []
|
||||||
all_urls = set()
|
all_urls = set()
|
||||||
@@ -111,7 +106,6 @@ class AsyncCrawler(object):
|
|||||||
for task in asyncio.as_completed(tasks):
|
for task in asyncio.as_completed(tasks):
|
||||||
urls = None
|
urls = None
|
||||||
try:
|
try:
|
||||||
# completed.append((await task))
|
|
||||||
urls = await task
|
urls = await task
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
@@ -124,7 +118,6 @@ class AsyncCrawler(object):
|
|||||||
|
|
||||||
return all_urls
|
return all_urls
|
||||||
|
|
||||||
|
|
||||||
async def main(self):
|
async def main(self):
|
||||||
'''
|
'''
|
||||||
Runs a crawl with batches of URLs. Once complete returns a list of all
|
Runs a crawl with batches of URLs. Once complete returns a list of all
|
||||||
@@ -180,7 +173,6 @@ class RobotsTxt(object):
|
|||||||
|
|
||||||
self.robots = robots
|
self.robots = robots
|
||||||
|
|
||||||
|
|
||||||
def check(self, url):
|
def check(self, url):
|
||||||
'''
|
'''
|
||||||
Test if robots allows us to crawl that URL.
|
Test if robots allows us to crawl that URL.
|
||||||
@@ -190,19 +182,21 @@ class RobotsTxt(object):
|
|||||||
|
|
||||||
def standardise_url(url=None, base_url=None):
|
def standardise_url(url=None, base_url=None):
|
||||||
'''
|
'''
|
||||||
If `base_url` is None then we attempt to standarise the URL to ensure it can
|
If `base_url` is None then we attempt to standarise the URL to ensure it
|
||||||
be prepended to relative URLs. If no scheme has been provided then we default
|
can be prepended to relative URLs. If no scheme has been provided then we
|
||||||
to http as any sane https-only site should 301 redirect http > https.
|
default to http as any sane https-only site should 301 redirect http to
|
||||||
|
https.
|
||||||
|
|
||||||
If `base_url` is set, we standardise URLs to strip queries and fragments (we
|
If `base_url` is set, we standardise URLs to strip queries and fragments
|
||||||
don't want to scrape in-page anchors etc). Any relative URLs will be appended
|
(we don't want to scrape in-page anchors etc). Any relative URLs will be
|
||||||
to the base url.
|
appended to the base url.
|
||||||
|
|
||||||
Returns a standardised URL as a string.
|
Returns a standardised URL as a string.
|
||||||
'''
|
'''
|
||||||
default_proto = 'http'
|
default_proto = 'http'
|
||||||
delim = '://'
|
delim = '://'
|
||||||
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm')
|
file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx',
|
||||||
|
'cfm')
|
||||||
|
|
||||||
split_url = urlsplit(url)
|
split_url = urlsplit(url)
|
||||||
|
|
||||||
@@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None):
|
|||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
return urljoin(base_url, split_url.path)
|
return urljoin(base_url, split_url.path)
|
||||||
elif url.startswith(base_url):
|
elif url.startswith(base_url):
|
||||||
return "".join([split_url.scheme, delim, split_url.netloc, split_url.path])
|
return "".join([split_url.scheme, delim, split_url.netloc,
|
||||||
|
split_url.path])
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user