diff --git a/utils/helpers.py b/utils/helpers.py index f18d78a..5739b7f 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -26,7 +26,6 @@ class AsyncCrawler(object): self.client_session = None self.semaphore = asyncio.BoundedSemaphore(concurrency) - async def crawl_url(self, url=None): ''' Crawls the given URL and finds all new URLs in the initial page. @@ -38,7 +37,6 @@ class AsyncCrawler(object): return urls - def validate_url(self, url=None): ''' Ensures we have a valid URL to crawl and that the site's robots.txt @@ -52,7 +50,6 @@ class AsyncCrawler(object): else: return False - async def get_source(self, url=None): ''' Obtains the URL's source, provided it is HTML. Usage of semaphores @@ -72,7 +69,6 @@ class AsyncCrawler(object): except Exception: return None - def find_all_urls(self, source=None): ''' Find all URLs in a page's source. Returns a list of URLs which have @@ -91,13 +87,12 @@ class AsyncCrawler(object): return urls - async def run(self, urls=None): ''' Crawls a batch of URLs of any size (resource usage is bounded by n semaphores (where n = concurrency). Returns a set of URLs to be added - to the list of URLs which need to be crawled (find_all_urls only returns - unseen URLs). + to the list of URLs which need to be crawled (find_all_urls only + returns unseen URLs). ''' tasks = [] all_urls = set() @@ -111,7 +106,6 @@ class AsyncCrawler(object): for task in asyncio.as_completed(tasks): urls = None try: - # completed.append((await task)) urls = await task except Exception as e: print(e) @@ -124,7 +118,6 @@ class AsyncCrawler(object): return all_urls - async def main(self): ''' Runs a crawl with batches of URLs. Once complete returns a list of all @@ -180,7 +173,6 @@ class RobotsTxt(object): self.robots = robots - def check(self, url): ''' Test if robots allows us to crawl that URL. @@ -190,19 +182,21 @@ class RobotsTxt(object): def standardise_url(url=None, base_url=None): ''' - If `base_url` is None then we attempt to standarise the URL to ensure it can - be prepended to relative URLs. If no scheme has been provided then we default - to http as any sane https-only site should 301 redirect http > https. + If `base_url` is None then we attempt to standarise the URL to ensure it + can be prepended to relative URLs. If no scheme has been provided then we + default to http as any sane https-only site should 301 redirect http to + https. - If `base_url` is set, we standardise URLs to strip queries and fragments (we - don't want to scrape in-page anchors etc). Any relative URLs will be appended - to the base url. + If `base_url` is set, we standardise URLs to strip queries and fragments + (we don't want to scrape in-page anchors etc). Any relative URLs will be + appended to the base url. Returns a standardised URL as a string. ''' default_proto = 'http' delim = '://' - file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', 'cfm') + file_extensions = ('/', 'htm', 'html', 'xhtml', 'php', 'asp', 'aspx', + 'cfm') split_url = urlsplit(url) @@ -219,6 +213,7 @@ def standardise_url(url=None, base_url=None): if url.startswith('/'): return urljoin(base_url, split_url.path) elif url.startswith(base_url): - return "".join([split_url.scheme, delim, split_url.netloc, split_url.path]) + return "".join([split_url.scheme, delim, split_url.netloc, + split_url.path]) return None