summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks/logging
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/logging')
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/__init__.py0
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py61
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py217
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py170
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py229
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py96
6 files changed, 773 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..c9fc59896
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,61 @@
+"""
+Module for performing checks on an Curator logging deployment
+"""
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Curator(LoggingCheck):
+ """Module that checks an integrated logging Curator deployment"""
+
+ name = "curator"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ curator_pods, error = super(Curator, self).get_pods_for_component(
+ self.module_executor,
+ self.logging_namespace,
+ "curator",
+ task_vars
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_curator(curator_pods)
+
+ if check_error:
+ msg = ("The following Curator deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+ def check_curator(self, pods):
+ """Check to see if curator is up and working. Returns: error string"""
+ if not pods:
+ return (
+ "There are no Curator pods for the logging stack,\n"
+ "so nothing will prune Elasticsearch indexes.\n"
+ "Is Curator correctly deployed?"
+ )
+
+ not_running = super(Curator, self).not_running_pods(pods)
+ if len(not_running) == len(pods):
+ return (
+ "The Curator pod is not currently in a running state,\n"
+ "so Elasticsearch indexes may increase without bound."
+ )
+ if len(pods) - len(not_running) > 1:
+ return (
+ "There is more than one Curator pod running. This should not normally happen.\n"
+ "Although this doesn't cause any problems, you may want to investigate."
+ )
+
+ return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..01cb35b81
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,217 @@
+"""
+Module for performing checks on an Elasticsearch logging deployment
+"""
+
+import json
+import re
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+ """Module that checks an integrated logging Elasticsearch deployment"""
+
+ name = "elasticsearch"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ es_pods, error = super(Elasticsearch, self).get_pods_for_component(
+ self.execute_module,
+ self.logging_namespace,
+ "es",
+ task_vars,
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_elasticsearch(es_pods, task_vars)
+
+ if check_error:
+ msg = ("The following Elasticsearch deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
+
+ def _not_running_elasticsearch_pods(self, es_pods):
+ """Returns: list of running pods, list of errors about non-running pods"""
+ not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+ if not_running:
+ return not_running, [(
+ 'The following Elasticsearch pods are not running:\n'
+ '{pods}'
+ 'These pods will not aggregate logs from their nodes.'
+ ).format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))]
+ return not_running, []
+
+ def check_elasticsearch(self, es_pods, task_vars):
+ """Various checks for elasticsearch. Returns: error string"""
+ not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+ pods_by_name = {
+ pod['metadata']['name']: pod for pod in running_pods
+ # Filter out pods that are not members of a DC
+ if pod['metadata'].get('labels', {}).get('deploymentconfig')
+ }
+ if not pods_by_name:
+ return 'No logging Elasticsearch pods were found. Is logging deployed?'
+ error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars)
+ error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars)
+ error_msgs += self._check_es_cluster_health(pods_by_name, task_vars)
+ error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars)
+ return '\n'.join(error_msgs)
+
+ @staticmethod
+ def _build_es_curl_cmd(pod_name, url):
+ base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+ return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+ def _check_elasticsearch_masters(self, pods_by_name, task_vars):
+ """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+ es_master_names = set()
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ # Compare what each ES node reports as master and compare for split brain
+ get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+ master_name_str = self._exec_oc(get_master_cmd, [], task_vars)
+ master_names = (master_name_str or '').split(' ')
+ if len(master_names) > 1:
+ es_master_names.add(master_names[1])
+ else:
+ error_msgs.append(
+ 'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+ ' {response}'.format(pod=pod_name, response=master_name_str)
+ )
+
+ if not es_master_names:
+ error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
+ return '\n'.join(error_msgs)
+
+ if len(es_master_names) > 1:
+ error_msgs.append(
+ 'Found multiple Elasticsearch masters according to the pods:\n'
+ '{master_list}\n'
+ 'This implies that the masters have "split brain" and are not correctly\n'
+ 'replicating data for the logging cluster. Log loss is likely to occur.'
+ .format(master_list='\n'.join(' ' + master for master in es_master_names))
+ )
+
+ return error_msgs
+
+ def _check_elasticsearch_node_list(self, pods_by_name, task_vars):
+ """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+
+ if not pods_by_name:
+ return ['No logging Elasticsearch masters were found. Is logging deployed?']
+
+ # get ES cluster nodes
+ node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+ cluster_node_data = self._exec_oc(node_cmd, [], task_vars)
+ try:
+ cluster_nodes = json.loads(cluster_node_data)['nodes']
+ except (ValueError, KeyError):
+ return [
+ 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+ cluster_node_data
+ ]
+
+ # Try to match all ES-reported node hosts to known pods.
+ error_msgs = []
+ for node in cluster_nodes.values():
+ # Note that with 1.4/3.4 the pod IP may be used as the master name
+ if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+ for pod_name, pod in pods_by_name.items()):
+ error_msgs.append(
+ 'The Elasticsearch cluster reports a member node "{node}"\n'
+ 'that does not correspond to any known ES pod.'.format(node=node['host'])
+ )
+
+ return error_msgs
+
+ def _check_es_cluster_health(self, pods_by_name, task_vars):
+ """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+ cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars)
+ try:
+ health_res = json.loads(cluster_health_data)
+ if not health_res or not health_res.get('status'):
+ raise ValueError()
+ except ValueError:
+ error_msgs.append(
+ 'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+ 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+ )
+ continue
+
+ if health_res['status'] not in ['green', 'yellow']:
+ error_msgs.append(
+ 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+ )
+
+ return error_msgs
+
+ def _check_elasticsearch_diskspace(self, pods_by_name, task_vars):
+ """
+ Exec into an ES pod and query the diskspace on the persistent volume.
+ Returns: list of errors
+ """
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+ disk_output = self._exec_oc(df_cmd, [], task_vars)
+ lines = disk_output.splitlines()
+ # expecting one header looking like 'IUse% Use%' and one body line
+ body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+ if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+ error_msgs.append(
+ 'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+ 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+ )
+ continue
+ inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+ inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90')
+ if int(inode_pct) >= int(inode_pct_thresh):
+ error_msgs.append(
+ 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(inode_pct),
+ limit=str(inode_pct_thresh),
+ param='openshift_check_efk_es_inode_pct',
+ ))
+ disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80')
+ if int(disk_pct) >= int(disk_pct_thresh):
+ error_msgs.append(
+ 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(disk_pct),
+ limit=str(disk_pct_thresh),
+ param='openshift_check_efk_es_storage_pct',
+ ))
+
+ return error_msgs
+
+ def _exec_oc(self, cmd_str, extra_args, task_vars):
+ return super(Elasticsearch, self).exec_oc(
+ self.execute_module,
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ task_vars,
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..627567293
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,170 @@
+"""
+Module for performing checks on an Fluentd logging deployment
+"""
+
+import json
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+ """Module that checks an integrated logging Fluentd deployment"""
+ name = "fluentd"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
+ self.execute_module,
+ self.logging_namespace,
+ "fluentd",
+ task_vars,
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_fluentd(fluentd_pods, task_vars)
+
+ if check_error:
+ msg = ("The following Fluentd deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+
+ @staticmethod
+ def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+ label, value = node_selector.split('=', 1)
+ fluentd_nodes = {
+ name: node for name, node in nodes_by_name.items()
+ if node['metadata']['labels'].get(label) == value
+ }
+ if not fluentd_nodes:
+ return None, (
+ 'There are no nodes with the fluentd label {label}.\n'
+ 'This means no logs will be aggregated from the nodes.'
+ ).format(label=node_selector)
+ return fluentd_nodes, None
+
+ @staticmethod
+ def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars):
+ """Note if nodes are not labeled as expected. Returns: error string"""
+ intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all'])
+ if not intended_nodes or '--all' in intended_nodes:
+ intended_nodes = nodes_by_name.keys()
+ nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+ if nodes_missing_labels:
+ return (
+ 'The following nodes are supposed to be labeled with {label} but are not:\n'
+ ' {nodes}\n'
+ 'Fluentd will not aggregate logs from these nodes.'
+ ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
+ return None
+
+ @staticmethod
+ def _check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+ unmatched_nodes = fluentd_nodes.copy()
+ node_names_by_label = {
+ node['metadata']['labels']['kubernetes.io/hostname']: name
+ for name, node in fluentd_nodes.items()
+ }
+ node_names_by_internal_ip = {
+ address['address']: name
+ for name, node in fluentd_nodes.items()
+ for address in node['status']['addresses']
+ if address['type'] == "InternalIP"
+ }
+ for pod in pods:
+ for name in [
+ pod['spec']['nodeName'],
+ node_names_by_internal_ip.get(pod['spec']['nodeName']),
+ node_names_by_label.get(pod.get('spec', {}).get('host')),
+ ]:
+ unmatched_nodes.pop(name, None)
+ if unmatched_nodes:
+ return (
+ 'The following nodes are supposed to have a Fluentd pod but do not:\n'
+ '{nodes}'
+ 'These nodes will not have their logs aggregated.'
+ ).format(nodes=''.join(
+ " {}\n".format(name)
+ for name in unmatched_nodes.keys()
+ ))
+ return None
+
+ def _check_fluentd_pods_running(self, pods):
+ """Make sure all fluentd pods are running. Returns: error string"""
+ not_running = super(Fluentd, self).not_running_pods(pods)
+ if not_running:
+ return (
+ 'The following Fluentd pods are supposed to be running but are not:\n'
+ '{pods}'
+ 'These pods will not aggregate logs from their nodes.'
+ ).format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ return None
+
+ def check_fluentd(self, pods, task_vars):
+ """Verify fluentd is running everywhere. Returns: error string"""
+
+ node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true')
+
+ nodes_by_name, error = self.get_nodes_by_name(task_vars)
+
+ if error:
+ return error
+ fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+ if error:
+ return error
+
+ error_msgs = []
+ error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars)
+ if error:
+ error_msgs.append(error)
+ error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
+ if error:
+ error_msgs.append(error)
+ error = self._check_fluentd_pods_running(pods)
+ if error:
+ error_msgs.append(error)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ error_msgs.append(
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ )
+
+ return '\n'.join(error_msgs)
+
+ def get_nodes_by_name(self, task_vars):
+ """Retrieve all the node definitions. Returns: dict(name: node), error string"""
+ nodes_json = self._exec_oc("get nodes -o json", [], task_vars)
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
+ if not nodes or not nodes.get('items'): # also should not happen
+ return None, "No nodes appear to be defined according to the API."
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }, None
+
+ def _exec_oc(self, cmd_str, extra_args, task_vars):
+ return super(Fluentd, self).exec_oc(self.execute_module,
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..442f407b1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,229 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+ from urllib2 import HTTPError, URLError
+ import urllib2
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Kibana(LoggingCheck):
+ """Module that checks an integrated logging Kibana deployment"""
+
+ name = "kibana"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ kibana_pods, error = super(Kibana, self).get_pods_for_component(
+ self.execute_module,
+ self.logging_namespace,
+ "kibana",
+ task_vars,
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_kibana(kibana_pods)
+
+ if not check_error:
+ check_error = self._check_kibana_route(task_vars)
+
+ if check_error:
+ msg = ("The following Kibana deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
+
+ def _verify_url_internal(self, url, task_vars):
+ """
+ Try to reach a URL from the host.
+ Returns: success (bool), reason (for failure)
+ """
+ args = dict(
+ url=url,
+ follow_redirects='none',
+ validate_certs='no', # likely to be signed with internal CA
+ # TODO(lmeyer): give users option to validate certs
+ status_code=302,
+ )
+ result = self.execute_module('uri', args, task_vars)
+ if result.get('failed'):
+ return result['msg']
+ return None
+
+ @staticmethod
+ def _verify_url_external(url):
+ """
+ Try to reach a URL from ansible control host.
+ Returns: success (bool), reason (for failure)
+ """
+ # This actually checks from the ansible control host, which may or may not
+ # really be "external" to the cluster.
+
+ # Disable SSL cert validation to work around internally signed certs
+ ctx = ssl.create_default_context()
+ ctx.check_hostname = False # or setting CERT_NONE is refused
+ ctx.verify_mode = ssl.CERT_NONE
+
+ # Verify that the url is returning a valid response
+ try:
+ # We only care if the url connects and responds
+ return_code = urllib2.urlopen(url, context=ctx).getcode()
+ except HTTPError as httperr:
+ return httperr.reason
+ except URLError as urlerr:
+ return str(urlerr)
+
+ # there appears to be no way to prevent urlopen from following redirects
+ if return_code != 200:
+ return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+ return None
+
+ def check_kibana(self, pods):
+ """Check to see if Kibana is up and working. Returns: error string."""
+
+ if not pods:
+ return "There are no Kibana pods deployed, so no access to the logging UI."
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ return "No Kibana pod is in a running state, so there is no access to the logging UI."
+ elif not_running:
+ return (
+ "The following Kibana pods are not currently in a running state:\n"
+ "{pods}"
+ "However at least one is, so service may not be impacted."
+ ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running))
+
+ return None
+
+ def _get_kibana_url(self, task_vars):
+ """
+ Get kibana route or report error.
+ Returns: url (or empty), reason for failure
+ """
+
+ # Get logging url
+ get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars)
+ if not get_route:
+ return None, 'no_route_exists'
+
+ route = json.loads(get_route)
+
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ # ingress can be null if there is no router, or empty if not routed
+ if not ingress or not ingress[0]:
+ return None, 'route_not_accepted'
+
+ host = route.get("spec", {}).get("host")
+ if not host:
+ return None, 'route_missing_host'
+
+ return 'https://{}/'.format(host), None
+
+ def _check_kibana_route(self, task_vars):
+ """
+ Check to see if kibana route is up and working.
+ Returns: error string
+ """
+ known_errors = dict(
+ no_route_exists=(
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ ),
+ route_not_accepted=(
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ ),
+ route_missing_host=(
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ ),
+ )
+
+ kibana_url, error = self._get_kibana_url(task_vars)
+ if not kibana_url:
+ return known_errors.get(error, error)
+
+ # first, check that kibana is reachable from the master.
+ error = self._verify_url_internal(kibana_url, task_vars)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ error = (
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'Is kibana running, and is at least one router routing to it?'
+ ).format(url=kibana_url)
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ error = (
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'because the hostname does not resolve.\n'
+ 'Is DNS configured for the Kibana hostname?'
+ ).format(url=kibana_url)
+ elif 'Status code was not' in error:
+ error = (
+ 'A request from this master to the Kibana URL {url}\n'
+ 'did not return the correct status code (302).\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues. The output was:\n'
+ ' {error}'
+ ).format(url=kibana_url, error=error)
+ return 'Error validating the logging Kibana route:\n' + error
+
+ # in production we would like the kibana route to work from outside the
+ # cluster too; but that may not be the case, so allow disabling just this part.
+ if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True):
+ return None
+ error = self._verify_url_external(kibana_url)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ error = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ error = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ elif 'Expected success (200)' in error:
+ error = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ error = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set in your inventory:\n'
+ ' openshift_check_efk_kibana_external=False'
+ ).format(error=error)
+ return error
+ return None
+
+ def _exec_oc(self, cmd_str, extra_args, task_vars):
+ return super(Kibana, self).exec_oc(self.execute_module,
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05b4d300c
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,96 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class LoggingCheck(OpenShiftCheck):
+ """Base class for logging component checks"""
+
+ name = "logging"
+
+ @classmethod
+ def is_active(cls, task_vars):
+ return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+
+ @staticmethod
+ def is_first_master(task_vars):
+ """Run only on first master and only when logging is configured. Returns: bool"""
+ logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+ # Note: It would be nice to use membership in oo_first_master group, however for now it
+ # seems best to avoid requiring that setup and just check this is the first master.
+ hostname = get_var(task_vars, "ansible_ssh_host") or [None]
+ masters = get_var(task_vars, "groups", "masters", default=None) or [None]
+ return logging_deployed and masters[0] == hostname
+
+ def run(self, tmp, task_vars):
+ pass
+
+ def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars):
+ """Get all pods for a given component. Returns: list of pods for component, error string"""
+ pod_output = self.exec_oc(
+ execute_module,
+ namespace,
+ "get pods -l component={} -o json".format(logging_component),
+ [],
+ task_vars
+ )
+ try:
+ pods = json.loads(pod_output)
+ if not pods or not pods.get('items'):
+ raise ValueError()
+ except ValueError:
+ # successful run but non-parsing data generally means there were no pods in the namespace
+ return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+
+ return pods['items'], None
+
+ @staticmethod
+ def not_running_pods(pods):
+ """Returns: list of pods not in a ready and running state"""
+ return [
+ pod for pod in pods
+ if any(
+ container['ready'] is False
+ for container in pod['status']['containerStatuses']
+ ) or not any(
+ condition['type'] == 'Ready' and condition['status'] == 'True'
+ for condition in pod['status']['conditions']
+ )
+ ]
+
+ @staticmethod
+ def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None):
+ """
+ Execute an 'oc' command in the remote host.
+ Returns: output of command and namespace,
+ or raises OpenShiftCheckException on error
+ """
+ config_base = get_var(task_vars, "openshift", "common", "config_base")
+ args = {
+ "namespace": namespace,
+ "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+ "cmd": cmd_str,
+ "extra_args": list(extra_args) if extra_args else [],
+ }
+
+ result = execute_module("ocutil", args, task_vars)
+ if result.get("failed"):
+ msg = (
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'
+ ).format(cmd=args['cmd'], error=result['result'])
+
+ if result['result'] == '[Errno 2] No such file or directory':
+ msg = (
+ "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+ "Has an installation been run on this host yet?"
+ )
+ raise OpenShiftCheckException(msg)
+
+ return result.get("result", "")