summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks
diff options
context:
space:
mode:
authorRodolfo Carvalho <rhcarvalho@gmail.com>2017-08-08 18:53:44 +0200
committerGitHub <noreply@github.com>2017-08-08 18:53:44 +0200
commit7121e065b54f9642e6f69ca768b57c3eec542bf7 (patch)
tree4f849a9a7625cf97ad886c4513606121a0b20497 /roles/openshift_health_checker/openshift_checks
parent0569c5069dabeea9e2fe94cd097cb6f2b1540867 (diff)
parent06a6fb9642a2cc70b1ca65f403b853fe8ce9d4b2 (diff)
downloadopenshift-7121e065b54f9642e6f69ca768b57c3eec542bf7.tar.gz
openshift-7121e065b54f9642e6f69ca768b57c3eec542bf7.tar.bz2
openshift-7121e065b54f9642e6f69ca768b57c3eec542bf7.tar.xz
openshift-7121e065b54f9642e6f69ca768b57c3eec542bf7.zip
Merge pull request #4913 from sosiouxme/20170720-refactor-check-results
openshift_checks: refactor check results
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py29
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py10
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py10
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py4
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py32
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py166
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py191
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py17
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py208
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py54
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py75
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py8
13 files changed, 417 insertions, 389 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index 85cbc6224..f26008c9f 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -13,8 +13,30 @@ from ansible.module_utils.six.moves import reduce # pylint: disable=import-erro
class OpenShiftCheckException(Exception):
- """Raised when a check cannot proceed."""
- pass
+ """Raised when a check encounters a failure condition."""
+
+ def __init__(self, name, msg=None):
+ # msg is for the message the user will see when this is raised.
+ # name is for test code to identify the error without looking at msg text.
+ if msg is None: # for parameter backward compatibility
+ msg = name
+ name = self.__class__.__name__
+ self.name = name
+ super(OpenShiftCheckException, self).__init__(msg)
+
+
+class OpenShiftCheckExceptionList(OpenShiftCheckException):
+ """A container for multiple logging errors that may be detected in one check."""
+ def __init__(self, errors):
+ self.errors = errors
+ super(OpenShiftCheckExceptionList, self).__init__(
+ 'OpenShiftCheckExceptionList',
+ '\n'.join(str(msg) for msg in errors)
+ )
+
+ # make iterable
+ def __getitem__(self, index):
+ return self.errors[index]
@six.add_metaclass(ABCMeta)
@@ -35,6 +57,9 @@ class OpenShiftCheck(object):
self.task_vars = task_vars or {}
self.tmp = tmp
+ # set to True when the check changes the host, for accurate total "changed" count
+ self.changed = False
+
@abstractproperty
def name(self):
"""The name of this check, usually derived from the class name."""
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index 77180223e..85a922f86 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -41,11 +41,10 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type
def run(self):
- msg, failed, changed = self.ensure_dependencies()
+ msg, failed = self.ensure_dependencies()
if failed:
return {
"failed": True,
- "changed": changed,
"msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}
@@ -54,11 +53,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
# exit early if all images were found locally
if not missing_images:
- return {"changed": changed}
+ return {}
registries = self.known_docker_registries()
if not registries:
- return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+ return {"failed": True, "msg": "Unable to retrieve any docker registries."}
available_images = self.available_images(missing_images, registries)
unavailable_images = set(missing_images) - set(available_images)
@@ -70,10 +69,9 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"One or more required Docker images are not available:\n {}\n"
"Configured registries: {}"
).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)),
- "changed": changed,
}
- return {"changed": changed}
+ return {}
def required_images(self):
"""
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index dea15a56e..7ae384bd7 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -43,21 +43,20 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
]
def run(self):
- msg, failed, changed = self.ensure_dependencies()
+ msg, failed = self.ensure_dependencies()
if failed:
return {
"failed": True,
- "changed": changed,
"msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
}
# attempt to get the docker info hash from the API
docker_info = self.execute_module("docker_info", {})
if docker_info.get("failed"):
- return {"failed": True, "changed": changed,
+ return {"failed": True,
"msg": "Failed to query Docker API. Is docker running on this host?"}
if not docker_info.get("info"): # this would be very strange
- return {"failed": True, "changed": changed,
+ return {"failed": True,
"msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
docker_info = docker_info["info"]
@@ -68,7 +67,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
"Detected unsupported Docker storage driver '{driver}'.\n"
"Supported storage drivers are: {drivers}"
).format(driver=driver, drivers=', '.join(self.storage_drivers))
- return {"failed": True, "changed": changed, "msg": msg}
+ return {"failed": True, "msg": msg}
# driver status info is a list of tuples; convert to dict and validate based on driver
driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
@@ -81,7 +80,6 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
if driver in ['overlay', 'overlay2']:
result = self.check_overlay_support(docker_info, driver_status)
- result['changed'] = result.get('changed', False) or changed
return result
def check_devicemapper_support(self, driver_status):
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
index 28c38504d..ae8460b7e 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -56,7 +56,7 @@ class EtcdImageDataSize(OpenShiftCheck):
reason = etcdkeysize["module_stderr"]
msg = msg.format(host=etcd_host, reason=reason)
- return {"failed": True, "changed": False, "msg": msg}
+ return {"failed": True, "msg": msg}
if etcdkeysize["size_limit_exceeded"]:
limit = self._to_gigabytes(etcd_imagedata_size_limit)
@@ -65,7 +65,7 @@ class EtcdImageDataSize(OpenShiftCheck):
"Use the `oadm prune images` command to cleanup unused Docker images.")
return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
- return {"changed": False}
+ return {}
@staticmethod
def _get_etcd_mountpath(ansible_mounts):
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
index da7d0364a..e55d55e91 100644
--- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -40,7 +40,7 @@ class EtcdVolume(OpenShiftCheck):
)
return {"failed": True, "msg": msg}
- return {"changed": False}
+ return {}
def _etcd_mount_info(self):
ansible_mounts = self.get_var("ansible_mounts")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
index 32d853d57..b27f97172 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/curator.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -1,6 +1,6 @@
"""Check for an aggregated logging Curator deployment"""
-from openshift_checks.logging.logging import LoggingCheck
+from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck
class Curator(LoggingCheck):
@@ -12,27 +12,17 @@ class Curator(LoggingCheck):
def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
- curator_pods, error = self.get_pods_for_component(
- self.logging_namespace,
- "curator",
- )
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_curator(curator_pods)
-
- if check_error:
- msg = ("The following Curator deployment issue was found:"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
-
+ curator_pods = self.get_pods_for_component("curator")
+ self.check_curator(curator_pods)
# TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+ return {}
def check_curator(self, pods):
"""Check to see if curator is up and working. Returns: error string"""
if not pods:
- return (
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
"There are no Curator pods for the logging stack,\n"
"so nothing will prune Elasticsearch indexes.\n"
"Is Curator correctly deployed?"
@@ -40,14 +30,14 @@ class Curator(LoggingCheck):
not_running = self.not_running_pods(pods)
if len(not_running) == len(pods):
- return (
+ raise OpenShiftCheckException(
+ "CuratorNotRunning",
"The Curator pod is not currently in a running state,\n"
"so Elasticsearch indexes may increase without bound."
)
if len(pods) - len(not_running) > 1:
- return (
+ raise OpenShiftCheckException(
+ "TooManyCurators",
"There is more than one Curator pod running. This should not normally happen.\n"
"Although this doesn't cause any problems, you may want to investigate."
)
-
- return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index 8bdda1f32..7fc843fd7 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -3,6 +3,7 @@
import json
import re
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
from openshift_checks.logging.logging import LoggingCheck
@@ -15,168 +16,178 @@ class Elasticsearch(LoggingCheck):
def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
- es_pods, error = self.get_pods_for_component(
- self.logging_namespace,
- "es",
- )
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_elasticsearch(es_pods)
-
- if check_error:
- msg = ("The following Elasticsearch deployment issue was found:"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
-
+ es_pods = self.get_pods_for_component("es")
+ self.check_elasticsearch(es_pods)
# TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
- def _not_running_elasticsearch_pods(self, es_pods):
- """Returns: list of pods that are not running, list of errors about non-running pods"""
- not_running = self.not_running_pods(es_pods)
- if not_running:
- return not_running, [(
- 'The following Elasticsearch pods are not running:\n'
- '{pods}'
- 'These pods will not aggregate logs from their nodes.'
- ).format(pods=''.join(
- " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
- for pod in not_running
- ))]
- return not_running, []
+ return {}
def check_elasticsearch(self, es_pods):
- """Various checks for elasticsearch. Returns: error string"""
- not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
- running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+ """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
+ running_pods, errors = self.running_elasticsearch_pods(es_pods)
pods_by_name = {
pod['metadata']['name']: pod for pod in running_pods
# Filter out pods that are not members of a DC
if pod['metadata'].get('labels', {}).get('deploymentconfig')
}
if not pods_by_name:
- return 'No logging Elasticsearch pods were found. Is logging deployed?'
- error_msgs += self._check_elasticsearch_masters(pods_by_name)
- error_msgs += self._check_elasticsearch_node_list(pods_by_name)
- error_msgs += self._check_es_cluster_health(pods_by_name)
- error_msgs += self._check_elasticsearch_diskspace(pods_by_name)
- return '\n'.join(error_msgs)
+ # nothing running, cannot run the rest of the check
+ errors.append(OpenShiftCheckException(
+ 'NoRunningPods',
+ 'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
+ ))
+ raise OpenShiftCheckExceptionList(errors)
+
+ errors += self.check_elasticsearch_masters(pods_by_name)
+ errors += self.check_elasticsearch_node_list(pods_by_name)
+ errors += self.check_es_cluster_health(pods_by_name)
+ errors += self.check_elasticsearch_diskspace(pods_by_name)
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def running_elasticsearch_pods(self, es_pods):
+ """Returns: list of running pods, list of errors about non-running pods"""
+ not_running = self.not_running_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running]
+ if not_running:
+ return running_pods, [OpenShiftCheckException(
+ 'PodNotRunning',
+ 'The following Elasticsearch pods are defined but not running:\n'
+ '{pods}'.format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ )]
+ return running_pods, []
@staticmethod
def _build_es_curl_cmd(pod_name, url):
base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
- def _check_elasticsearch_masters(self, pods_by_name):
- """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+ def check_elasticsearch_masters(self, pods_by_name):
+ """Check that Elasticsearch masters are sane. Returns: list of errors"""
es_master_names = set()
- error_msgs = []
+ errors = []
for pod_name in pods_by_name.keys():
# Compare what each ES node reports as master and compare for split brain
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
- master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, [])
+ master_name_str = self.exec_oc(get_master_cmd, [])
master_names = (master_name_str or '').split(' ')
if len(master_names) > 1:
es_master_names.add(master_names[1])
else:
- error_msgs.append(
- 'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+ errors.append(OpenShiftCheckException(
+ 'NoMasterName',
+ 'Elasticsearch {pod} gave unexpected response when asked master name:\n'
' {response}'.format(pod=pod_name, response=master_name_str)
- )
+ ))
if not es_master_names:
- error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
- return '\n'.join(error_msgs)
+ errors.append(OpenShiftCheckException(
+ 'NoMasterFound',
+ 'No logging Elasticsearch masters were found.'
+ ))
+ return errors
if len(es_master_names) > 1:
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'SplitBrainMasters',
'Found multiple Elasticsearch masters according to the pods:\n'
'{master_list}\n'
'This implies that the masters have "split brain" and are not correctly\n'
'replicating data for the logging cluster. Log loss is likely to occur.'
.format(master_list='\n'.join(' ' + master for master in es_master_names))
- )
+ ))
- return error_msgs
+ return errors
- def _check_elasticsearch_node_list(self, pods_by_name):
- """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+ def check_elasticsearch_node_list(self, pods_by_name):
+ """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
if not pods_by_name:
- return ['No logging Elasticsearch masters were found. Is logging deployed?']
+ return [OpenShiftCheckException(
+ 'MissingComponentPods',
+ 'No logging Elasticsearch pods were found.'
+ )]
# get ES cluster nodes
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
- cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, [])
+ cluster_node_data = self.exec_oc(node_cmd, [])
try:
cluster_nodes = json.loads(cluster_node_data)['nodes']
except (ValueError, KeyError):
- return [
+ return [OpenShiftCheckException(
+ 'MissingNodeList',
'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
cluster_node_data
- ]
+ )]
# Try to match all ES-reported node hosts to known pods.
- error_msgs = []
+ errors = []
for node in cluster_nodes.values():
# Note that with 1.4/3.4 the pod IP may be used as the master name
if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
for pod_name, pod in pods_by_name.items()):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'EsPodNodeMismatch',
'The Elasticsearch cluster reports a member node "{node}"\n'
'that does not correspond to any known ES pod.'.format(node=node['host'])
- )
+ ))
- return error_msgs
+ return errors
- def _check_es_cluster_health(self, pods_by_name):
+ def check_es_cluster_health(self, pods_by_name):
"""Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
- error_msgs = []
+ errors = []
for pod_name in pods_by_name.keys():
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
- cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, [])
+ cluster_health_data = self.exec_oc(cluster_health_cmd, [])
try:
health_res = json.loads(cluster_health_data)
if not health_res or not health_res.get('status'):
raise ValueError()
except ValueError:
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'BadEsResponse',
'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
- )
+ ))
continue
if health_res['status'] not in ['green', 'yellow']:
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'EsClusterHealthRed',
'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
- )
+ ))
- return error_msgs
+ return errors
- def _check_elasticsearch_diskspace(self, pods_by_name):
+ def check_elasticsearch_diskspace(self, pods_by_name):
"""
Exec into an ES pod and query the diskspace on the persistent volume.
Returns: list of errors
"""
- error_msgs = []
+ errors = []
for pod_name in pods_by_name.keys():
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
- disk_output = self.exec_oc(self.logging_namespace, df_cmd, [])
+ disk_output = self.exec_oc(df_cmd, [])
lines = disk_output.splitlines()
# expecting one header looking like 'IUse% Use%' and one body line
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'BadDfResponse',
'Could not retrieve storage usage from logging ES pod "{pod}".\n'
'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
- )
+ ))
continue
inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
if int(inode_pct) >= int(inode_pct_thresh):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'InodeUsageTooHigh',
'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
' is {pct}, greater than threshold {limit}.\n'
' Note: threshold can be specified in inventory with {param}'.format(
@@ -184,10 +195,11 @@ class Elasticsearch(LoggingCheck):
pct=str(inode_pct),
limit=str(inode_pct_thresh),
param='openshift_check_efk_es_inode_pct',
- ))
+ )))
disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
if int(disk_pct) >= int(disk_pct_thresh):
- error_msgs.append(
+ errors.append(OpenShiftCheckException(
+ 'DiskUsageTooHigh',
'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
' is {pct}, greater than threshold {limit}.\n'
' Note: threshold can be specified in inventory with {param}'.format(
@@ -195,6 +207,6 @@ class Elasticsearch(LoggingCheck):
pct=str(disk_pct),
limit=str(disk_pct_thresh),
param='openshift_check_efk_es_storage_pct',
- ))
+ )))
- return error_msgs
+ return errors
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
index b3485bf44..3b192a281 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -2,6 +2,8 @@
import json
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
from openshift_checks.logging.logging import LoggingCheck
@@ -12,57 +14,96 @@ class Fluentd(LoggingCheck):
tags = ["health", "logging"]
def run(self):
- """Check various things and gather errors. Returns: result as hash"""
+ """Check the Fluentd deployment and raise an error if any problems are found."""
+
+ fluentd_pods = self.get_pods_for_component("fluentd")
+ self.check_fluentd(fluentd_pods)
+ return {}
+
+ def check_fluentd(self, pods):
+ """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
- self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
- fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
- self.logging_namespace,
- "fluentd",
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
)
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_fluentd(fluentd_pods)
- if check_error:
- msg = ("The following Fluentd deployment issue was found:"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
+ nodes_by_name = self.get_nodes_by_name()
+ fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
- # TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+ errors = []
+ errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+ errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+ errors += self.check_fluentd_pods_running(pods)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ errors.append(OpenShiftCheckException(
+ 'TooManyFluentdPods',
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ ))
+
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def get_nodes_by_name(self):
+ """Retrieve all the node definitions. Returns: dict(name: node)"""
+ nodes_json = self.exec_oc("get nodes -o json", [])
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ raise OpenShiftCheckException(
+ "BadOcNodeList",
+ "Could not obtain a list of nodes to validate fluentd.\n"
+ "Output from oc get:\n" + nodes_json
+ )
+ if not nodes or not nodes.get('items'): # also should not happen
+ raise OpenShiftCheckException(
+ "NoNodesDefined",
+ "No nodes appear to be defined according to the API."
+ )
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }
@staticmethod
- def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
- """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+ def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node)"""
label, value = node_selector.split('=', 1)
fluentd_nodes = {
name: node for name, node in nodes_by_name.items()
if node['metadata']['labels'].get(label) == value
}
if not fluentd_nodes:
- return None, (
+ raise OpenShiftCheckException(
+ 'NoNodesLabeled',
'There are no nodes with the fluentd label {label}.\n'
- 'This means no logs will be aggregated from the nodes.'
- ).format(label=node_selector)
- return fluentd_nodes, None
+ 'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+ )
+ return fluentd_nodes
- def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
- """Note if nodes are not labeled as expected. Returns: error string"""
+ def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+ """Note if nodes are not labeled as expected. Returns: error list"""
intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
if not intended_nodes or '--all' in intended_nodes:
intended_nodes = nodes_by_name.keys()
nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
if nodes_missing_labels:
- return (
+ return [OpenShiftCheckException(
+ 'NodesUnlabeled',
'The following nodes are supposed to be labeled with {label} but are not:\n'
' {nodes}\n'
- 'Fluentd will not aggregate logs from these nodes.'
- ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
- return None
+ 'Fluentd will not aggregate logs from these nodes.'.format(
+ label=node_selector, nodes=', '.join(nodes_missing_labels)
+ ))]
+
+ return []
@staticmethod
- def _check_nodes_have_fluentd(pods, fluentd_nodes):
- """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+ def check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error list"""
unmatched_nodes = fluentd_nodes.copy()
node_names_by_label = {
node['metadata']['labels']['kubernetes.io/hostname']: name
@@ -82,80 +123,32 @@ class Fluentd(LoggingCheck):
]:
unmatched_nodes.pop(name, None)
if unmatched_nodes:
- return (
+ return [OpenShiftCheckException(
+ 'MissingFluentdPod',
'The following nodes are supposed to have a Fluentd pod but do not:\n'
- '{nodes}'
- 'These nodes will not have their logs aggregated.'
- ).format(nodes=''.join(
- " {}\n".format(name)
- for name in unmatched_nodes.keys()
- ))
- return None
+ ' {nodes}\n'
+ 'These nodes will not have their logs aggregated.'.format(
+ nodes='\n '.join(unmatched_nodes.keys())
+ ))]
+
+ return []
- def _check_fluentd_pods_running(self, pods):
+ def check_fluentd_pods_running(self, pods):
"""Make sure all fluentd pods are running. Returns: error string"""
not_running = super(Fluentd, self).not_running_pods(pods)
if not_running:
- return (
+ return [OpenShiftCheckException(
+ 'FluentdNotRunning',
'The following Fluentd pods are supposed to be running but are not:\n'
- '{pods}'
- 'These pods will not aggregate logs from their nodes.'
- ).format(pods=''.join(
- " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
- for pod in not_running
- ))
- return None
-
- def check_fluentd(self, pods):
- """Verify fluentd is running everywhere. Returns: error string"""
-
- node_selector = self.get_var(
- 'openshift_logging_fluentd_nodeselector',
- default='logging-infra-fluentd=true'
- )
-
- nodes_by_name, error = self.get_nodes_by_name()
-
- if error:
- return error
- fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
- if error:
- return error
-
- error_msgs = []
- error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
- if error:
- error_msgs.append(error)
- error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
- if error:
- error_msgs.append(error)
- error = self._check_fluentd_pods_running(pods)
- if error:
- error_msgs.append(error)
-
- # Make sure there are no extra fluentd pods
- if len(pods) > len(fluentd_nodes):
- error_msgs.append(
- 'There are more Fluentd pods running than nodes labeled.\n'
- 'This may not cause problems with logging but it likely indicates something wrong.'
- )
-
- return '\n'.join(error_msgs)
-
- def get_nodes_by_name(self):
- """Retrieve all the node definitions. Returns: dict(name: node), error string"""
- nodes_json = self.exec_oc(
- self.logging_namespace,
- "get nodes -o json",
- []
- )
- try:
- nodes = json.loads(nodes_json)
- except ValueError: # no valid json - should not happen
- return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
- if not nodes or not nodes.get('items'): # also should not happen
- return None, "No nodes appear to be defined according to the API."
- return {
- node['metadata']['name']: node
- for node in nodes['items']
- }, None
+ ' {pods}\n'
+ 'These pods will not aggregate logs from their nodes.'.format(
+ pods='\n'.join(
+ " {name} ({host})".format(
+ name=pod['metadata']['name'],
+ host=pod['spec'].get('host', 'None')
+ )
+ for pod in not_running
+ )
+ ))]
+
+ return []
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
index 0970f0a63..d783e6760 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -24,7 +24,6 @@ class FluentdConfig(LoggingCheck):
def run(self):
"""Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
- self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
config_error = self.check_logging_config()
if config_error:
msg = ("The following Fluentd logging configuration problem was found:"
@@ -120,19 +119,13 @@ class FluentdConfig(LoggingCheck):
def running_fluentd_pods(self):
"""Return a list of running fluentd pods."""
- fluentd_pods, error = self.get_pods_for_component(
- self.logging_namespace,
- "fluentd",
- )
- if error:
- msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error)
- raise OpenShiftCheckException(msg)
+ fluentd_pods = self.get_pods_for_component("fluentd")
running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
if not running_fluentd_pods:
- msg = ('No Fluentd pods were found to be in the "Running" state. '
- 'At least one Fluentd pod is required in order to perform this check.')
-
- raise OpenShiftCheckException(msg)
+ raise OpenShiftCheckException(
+ 'No Fluentd pods were found to be in the "Running" state. '
+ 'At least one Fluentd pod is required in order to perform this check.'
+ )
return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index efb14ab42..3b1cf8baa 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -12,7 +12,7 @@ except ImportError:
from urllib.error import HTTPError, URLError
import urllib.request as urllib2
-from openshift_checks.logging.logging import LoggingCheck
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
class Kibana(LoggingCheck):
@@ -24,25 +24,12 @@ class Kibana(LoggingCheck):
def run(self):
"""Check various things and gather errors. Returns: result as hash"""
- self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
- kibana_pods, error = self.get_pods_for_component(
- self.logging_namespace,
- "kibana",
- )
- if error:
- return {"failed": True, "changed": False, "msg": error}
- check_error = self.check_kibana(kibana_pods)
-
- if not check_error:
- check_error = self._check_kibana_route()
-
- if check_error:
- msg = ("The following Kibana deployment issue was found:"
- "{}".format(check_error))
- return {"failed": True, "changed": False, "msg": msg}
-
+ kibana_pods = self.get_pods_for_component("kibana")
+ self.check_kibana(kibana_pods)
+ self.check_kibana_route()
# TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
+
+ return {}
def _verify_url_internal(self, url):
"""
@@ -65,7 +52,7 @@ class Kibana(LoggingCheck):
def _verify_url_external(url):
"""
Try to reach a URL from ansible control host.
- Returns: success (bool), reason (for failure)
+ Raise an OpenShiftCheckException if anything goes wrong.
"""
# This actually checks from the ansible control host, which may or may not
# really be "external" to the cluster.
@@ -91,130 +78,149 @@ class Kibana(LoggingCheck):
return None
def check_kibana(self, pods):
- """Check to see if Kibana is up and working. Returns: error string."""
+ """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""
if not pods:
- return "There are no Kibana pods deployed, so no access to the logging UI."
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
+ "There are no Kibana pods deployed, so no access to the logging UI."
+ )
not_running = self.not_running_pods(pods)
if len(not_running) == len(pods):
- return "No Kibana pod is in a running state, so there is no access to the logging UI."
+ raise OpenShiftCheckException(
+ "NoRunningPods",
+ "No Kibana pod is in a running state, so there is no access to the logging UI."
+ )
elif not_running:
- return (
+ raise OpenShiftCheckException(
+ "PodNotRunning",
"The following Kibana pods are not currently in a running state:\n"
- "{pods}"
- "However at least one is, so service may not be impacted."
- ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running))
-
- return None
+ " {pods}\n"
+ "However at least one is, so service may not be impacted.".format(
+ pods="\n ".join(pod['metadata']['name'] for pod in not_running)
+ )
+ )
def _get_kibana_url(self):
"""
Get kibana route or report error.
- Returns: url (or empty), reason for failure
+ Returns: url
"""
# Get logging url
- get_route = self.exec_oc(
- self.logging_namespace,
- "get route logging-kibana -o json",
- [],
- )
+ get_route = self.exec_oc("get route logging-kibana -o json", [])
if not get_route:
- return None, 'no_route_exists'
+ raise OpenShiftCheckException(
+ 'no_route_exists',
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ )
- route = json.loads(get_route)
+ try:
+ route = json.loads(get_route)
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ except (ValueError, KeyError):
+ raise OpenShiftCheckException(
+ 'get_route_failed',
+ '"oc get route" returned an unexpected response:\n' + get_route
+ )
- # check that the route has been accepted by a router
- ingress = route["status"]["ingress"]
# ingress can be null if there is no router, or empty if not routed
if not ingress or not ingress[0]:
- return None, 'route_not_accepted'
+ raise OpenShiftCheckException(
+ 'route_not_accepted',
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ )
host = route.get("spec", {}).get("host")
if not host:
- return None, 'route_missing_host'
+ raise OpenShiftCheckException(
+ 'route_missing_host',
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ )
- return 'https://{}/'.format(host), None
+ return 'https://{}/'.format(host)
- def _check_kibana_route(self):
+ def check_kibana_route(self):
"""
Check to see if kibana route is up and working.
- Returns: error string
+ Raises exception if not.
"""
- known_errors = dict(
- no_route_exists=(
- 'No route is defined for Kibana in the logging namespace,\n'
- 'so the logging stack is not accessible. Is logging deployed?\n'
- 'Did something remove the logging-kibana route?'
- ),
- route_not_accepted=(
- 'The logging-kibana route is not being routed by any router.\n'
- 'Is the router deployed and working?'
- ),
- route_missing_host=(
- 'The logging-kibana route has no hostname defined,\n'
- 'which should never happen. Did something alter its definition?'
- ),
- )
- kibana_url, error = self._get_kibana_url()
- if not kibana_url:
- return known_errors.get(error, error)
+ kibana_url = self._get_kibana_url()
# first, check that kibana is reachable from the master.
error = self._verify_url_internal(kibana_url)
if error:
if 'urlopen error [Errno 111] Connection refused' in error:
- error = (
+ raise OpenShiftCheckException(
+ 'FailedToConnectInternal',
'Failed to connect from this master to Kibana URL {url}\n'
- 'Is kibana running, and is at least one router routing to it?'
- ).format(url=kibana_url)
+ 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
+ )
elif 'urlopen error [Errno -2] Name or service not known' in error:
- error = (
+ raise OpenShiftCheckException(
+ 'FailedToResolveInternal',
'Failed to connect from this master to Kibana URL {url}\n'
'because the hostname does not resolve.\n'
- 'Is DNS configured for the Kibana hostname?'
- ).format(url=kibana_url)
+ 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
+ )
elif 'Status code was not' in error:
- error = (
+ raise OpenShiftCheckException(
+ 'WrongReturnCodeInternal',
'A request from this master to the Kibana URL {url}\n'
'did not return the correct status code (302).\n'
'This could mean that Kibana is malfunctioning, the hostname is\n'
'resolving incorrectly, or other network issues. The output was:\n'
- ' {error}'
- ).format(url=kibana_url, error=error)
- return 'Error validating the logging Kibana route:\n' + error
+ ' {error}'.format(url=kibana_url, error=error)
+ )
+ raise OpenShiftCheckException(
+ 'MiscRouteErrorInternal',
+ 'Error validating the logging Kibana route internally:\n' + error
+ )
# in production we would like the kibana route to work from outside the
# cluster too; but that may not be the case, so allow disabling just this part.
- if not self.get_var("openshift_check_efk_kibana_external", default=True):
- return None
+ if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
+ return
error = self._verify_url_external(kibana_url)
- if error:
- if 'urlopen error [Errno 111] Connection refused' in error:
- error = (
- 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
- 'Is the router for the Kibana hostname exposed externally?'
- ).format(url=kibana_url)
- elif 'urlopen error [Errno -2] Name or service not known' in error:
- error = (
- 'Failed to resolve the Kibana hostname in {url}\n'
- 'from the Ansible control host.\n'
- 'Is DNS configured to resolve this Kibana hostname externally?'
- ).format(url=kibana_url)
- elif 'Expected success (200)' in error:
- error = (
- 'A request to Kibana at {url}\n'
- 'returned the wrong error code:\n'
- ' {error}\n'
- 'This could mean that Kibana is malfunctioning, the hostname is\n'
- 'resolving incorrectly, or other network issues.'
- ).format(url=kibana_url, error=error)
- error = (
- 'Error validating the logging Kibana route:\n{error}\n'
- 'To disable external Kibana route validation, set in your inventory:\n'
- ' openshift_check_efk_kibana_external=False'
- ).format(error=error)
- return error
- return None
+
+ if not error:
+ return
+
+ error_fmt = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set the variable:\n'
+ ' openshift_check_efk_kibana_external=False'
+ )
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ msg = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ msg = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
+ elif 'Expected success (200)' in error:
+ msg = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
+ raise OpenShiftCheckException(
+ 'MiscRouteError',
+ 'Error validating the logging Kibana route externally:\n' + error
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 43ba6c406..3b7c39760 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -8,6 +8,16 @@ import os
from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+class MissingComponentPods(OpenShiftCheckException):
+ """Raised when a component has no pods in the namespace."""
+ pass
+
+
+class CouldNotUseOc(OpenShiftCheckException):
+ """Raised when ocutil has a failure running oc."""
+ pass
+
+
class LoggingCheck(OpenShiftCheck):
"""Base class for OpenShift aggregated logging component checks"""
@@ -15,7 +25,6 @@ class LoggingCheck(OpenShiftCheck):
# run by itself.
name = "logging"
- logging_namespace = "logging"
def is_active(self):
logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
@@ -32,22 +41,24 @@ class LoggingCheck(OpenShiftCheck):
def run(self):
return {}
- def get_pods_for_component(self, namespace, logging_component):
- """Get all pods for a given component. Returns: list of pods for component, error string"""
+ def get_pods_for_component(self, logging_component):
+ """Get all pods for a given component. Returns: list of pods."""
pod_output = self.exec_oc(
- namespace,
"get pods -l component={} -o json".format(logging_component),
[],
)
try:
- pods = json.loads(pod_output)
- if not pods or not pods.get('items'):
+ pods = json.loads(pod_output) # raises ValueError if deserialize fails
+ if not pods or not pods.get('items'): # also a broken response, treat the same
raise ValueError()
except ValueError:
- # successful run but non-parsing data generally means there were no pods in the namespace
- return None, 'No pods were found for the "{}" logging component.'.format(logging_component)
+ # successful run but non-parsing data generally means there were no pods to be found
+ raise MissingComponentPods(
+ 'There are no "{}" component pods in the "{}" namespace.\n'
+ 'Is logging deployed?'.format(logging_component, self.logging_namespace())
+ )
- return pods['items'], None
+ return pods['items']
@staticmethod
def not_running_pods(pods):
@@ -63,15 +74,19 @@ class LoggingCheck(OpenShiftCheck):
)
]
- def exec_oc(self, namespace="logging", cmd_str="", extra_args=None):
+ def logging_namespace(self):
+ """Returns the namespace in which logging is configured to deploy."""
+ return self.get_var("openshift_logging_namespace", default="logging")
+
+ def exec_oc(self, cmd_str="", extra_args=None):
"""
Execute an 'oc' command in the remote host.
Returns: output of command and namespace,
- or raises OpenShiftCheckException on error
+ or raises CouldNotUseOc on error
"""
config_base = self.get_var("openshift", "common", "config_base")
args = {
- "namespace": namespace,
+ "namespace": self.logging_namespace(),
"config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
"cmd": cmd_str,
"extra_args": list(extra_args) if extra_args else [],
@@ -79,17 +94,16 @@ class LoggingCheck(OpenShiftCheck):
result = self.execute_module("ocutil", args)
if result.get("failed"):
- msg = (
- 'Unexpected error using `oc` to validate the logging stack components.\n'
- 'Error executing `oc {cmd}`:\n'
- '{error}'
- ).format(cmd=args['cmd'], error=result['result'])
-
if result['result'] == '[Errno 2] No such file or directory':
- msg = (
+ raise CouldNotUseOc(
"This host is supposed to be a master but does not have the `oc` command where expected.\n"
"Has an installation been run on this host yet?"
)
- raise OpenShiftCheckException(msg)
+
+ raise CouldNotUseOc(
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'.format(cmd=args['cmd'], error=result['result'])
+ )
return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
index b24e88e05..d781db649 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck):
name = "logging_index_time"
tags = ["health", "logging"]
- logging_namespace = "logging"
-
def run(self):
"""Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
try:
@@ -28,29 +26,25 @@ class LoggingIndexTime(LoggingCheck):
self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
)
except ValueError:
- return {
- "failed": True,
- "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
- 'Value must be an integer representing an amount in seconds.'),
- }
+ raise OpenShiftCheckException(
+ 'InvalidTimeout',
+ 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'
+ )
running_component_pods = dict()
# get all component pods
- self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
- pods, error = self.get_pods_for_component(self.logging_namespace, component)
-
- if error:
- msg = 'Unable to retrieve pods for the {} logging component: {}'
- return {"failed": True, "changed": False, "msg": msg.format(name, error)}
-
+ pods = self.get_pods_for_component(component)
running_pods = self.running_pods(pods)
if not running_pods:
- msg = ('No {} pods in the "Running" state were found.'
- 'At least one pod is required in order to perform this check.')
- return {"failed": True, "changed": False, "msg": msg.format(name)}
+ raise OpenShiftCheckException(
+ component + 'NoRunningPods',
+ 'No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.'.format(name)
+ )
running_component_pods[component] = running_pods
@@ -65,8 +59,11 @@ class LoggingIndexTime(LoggingCheck):
interval = 1
while not self.query_es_from_es(es_pod, uuid):
if time.time() + interval > deadline:
- msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s."
- raise OpenShiftCheckException(msg.format(uuid, timeout_secs))
+ raise OpenShiftCheckException(
+ "NoMatchFound",
+ "expecting match in Elasticsearch for message with uuid {}, "
+ "but no matches were found after {}s.".format(uuid, timeout_secs)
+ )
time.sleep(interval)
def curl_kibana_with_uuid(self, kibana_pod):
@@ -76,22 +73,23 @@ class LoggingIndexTime(LoggingCheck):
exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
- error_str = self.exec_oc(self.logging_namespace, exec_cmd, [])
+ error_str = self.exec_oc(exec_cmd, [])
try:
error_code = json.loads(error_str)["statusCode"]
- except KeyError:
- msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n'
- 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
- raise OpenShiftCheckException(msg)
- except ValueError:
- msg = ('invalid response returned from Kibana request (Non-JSON output):\n'
- 'Command: {}\nResponse: {}').format(exec_cmd, error_str)
- raise OpenShiftCheckException(msg)
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'kibanaInvalidResponse',
+ 'invalid response returned from Kibana request:\n'
+ 'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
+ )
if error_code != 404:
- msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.'
- raise OpenShiftCheckException(msg.format(error_code))
+ raise OpenShiftCheckException(
+ 'kibanaInvalidReturnCode',
+ 'invalid error code returned from Kibana request.\n'
+ 'Expecting error code "404", but got "{}" instead.'.format(error_code)
+ )
return uuid
@@ -105,17 +103,18 @@ class LoggingIndexTime(LoggingCheck):
"--key /etc/elasticsearch/secret/admin-key "
"https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
)
- exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid)
- result = self.exec_oc(self.logging_namespace, exec_cmd, [])
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
+ result = self.exec_oc(exec_cmd, [])
try:
count = json.loads(result)["count"]
- except KeyError:
- msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}'
- raise OpenShiftCheckException(msg.format(exec_cmd, result))
- except ValueError:
- msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}'
- raise OpenShiftCheckException(msg.format(exec_cmd, result))
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'esInvalidResponse',
+ 'Invalid response from Elasticsearch query:\n'
+ ' {}\n'
+ 'Response was:\n{}'.format(exec_cmd, result)
+ )
return count
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 3b2c64e6a..e9bae60a3 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -29,10 +29,10 @@ class DockerHostMixin(object):
"""
Ensure that docker-related packages exist, but not on atomic hosts
(which would not be able to install but should already have them).
- Returns: msg, failed, changed
+ Returns: msg, failed
"""
if self.get_var("openshift", "common", "is_atomic"):
- return "", False, False
+ return "", False
# NOTE: we would use the "package" module but it's actually an action plugin
# and it's not clear how to invoke one of those. This is about the same anyway:
@@ -49,5 +49,5 @@ class DockerHostMixin(object):
" {deps}\n{msg}"
).format(deps=',\n '.join(self.dependencies), msg=msg)
failed = result.get("failed", False) or result.get("rc", 0) != 0
- changed = result.get("changed", False)
- return msg, failed, changed
+ self.changed = result.get("changed", False)
+ return msg, failed