summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/logging/fluentd.py')
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py191
1 files changed, 92 insertions, 99 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
index 69c7b4392..3b192a281 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -2,6 +2,8 @@
import json
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
from openshift_checks.logging.logging import LoggingCheck
@@ -12,57 +14,96 @@ class Fluentd(LoggingCheck):
tags = ["health", "logging"]
def run(self):
- """Check various things and gather errors. Returns: result as hash"""
+ """Check the Fluentd deployment and raise an error if any problems are found."""
+
+ fluentd_pods = self.get_pods_for_component("fluentd")
+ self.check_fluentd(fluentd_pods)
+ return {}
+
+ def check_fluentd(self, pods):
+ """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
- self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
- fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
- self.logging_namespace,
- "fluentd",
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
)
- if error:
- return {"failed": True, "msg": error}
- check_error = self.check_fluentd(fluentd_pods)
- if check_error:
- msg = ("The following Fluentd deployment issue was found:"
- "{}".format(check_error))
- return {"failed": True, "msg": msg}
+ nodes_by_name = self.get_nodes_by_name()
+ fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
- # TODO(lmeyer): run it all again for the ops cluster
- return {"failed": False, "msg": 'No problems found with Fluentd deployment.'}
+ errors = []
+ errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+ errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+ errors += self.check_fluentd_pods_running(pods)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ errors.append(OpenShiftCheckException(
+ 'TooManyFluentdPods',
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ ))
+
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def get_nodes_by_name(self):
+ """Retrieve all the node definitions. Returns: dict(name: node)"""
+ nodes_json = self.exec_oc("get nodes -o json", [])
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ raise OpenShiftCheckException(
+ "BadOcNodeList",
+ "Could not obtain a list of nodes to validate fluentd.\n"
+ "Output from oc get:\n" + nodes_json
+ )
+ if not nodes or not nodes.get('items'): # also should not happen
+ raise OpenShiftCheckException(
+ "NoNodesDefined",
+ "No nodes appear to be defined according to the API."
+ )
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }
@staticmethod
- def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
- """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+ def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node)"""
label, value = node_selector.split('=', 1)
fluentd_nodes = {
name: node for name, node in nodes_by_name.items()
if node['metadata']['labels'].get(label) == value
}
if not fluentd_nodes:
- return None, (
+ raise OpenShiftCheckException(
+ 'NoNodesLabeled',
'There are no nodes with the fluentd label {label}.\n'
- 'This means no logs will be aggregated from the nodes.'
- ).format(label=node_selector)
- return fluentd_nodes, None
+ 'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+ )
+ return fluentd_nodes
- def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
- """Note if nodes are not labeled as expected. Returns: error string"""
+ def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+ """Note if nodes are not labeled as expected. Returns: error list"""
intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
if not intended_nodes or '--all' in intended_nodes:
intended_nodes = nodes_by_name.keys()
nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
if nodes_missing_labels:
- return (
+ return [OpenShiftCheckException(
+ 'NodesUnlabeled',
'The following nodes are supposed to be labeled with {label} but are not:\n'
' {nodes}\n'
- 'Fluentd will not aggregate logs from these nodes.'
- ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
- return None
+ 'Fluentd will not aggregate logs from these nodes.'.format(
+ label=node_selector, nodes=', '.join(nodes_missing_labels)
+ ))]
+
+ return []
@staticmethod
- def _check_nodes_have_fluentd(pods, fluentd_nodes):
- """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+ def check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error list"""
unmatched_nodes = fluentd_nodes.copy()
node_names_by_label = {
node['metadata']['labels']['kubernetes.io/hostname']: name
@@ -82,80 +123,32 @@ class Fluentd(LoggingCheck):
]:
unmatched_nodes.pop(name, None)
if unmatched_nodes:
- return (
+ return [OpenShiftCheckException(
+ 'MissingFluentdPod',
'The following nodes are supposed to have a Fluentd pod but do not:\n'
- '{nodes}'
- 'These nodes will not have their logs aggregated.'
- ).format(nodes=''.join(
- " {}\n".format(name)
- for name in unmatched_nodes.keys()
- ))
- return None
+ ' {nodes}\n'
+ 'These nodes will not have their logs aggregated.'.format(
+ nodes='\n '.join(unmatched_nodes.keys())
+ ))]
+
+ return []
- def _check_fluentd_pods_running(self, pods):
+ def check_fluentd_pods_running(self, pods):
"""Make sure all fluentd pods are running. Returns: error string"""
not_running = super(Fluentd, self).not_running_pods(pods)
if not_running:
- return (
+ return [OpenShiftCheckException(
+ 'FluentdNotRunning',
'The following Fluentd pods are supposed to be running but are not:\n'
- '{pods}'
- 'These pods will not aggregate logs from their nodes.'
- ).format(pods=''.join(
- " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
- for pod in not_running
- ))
- return None
-
- def check_fluentd(self, pods):
- """Verify fluentd is running everywhere. Returns: error string"""
-
- node_selector = self.get_var(
- 'openshift_logging_fluentd_nodeselector',
- default='logging-infra-fluentd=true'
- )
-
- nodes_by_name, error = self.get_nodes_by_name()
-
- if error:
- return error
- fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
- if error:
- return error
-
- error_msgs = []
- error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
- if error:
- error_msgs.append(error)
- error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
- if error:
- error_msgs.append(error)
- error = self._check_fluentd_pods_running(pods)
- if error:
- error_msgs.append(error)
-
- # Make sure there are no extra fluentd pods
- if len(pods) > len(fluentd_nodes):
- error_msgs.append(
- 'There are more Fluentd pods running than nodes labeled.\n'
- 'This may not cause problems with logging but it likely indicates something wrong.'
- )
-
- return '\n'.join(error_msgs)
-
- def get_nodes_by_name(self):
- """Retrieve all the node definitions. Returns: dict(name: node), error string"""
- nodes_json = self.exec_oc(
- self.logging_namespace,
- "get nodes -o json",
- []
- )
- try:
- nodes = json.loads(nodes_json)
- except ValueError: # no valid json - should not happen
- return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
- if not nodes or not nodes.get('items'): # also should not happen
- return None, "No nodes appear to be defined according to the API."
- return {
- node['metadata']['name']: node
- for node in nodes['items']
- }, None
+ ' {pods}\n'
+ 'These pods will not aggregate logs from their nodes.'.format(
+ pods='\n'.join(
+ " {name} ({host})".format(
+ name=pod['metadata']['name'],
+ host=pod['spec'].get('host', 'None')
+ )
+ for pod in not_running
+ )
+ ))]
+
+ return []