6 files changed, 117 insertions, 24 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index 987c955b6..28cb53cc5 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -2,9 +2,11 @@
 Health checks for OpenShift clusters.
 """
 
+import json
 import operator
 import os
 import time
+import collections
 
 from abc import ABCMeta, abstractmethod, abstractproperty
 from importlib import import_module
@@ -28,7 +30,7 @@ class OpenShiftCheckException(Exception):
 
 
 class OpenShiftCheckExceptionList(OpenShiftCheckException):
-    """A container for multiple logging errors that may be detected in one check."""
+    """A container for multiple errors that may be detected in one check."""
     def __init__(self, errors):
         self.errors = errors
         super(OpenShiftCheckExceptionList, self).__init__(
@@ -41,29 +43,53 @@ class OpenShiftCheckExceptionList(OpenShiftCheckException):
         return self.errors[index]
 
 
+FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename")
+
+
+# pylint: disable=too-many-instance-attributes; all represent significantly different state.
+# Arguably they could be separated into two hashes, one for storing parameters, and one for
+# storing result state; but that smells more like clutter than clarity.
 @six.add_metaclass(ABCMeta)
 class OpenShiftCheck(object):
-    """
-    A base class for defining checks for an OpenShift cluster environment.
+    """A base class for defining checks for an OpenShift cluster environment.
 
-    Expect optional params: method execute_module, dict task_vars, and string tmp.
+    Optional init params: method execute_module, dict task_vars, and string tmp
     execute_module is expected to have a signature compatible with _execute_module
     from ansible plugins/action/__init__.py, e.g.:
     def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
     This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
     which provides the check's stored task_vars and tmp.
+
+    Optional init param: want_full_results
+    If the check can gather logs, tarballs, etc., do so when True; but no need to spend
+    the time if they're not wanted (won't be written to output directory).
     """
 
-    def __init__(self, execute_module=None, task_vars=None, tmp=None):
+    def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False):
+        # store a method for executing ansible modules from the check
         self._execute_module = execute_module
+        # the task variables and tmpdir passed into the health checker task
         self.task_vars = task_vars or {}
         self.tmp = tmp
+        # a boolean for disabling the gathering of results (files, computations) that won't
+        # actually be recorded/used
+        self.want_full_results = want_full_results
+
         # mainly for testing purposes; see execute_module_with_retries
         self._module_retries = 3
         self._module_retry_interval = 5  # seconds
 
+        # state to be recorded for inspection after the check runs:
+        #
         # set to True when the check changes the host, for accurate total "changed" count
         self.changed = False
+        # list of OpenShiftCheckException for check to report (alternative to returning a failed result)
+        self.failures = []
+        # list of FileToSave - files the check specifies to be written locally if so configured
+        self.files_to_save = []
+        # log messages for the check - tuples of (description, msg) where msg is serializable.
+        # These are intended to be a sequential record of what the check observed and determined.
+        self.logs = []
 
     @abstractproperty
     def name(self):
@@ -86,7 +112,13 @@ class OpenShiftCheck(object):
 
     @abstractmethod
     def run(self):
-        """Executes a check, normally implemented as a module."""
+        """Executes a check against a host and returns a result hash similar to Ansible modules.
+
+        Actually the direction ahead is to record state in the attributes and
+        not bother building a result hash. Instead, return an empty hash and let
+        the action plugin fill it in. Or raise an OpenShiftCheckException.
+        Returning a hash may become deprecated if it does not prove necessary.
+        """
         return {}
 
     @classmethod
@@ -98,7 +130,43 @@ class OpenShiftCheck(object):
             for subclass in subclass.subclasses():
                 yield subclass
 
-    def execute_module(self, module_name=None, module_args=None):
+    def register_failure(self, error):
+        """Record in the check that a failure occurred.
+
+        Recorded failures are merged into the result hash for now. They are also saved to output directory
+        (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json.
+        """
+        # It should be an exception; make it one if not
+        if not isinstance(error, OpenShiftCheckException):
+            error = OpenShiftCheckException(str(error))
+        self.failures.append(error)
+        # duplicate it in the logs so it can be seen in the context of any
+        # information that led to the failure
+        self.register_log("failure: " + error.name, str(error))
+
+    def register_log(self, context, msg):
+        """Record an entry for the check log.
+
+        Notes are intended to serve as context of the whole sequence of what the check observed.
+        They are be saved as an ordered list in a local check log file.
+        They are not to included in the result or in the ansible log; it's just for the record.
+        """
+        self.logs.append([context, msg])
+
+    def register_file(self, filename, contents=None, remote_filename=""):
+        """Record a file that a check makes available to be saved individually to output directory.
+
+        Either file contents should be passed in, or a file to be copied from the remote host
+        should be specified. Contents that are not a string are to be serialized as JSON.
+
+        NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning
+        you should avoid using this on huge files (more than say 10M).
+        """
+        if contents is None and not remote_filename:
+            raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.")
+        self.files_to_save.append(FileToSave(filename, contents, remote_filename))
+
+    def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True):
         """Invoke an Ansible module from a check.
 
         Invoke stored _execute_module, normally copied from the action
@@ -110,6 +178,12 @@ class OpenShiftCheck(object):
         Ansible version).
 
         So e.g. check.execute_module("foo", dict(arg1=...))
+
+        save_as_name specifies a file name for saving the result to an output directory,
+        if needed, and is intended to uniquely identify the result of invoking execute_module.
+        If not provided, the module name will be used.
+        If register is set False, then the result won't be registered in logs or files to save.
+
         Return: result hash from module execution.
         """
         if self._execute_module is None:
@@ -117,7 +191,20 @@ class OpenShiftCheck(object):
                 self.__class__.__name__ +
                 " invoked execute_module without providing the method at initialization."
             )
-        return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+        result = self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+        if result.get("changed"):
+            self.changed = True
+        for output in ["result", "stdout"]:
+            # output is often JSON; attempt to decode
+            try:
+                result[output + "_json"] = json.loads(result[output])
+            except (KeyError, ValueError):
+                pass
+
+        if register:
+            self.register_log("execute_module: " + module_name, result)
+            self.register_file(save_as_name or module_name + ".json", result)
+        return result
 
     def execute_module_with_retries(self, module_name, module_args):
         """Run execute_module and retry on failure."""
@@ -188,8 +275,12 @@ class OpenShiftCheck(object):
                 'There is a bug in this check. While trying to convert variable \n'
                 '  "{var}={value}"\n'
                 'the given converter cannot be used or failed unexpectedly:\n'
-                '{error}'.format(var=".".join(keys), value=value, error=error)
-            )
+                '{type}: {error}'.format(
+                    var=".".join(keys),
+                    value=value,
+                    type=error.__class__.__name__,
+                    error=error
+                ))
 
     @staticmethod
     def get_major_minor_version(openshift_image_tag):
@@ -231,7 +322,9 @@ class OpenShiftCheck(object):
             mount_point = os.path.dirname(mount_point)
 
         try:
-            return mount_for_path[mount_point]
+            mount = mount_for_path[mount_point]
+            self.register_log("mount point for " + path, mount)
+            return mount
         except KeyError:
             known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path))
             raise OpenShiftCheckException(
@@ -259,7 +352,7 @@ def load_checks(path=None, subpkg=""):
             modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
             continue
 
-        if name.endswith(".py") and not name.startswith(".") and name not in LOADER_EXCLUDES:
+        if name.endswith(".py") and name not in LOADER_EXCLUDES:
             modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
 
     return modules
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index f302fd14b..cdf56e959 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -70,6 +70,10 @@ class DiskAvailability(OpenShiftCheck):
             # If it is not a number, then it should be a nested dict.
             pass
 
+        self.register_log("recommended thresholds", self.recommended_disk_space_bytes)
+        if user_config:
+            self.register_log("user-configured thresholds", user_config)
+
         # TODO: as suggested in
         # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
         # maybe we could support checking disk availability in paths that are
@@ -113,10 +117,7 @@ class DiskAvailability(OpenShiftCheck):
                             'in your Ansible inventory, and lower the recommended disk space availability\n'
                             'if necessary for this upgrade.').format(config_bytes)
 
-                return {
-                    'failed': True,
-                    'msg': msg,
-                }
+                self.register_failure(msg)
 
         return {}
 
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index 7fc843fd7..986a01f38 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -72,7 +72,7 @@ class Elasticsearch(LoggingCheck):
         for pod_name in pods_by_name.keys():
             # Compare what each ES node reports as master and compare for split brain
             get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
-            master_name_str = self.exec_oc(get_master_cmd, [])
+            master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json")
             master_names = (master_name_str or '').split(' ')
             if len(master_names) > 1:
                 es_master_names.add(master_names[1])
@@ -113,7 +113,7 @@ class Elasticsearch(LoggingCheck):
 
         # get ES cluster nodes
         node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
-        cluster_node_data = self.exec_oc(node_cmd, [])
+        cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json")
         try:
             cluster_nodes = json.loads(cluster_node_data)['nodes']
         except (ValueError, KeyError):
@@ -142,7 +142,7 @@ class Elasticsearch(LoggingCheck):
         errors = []
         for pod_name in pods_by_name.keys():
             cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
-            cluster_health_data = self.exec_oc(cluster_health_cmd, [])
+            cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json')
             try:
                 health_res = json.loads(cluster_health_data)
                 if not health_res or not health_res.get('status'):
@@ -171,7 +171,7 @@ class Elasticsearch(LoggingCheck):
         errors = []
         for pod_name in pods_by_name.keys():
             df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
-            disk_output = self.exec_oc(df_cmd, [])
+            disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json')
             lines = disk_output.splitlines()
             # expecting one header looking like 'IUse% Use%' and one body line
             body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index ecd8adb64..06bdfebf6 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -78,7 +78,7 @@ class LoggingCheck(OpenShiftCheck):
         """Returns the namespace in which logging is configured to deploy."""
         return self.get_var("openshift_logging_namespace", default="logging")
 
-    def exec_oc(self, cmd_str="", extra_args=None):
+    def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None):
         """
         Execute an 'oc' command in the remote host.
         Returns: output of command and namespace,
@@ -92,7 +92,7 @@ class LoggingCheck(OpenShiftCheck):
             "extra_args": list(extra_args) if extra_args else [],
         }
 
-        result = self.execute_module("ocutil", args)
+        result = self.execute_module("ocutil", args, save_as_name=save_as_name)
         if result.get("failed"):
             if result['result'] == '[Errno 2] No such file or directory':
                 raise CouldNotUseOc(
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
index d781db649..cacdf4213 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -104,7 +104,7 @@ class LoggingIndexTime(LoggingCheck):
             "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
         )
         exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
-        result = self.exec_oc(exec_cmd, [])
+        result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
 
         try:
             count = json.loads(result)["count"]
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 24f1d938a..b90ebf6dd 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -49,5 +49,4 @@ class DockerHostMixin(object):
                 "    {deps}\n{msg}"
             ).format(deps=',\n    '.join(self.dependencies), msg=msg)
         failed = result.get("failed", False) or result.get("rc", 0) != 0
-        self.changed = result.get("changed", False)
         return msg, failed