summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py134
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py9
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py87
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py8
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py4
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py3
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_availability.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_update.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py3
10 files changed, 198 insertions, 56 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index 02ee1d0f9..28cb53cc5 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -2,8 +2,11 @@
Health checks for OpenShift clusters.
"""
+import json
import operator
import os
+import time
+import collections
from abc import ABCMeta, abstractmethod, abstractproperty
from importlib import import_module
@@ -27,7 +30,7 @@ class OpenShiftCheckException(Exception):
class OpenShiftCheckExceptionList(OpenShiftCheckException):
- """A container for multiple logging errors that may be detected in one check."""
+ """A container for multiple errors that may be detected in one check."""
def __init__(self, errors):
self.errors = errors
super(OpenShiftCheckExceptionList, self).__init__(
@@ -40,26 +43,53 @@ class OpenShiftCheckExceptionList(OpenShiftCheckException):
return self.errors[index]
+FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename")
+
+
+# pylint: disable=too-many-instance-attributes; all represent significantly different state.
+# Arguably they could be separated into two hashes, one for storing parameters, and one for
+# storing result state; but that smells more like clutter than clarity.
@six.add_metaclass(ABCMeta)
class OpenShiftCheck(object):
- """
- A base class for defining checks for an OpenShift cluster environment.
+ """A base class for defining checks for an OpenShift cluster environment.
- Expect optional params: method execute_module, dict task_vars, and string tmp.
+ Optional init params: method execute_module, dict task_vars, and string tmp
execute_module is expected to have a signature compatible with _execute_module
from ansible plugins/action/__init__.py, e.g.:
def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
which provides the check's stored task_vars and tmp.
+
+ Optional init param: want_full_results
+ If the check can gather logs, tarballs, etc., do so when True; but no need to spend
+ the time if they're not wanted (won't be written to output directory).
"""
- def __init__(self, execute_module=None, task_vars=None, tmp=None):
+ def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False):
+ # store a method for executing ansible modules from the check
self._execute_module = execute_module
+ # the task variables and tmpdir passed into the health checker task
self.task_vars = task_vars or {}
self.tmp = tmp
+ # a boolean for disabling the gathering of results (files, computations) that won't
+ # actually be recorded/used
+ self.want_full_results = want_full_results
+
+ # mainly for testing purposes; see execute_module_with_retries
+ self._module_retries = 3
+ self._module_retry_interval = 5 # seconds
+ # state to be recorded for inspection after the check runs:
+ #
# set to True when the check changes the host, for accurate total "changed" count
self.changed = False
+ # list of OpenShiftCheckException for check to report (alternative to returning a failed result)
+ self.failures = []
+ # list of FileToSave - files the check specifies to be written locally if so configured
+ self.files_to_save = []
+ # log messages for the check - tuples of (description, msg) where msg is serializable.
+ # These are intended to be a sequential record of what the check observed and determined.
+ self.logs = []
@abstractproperty
def name(self):
@@ -82,7 +112,13 @@ class OpenShiftCheck(object):
@abstractmethod
def run(self):
- """Executes a check, normally implemented as a module."""
+ """Executes a check against a host and returns a result hash similar to Ansible modules.
+
+ Actually the direction ahead is to record state in the attributes and
+ not bother building a result hash. Instead, return an empty hash and let
+ the action plugin fill it in. Or raise an OpenShiftCheckException.
+ Returning a hash may become deprecated if it does not prove necessary.
+ """
return {}
@classmethod
@@ -94,7 +130,43 @@ class OpenShiftCheck(object):
for subclass in subclass.subclasses():
yield subclass
- def execute_module(self, module_name=None, module_args=None):
+ def register_failure(self, error):
+ """Record in the check that a failure occurred.
+
+ Recorded failures are merged into the result hash for now. They are also saved to output directory
+ (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json.
+ """
+ # It should be an exception; make it one if not
+ if not isinstance(error, OpenShiftCheckException):
+ error = OpenShiftCheckException(str(error))
+ self.failures.append(error)
+ # duplicate it in the logs so it can be seen in the context of any
+ # information that led to the failure
+ self.register_log("failure: " + error.name, str(error))
+
+ def register_log(self, context, msg):
+ """Record an entry for the check log.
+
+ Notes are intended to serve as context of the whole sequence of what the check observed.
+ They are be saved as an ordered list in a local check log file.
+ They are not to included in the result or in the ansible log; it's just for the record.
+ """
+ self.logs.append([context, msg])
+
+ def register_file(self, filename, contents=None, remote_filename=""):
+ """Record a file that a check makes available to be saved individually to output directory.
+
+ Either file contents should be passed in, or a file to be copied from the remote host
+ should be specified. Contents that are not a string are to be serialized as JSON.
+
+ NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning
+ you should avoid using this on huge files (more than say 10M).
+ """
+ if contents is None and not remote_filename:
+ raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.")
+ self.files_to_save.append(FileToSave(filename, contents, remote_filename))
+
+ def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True):
"""Invoke an Ansible module from a check.
Invoke stored _execute_module, normally copied from the action
@@ -106,6 +178,12 @@ class OpenShiftCheck(object):
Ansible version).
So e.g. check.execute_module("foo", dict(arg1=...))
+
+ save_as_name specifies a file name for saving the result to an output directory,
+ if needed, and is intended to uniquely identify the result of invoking execute_module.
+ If not provided, the module name will be used.
+ If register is set False, then the result won't be registered in logs or files to save.
+
Return: result hash from module execution.
"""
if self._execute_module is None:
@@ -113,7 +191,33 @@ class OpenShiftCheck(object):
self.__class__.__name__ +
" invoked execute_module without providing the method at initialization."
)
- return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+ result = self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+ if result.get("changed"):
+ self.changed = True
+ for output in ["result", "stdout"]:
+ # output is often JSON; attempt to decode
+ try:
+ result[output + "_json"] = json.loads(result[output])
+ except (KeyError, ValueError):
+ pass
+
+ if register:
+ self.register_log("execute_module: " + module_name, result)
+ self.register_file(save_as_name or module_name + ".json", result)
+ return result
+
+ def execute_module_with_retries(self, module_name, module_args):
+ """Run execute_module and retry on failure."""
+ result = {}
+ tries = 0
+ while True:
+ res = self.execute_module(module_name, module_args)
+ if tries > self._module_retries or not res.get("failed"):
+ result.update(res)
+ return result
+ result["last_failed"] = res
+ tries += 1
+ time.sleep(self._module_retry_interval)
def get_var(self, *keys, **kwargs):
"""Get deeply nested values from task_vars.
@@ -171,8 +275,12 @@ class OpenShiftCheck(object):
'There is a bug in this check. While trying to convert variable \n'
' "{var}={value}"\n'
'the given converter cannot be used or failed unexpectedly:\n'
- '{error}'.format(var=".".join(keys), value=value, error=error)
- )
+ '{type}: {error}'.format(
+ var=".".join(keys),
+ value=value,
+ type=error.__class__.__name__,
+ error=error
+ ))
@staticmethod
def get_major_minor_version(openshift_image_tag):
@@ -214,7 +322,9 @@ class OpenShiftCheck(object):
mount_point = os.path.dirname(mount_point)
try:
- return mount_for_path[mount_point]
+ mount = mount_for_path[mount_point]
+ self.register_log("mount point for " + path, mount)
+ return mount
except KeyError:
known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path))
raise OpenShiftCheckException(
@@ -242,7 +352,7 @@ def load_checks(path=None, subpkg=""):
modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
continue
- if name.endswith(".py") and not name.startswith(".") and name not in LOADER_EXCLUDES:
+ if name.endswith(".py") and name not in LOADER_EXCLUDES:
modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
return modules
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index f302fd14b..cdf56e959 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -70,6 +70,10 @@ class DiskAvailability(OpenShiftCheck):
# If it is not a number, then it should be a nested dict.
pass
+ self.register_log("recommended thresholds", self.recommended_disk_space_bytes)
+ if user_config:
+ self.register_log("user-configured thresholds", user_config)
+
# TODO: as suggested in
# https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
# maybe we could support checking disk availability in paths that are
@@ -113,10 +117,7 @@ class DiskAvailability(OpenShiftCheck):
'in your Ansible inventory, and lower the recommended disk space availability\n'
'if necessary for this upgrade.').format(config_bytes)
- return {
- 'failed': True,
- 'msg': msg,
- }
+ self.register_failure(msg)
return {}
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index 857a80c74..9c35f0f92 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -32,6 +32,12 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
# we use python-docker-py to check local docker for images, and skopeo
# to look for images available remotely without waiting to pull them.
dependencies = ["python-docker-py", "skopeo"]
+ skopeo_img_check_command = "timeout 10 skopeo inspect --tls-verify=false docker://{registry}/{image}"
+
+ def __init__(self, *args, **kwargs):
+ super(DockerImageAvailability, self).__init__(*args, **kwargs)
+ # record whether we could reach a registry or not (and remember results)
+ self.reachable_registries = {}
def is_active(self):
"""Skip hosts with unsupported deployment types."""
@@ -63,13 +69,21 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
unavailable_images = set(missing_images) - set(available_images)
if unavailable_images:
- return {
- "failed": True,
- "msg": (
- "One or more required Docker images are not available:\n {}\n"
- "Configured registries: {}"
- ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)),
- }
+ registries = [
+ reg if self.reachable_registries.get(reg, True) else reg + " (unreachable)"
+ for reg in registries
+ ]
+ msg = (
+ "One or more required Docker images are not available:\n {}\n"
+ "Configured registries: {}\n"
+ "Checked by: {}"
+ ).format(
+ ",\n ".join(sorted(unavailable_images)),
+ ", ".join(registries),
+ self.skopeo_img_check_command
+ )
+
+ return dict(failed=True, msg=msg)
return {}
@@ -125,31 +139,31 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
def local_images(self, images):
"""Filter a list of images and return those available locally."""
- return [
- image for image in images
- if self.is_image_local(image)
- ]
+ registries = self.known_docker_registries()
+ found_images = []
+ for image in images:
+ # docker could have the image name as-is or prefixed with any registry
+ imglist = [image] + [reg + "/" + image for reg in registries]
+ if self.is_image_local(imglist):
+ found_images.append(image)
+ return found_images
def is_image_local(self, image):
"""Check if image is already in local docker index."""
result = self.execute_module("docker_image_facts", {"name": image})
- if result.get("failed", False):
- return False
-
- return bool(result.get("images", []))
+ return bool(result.get("images")) and not result.get("failed")
def known_docker_registries(self):
"""Build a list of docker registries available according to inventory vars."""
- docker_facts = self.get_var("openshift", "docker")
- regs = set(docker_facts["additional_registries"])
+ regs = list(self.get_var("openshift.docker.additional_registries", default=[]))
deployment_type = self.get_var("openshift_deployment_type")
- if deployment_type == "origin":
- regs.update(["docker.io"])
- elif "enterprise" in deployment_type:
- regs.update(["registry.access.redhat.com"])
+ if deployment_type == "origin" and "docker.io" not in regs:
+ regs.append("docker.io")
+ elif "enterprise" in deployment_type and "registry.access.redhat.com" not in regs:
+ regs.append("registry.access.redhat.com")
- return list(regs)
+ return regs
def available_images(self, images, default_registries):
"""Search remotely for images. Returns: list of images found."""
@@ -162,18 +176,35 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Use Skopeo to determine if required image exists in known registry(s)."""
registries = default_registries
- # if image already includes a registry, only use that
+ # If image already includes a registry, only use that.
+ # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g.
+ # registry.access.redhat.com/rhel7 as if the registry were a namespace.
+ # It's not clear that there's any way to distinguish them, but fortunately
+ # the current set of images all look like [registry/]namespace/name[:version].
if image.count("/") > 1:
registry, image = image.split("/", 1)
registries = [registry]
for registry in registries:
- args = {
- "_raw_params": "timeout 10 skopeo inspect --tls-verify=false "
- "docker://{}/{}".format(registry, image)
- }
- result = self.execute_module("command", args)
+ if registry not in self.reachable_registries:
+ self.reachable_registries[registry] = self.connect_to_registry(registry)
+ if not self.reachable_registries[registry]:
+ continue
+
+ args = {"_raw_params": self.skopeo_img_check_command.format(registry=registry, image=image)}
+ result = self.execute_module_with_retries("command", args)
if result.get("rc", 0) == 0 and not result.get("failed"):
return True
+ if result.get("rc") == 124: # RC 124 == timed out; mark unreachable
+ self.reachable_registries[registry] = False
return False
+
+ def connect_to_registry(self, registry):
+ """Use ansible wait_for module to test connectivity from host to registry. Returns bool."""
+ # test a simple TCP connection
+ host, _, port = registry.partition(":")
+ port = port or 443
+ args = dict(host=host, port=port, state="started", timeout=30)
+ result = self.execute_module("wait_for", args)
+ return result.get("rc", 0) == 0 and not result.get("failed")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index 7fc843fd7..986a01f38 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -72,7 +72,7 @@ class Elasticsearch(LoggingCheck):
for pod_name in pods_by_name.keys():
# Compare what each ES node reports as master and compare for split brain
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
- master_name_str = self.exec_oc(get_master_cmd, [])
+ master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json")
master_names = (master_name_str or '').split(' ')
if len(master_names) > 1:
es_master_names.add(master_names[1])
@@ -113,7 +113,7 @@ class Elasticsearch(LoggingCheck):
# get ES cluster nodes
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
- cluster_node_data = self.exec_oc(node_cmd, [])
+ cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json")
try:
cluster_nodes = json.loads(cluster_node_data)['nodes']
except (ValueError, KeyError):
@@ -142,7 +142,7 @@ class Elasticsearch(LoggingCheck):
errors = []
for pod_name in pods_by_name.keys():
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
- cluster_health_data = self.exec_oc(cluster_health_cmd, [])
+ cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json')
try:
health_res = json.loads(cluster_health_data)
if not health_res or not health_res.get('status'):
@@ -171,7 +171,7 @@ class Elasticsearch(LoggingCheck):
errors = []
for pod_name in pods_by_name.keys():
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
- disk_output = self.exec_oc(df_cmd, [])
+ disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json')
lines = disk_output.splitlines()
# expecting one header looking like 'IUse% Use%' and one body line
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index ecd8adb64..06bdfebf6 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -78,7 +78,7 @@ class LoggingCheck(OpenShiftCheck):
"""Returns the namespace in which logging is configured to deploy."""
return self.get_var("openshift_logging_namespace", default="logging")
- def exec_oc(self, cmd_str="", extra_args=None):
+ def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None):
"""
Execute an 'oc' command in the remote host.
Returns: output of command and namespace,
@@ -92,7 +92,7 @@ class LoggingCheck(OpenShiftCheck):
"extra_args": list(extra_args) if extra_args else [],
}
- result = self.execute_module("ocutil", args)
+ result = self.execute_module("ocutil", args, save_as_name=save_as_name)
if result.get("failed"):
if result['result'] == '[Errno 2] No such file or directory':
raise CouldNotUseOc(
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
index d781db649..cacdf4213 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -104,7 +104,7 @@ class LoggingIndexTime(LoggingCheck):
"https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
)
exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
- result = self.exec_oc(exec_cmd, [])
+ result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
try:
count = json.loads(result)["count"]
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index e9bae60a3..b90ebf6dd 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -36,7 +36,7 @@ class DockerHostMixin(object):
# NOTE: we would use the "package" module but it's actually an action plugin
# and it's not clear how to invoke one of those. This is about the same anyway:
- result = self.execute_module(
+ result = self.execute_module_with_retries(
self.get_var("ansible_pkg_mgr", default="yum"),
{"name": self.dependencies, "state": "present"},
)
@@ -49,5 +49,4 @@ class DockerHostMixin(object):
" {deps}\n{msg}"
).format(deps=',\n '.join(self.dependencies), msg=msg)
failed = result.get("failed", False) or result.get("rc", 0) != 0
- self.changed = result.get("changed", False)
return msg, failed
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index a86180b00..21355c2f0 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -26,7 +26,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
packages.update(self.node_packages(rpm_prefix))
args = {"packages": sorted(set(packages))}
- return self.execute_module("check_yum_update", args)
+ return self.execute_module_with_retries("check_yum_update", args)
@staticmethod
def master_packages(rpm_prefix):
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index 1e9aecbe0..8464e8a5e 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -11,4 +11,4 @@ class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
def run(self):
args = {"packages": []}
- return self.execute_module("check_yum_update", args)
+ return self.execute_module_with_retries("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 8b780114f..d4aec3ed8 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -46,6 +46,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
check_multi_minor_release = deployment_type in ['openshift-enterprise']
args = {
+ "package_mgr": self.get_var("ansible_pkg_mgr"),
"package_list": [
{
"name": "openvswitch",
@@ -75,7 +76,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
],
}
- return self.execute_module("aos_version", args)
+ return self.execute_module_with_retries("aos_version", args)
def get_required_ovs_version(self):
"""Return the correct Open vSwitch version(s) for the current OpenShift version."""