summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker')
-rw-r--r--roles/openshift_health_checker/action_plugins/openshift_health_check.py89
-rw-r--r--roles/openshift_health_checker/callback_plugins/zz_failure_summary.py88
-rwxr-xr-xroles/openshift_health_checker/library/aos_version.py228
-rwxr-xr-xroles/openshift_health_checker/library/check_yum_update.py1
-rw-r--r--roles/openshift_health_checker/library/etcdkeysize.py122
-rw-r--r--roles/openshift_health_checker/library/ocutil.py74
-rw-r--r--roles/openshift_health_checker/library/rpm_version.py127
-rw-r--r--roles/openshift_health_checker/meta/main.yml2
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py22
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py115
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py232
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py185
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py84
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py58
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/__init__.py0
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py61
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py217
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py170
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py229
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py96
-rw-r--r--roles/openshift_health_checker/openshift_checks/memory_availability.py51
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py58
-rw-r--r--roles/openshift_health_checker/openshift_checks/ovs_version.py78
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_availability.py9
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_update.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py115
-rw-r--r--roles/openshift_health_checker/test/action_plugin_test.py253
-rw-r--r--roles/openshift_health_checker/test/aos_version_test.py149
-rw-r--r--roles/openshift_health_checker/test/conftest.py10
-rw-r--r--roles/openshift_health_checker/test/curator_test.py68
-rw-r--r--roles/openshift_health_checker/test/disk_availability_test.py180
-rw-r--r--roles/openshift_health_checker/test/docker_image_availability_test.py265
-rw-r--r--roles/openshift_health_checker/test/docker_storage_test.py224
-rw-r--r--roles/openshift_health_checker/test/elasticsearch_test.py180
-rw-r--r--roles/openshift_health_checker/test/etcd_imagedata_size_test.py328
-rw-r--r--roles/openshift_health_checker/test/etcd_volume_test.py149
-rw-r--r--roles/openshift_health_checker/test/fluentd_test.py109
-rw-r--r--roles/openshift_health_checker/test/kibana_test.py218
-rw-r--r--roles/openshift_health_checker/test/logging_check_test.py137
-rw-r--r--roles/openshift_health_checker/test/memory_availability_test.py129
-rw-r--r--roles/openshift_health_checker/test/ovs_version_test.py89
-rw-r--r--roles/openshift_health_checker/test/package_availability_test.py14
-rw-r--r--roles/openshift_health_checker/test/package_version_test.py147
-rw-r--r--roles/openshift_health_checker/test/rpm_version_test.py82
44 files changed, 4960 insertions, 284 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
index cf0fe19f1..0390dc82e 100644
--- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py
+++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
@@ -4,6 +4,7 @@ Ansible action plugin to execute health checks in OpenShift clusters.
# pylint: disable=wrong-import-position,missing-docstring,invalid-name
import sys
import os
+from collections import defaultdict
try:
from __main__ import display
@@ -24,9 +25,11 @@ class ActionModule(ActionBase):
def run(self, tmp=None, task_vars=None):
result = super(ActionModule, self).run(tmp, task_vars)
+ task_vars = task_vars or {}
- if task_vars is None:
- task_vars = {}
+ # vars are not supportably available in the callback plugin,
+ # so record any it will need in the result.
+ result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context')
if "openshift" not in task_vars:
result["failed"] = True
@@ -35,38 +38,36 @@ class ActionModule(ActionBase):
try:
known_checks = self.load_known_checks()
+ args = self._task.args
+ resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
except OpenShiftCheckException as e:
result["failed"] = True
result["msg"] = str(e)
return result
- args = self._task.args
- requested_checks = resolve_checks(args.get("checks", []), known_checks.values())
-
- unknown_checks = requested_checks - set(known_checks)
- if unknown_checks:
- result["failed"] = True
- result["msg"] = (
- "One or more checks are unknown: {}. "
- "Make sure there is no typo in the playbook and no files are missing."
- ).format(", ".join(unknown_checks))
- return result
-
result["checks"] = check_results = {}
- for check_name in requested_checks & set(known_checks):
+ user_disabled_checks = [
+ check.strip()
+ for check in task_vars.get("openshift_disable_check", "").split(",")
+ ]
+
+ for check_name in resolved_checks:
display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"]))
check = known_checks[check_name]
- if check.is_active(task_vars):
+ if not check.is_active(task_vars):
+ r = dict(skipped=True, skipped_reason="Not active for this host")
+ elif check_name in user_disabled_checks:
+ r = dict(skipped=True, skipped_reason="Disabled by user request")
+ else:
try:
r = check.run(tmp, task_vars)
except OpenShiftCheckException as e:
- r = {}
- r["failed"] = True
- r["msg"] = str(e)
- else:
- r = {"skipped": True}
+ r = dict(
+ failed=True,
+ msg=str(e),
+ )
check_results[check_name] = r
@@ -81,10 +82,7 @@ class ActionModule(ActionBase):
load_checks()
known_checks = {}
-
- known_check_classes = set(cls for cls in OpenShiftCheck.subclasses())
-
- for cls in known_check_classes:
+ for cls in OpenShiftCheck.subclasses():
check_name = cls.name
if check_name in known_checks:
other_cls = known_checks[check_name].__class__
@@ -94,26 +92,45 @@ class ActionModule(ActionBase):
cls.__module__, cls.__name__,
other_cls.__module__, other_cls.__name__))
known_checks[check_name] = cls(execute_module=self._execute_module)
-
return known_checks
def resolve_checks(names, all_checks):
"""Returns a set of resolved check names.
- Resolving a check name involves expanding tag references (e.g., '@tag') with
- all the checks that contain the given tag.
+ Resolving a check name expands tag references (e.g., "@tag") to all the
+ checks that contain the given tag. OpenShiftCheckException is raised if
+ names contains an unknown check or tag name.
names should be a sequence of strings.
all_checks should be a sequence of check classes/instances.
"""
- resolved = set()
- for name in names:
- if name.startswith("@"):
- for check in all_checks:
- if name[1:] in check.tags:
- resolved.add(check.name)
- else:
- resolved.add(name)
+ known_check_names = set(check.name for check in all_checks)
+ known_tag_names = set(name for check in all_checks for name in check.tags)
+
+ check_names = set(name for name in names if not name.startswith('@'))
+ tag_names = set(name[1:] for name in names if name.startswith('@'))
+
+ unknown_check_names = check_names - known_check_names
+ unknown_tag_names = tag_names - known_tag_names
+
+ if unknown_check_names or unknown_tag_names:
+ msg = []
+ if unknown_check_names:
+ msg.append('Unknown check names: {}.'.format(', '.join(sorted(unknown_check_names))))
+ if unknown_tag_names:
+ msg.append('Unknown tag names: {}.'.format(', '.join(sorted(unknown_tag_names))))
+ msg.append('Make sure there is no typo in the playbook and no files are missing.')
+ raise OpenShiftCheckException('\n'.join(msg))
+
+ tag_to_checks = defaultdict(set)
+ for check in all_checks:
+ for tag in check.tags:
+ tag_to_checks[tag].add(check.name)
+
+ resolved = check_names.copy()
+ for tag in tag_names:
+ resolved.update(tag_to_checks[tag])
+
return resolved
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
index 208e81048..64c29a8d9 100644
--- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -1,8 +1,13 @@
-# vim: expandtab:tabstop=4:shiftwidth=4
'''
Ansible callback plugin.
'''
+# Reason: In several locations below we disable pylint protected-access
+# for Ansible objects that do not give us any public way
+# to access the full details we need to report check failures.
+# Status: disabled permanently or until Ansible object has a public API.
+# This does leave the code more likely to be broken by future Ansible changes.
+
from pprint import pformat
from ansible.plugins.callback import CallbackBase
@@ -21,38 +26,37 @@ class CallbackModule(CallbackBase):
CALLBACK_TYPE = 'aggregate'
CALLBACK_NAME = 'failure_summary'
CALLBACK_NEEDS_WHITELIST = False
+ _playbook_file = None
def __init__(self):
super(CallbackModule, self).__init__()
self.__failures = []
+ def v2_playbook_on_start(self, playbook):
+ super(CallbackModule, self).v2_playbook_on_start(playbook)
+ # re: playbook attrs see top comment # pylint: disable=protected-access
+ self._playbook_file = playbook._file_name
+
def v2_runner_on_failed(self, result, ignore_errors=False):
super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
def v2_playbook_on_stats(self, stats):
super(CallbackModule, self).v2_playbook_on_stats(stats)
- # TODO: update condition to consider a host var or env var to
- # enable/disable the summary, so that we can control the output from a
- # play.
if self.__failures:
- self._print_failure_summary()
+ self._print_failure_details(self.__failures)
- def _print_failure_summary(self):
- '''Print a summary of failed tasks (including ignored failures).'''
+ def _print_failure_details(self, failures):
+ '''Print a summary of failed tasks or checks.'''
self._display.display(u'\nFailure summary:\n')
- # TODO: group failures by host or by task. If grouped by host, it is
- # easy to see all problems of a given host. If grouped by task, it is
- # easy to see what hosts needs the same fix.
-
- width = len(str(len(self.__failures)))
+ width = len(str(len(failures)))
initial_indent_format = u' {{:>{width}}}. '.format(width=width)
initial_indent_len = len(initial_indent_format.format(0))
subsequent_indent = u' ' * initial_indent_len
subsequent_extra_indent = u' ' * (initial_indent_len + 10)
- for i, failure in enumerate(self.__failures, 1):
+ for i, failure in enumerate(failures, 1):
entries = _format_failure(failure)
self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), entries[0]))
for entry in entries[1:]:
@@ -60,11 +64,52 @@ class CallbackModule(CallbackBase):
indented = u'{}{}'.format(subsequent_indent, entry)
self._display.display(indented)
-
-# Reason: disable pylint protected-access because we need to access _*
-# attributes of a task result to implement this method.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
+ failed_checks = set()
+ playbook_context = None
+ # re: result attrs see top comment # pylint: disable=protected-access
+ for failure in failures:
+ # get context from check task result since callback plugins cannot access task vars
+ playbook_context = playbook_context or failure['result']._result.get('playbook_context')
+ failed_checks.update(
+ name
+ for name, result in failure['result']._result.get('checks', {}).items()
+ if result.get('failed')
+ )
+ if failed_checks:
+ self._print_check_failure_summary(failed_checks, playbook_context)
+
+ def _print_check_failure_summary(self, failed_checks, context):
+ checks = ','.join(sorted(failed_checks))
+ # NOTE: context is not set if all failures occurred prior to checks task
+ summary = (
+ '\n'
+ 'The execution of "{playbook}"\n'
+ 'includes checks designed to fail early if the requirements\n'
+ 'of the playbook are not met. One or more of these checks\n'
+ 'failed. To disregard these results, you may choose to\n'
+ 'disable failing checks by setting an Ansible variable:\n\n'
+ ' openshift_disable_check={checks}\n\n'
+ 'Failing check names are shown in the failure details above.\n'
+ 'Some checks may be configurable by variables if your requirements\n'
+ 'are different from the defaults; consult check documentation.\n'
+ 'Variables can be set in the inventory or passed on the\n'
+ 'command line using the -e flag to ansible-playbook.\n'
+ ).format(playbook=self._playbook_file, checks=checks)
+ if context in ['pre-install', 'health']:
+ summary = (
+ '\n'
+ 'You may choose to configure or disable failing checks by\n'
+ 'setting Ansible variables. To disable those above:\n\n'
+ ' openshift_disable_check={checks}\n\n'
+ 'Consult check documentation for configurable variables.\n'
+ 'Variables can be set in the inventory or passed on the\n'
+ 'command line using the -e flag to ansible-playbook.\n'
+ ).format(checks=checks)
+ # other expected contexts: install, upgrade
+ self._display.display(summary)
+
+
+# re: result attrs see top comment # pylint: disable=protected-access
def _format_failure(failure):
'''Return a list of pretty-formatted text entries describing a failure, including
relevant information about it. Expect that the list of text entries will be joined
@@ -101,11 +146,8 @@ def _format_failed_checks(checks):
return stringc(pformat(checks), C.COLOR_ERROR)
-# Reason: disable pylint protected-access because we need to access _*
-# attributes of obj to implement this function.
-# This is inspired by ansible.playbook.base.Base.dump_me.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
+# This is inspired by ansible.playbook.base.Base.dump_me.
+# re: play/task/block attrs see top comment # pylint: disable=protected-access
def _get_play(obj):
'''Given a task or block, recursively tries to find its parent play.'''
if hasattr(obj, '_play'):
diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py
index 191a4b107..4c205e48c 100755
--- a/roles/openshift_health_checker/library/aos_version.py
+++ b/roles/openshift_health_checker/library/aos_version.py
@@ -1,91 +1,203 @@
#!/usr/bin/python
-# vim: expandtab:tabstop=4:shiftwidth=4
'''
-Ansible module for determining if multiple versions of an OpenShift package are
-available, and if the version requested is available down to the given
-precision.
-
-Multiple versions available suggest that multiple repos are enabled for the
-different versions, which may cause installation problems.
+Ansible module for yum-based systems determining if multiple releases
+of an OpenShift package are available, and if the release requested
+(if any) is available down to the given precision.
+
+For Enterprise, multiple releases available suggest that multiple repos
+are enabled for the different releases, which may cause installation
+problems. With Origin, however, this is a normal state of affairs as
+all the releases are provided in a single repo with the expectation that
+only the latest can be installed.
+
+Code in the openshift_version role contains a lot of logic to pin down
+the exact package and image version to use and so does some validation
+of release availability already. Without duplicating all that, we would
+like the user to have a helpful error message if we detect things will
+not work out right. Note that if openshift_release is not specified in
+the inventory, the version comparison checks just pass.
'''
-import yum # pylint: disable=import-error
-
from ansible.module_utils.basic import AnsibleModule
+IMPORT_EXCEPTION = None
+try:
+ import yum # pylint: disable=import-error
+except ImportError as err:
+ IMPORT_EXCEPTION = err
+
+
+class AosVersionException(Exception):
+ '''Base exception class for package version problems'''
+ def __init__(self, message, problem_pkgs=None):
+ Exception.__init__(self, message)
+ self.problem_pkgs = problem_pkgs
+
-def main(): # pylint: disable=missing-docstring,too-many-branches
+def main():
+ """Entrypoint for this Ansible module"""
module = AnsibleModule(
argument_spec=dict(
- prefix=dict(required=True), # atomic-openshift, origin, ...
- version=dict(required=True),
+ package_list=dict(type="list", required=True),
),
supports_check_mode=True
)
- def bail(error): # pylint: disable=missing-docstring
- module.fail_json(msg=error)
+ if IMPORT_EXCEPTION:
+ module.fail_json(msg="aos_version module could not import yum: %s" % IMPORT_EXCEPTION)
- rpm_prefix = module.params['prefix']
+ # determine the packages we will look for
+ package_list = module.params['package_list']
+ if not package_list:
+ module.fail_json(msg="package_list must not be empty")
- if not rpm_prefix:
- bail("prefix must not be empty")
+ # generate set with only the names of expected packages
+ expected_pkg_names = [p["name"] for p in package_list]
+
+ # gather packages that require a multi_minor_release check
+ multi_minor_pkgs = [p for p in package_list if p["check_multi"]]
+
+ # generate list of packages with a specified (non-empty) version
+ # should look like a version string with possibly many segments e.g. "3.4.1"
+ versioned_pkgs = [p for p in package_list if p["version"]]
+
+ # get the list of packages available and complain if anything is wrong
+ try:
+ pkgs = _retrieve_available_packages(expected_pkg_names)
+ if versioned_pkgs:
+ _check_precise_version_found(pkgs, _to_dict(versioned_pkgs))
+ _check_higher_version_found(pkgs, _to_dict(versioned_pkgs))
+ if multi_minor_pkgs:
+ _check_multi_minor_release(pkgs, _to_dict(multi_minor_pkgs))
+ except AosVersionException as excinfo:
+ module.fail_json(msg=str(excinfo))
+ module.exit_json(changed=False)
+
+def _to_dict(pkg_list):
+ return {pkg["name"]: pkg for pkg in pkg_list}
+
+
+def _retrieve_available_packages(expected_pkgs):
+ # search for package versions available for openshift pkgs
yb = yum.YumBase() # pylint: disable=invalid-name
- yb.conf.disable_excludes = ["all"] # assume the openshift excluder will be managed, ignore current state
-
- # search for package versions available for aos pkgs
- expected_pkgs = [
- rpm_prefix,
- rpm_prefix + '-master',
- rpm_prefix + '-node',
- ]
+
+ # The openshift excluder prevents unintended updates to openshift
+ # packages by setting yum excludes on those packages. See:
+ # https://wiki.centos.org/SpecialInterestGroup/PaaS/OpenShift-Origin-Control-Updates
+ # Excludes are then disabled during an install or upgrade, but
+ # this check will most likely be running outside either. When we
+ # attempt to determine what packages are available via yum they may
+ # be excluded. So, for our purposes here, disable excludes to see
+ # what will really be available during an install or upgrade.
+ yb.conf.disable_excludes = ['all']
+
try:
pkgs = yb.pkgSack.returnPackages(patterns=expected_pkgs)
- except yum.Errors.PackageSackError as e: # pylint: disable=invalid-name
+ except yum.Errors.PackageSackError as excinfo:
# you only hit this if *none* of the packages are available
- bail('Unable to find any OpenShift packages.\nCheck your subscription and repo settings.\n%s' % e)
+ raise AosVersionException('\n'.join([
+ 'Unable to find any OpenShift packages.',
+ 'Check your subscription and repo settings.',
+ str(excinfo),
+ ]))
+ return pkgs
+
+
+class PreciseVersionNotFound(AosVersionException):
+ """Exception for reporting packages not available at given version"""
+ def __init__(self, not_found):
+ msg = ['Not all of the required packages are available at their requested version']
+ msg += ['{}:{} '.format(pkg["name"], pkg["version"]) for pkg in not_found]
+ msg += ['Please check your subscriptions and enabled repositories.']
+ AosVersionException.__init__(self, '\n'.join(msg), not_found)
+
+
+def _check_precise_version_found(pkgs, expected_pkgs_dict):
+ # see if any packages couldn't be found at requested release version
+ # we would like to verify that the latest available pkgs have however specific a version is given.
+ # so e.g. if there is a package version 3.4.1.5 the check passes; if only 3.4.0, it fails.
- # determine what level of precision we're expecting for the version
- expected_version = module.params['version']
- if expected_version.startswith('v'): # v3.3 => 3.3
- expected_version = expected_version[1:]
- num_dots = expected_version.count('.')
+ pkgs_precise_version_found = set()
+ for pkg in pkgs:
+ if pkg.name not in expected_pkgs_dict:
+ continue
+ # does the version match, to the precision requested?
+ # and, is it strictly greater, at the precision requested?
+ expected_pkg_version = expected_pkgs_dict[pkg.name]["version"]
+ match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1])
+ if match_version == expected_pkg_version:
+ pkgs_precise_version_found.add(pkg.name)
+
+ not_found = []
+ for name, pkg in expected_pkgs_dict.items():
+ if name not in pkgs_precise_version_found:
+ not_found.append(pkg)
+
+ if not_found:
+ raise PreciseVersionNotFound(not_found)
+
+
+class FoundHigherVersion(AosVersionException):
+ """Exception for reporting that a higher version than requested is available"""
+ def __init__(self, higher_found):
+ msg = ['Some required package(s) are available at a version',
+ 'that is higher than requested']
+ msg += [' ' + name for name in higher_found]
+ msg += ['This will prevent installing the version you requested.']
+ msg += ['Please check your enabled repositories or adjust openshift_release.']
+ AosVersionException.__init__(self, '\n'.join(msg), higher_found)
+
+
+def _check_higher_version_found(pkgs, expected_pkgs_dict):
+ expected_pkg_names = list(expected_pkgs_dict)
+ # see if any packages are available in a version higher than requested
+ higher_version_for_pkg = {}
+ for pkg in pkgs:
+ if pkg.name not in expected_pkg_names:
+ continue
+ expected_pkg_version = expected_pkgs_dict[pkg.name]["version"]
+ req_release_arr = [int(segment) for segment in expected_pkg_version.split(".")]
+ version = [int(segment) for segment in pkg.version.split(".")]
+ too_high = version[:len(req_release_arr)] > req_release_arr
+ higher_than_seen = version > higher_version_for_pkg.get(pkg.name, [])
+ if too_high and higher_than_seen:
+ higher_version_for_pkg[pkg.name] = version
+
+ if higher_version_for_pkg:
+ higher_found = []
+ for name, version in higher_version_for_pkg.items():
+ higher_found.append(name + '-' + '.'.join(str(segment) for segment in version))
+ raise FoundHigherVersion(higher_found)
+
+
+class FoundMultiRelease(AosVersionException):
+ """Exception for reporting multiple minor releases found for same package"""
+ def __init__(self, multi_found):
+ msg = ['Multiple minor versions of these packages are available']
+ msg += [' ' + name for name in multi_found]
+ msg += ["There should only be one OpenShift release repository enabled at a time."]
+ AosVersionException.__init__(self, '\n'.join(msg), multi_found)
+
+
+def _check_multi_minor_release(pkgs, expected_pkgs_dict):
+ # see if any packages are available in more than one minor version
pkgs_by_name_version = {}
- pkgs_precise_version_found = {}
for pkg in pkgs:
- # get expected version precision
- match_version = '.'.join(pkg.version.split('.')[:num_dots + 1])
- if match_version == expected_version:
- pkgs_precise_version_found[pkg.name] = True
- # get x.y version precision
- minor_version = '.'.join(pkg.version.split('.')[:2])
+ # keep track of x.y (minor release) versions seen
+ minor_release = '.'.join(pkg.version.split('.')[:2])
if pkg.name not in pkgs_by_name_version:
- pkgs_by_name_version[pkg.name] = {}
- pkgs_by_name_version[pkg.name][minor_version] = True
+ pkgs_by_name_version[pkg.name] = set()
+ pkgs_by_name_version[pkg.name].add(minor_release)
- # see if any packages couldn't be found at requested version
- # see if any packages are available in more than one minor version
- not_found = []
multi_found = []
- for name in expected_pkgs:
- if name not in pkgs_precise_version_found:
- not_found.append(name)
+ for name in expected_pkgs_dict:
if name in pkgs_by_name_version and len(pkgs_by_name_version[name]) > 1:
multi_found.append(name)
- if not_found:
- msg = 'Not all of the required packages are available at requested version %s:\n' % expected_version
- for name in not_found:
- msg += ' %s\n' % name
- bail(msg + 'Please check your subscriptions and enabled repositories.')
- if multi_found:
- msg = 'Multiple minor versions of these packages are available\n'
- for name in multi_found:
- msg += ' %s\n' % name
- bail(msg + "There should only be one OpenShift version's repository enabled at a time.")
- module.exit_json(changed=False)
+ if multi_found:
+ raise FoundMultiRelease(multi_found)
if __name__ == '__main__':
diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py
index 630ebc848..433795b67 100755
--- a/roles/openshift_health_checker/library/check_yum_update.py
+++ b/roles/openshift_health_checker/library/check_yum_update.py
@@ -1,5 +1,4 @@
#!/usr/bin/python
-# vim: expandtab:tabstop=4:shiftwidth=4
'''
Ansible module to test whether a yum update or install will succeed,
without actually performing it or running yum.
diff --git a/roles/openshift_health_checker/library/etcdkeysize.py b/roles/openshift_health_checker/library/etcdkeysize.py
new file mode 100644
index 000000000..620e82d87
--- /dev/null
+++ b/roles/openshift_health_checker/library/etcdkeysize.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+"""Ansible module that recursively determines if the size of a key in an etcd cluster exceeds a given limit."""
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+try:
+ import etcd
+
+ IMPORT_EXCEPTION_MSG = None
+except ImportError as err:
+ IMPORT_EXCEPTION_MSG = str(err)
+
+ from collections import namedtuple
+ EtcdMock = namedtuple("etcd", ["EtcdKeyNotFound"])
+ etcd = EtcdMock(KeyError)
+
+
+# pylint: disable=too-many-arguments
+def check_etcd_key_size(client, key, size_limit, total_size=0, depth=0, depth_limit=1000, visited=None):
+ """Check size of an etcd path starting at given key. Returns tuple (string, bool)"""
+ if visited is None:
+ visited = set()
+
+ if key in visited:
+ return 0, False
+
+ visited.add(key)
+
+ try:
+ result = client.read(key, recursive=False)
+ except etcd.EtcdKeyNotFound:
+ return 0, False
+
+ size = 0
+ limit_exceeded = False
+
+ for node in result.leaves:
+ if depth >= depth_limit:
+ raise Exception("Maximum recursive stack depth ({}) exceeded.".format(depth_limit))
+
+ if size_limit and total_size + size > size_limit:
+ return size, True
+
+ if not node.dir:
+ size += len(node.value)
+ continue
+
+ key_size, limit_exceeded = check_etcd_key_size(client, node.key,
+ size_limit,
+ total_size + size,
+ depth + 1,
+ depth_limit, visited)
+ size += key_size
+
+ max_limit_exceeded = limit_exceeded or (total_size + size > size_limit)
+ return size, max_limit_exceeded
+
+
+def main(): # pylint: disable=missing-docstring,too-many-branches
+ module = AnsibleModule(
+ argument_spec=dict(
+ size_limit_bytes=dict(type="int", default=0),
+ paths=dict(type="list", default=["/openshift.io/images"]),
+ host=dict(type="str", default="127.0.0.1"),
+ port=dict(type="int", default=4001),
+ protocol=dict(type="str", default="http"),
+ version_prefix=dict(type="str", default=""),
+ allow_redirect=dict(type="bool", default=False),
+ cert=dict(type="dict", default=""),
+ ca_cert=dict(type="str", default=None),
+ ),
+ supports_check_mode=True
+ )
+
+ module.params["cert"] = (
+ module.params["cert"]["cert"],
+ module.params["cert"]["key"],
+ )
+
+ size_limit = module.params.pop("size_limit_bytes")
+ paths = module.params.pop("paths")
+
+ limit_exceeded = False
+
+ try:
+ # pylint: disable=no-member
+ client = etcd.Client(**module.params)
+ except AttributeError as attrerr:
+ msg = str(attrerr)
+ if IMPORT_EXCEPTION_MSG:
+ msg = IMPORT_EXCEPTION_MSG
+ if "No module named etcd" in IMPORT_EXCEPTION_MSG:
+ # pylint: disable=redefined-variable-type
+ msg = ('Unable to import the python "etcd" dependency. '
+ 'Make sure python-etcd is installed on the host.')
+
+ module.exit_json(
+ failed=True,
+ changed=False,
+ size_limit_exceeded=limit_exceeded,
+ msg=msg,
+ )
+
+ return
+
+ size = 0
+ for path in paths:
+ path_size, limit_exceeded = check_etcd_key_size(client, path, size_limit - size)
+ size += path_size
+
+ if limit_exceeded:
+ break
+
+ module.exit_json(
+ changed=False,
+ size_limit_exceeded=limit_exceeded,
+ )
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py
new file mode 100644
index 000000000..2e60735d6
--- /dev/null
+++ b/roles/openshift_health_checker/library/ocutil.py
@@ -0,0 +1,74 @@
+#!/usr/bin/python
+"""Interface to OpenShift oc command"""
+
+import os
+import shlex
+import shutil
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+ADDITIONAL_PATH_LOOKUPS = ['/usr/local/bin', os.path.expanduser('~/bin')]
+
+
+def locate_oc_binary():
+ """Find and return oc binary file"""
+ # https://github.com/openshift/openshift-ansible/issues/3410
+ # oc can be in /usr/local/bin in some cases, but that may not
+ # be in $PATH due to ansible/sudo
+ paths = os.environ.get("PATH", os.defpath).split(os.pathsep) + ADDITIONAL_PATH_LOOKUPS
+
+ oc_binary = 'oc'
+
+ # Use shutil.which if it is available, otherwise fallback to a naive path search
+ try:
+ which_result = shutil.which(oc_binary, path=os.pathsep.join(paths))
+ if which_result is not None:
+ oc_binary = which_result
+ except AttributeError:
+ for path in paths:
+ if os.path.exists(os.path.join(path, oc_binary)):
+ oc_binary = os.path.join(path, oc_binary)
+ break
+
+ return oc_binary
+
+
+def main():
+ """Module that executes commands on a remote OpenShift cluster"""
+
+ module = AnsibleModule(
+ argument_spec=dict(
+ namespace=dict(type="str", required=True),
+ config_file=dict(type="str", required=True),
+ cmd=dict(type="str", required=True),
+ extra_args=dict(type="list", default=[]),
+ ),
+ )
+
+ cmd = [
+ locate_oc_binary(),
+ '--config', module.params["config_file"],
+ '-n', module.params["namespace"],
+ ] + shlex.split(module.params["cmd"])
+
+ failed = True
+ try:
+ cmd_result = subprocess.check_output(list(cmd), stderr=subprocess.STDOUT)
+ failed = False
+ except subprocess.CalledProcessError as exc:
+ cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output)
+ except OSError as exc:
+ # we get this when 'oc' is not there
+ cmd_result = str(exc)
+
+ module.exit_json(
+ changed=False,
+ failed=failed,
+ result=cmd_result,
+ )
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/library/rpm_version.py b/roles/openshift_health_checker/library/rpm_version.py
new file mode 100644
index 000000000..8ea223055
--- /dev/null
+++ b/roles/openshift_health_checker/library/rpm_version.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python
+"""
+Ansible module for rpm-based systems determining existing package version information in a host.
+"""
+
+from ansible.module_utils.basic import AnsibleModule
+
+IMPORT_EXCEPTION = None
+try:
+ import rpm # pylint: disable=import-error
+except ImportError as err:
+ IMPORT_EXCEPTION = err # in tox test env, rpm import fails
+
+
+class RpmVersionException(Exception):
+ """Base exception class for package version problems"""
+ def __init__(self, message, problem_pkgs=None):
+ Exception.__init__(self, message)
+ self.problem_pkgs = problem_pkgs
+
+
+def main():
+ """Entrypoint for this Ansible module"""
+ module = AnsibleModule(
+ argument_spec=dict(
+ package_list=dict(type="list", required=True),
+ ),
+ supports_check_mode=True
+ )
+
+ if IMPORT_EXCEPTION:
+ module.fail_json(msg="rpm_version module could not import rpm: %s" % IMPORT_EXCEPTION)
+
+ # determine the packages we will look for
+ pkg_list = module.params['package_list']
+ if not pkg_list:
+ module.fail_json(msg="package_list must not be empty")
+
+ # get list of packages available and complain if any
+ # of them are missing or if any errors occur
+ try:
+ pkg_versions = _retrieve_expected_pkg_versions(_to_dict(pkg_list))
+ _check_pkg_versions(pkg_versions, _to_dict(pkg_list))
+ except RpmVersionException as excinfo:
+ module.fail_json(msg=str(excinfo))
+ module.exit_json(changed=False)
+
+
+def _to_dict(pkg_list):
+ return {pkg["name"]: pkg for pkg in pkg_list}
+
+
+def _retrieve_expected_pkg_versions(expected_pkgs_dict):
+ """Search for installed packages matching given pkg names
+ and versions. Returns a dictionary: {pkg_name: [versions]}"""
+
+ transaction = rpm.TransactionSet()
+ pkgs = {}
+
+ for pkg_name in expected_pkgs_dict:
+ matched_pkgs = transaction.dbMatch("name", pkg_name)
+ if not matched_pkgs:
+ continue
+
+ for header in matched_pkgs:
+ if header['name'] == pkg_name:
+ if pkg_name not in pkgs:
+ pkgs[pkg_name] = []
+
+ pkgs[pkg_name].append(header['version'])
+
+ return pkgs
+
+
+def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict):
+ invalid_pkg_versions = {}
+ not_found_pkgs = []
+
+ for pkg_name, pkg in expected_pkgs_dict.items():
+ if not found_pkgs_dict.get(pkg_name):
+ not_found_pkgs.append(pkg_name)
+ continue
+
+ found_versions = [_parse_version(version) for version in found_pkgs_dict[pkg_name]]
+ expected_version = _parse_version(pkg["version"])
+ if expected_version not in found_versions:
+ invalid_pkg_versions[pkg_name] = {
+ "found_versions": found_versions,
+ "required_version": expected_version,
+ }
+
+ if not_found_pkgs:
+ raise RpmVersionException(
+ '\n'.join([
+ "The following packages were not found to be installed: {}".format('\n '.join([
+ "{}".format(pkg)
+ for pkg in not_found_pkgs
+ ]))
+ ]),
+ not_found_pkgs,
+ )
+
+ if invalid_pkg_versions:
+ raise RpmVersionException(
+ '\n '.join([
+ "The following packages were found to be installed with an incorrect version: {}".format('\n'.join([
+ " \n{}\n Required version: {}\n Found versions: {}".format(
+ pkg_name,
+ pkg["required_version"],
+ ', '.join([version for version in pkg["found_versions"]]))
+ for pkg_name, pkg in invalid_pkg_versions.items()
+ ]))
+ ]),
+ invalid_pkg_versions,
+ )
+
+
+def _parse_version(version_str):
+ segs = version_str.split('.')
+ if not segs or len(segs) <= 2:
+ return version_str
+
+ return '.'.join(segs[0:2])
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml
index 0bbeadd34..4d141974c 100644
--- a/roles/openshift_health_checker/meta/main.yml
+++ b/roles/openshift_health_checker/meta/main.yml
@@ -1,3 +1,5 @@
---
dependencies:
- role: openshift_facts
+ - role: openshift_repos
+ - role: openshift_version
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index be63d864a..5c9949ced 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -66,16 +66,26 @@ class OpenShiftCheck(object):
LOADER_EXCLUDES = (
"__init__.py",
"mixins.py",
+ "logging.py",
)
-def load_checks():
+def load_checks(path=None, subpkg=""):
"""Dynamically import all check modules for the side effect of registering checks."""
- return [
- import_module(__package__ + "." + name[:-3])
- for name in os.listdir(os.path.dirname(__file__))
- if name.endswith(".py") and name not in LOADER_EXCLUDES
- ]
+ if path is None:
+ path = os.path.dirname(__file__)
+
+ modules = []
+
+ for name in os.listdir(path):
+ if os.path.isdir(os.path.join(path, name)):
+ modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
+ continue
+
+ if name.endswith(".py") and name not in LOADER_EXCLUDES:
+ modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
+
+ return modules
def get_var(task_vars, *keys, **kwargs):
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
new file mode 100644
index 000000000..e93e81efa
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -0,0 +1,115 @@
+"""Check that there is enough disk space in predefined paths."""
+
+import os.path
+import tempfile
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class DiskAvailability(OpenShiftCheck):
+ """Check that recommended disk space is available before a first-time install."""
+
+ name = "disk_availability"
+ tags = ["preflight"]
+
+ # Values taken from the official installation documentation:
+ # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
+ recommended_disk_space_bytes = {
+ '/var': {
+ 'masters': 40 * 10**9,
+ 'nodes': 15 * 10**9,
+ 'etcd': 20 * 10**9,
+ },
+ # Used to copy client binaries into,
+ # see roles/openshift_cli/library/openshift_container_binary_sync.py.
+ '/usr/local/bin': {
+ 'masters': 1 * 10**9,
+ 'nodes': 1 * 10**9,
+ 'etcd': 1 * 10**9,
+ },
+ # Used as temporary storage in several cases.
+ tempfile.gettempdir(): {
+ 'masters': 1 * 10**9,
+ 'nodes': 1 * 10**9,
+ 'etcd': 1 * 10**9,
+ },
+ }
+
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts that do not have recommended disk space requirements."""
+ group_names = get_var(task_vars, "group_names", default=[])
+ active_groups = set()
+ for recommendation in cls.recommended_disk_space_bytes.values():
+ active_groups.update(recommendation.keys())
+ has_disk_space_recommendation = bool(active_groups.intersection(group_names))
+ return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
+
+ def run(self, tmp, task_vars):
+ group_names = get_var(task_vars, "group_names")
+ ansible_mounts = get_var(task_vars, "ansible_mounts")
+ ansible_mounts = {mount['mount']: mount for mount in ansible_mounts}
+
+ user_config = get_var(task_vars, "openshift_check_min_host_disk_gb", default={})
+ try:
+ # For backwards-compatibility, if openshift_check_min_host_disk_gb
+ # is a number, then it overrides the required config for '/var'.
+ number = float(user_config)
+ user_config = {
+ '/var': {
+ 'masters': number,
+ 'nodes': number,
+ 'etcd': number,
+ },
+ }
+ except TypeError:
+ # If it is not a number, then it should be a nested dict.
+ pass
+
+ # TODO: as suggested in
+ # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
+ # maybe we could support checking disk availability in paths that are
+ # not part of the official recommendation but present in the user
+ # configuration.
+ for path, recommendation in self.recommended_disk_space_bytes.items():
+ free_bytes = self.free_bytes(path, ansible_mounts)
+ recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
+
+ config = user_config.get(path, {})
+ # NOTE: the user config is in GB, but we compare bytes, thus the
+ # conversion.
+ config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
+ recommended_bytes = config_bytes or recommended_bytes
+
+ if free_bytes < recommended_bytes:
+ free_gb = float(free_bytes) / 10**9
+ recommended_gb = float(recommended_bytes) / 10**9
+ return {
+ 'failed': True,
+ 'msg': (
+ 'Available disk space in "{}" ({:.1f} GB) '
+ 'is below minimum recommended ({:.1f} GB)'
+ ).format(path, free_gb, recommended_gb)
+ }
+
+ return {}
+
+ @staticmethod
+ def free_bytes(path, ansible_mounts):
+ """Return the size available in path based on ansible_mounts."""
+ mount_point = path
+ # arbitry value to prevent an infinite loop, in the unlike case that '/'
+ # is not in ansible_mounts.
+ max_depth = 32
+ while mount_point not in ansible_mounts and max_depth > 0:
+ mount_point = os.path.dirname(mount_point)
+ max_depth -= 1
+
+ try:
+ free_bytes = ansible_mounts[mount_point]['size_available']
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(ansible_mounts)) or 'none'
+ msg = 'Unable to determine disk availability for "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
+
+ return free_bytes
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..26bf4c09b 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,8 +1,25 @@
-# pylint: disable=missing-docstring
+"""Check that required Docker images are available."""
+
from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks.mixins import DockerHostMixin
+
+
+NODE_IMAGE_SUFFIXES = ["haproxy-router", "docker-registry", "deployer", "pod"]
+DEPLOYMENT_IMAGE_INFO = {
+ "origin": {
+ "namespace": "openshift",
+ "name": "origin",
+ "registry_console_image": "cockpit/kubernetes",
+ },
+ "openshift-enterprise": {
+ "namespace": "openshift3",
+ "name": "ose",
+ "registry_console_image": "registry.access.redhat.com/openshift3/registry-console",
+ },
+}
-class DockerImageAvailability(OpenShiftCheck):
+class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.
This check attempts to ensure that required docker images are
@@ -12,42 +29,36 @@ class DockerImageAvailability(OpenShiftCheck):
name = "docker_image_availability"
tags = ["preflight"]
+ dependencies = ["skopeo", "python-docker-py"]
- skopeo_image = "openshift/openshift-ansible"
-
- # FIXME(juanvallejo): we should consider other possible values of
- # `deployment_type` (the key here). See
- # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
- docker_image_base = {
- "origin": {
- "repo": "openshift",
- "image": "origin",
- },
- "openshift-enterprise": {
- "repo": "openshift3",
- "image": "ose",
- },
- }
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts with unsupported deployment types."""
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO
- def run(self, tmp, task_vars):
- required_images = self.required_images(task_vars)
- missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+ return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type
- # exit early if all images were found locally
- if not missing_images:
- return {"changed": False}
-
- msg, failed, changed = self.update_skopeo_image(task_vars)
-
- # exit early if Skopeo update fails
+ def run(self, tmp, task_vars):
+ msg, failed, changed = self.ensure_dependencies(task_vars)
if failed:
return {
"failed": True,
"changed": changed,
- "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+ "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}
+ required_images = self.required_images(task_vars)
+ missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+
+ # exit early if all images were found locally
+ if not missing_images:
+ return {"changed": changed}
+
registries = self.known_docker_registries(task_vars)
+ if not registries:
+ return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+
available_images = self.available_images(missing_images, registries, task_vars)
unavailable_images = set(missing_images) - set(available_images)
@@ -55,43 +66,63 @@ class DockerImageAvailability(OpenShiftCheck):
return {
"failed": True,
"msg": (
- "One or more required images are not available: {}.\n"
+ "One or more required Docker images are not available:\n {}\n"
"Configured registries: {}"
- ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
+ ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)),
"changed": changed,
}
return {"changed": changed}
- def required_images(self, task_vars):
- deployment_type = get_var(task_vars, "deployment_type")
- # FIXME(juanvallejo): we should handle gracefully with a proper error
- # message when given an unexpected value for `deployment_type`.
- image_base_name = self.docker_image_base[deployment_type]
-
- openshift_release = get_var(task_vars, "openshift_release")
- # FIXME(juanvallejo): this variable is not required when the
- # installation is non-containerized. The example inventories have it
- # commented out. We should handle gracefully and with a proper error
- # message when this variable is required and not set.
- openshift_image_tag = get_var(task_vars, "openshift_image_tag")
-
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
-
- if is_containerized:
- images = set(self.containerized_docker_images(image_base_name, openshift_release))
- else:
- images = set(self.rpm_docker_images(image_base_name, openshift_release))
-
- # append images with qualified image tags to our list of required images.
- # these are images with a (v0.0.0.0) tag, rather than a standard release
- # format tag (v0.0). We want to check this set in both containerized and
- # non-containerized installations.
- images.update(
- self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
- )
-
- return images
+ @staticmethod
+ def required_images(task_vars):
+ """
+ Determine which images we expect to need for this host.
+ Returns: a set of required images like 'openshift/origin:v3.6'
+
+ The thorny issue of determining the image names from the variables is under consideration
+ via https://github.com/openshift/openshift-ansible/issues/4415
+
+ For now we operate as follows:
+ * For containerized components (master, node, ...) we look at the deployment type and
+ use openshift/origin or openshift3/ose as the base for those component images. The
+ version is openshift_image_tag as determined by the openshift_version role.
+ * For OpenShift-managed infrastructure (router, registry...) we use oreg_url if
+ it is defined; otherwise we again use the base that depends on the deployment type.
+ Registry is not included in constructed images. It may be in oreg_url or etcd image.
+ """
+ required = set()
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ host_groups = get_var(task_vars, "group_names")
+ image_tag = get_var(task_vars, "openshift_image_tag")
+ image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
+ if not image_info:
+ return required
+
+ # template for images that run on top of OpenShift
+ image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
+ image_url = get_var(task_vars, "oreg_url", default="") or image_url
+ if 'nodes' in host_groups:
+ for suffix in NODE_IMAGE_SUFFIXES:
+ required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
+ # The registry-console is for some reason not prefixed with ose- like the other components.
+ # Nor is it versioned the same, so just look for latest.
+ # Also a completely different name is used for Origin.
+ required.add(image_info["registry_console_image"])
+
+ # images for containerized components
+ if get_var(task_vars, "openshift", "common", "is_containerized"):
+ components = set()
+ if 'nodes' in host_groups:
+ components.update(["node", "openvswitch"])
+ if 'masters' in host_groups: # name is "origin" or "ose"
+ components.add(image_info["name"])
+ for component in components:
+ required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag))
+ if 'etcd' in host_groups: # special case, note it is the same for origin/enterprise
+ required.add("registry.access.redhat.com/rhel7/etcd") # and no image tag
+
+ return required
def local_images(self, images, task_vars):
"""Filter a list of images and return those available locally."""
@@ -101,79 +132,46 @@ class DockerImageAvailability(OpenShiftCheck):
]
def is_image_local(self, image, task_vars):
- result = self.module_executor("docker_image_facts", {"name": image}, task_vars)
+ """Check if image is already in local docker index."""
+ result = self.execute_module("docker_image_facts", {"name": image}, task_vars=task_vars)
if result.get("failed", False):
return False
return bool(result.get("images", []))
- def known_docker_registries(self, task_vars):
- result = self.module_executor("docker_info", {}, task_vars)
+ @staticmethod
+ def known_docker_registries(task_vars):
+ """Build a list of docker registries available according to inventory vars."""
+ docker_facts = get_var(task_vars, "openshift", "docker")
+ regs = set(docker_facts["additional_registries"])
- if result.get("failed", False):
- return []
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ if deployment_type == "origin":
+ regs.update(["docker.io"])
+ elif "enterprise" in deployment_type:
+ regs.update(["registry.access.redhat.com"])
- # FIXME(juanvallejo): wrong default type, result["info"] is expected to
- # contain a dictionary (see how we call `docker_info.get` below).
- docker_info = result.get("info", "")
- return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
+ return list(regs)
def available_images(self, images, registries, task_vars):
"""Inspect existing images using Skopeo and return all images successfully inspected."""
return [
image for image in images
- if self.is_image_available(image, registries, task_vars)
+ if self.is_available_skopeo_image(image, registries, task_vars)
]
- def is_image_available(self, image, registries, task_vars):
+ def is_available_skopeo_image(self, image, registries, task_vars):
+ """Use Skopeo to determine if required image exists in known registry(s)."""
+
+ # if image does already includes a registry, just use that
+ if image.count("/") > 1:
+ registry, image = image.split("/", 1)
+ registries = [registry]
+
for registry in registries:
- if self.is_available_skopeo_image(image, registry, task_vars):
+ args = {"_raw_params": "skopeo inspect --tls-verify=false docker://{}/{}".format(registry, image)}
+ result = self.execute_module("command", args, task_vars=task_vars)
+ if result.get("rc", 0) == 0 and not result.get("failed"):
return True
return False
-
- def is_available_skopeo_image(self, image, registry, task_vars):
- """Uses Skopeo to determine if required image exists in a given registry."""
-
- cmd_str = "skopeo inspect docker://{registry}/{image}".format(
- registry=registry,
- image=image,
- )
-
- args = {
- "name": "skopeo_inspect",
- "image": self.skopeo_image,
- "command": cmd_str,
- "detach": False,
- "cleanup": True,
- }
- result = self.module_executor("docker_container", args, task_vars)
- return result.get("failed", False)
-
- def containerized_docker_images(self, base_name, version):
- return [
- "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
- ]
-
- @staticmethod
- def rpm_docker_images(base, version):
- return [
- "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
- ]
-
- @staticmethod
- def qualified_docker_images(image_name, version):
- return [
- "{}-{}:{}".format(image_name, component, version)
- for component in "haproxy-router docker-registry deployer pod".split()
- ]
-
- @staticmethod
- def image_from_base_name(base):
- return "".join([base["repo"], "/", base["image"]])
-
- # ensures that the skopeo docker image exists, and updates it
- # with latest if image was already present locally.
- def update_skopeo_image(self, task_vars):
- result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
- return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
new file mode 100644
index 000000000..8d0fbcc9c
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -0,0 +1,185 @@
+"""Check Docker storage driver and usage."""
+import json
+import re
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks.mixins import DockerHostMixin
+
+
+class DockerStorage(DockerHostMixin, OpenShiftCheck):
+ """Check Docker storage driver compatibility.
+
+ This check ensures that Docker is using a supported storage driver,
+ and that loopback is not being used (if using devicemapper).
+ Also that storage usage is not above threshold.
+ """
+
+ name = "docker_storage"
+ tags = ["pre-install", "health", "preflight"]
+
+ dependencies = ["python-docker-py"]
+ storage_drivers = ["devicemapper", "overlay2"]
+ max_thinpool_data_usage_percent = 90.0
+ max_thinpool_meta_usage_percent = 90.0
+
+ # pylint: disable=too-many-return-statements
+ # Reason: permanent stylistic exception;
+ # it is clearer to return on failures and there are just many ways to fail here.
+ def run(self, tmp, task_vars):
+ msg, failed, changed = self.ensure_dependencies(task_vars)
+ if failed:
+ return {
+ "failed": True,
+ "changed": changed,
+ "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
+ }
+
+ # attempt to get the docker info hash from the API
+ info = self.execute_module("docker_info", {}, task_vars=task_vars)
+ if info.get("failed"):
+ return {"failed": True, "changed": changed,
+ "msg": "Failed to query Docker API. Is docker running on this host?"}
+ if not info.get("info"): # this would be very strange
+ return {"failed": True, "changed": changed,
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
+ info = info["info"]
+
+ # check if the storage driver we saw is valid
+ driver = info.get("Driver", "[NONE]")
+ if driver not in self.storage_drivers:
+ msg = (
+ "Detected unsupported Docker storage driver '{driver}'.\n"
+ "Supported storage drivers are: {drivers}"
+ ).format(driver=driver, drivers=', '.join(self.storage_drivers))
+ return {"failed": True, "changed": changed, "msg": msg}
+
+ # driver status info is a list of tuples; convert to dict and validate based on driver
+ driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+ if driver == "devicemapper":
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "changed": changed, "msg": msg}
+ result = self._check_dm_usage(driver_status, task_vars)
+ result['changed'] = result.get('changed', False) or changed
+ return result
+
+ # TODO(lmeyer): determine how to check usage for overlay2
+
+ return {"changed": changed}
+
+ def _check_dm_usage(self, driver_status, task_vars):
+ """
+ Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
+ implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
+ devicemapper storage. The LV is "thin" because it does not use all available storage
+ from its VG, instead expanding as needed; so to determine available space, we gather
+ current usage as the Docker API reports for the driver as well as space available for
+ expansion in the pool's VG.
+ Usage within the LV is divided into pools allocated to data and metadata, either of which
+ could run out of space first; so we check both.
+ """
+ vals = dict(
+ vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+ data_used=driver_status.get("Data Space Used"),
+ data_total=driver_status.get("Data Space Total"),
+ metadata_used=driver_status.get("Metadata Space Used"),
+ metadata_total=driver_status.get("Metadata Space Total"),
+ )
+
+ # convert all human-readable strings to bytes
+ for key, value in vals.copy().items():
+ try:
+ vals[key + "_bytes"] = self._convert_to_bytes(value)
+ except ValueError as err: # unlikely to hit this from API info, but just to be safe
+ return {
+ "failed": True,
+ "values": vals,
+ "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
+ }
+
+ # determine the threshold percentages which usage should not exceed
+ for name, default in [("data", self.max_thinpool_data_usage_percent),
+ ("metadata", self.max_thinpool_meta_usage_percent)]:
+ percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
+ try:
+ vals[name + "_threshold"] = float(percent)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
+ }
+
+ # test whether the thresholds are exceeded
+ messages = []
+ for name in ["data", "metadata"]:
+ vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
+ vals[name + "_total_bytes"] + vals["vg_free_bytes"])
+ if vals[name + "_pct_used"] > vals[name + "_threshold"]:
+ messages.append(
+ "Docker thinpool {name} usage percentage {pct:.1f} "
+ "is higher than threshold {thresh:.1f}.".format(
+ name=name,
+ pct=vals[name + "_pct_used"],
+ thresh=vals[name + "_threshold"],
+ ))
+ vals["failed"] = True
+
+ vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
+ return vals
+
+ def _get_vg_free(self, pool, task_vars):
+ # Determine which VG to examine according to the pool name, the only indicator currently
+ # available from the Docker API driver info. We assume a name that looks like
+ # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+ match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
+ if not match: # unlikely, but... be clear if we assumed wrong
+ raise OpenShiftCheckException(
+ "This host's Docker reports it is using a storage pool named '{}'.\n"
+ "However this name does not have the expected format of 'vgname-lvname'\n"
+ "so the available storage in the VG cannot be determined.".format(pool)
+ )
+ vg_name = match.groups()[0].replace("--", "-")
+ vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
+ # should return free space like " 12.00g" if the VG exists; empty if it does not
+
+ ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars=task_vars)
+ if ret.get("failed") or ret.get("rc", 0) != 0:
+ raise OpenShiftCheckException(
+ "Is LVM installed? Failed to run /sbin/vgs "
+ "to determine docker storage usage:\n" + ret.get("msg", "")
+ )
+ size = ret.get("stdout", "").strip()
+ if not size:
+ raise OpenShiftCheckException(
+ "This host's Docker reports it is using a storage pool named '{pool}'.\n"
+ "which we expect to come from local VG '{vg}'.\n"
+ "However, /sbin/vgs did not find this VG. Is Docker for this host"
+ "running and using the storage on the host?".format(pool=pool, vg=vg_name)
+ )
+ return size
+
+ @staticmethod
+ def _convert_to_bytes(string):
+ units = dict(
+ b=1,
+ k=1024,
+ m=1024**2,
+ g=1024**3,
+ t=1024**4,
+ p=1024**5,
+ )
+ string = string or ""
+ match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit
+ if not match:
+ raise ValueError("Cannot convert to a byte size: " + string)
+
+ number, unit = match.groups()
+ multiplier = 1 if not unit else units.get(unit.lower())
+ if not multiplier:
+ raise ValueError("Cannot convert to a byte size: " + string)
+
+ return float(number) * multiplier
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
new file mode 100644
index 000000000..c04a69765
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -0,0 +1,84 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+ """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+ name = "etcd_imagedata_size"
+ tags = ["etcd"]
+
+ def run(self, tmp, task_vars):
+ etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts"))
+ etcd_avail_diskspace = etcd_mountpath["size_available"]
+ etcd_total_diskspace = etcd_mountpath["size_total"]
+
+ etcd_imagedata_size_limit = get_var(task_vars,
+ "etcd_max_image_data_size_bytes",
+ default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)))
+
+ etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False)
+ etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379)
+ etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts")
+
+ config_base = get_var(task_vars, "openshift", "common", "config_base")
+
+ cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt")
+ key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key")
+ ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt")
+
+ for etcd_host in list(etcd_hosts):
+ args = {
+ "size_limit_bytes": etcd_imagedata_size_limit,
+ "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+ "host": etcd_host,
+ "port": etcd_port,
+ "protocol": "https" if etcd_is_ssl else "http",
+ "version_prefix": "/v2",
+ "allow_redirect": True,
+ "ca_cert": ca_cert,
+ "cert": {
+ "cert": cert,
+ "key": key,
+ },
+ }
+
+ etcdkeysize = self.module_executor("etcdkeysize", args, task_vars)
+
+ if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+ msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+ reason = etcdkeysize.get("msg")
+ if etcdkeysize.get("module_stderr"):
+ reason = etcdkeysize["module_stderr"]
+
+ msg = msg.format(host=etcd_host, reason=reason)
+ return {"failed": True, "changed": False, "msg": msg}
+
+ if etcdkeysize["size_limit_exceeded"]:
+ limit = self._to_gigabytes(etcd_imagedata_size_limit)
+ msg = ("The size of OpenShift image data stored in etcd host "
+ "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+ "Use the `oadm prune images` command to cleanup unused Docker images.")
+ return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+ return {"changed": False}
+
+ @staticmethod
+ def _get_etcd_mountpath(ansible_mounts):
+ valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+ mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+ for path in valid_etcd_mount_paths:
+ if path in mount_for_path:
+ return mount_for_path[path]
+
+ paths = ', '.join(sorted(mount_for_path)) or 'none'
+ msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths)
+ raise OpenShiftCheckException(msg)
+
+ @staticmethod
+ def _to_gigabytes(byte_size):
+ return float(byte_size) / 10.0**9
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
new file mode 100644
index 000000000..7452c9cc1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -0,0 +1,58 @@
+"""A health check for OpenShift clusters."""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdVolume(OpenShiftCheck):
+ """Ensures etcd storage usage does not exceed a given threshold."""
+
+ name = "etcd_volume"
+ tags = ["etcd", "health"]
+
+ # Default device usage threshold. Value should be in the range [0, 100].
+ default_threshold_percent = 90
+ # Where to find ectd data, higher priority first.
+ supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+ @classmethod
+ def is_active(cls, task_vars):
+ etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters",
+ default=[]) or []
+ is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts
+ return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host
+
+ def run(self, tmp, task_vars):
+ mount_info = self._etcd_mount_info(task_vars)
+ available = mount_info["size_available"]
+ total = mount_info["size_total"]
+ used = total - available
+
+ threshold = get_var(
+ task_vars,
+ "etcd_device_usage_threshold_percent",
+ default=self.default_threshold_percent
+ )
+
+ used_percent = 100.0 * used / total
+
+ if used_percent > threshold:
+ device = mount_info.get("device", "unknown")
+ mount = mount_info.get("mount", "unknown")
+ msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format(
+ used_percent, threshold, device, mount
+ )
+ return {"failed": True, "msg": msg}
+
+ return {"changed": False}
+
+ def _etcd_mount_info(self, task_vars):
+ ansible_mounts = get_var(task_vars, "ansible_mounts")
+ mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+
+ for path in self.supported_mount_paths:
+ if path in mounts:
+ return mounts[path]
+
+ paths = ', '.join(sorted(mounts)) or 'none'
+ msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths)
+ raise OpenShiftCheckException(msg)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..c9fc59896
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,61 @@
+"""
+Module for performing checks on an Curator logging deployment
+"""
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Curator(LoggingCheck):
+ """Module that checks an integrated logging Curator deployment"""
+
+ name = "curator"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ curator_pods, error = super(Curator, self).get_pods_for_component(
+ self.module_executor,
+ self.logging_namespace,
+ "curator",
+ task_vars
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_curator(curator_pods)
+
+ if check_error:
+ msg = ("The following Curator deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+ def check_curator(self, pods):
+ """Check to see if curator is up and working. Returns: error string"""
+ if not pods:
+ return (
+ "There are no Curator pods for the logging stack,\n"
+ "so nothing will prune Elasticsearch indexes.\n"
+ "Is Curator correctly deployed?"
+ )
+
+ not_running = super(Curator, self).not_running_pods(pods)
+ if len(not_running) == len(pods):
+ return (
+ "The Curator pod is not currently in a running state,\n"
+ "so Elasticsearch indexes may increase without bound."
+ )
+ if len(pods) - len(not_running) > 1:
+ return (
+ "There is more than one Curator pod running. This should not normally happen.\n"
+ "Although this doesn't cause any problems, you may want to investigate."
+ )
+
+ return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..01cb35b81
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,217 @@
+"""
+Module for performing checks on an Elasticsearch logging deployment
+"""
+
+import json
+import re
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+ """Module that checks an integrated logging Elasticsearch deployment"""
+
+ name = "elasticsearch"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ es_pods, error = super(Elasticsearch, self).get_pods_for_component(
+ self.execute_module,
+ self.logging_namespace,
+ "es",
+ task_vars,
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_elasticsearch(es_pods, task_vars)
+
+ if check_error:
+ msg = ("The following Elasticsearch deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
+
+ def _not_running_elasticsearch_pods(self, es_pods):
+ """Returns: list of running pods, list of errors about non-running pods"""
+ not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+ if not_running:
+ return not_running, [(
+ 'The following Elasticsearch pods are not running:\n'
+ '{pods}'
+ 'These pods will not aggregate logs from their nodes.'
+ ).format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))]
+ return not_running, []
+
+ def check_elasticsearch(self, es_pods, task_vars):
+ """Various checks for elasticsearch. Returns: error string"""
+ not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+ pods_by_name = {
+ pod['metadata']['name']: pod for pod in running_pods
+ # Filter out pods that are not members of a DC
+ if pod['metadata'].get('labels', {}).get('deploymentconfig')
+ }
+ if not pods_by_name:
+ return 'No logging Elasticsearch pods were found. Is logging deployed?'
+ error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars)
+ error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars)
+ error_msgs += self._check_es_cluster_health(pods_by_name, task_vars)
+ error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars)
+ return '\n'.join(error_msgs)
+
+ @staticmethod
+ def _build_es_curl_cmd(pod_name, url):
+ base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+ return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+ def _check_elasticsearch_masters(self, pods_by_name, task_vars):
+ """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+ es_master_names = set()
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ # Compare what each ES node reports as master and compare for split brain
+ get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+ master_name_str = self._exec_oc(get_master_cmd, [], task_vars)
+ master_names = (master_name_str or '').split(' ')
+ if len(master_names) > 1:
+ es_master_names.add(master_names[1])
+ else:
+ error_msgs.append(
+ 'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+ ' {response}'.format(pod=pod_name, response=master_name_str)
+ )
+
+ if not es_master_names:
+ error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
+ return '\n'.join(error_msgs)
+
+ if len(es_master_names) > 1:
+ error_msgs.append(
+ 'Found multiple Elasticsearch masters according to the pods:\n'
+ '{master_list}\n'
+ 'This implies that the masters have "split brain" and are not correctly\n'
+ 'replicating data for the logging cluster. Log loss is likely to occur.'
+ .format(master_list='\n'.join(' ' + master for master in es_master_names))
+ )
+
+ return error_msgs
+
+ def _check_elasticsearch_node_list(self, pods_by_name, task_vars):
+ """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+
+ if not pods_by_name:
+ return ['No logging Elasticsearch masters were found. Is logging deployed?']
+
+ # get ES cluster nodes
+ node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+ cluster_node_data = self._exec_oc(node_cmd, [], task_vars)
+ try:
+ cluster_nodes = json.loads(cluster_node_data)['nodes']
+ except (ValueError, KeyError):
+ return [
+ 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+ cluster_node_data
+ ]
+
+ # Try to match all ES-reported node hosts to known pods.
+ error_msgs = []
+ for node in cluster_nodes.values():
+ # Note that with 1.4/3.4 the pod IP may be used as the master name
+ if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+ for pod_name, pod in pods_by_name.items()):
+ error_msgs.append(
+ 'The Elasticsearch cluster reports a member node "{node}"\n'
+ 'that does not correspond to any known ES pod.'.format(node=node['host'])
+ )
+
+ return error_msgs
+
+ def _check_es_cluster_health(self, pods_by_name, task_vars):
+ """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+ cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars)
+ try:
+ health_res = json.loads(cluster_health_data)
+ if not health_res or not health_res.get('status'):
+ raise ValueError()
+ except ValueError:
+ error_msgs.append(
+ 'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+ 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+ )
+ continue
+
+ if health_res['status'] not in ['green', 'yellow']:
+ error_msgs.append(
+ 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+ )
+
+ return error_msgs
+
+ def _check_elasticsearch_diskspace(self, pods_by_name, task_vars):
+ """
+ Exec into an ES pod and query the diskspace on the persistent volume.
+ Returns: list of errors
+ """
+ error_msgs = []
+ for pod_name in pods_by_name.keys():
+ df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+ disk_output = self._exec_oc(df_cmd, [], task_vars)
+ lines = disk_output.splitlines()
+ # expecting one header looking like 'IUse% Use%' and one body line
+ body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+ if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+ error_msgs.append(
+ 'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+ 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+ )
+ continue
+ inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+ inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90')
+ if int(inode_pct) >= int(inode_pct_thresh):
+ error_msgs.append(
+ 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(inode_pct),
+ limit=str(inode_pct_thresh),
+ param='openshift_check_efk_es_inode_pct',
+ ))
+ disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80')
+ if int(disk_pct) >= int(disk_pct_thresh):
+ error_msgs.append(
+ 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(disk_pct),
+ limit=str(disk_pct_thresh),
+ param='openshift_check_efk_es_storage_pct',
+ ))
+
+ return error_msgs
+
+ def _exec_oc(self, cmd_str, extra_args, task_vars):
+ return super(Elasticsearch, self).exec_oc(
+ self.execute_module,
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ task_vars,
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..627567293
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,170 @@
+"""
+Module for performing checks on an Fluentd logging deployment
+"""
+
+import json
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+ """Module that checks an integrated logging Fluentd deployment"""
+ name = "fluentd"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
+ self.execute_module,
+ self.logging_namespace,
+ "fluentd",
+ task_vars,
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_fluentd(fluentd_pods, task_vars)
+
+ if check_error:
+ msg = ("The following Fluentd deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+
+ @staticmethod
+ def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+ label, value = node_selector.split('=', 1)
+ fluentd_nodes = {
+ name: node for name, node in nodes_by_name.items()
+ if node['metadata']['labels'].get(label) == value
+ }
+ if not fluentd_nodes:
+ return None, (
+ 'There are no nodes with the fluentd label {label}.\n'
+ 'This means no logs will be aggregated from the nodes.'
+ ).format(label=node_selector)
+ return fluentd_nodes, None
+
+ @staticmethod
+ def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars):
+ """Note if nodes are not labeled as expected. Returns: error string"""
+ intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all'])
+ if not intended_nodes or '--all' in intended_nodes:
+ intended_nodes = nodes_by_name.keys()
+ nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+ if nodes_missing_labels:
+ return (
+ 'The following nodes are supposed to be labeled with {label} but are not:\n'
+ ' {nodes}\n'
+ 'Fluentd will not aggregate logs from these nodes.'
+ ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
+ return None
+
+ @staticmethod
+ def _check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+ unmatched_nodes = fluentd_nodes.copy()
+ node_names_by_label = {
+ node['metadata']['labels']['kubernetes.io/hostname']: name
+ for name, node in fluentd_nodes.items()
+ }
+ node_names_by_internal_ip = {
+ address['address']: name
+ for name, node in fluentd_nodes.items()
+ for address in node['status']['addresses']
+ if address['type'] == "InternalIP"
+ }
+ for pod in pods:
+ for name in [
+ pod['spec']['nodeName'],
+ node_names_by_internal_ip.get(pod['spec']['nodeName']),
+ node_names_by_label.get(pod.get('spec', {}).get('host')),
+ ]:
+ unmatched_nodes.pop(name, None)
+ if unmatched_nodes:
+ return (
+ 'The following nodes are supposed to have a Fluentd pod but do not:\n'
+ '{nodes}'
+ 'These nodes will not have their logs aggregated.'
+ ).format(nodes=''.join(
+ " {}\n".format(name)
+ for name in unmatched_nodes.keys()
+ ))
+ return None
+
+ def _check_fluentd_pods_running(self, pods):
+ """Make sure all fluentd pods are running. Returns: error string"""
+ not_running = super(Fluentd, self).not_running_pods(pods)
+ if not_running:
+ return (
+ 'The following Fluentd pods are supposed to be running but are not:\n'
+ '{pods}'
+ 'These pods will not aggregate logs from their nodes.'
+ ).format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ return None
+
+ def check_fluentd(self, pods, task_vars):
+ """Verify fluentd is running everywhere. Returns: error string"""
+
+ node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true')
+
+ nodes_by_name, error = self.get_nodes_by_name(task_vars)
+
+ if error:
+ return error
+ fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+ if error:
+ return error
+
+ error_msgs = []
+ error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars)
+ if error:
+ error_msgs.append(error)
+ error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
+ if error:
+ error_msgs.append(error)
+ error = self._check_fluentd_pods_running(pods)
+ if error:
+ error_msgs.append(error)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ error_msgs.append(
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ )
+
+ return '\n'.join(error_msgs)
+
+ def get_nodes_by_name(self, task_vars):
+ """Retrieve all the node definitions. Returns: dict(name: node), error string"""
+ nodes_json = self._exec_oc("get nodes -o json", [], task_vars)
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
+ if not nodes or not nodes.get('items'): # also should not happen
+ return None, "No nodes appear to be defined according to the API."
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }, None
+
+ def _exec_oc(self, cmd_str, extra_args, task_vars):
+ return super(Fluentd, self).exec_oc(self.execute_module,
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..442f407b1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,229 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+ from urllib2 import HTTPError, URLError
+ import urllib2
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Kibana(LoggingCheck):
+ """Module that checks an integrated logging Kibana deployment"""
+
+ name = "kibana"
+ tags = ["health", "logging"]
+
+ logging_namespace = None
+
+ def run(self, tmp, task_vars):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+ kibana_pods, error = super(Kibana, self).get_pods_for_component(
+ self.execute_module,
+ self.logging_namespace,
+ "kibana",
+ task_vars,
+ )
+ if error:
+ return {"failed": True, "changed": False, "msg": error}
+ check_error = self.check_kibana(kibana_pods)
+
+ if not check_error:
+ check_error = self._check_kibana_route(task_vars)
+
+ if check_error:
+ msg = ("The following Kibana deployment issue was found:"
+ "\n-------\n"
+ "{}".format(check_error))
+ return {"failed": True, "changed": False, "msg": msg}
+
+ # TODO(lmeyer): run it all again for the ops cluster
+ return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
+
+ def _verify_url_internal(self, url, task_vars):
+ """
+ Try to reach a URL from the host.
+ Returns: success (bool), reason (for failure)
+ """
+ args = dict(
+ url=url,
+ follow_redirects='none',
+ validate_certs='no', # likely to be signed with internal CA
+ # TODO(lmeyer): give users option to validate certs
+ status_code=302,
+ )
+ result = self.execute_module('uri', args, task_vars)
+ if result.get('failed'):
+ return result['msg']
+ return None
+
+ @staticmethod
+ def _verify_url_external(url):
+ """
+ Try to reach a URL from ansible control host.
+ Returns: success (bool), reason (for failure)
+ """
+ # This actually checks from the ansible control host, which may or may not
+ # really be "external" to the cluster.
+
+ # Disable SSL cert validation to work around internally signed certs
+ ctx = ssl.create_default_context()
+ ctx.check_hostname = False # or setting CERT_NONE is refused
+ ctx.verify_mode = ssl.CERT_NONE
+
+ # Verify that the url is returning a valid response
+ try:
+ # We only care if the url connects and responds
+ return_code = urllib2.urlopen(url, context=ctx).getcode()
+ except HTTPError as httperr:
+ return httperr.reason
+ except URLError as urlerr:
+ return str(urlerr)
+
+ # there appears to be no way to prevent urlopen from following redirects
+ if return_code != 200:
+ return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+ return None
+
+ def check_kibana(self, pods):
+ """Check to see if Kibana is up and working. Returns: error string."""
+
+ if not pods:
+ return "There are no Kibana pods deployed, so no access to the logging UI."
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ return "No Kibana pod is in a running state, so there is no access to the logging UI."
+ elif not_running:
+ return (
+ "The following Kibana pods are not currently in a running state:\n"
+ "{pods}"
+ "However at least one is, so service may not be impacted."
+ ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running))
+
+ return None
+
+ def _get_kibana_url(self, task_vars):
+ """
+ Get kibana route or report error.
+ Returns: url (or empty), reason for failure
+ """
+
+ # Get logging url
+ get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars)
+ if not get_route:
+ return None, 'no_route_exists'
+
+ route = json.loads(get_route)
+
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ # ingress can be null if there is no router, or empty if not routed
+ if not ingress or not ingress[0]:
+ return None, 'route_not_accepted'
+
+ host = route.get("spec", {}).get("host")
+ if not host:
+ return None, 'route_missing_host'
+
+ return 'https://{}/'.format(host), None
+
+ def _check_kibana_route(self, task_vars):
+ """
+ Check to see if kibana route is up and working.
+ Returns: error string
+ """
+ known_errors = dict(
+ no_route_exists=(
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ ),
+ route_not_accepted=(
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ ),
+ route_missing_host=(
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ ),
+ )
+
+ kibana_url, error = self._get_kibana_url(task_vars)
+ if not kibana_url:
+ return known_errors.get(error, error)
+
+ # first, check that kibana is reachable from the master.
+ error = self._verify_url_internal(kibana_url, task_vars)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ error = (
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'Is kibana running, and is at least one router routing to it?'
+ ).format(url=kibana_url)
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ error = (
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'because the hostname does not resolve.\n'
+ 'Is DNS configured for the Kibana hostname?'
+ ).format(url=kibana_url)
+ elif 'Status code was not' in error:
+ error = (
+ 'A request from this master to the Kibana URL {url}\n'
+ 'did not return the correct status code (302).\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues. The output was:\n'
+ ' {error}'
+ ).format(url=kibana_url, error=error)
+ return 'Error validating the logging Kibana route:\n' + error
+
+ # in production we would like the kibana route to work from outside the
+ # cluster too; but that may not be the case, so allow disabling just this part.
+ if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True):
+ return None
+ error = self._verify_url_external(kibana_url)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ error = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ error = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ elif 'Expected success (200)' in error:
+ error = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ error = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set in your inventory:\n'
+ ' openshift_check_efk_kibana_external=False'
+ ).format(error=error)
+ return error
+ return None
+
+ def _exec_oc(self, cmd_str, extra_args, task_vars):
+ return super(Kibana, self).exec_oc(self.execute_module,
+ self.logging_namespace,
+ cmd_str,
+ extra_args,
+ task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05b4d300c
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,96 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class LoggingCheck(OpenShiftCheck):
+ """Base class for logging component checks"""
+
+ name = "logging"
+
+ @classmethod
+ def is_active(cls, task_vars):
+ return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+
+ @staticmethod
+ def is_first_master(task_vars):
+ """Run only on first master and only when logging is configured. Returns: bool"""
+ logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+ # Note: It would be nice to use membership in oo_first_master group, however for now it
+ # seems best to avoid requiring that setup and just check this is the first master.
+ hostname = get_var(task_vars, "ansible_ssh_host") or [None]
+ masters = get_var(task_vars, "groups", "masters", default=None) or [None]
+ return logging_deployed and masters[0] == hostname
+
+ def run(self, tmp, task_vars):
+ pass
+
+ def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars):
+ """Get all pods for a given component. Returns: list of pods for component, error string"""
+ pod_output = self.exec_oc(
+ execute_module,
+ namespace,
+ "get pods -l component={} -o json".format(logging_component),
+ [],
+ task_vars
+ )
+ try:
+ pods = json.loads(pod_output)
+ if not pods or not pods.get('items'):
+ raise ValueError()
+ except ValueError:
+ # successful run but non-parsing data generally means there were no pods in the namespace
+ return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+
+ return pods['items'], None
+
+ @staticmethod
+ def not_running_pods(pods):
+ """Returns: list of pods not in a ready and running state"""
+ return [
+ pod for pod in pods
+ if any(
+ container['ready'] is False
+ for container in pod['status']['containerStatuses']
+ ) or not any(
+ condition['type'] == 'Ready' and condition['status'] == 'True'
+ for condition in pod['status']['conditions']
+ )
+ ]
+
+ @staticmethod
+ def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None):
+ """
+ Execute an 'oc' command in the remote host.
+ Returns: output of command and namespace,
+ or raises OpenShiftCheckException on error
+ """
+ config_base = get_var(task_vars, "openshift", "common", "config_base")
+ args = {
+ "namespace": namespace,
+ "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+ "cmd": cmd_str,
+ "extra_args": list(extra_args) if extra_args else [],
+ }
+
+ result = execute_module("ocutil", args, task_vars)
+ if result.get("failed"):
+ msg = (
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'
+ ).format(cmd=args['cmd'], error=result['result'])
+
+ if result['result'] == '[Errno 2] No such file or directory':
+ msg = (
+ "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+ "Has an installation been run on this host yet?"
+ )
+ raise OpenShiftCheckException(msg)
+
+ return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
new file mode 100644
index 000000000..f4e31065f
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -0,0 +1,51 @@
+# pylint: disable=missing-docstring
+from openshift_checks import OpenShiftCheck, get_var
+
+MIB = 2**20
+GIB = 2**30
+
+
+class MemoryAvailability(OpenShiftCheck):
+ """Check that recommended memory is available."""
+
+ name = "memory_availability"
+ tags = ["preflight"]
+
+ # Values taken from the official installation documentation:
+ # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
+ recommended_memory_bytes = {
+ "masters": 16 * GIB,
+ "nodes": 8 * GIB,
+ "etcd": 8 * GIB,
+ }
+ # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+ memtotal_adjustment = 1 * GIB
+
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts that do not have recommended memory requirements."""
+ group_names = get_var(task_vars, "group_names", default=[])
+ has_memory_recommendation = bool(set(group_names).intersection(cls.recommended_memory_bytes))
+ return super(MemoryAvailability, cls).is_active(task_vars) and has_memory_recommendation
+
+ def run(self, tmp, task_vars):
+ group_names = get_var(task_vars, "group_names")
+ total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB
+
+ recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+ configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB
+ min_memory_bytes = configured_min or recommended_min
+
+ if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
+ return {
+ 'failed': True,
+ 'msg': (
+ 'Available memory ({available:.1f} GiB) is too far '
+ 'below recommended value ({recommended:.1f} GiB)'
+ ).format(
+ available=float(total_memory_bytes) / GIB,
+ recommended=float(min_memory_bytes) / GIB,
+ ),
+ }
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 657e15160..2cb2e21aa 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -1,18 +1,58 @@
-# pylint: disable=missing-docstring
+"""
+Mixin classes meant to be used with subclasses of OpenShiftCheck.
+"""
+
from openshift_checks import get_var
class NotContainerizedMixin(object):
"""Mixin for checks that are only active when not in containerized mode."""
+ # permanent # pylint: disable=too-few-public-methods
+ # Reason: The mixin is not intended to stand on its own as a class.
@classmethod
def is_active(cls, task_vars):
- return (
- # This mixin is meant to be used with subclasses of OpenShiftCheck.
- super(NotContainerizedMixin, cls).is_active(task_vars) and
- not cls.is_containerized(task_vars)
- )
+ """Only run on non-containerized hosts."""
+ is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+ return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+
+
+class DockerHostMixin(object):
+ """Mixin for checks that are only active on hosts that require Docker."""
+
+ dependencies = []
- @staticmethod
- def is_containerized(task_vars):
- return get_var(task_vars, "openshift", "common", "is_containerized")
+ @classmethod
+ def is_active(cls, task_vars):
+ """Only run on hosts that depend on Docker."""
+ is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+ is_node = "nodes" in get_var(task_vars, "group_names", default=[])
+ return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node)
+
+ def ensure_dependencies(self, task_vars):
+ """
+ Ensure that docker-related packages exist, but not on atomic hosts
+ (which would not be able to install but should already have them).
+ Returns: msg, failed, changed
+ """
+ if get_var(task_vars, "openshift", "common", "is_atomic"):
+ return "", False, False
+
+ # NOTE: we would use the "package" module but it's actually an action plugin
+ # and it's not clear how to invoke one of those. This is about the same anyway:
+ result = self.execute_module(
+ get_var(task_vars, "ansible_pkg_mgr", default="yum"),
+ {"name": self.dependencies, "state": "present"},
+ task_vars=task_vars,
+ )
+ msg = result.get("msg", "")
+ if result.get("failed"):
+ if "No package matching" in msg:
+ msg = "Ensure that all required dependencies can be installed via `yum`.\n"
+ msg = (
+ "Unable to install required packages on this host:\n"
+ " {deps}\n{msg}"
+ ).format(deps=',\n '.join(self.dependencies), msg=msg)
+ failed = result.get("failed", False) or result.get("rc", 0) != 0
+ changed = result.get("changed", False)
+ return msg, failed, changed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
new file mode 100644
index 000000000..2dd045f1f
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -0,0 +1,78 @@
+"""
+Ansible module for determining if an installed version of Open vSwitch is incompatible with the
+currently installed version of OpenShift.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks.mixins import NotContainerizedMixin
+
+
+class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
+ """Check that packages in a package_list are installed on the host
+ and are the correct version as determined by an OpenShift installation.
+ """
+
+ name = "ovs_version"
+ tags = ["health"]
+
+ openshift_to_ovs_version = {
+ "3.6": "2.6",
+ "3.5": "2.6",
+ "3.4": "2.4",
+ }
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts that do not have package requirements."""
+ group_names = get_var(task_vars, "group_names", default=[])
+ master_or_node = 'masters' in group_names or 'nodes' in group_names
+ return super(OvsVersion, cls).is_active(task_vars) and master_or_node
+
+ def run(self, tmp, task_vars):
+ args = {
+ "package_list": [
+ {
+ "name": "openvswitch",
+ "version": self.get_required_ovs_version(task_vars),
+ },
+ ],
+ }
+ return self.execute_module("rpm_version", args, task_vars=task_vars)
+
+ def get_required_ovs_version(self, task_vars):
+ """Return the correct Open vSwitch version for the current OpenShift version"""
+ openshift_version = self._get_openshift_version(task_vars)
+
+ if float(openshift_version) < 3.5:
+ return self.openshift_to_ovs_version["3.4"]
+
+ ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+ if ovs_version:
+ return self.openshift_to_ovs_version[str(openshift_version)]
+
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
+
+ def _get_openshift_version(self, task_vars):
+ openshift_version = get_var(task_vars, "openshift_image_tag")
+ if openshift_version and openshift_version[0] == 'v':
+ openshift_version = openshift_version[1:]
+
+ return self._parse_version(openshift_version)
+
+ def _parse_version(self, version):
+ components = version.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(version))
+
+ if components[0] in self.openshift_major_release_version:
+ components[0] = self.openshift_major_release_version[components[0]]
+
+ return '.'.join(components[:2])
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index 9891972a6..0dd2b1286 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -9,6 +9,10 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
name = "package_availability"
tags = ["preflight"]
+ @classmethod
+ def is_active(cls, task_vars):
+ return super(PackageAvailability, cls).is_active(task_vars) and task_vars["ansible_pkg_mgr"] == "yum"
+
def run(self, tmp, task_vars):
rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
group_names = get_var(task_vars, "group_names", default=[])
@@ -21,7 +25,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
packages.update(self.node_packages(rpm_prefix))
args = {"packages": sorted(set(packages))}
- return self.execute_module("check_yum_update", args, tmp, task_vars)
+ return self.execute_module("check_yum_update", args, tmp=tmp, task_vars=task_vars)
@staticmethod
def master_packages(rpm_prefix):
@@ -32,8 +36,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
"bash-completion",
"cockpit-bridge",
"cockpit-docker",
- "cockpit-kubernetes",
- "cockpit-shell",
+ "cockpit-system",
"cockpit-ws",
"etcd",
"httpd-tools",
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index fd0c0a755..f432380c6 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -11,4 +11,4 @@ class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
def run(self, tmp, task_vars):
args = {"packages": []}
- return self.execute_module("check_yum_update", args, tmp, task_vars)
+ return self.execute_module("check_yum_update", args, tmp=tmp, task_vars=task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 42193a1c6..6a76bb93d 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,5 @@
# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,12 +9,117 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
name = "package_version"
tags = ["preflight"]
+ openshift_to_ovs_version = {
+ "3.6": "2.6",
+ "3.5": "2.6",
+ "3.4": "2.4",
+ }
+
+ openshift_to_docker_version = {
+ "3.1": "1.8",
+ "3.2": "1.10",
+ "3.3": "1.10",
+ "3.4": "1.12",
+ }
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts that do not have package requirements."""
+ group_names = get_var(task_vars, "group_names", default=[])
+ master_or_node = 'masters' in group_names or 'nodes' in group_names
+ return super(PackageVersion, cls).is_active(task_vars) and master_or_node
+
def run(self, tmp, task_vars):
rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- openshift_release = get_var(task_vars, "openshift_release")
+ openshift_release = get_var(task_vars, "openshift_release", default='')
+ deployment_type = get_var(task_vars, "openshift_deployment_type")
+ check_multi_minor_release = deployment_type in ['openshift-enterprise']
args = {
- "prefix": rpm_prefix,
- "version": openshift_release,
+ "package_list": [
+ {
+ "name": "openvswitch",
+ "version": self.get_required_ovs_version(task_vars),
+ "check_multi": False,
+ },
+ {
+ "name": "docker",
+ "version": self.get_required_docker_version(task_vars),
+ "check_multi": False,
+ },
+ {
+ "name": "{}".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ {
+ "name": "{}-master".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ {
+ "name": "{}-node".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ ],
}
- return self.execute_module("aos_version", args, tmp, task_vars)
+
+ return self.execute_module("aos_version", args, tmp=tmp, task_vars=task_vars)
+
+ def get_required_ovs_version(self, task_vars):
+ """Return the correct Open vSwitch version for the current OpenShift version.
+ If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6,
+ Else ensure Open vSwitch version 2.4"""
+ openshift_version = self.get_openshift_version(task_vars)
+
+ if float(openshift_version) < 3.5:
+ return self.openshift_to_ovs_version["3.4"]
+
+ ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+ if ovs_version:
+ return ovs_version
+
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
+
+ def get_required_docker_version(self, task_vars):
+ """Return the correct Docker version for the current OpenShift version.
+ If the OpenShift version is 3.1, ensure Docker version 1.8.
+ If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10.
+ If the current OpenShift version is >= 3.4, ensure Docker version 1.12."""
+ openshift_version = self.get_openshift_version(task_vars)
+
+ if float(openshift_version) >= 3.4:
+ return self.openshift_to_docker_version["3.4"]
+
+ docker_version = self.openshift_to_docker_version.get(str(openshift_version))
+ if docker_version:
+ return docker_version
+
+ msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
+
+ def get_openshift_version(self, task_vars):
+ openshift_version = get_var(task_vars, "openshift_image_tag")
+ if openshift_version and openshift_version[0] == 'v':
+ openshift_version = openshift_version[1:]
+
+ return self.parse_version(openshift_version)
+
+ def parse_version(self, version):
+ components = version.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(version))
+
+ if components[0] in self.openshift_major_release_version:
+ components[0] = self.openshift_major_release_version[components[0]]
+
+ return '.'.join(components[:2])
diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py
new file mode 100644
index 000000000..9383b233c
--- /dev/null
+++ b/roles/openshift_health_checker/test/action_plugin_test.py
@@ -0,0 +1,253 @@
+import pytest
+
+from ansible.playbook.play_context import PlayContext
+
+from openshift_health_check import ActionModule, resolve_checks
+from openshift_checks import OpenShiftCheckException
+
+
+def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None):
+ """Returns a new class that is compatible with OpenShiftCheck for testing."""
+
+ _name, _tags = name, tags
+
+ class FakeCheck(object):
+ name = _name
+ tags = _tags or []
+
+ def __init__(self, execute_module=None):
+ pass
+
+ @classmethod
+ def is_active(cls, task_vars):
+ return is_active
+
+ def run(self, tmp, task_vars):
+ if run_exception is not None:
+ raise run_exception
+ return run_return
+
+ return FakeCheck
+
+
+# Fixtures
+
+
+@pytest.fixture
+def plugin():
+ task = FakeTask('openshift_health_check', {'checks': ['fake_check']})
+ plugin = ActionModule(task, None, PlayContext(), None, None, None)
+ return plugin
+
+
+class FakeTask(object):
+ def __init__(self, action, args):
+ self.action = action
+ self.args = args
+ self.async = 0
+
+
+@pytest.fixture
+def task_vars():
+ return dict(openshift=dict(), ansible_host='unit-test-host')
+
+
+# Assertion helpers
+
+
+def failed(result, msg_has=None):
+ if msg_has is not None:
+ assert 'msg' in result
+ for term in msg_has:
+ assert term.lower() in result['msg'].lower()
+ return result.get('failed', False)
+
+
+def changed(result):
+ return result.get('changed', False)
+
+
+# tests whether task is skipped, not individual checks
+def skipped(result):
+ return result.get('skipped', False)
+
+
+# Tests
+
+
+@pytest.mark.parametrize('task_vars', [
+ None,
+ {},
+])
+def test_action_plugin_missing_openshift_facts(plugin, task_vars):
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert failed(result, msg_has=['openshift_facts'])
+
+
+def test_action_plugin_cannot_load_checks_with_the_same_name(plugin, task_vars, monkeypatch):
+ FakeCheck1 = fake_check('duplicate_name')
+ FakeCheck2 = fake_check('duplicate_name')
+ checks = [FakeCheck1, FakeCheck2]
+ monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert failed(result, msg_has=['unique', 'duplicate_name', 'FakeCheck'])
+
+
+def test_action_plugin_skip_non_active_checks(plugin, task_vars, monkeypatch):
+ checks = [fake_check(is_active=False)]
+ monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Not active for this host")
+ assert not failed(result)
+ assert not changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_skip_disabled_checks(plugin, task_vars, monkeypatch):
+ checks = [fake_check('fake_check', is_active=True)]
+ monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
+
+ task_vars['openshift_disable_check'] = 'fake_check'
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Disabled by user request")
+ assert not failed(result)
+ assert not changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch):
+ check_return_value = {'ok': 'test'}
+ check_class = fake_check(run_return=check_return_value)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert result['checks']['fake_check'] == check_return_value
+ assert not failed(result)
+ assert not changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch):
+ check_return_value = {'ok': 'test', 'changed': True}
+ check_class = fake_check(run_return=check_return_value)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert result['checks']['fake_check'] == check_return_value
+ assert not failed(result)
+ assert changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch):
+ check_return_value = {'failed': True}
+ check_class = fake_check(run_return=check_return_value)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert result['checks']['fake_check'] == check_return_value
+ assert failed(result, msg_has=['failed'])
+ assert not changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):
+ exception_msg = 'fake check has an exception'
+ run_exception = OpenShiftCheckException(exception_msg)
+ check_class = fake_check(run_exception=run_exception)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert failed(result['checks']['fake_check'], msg_has=exception_msg)
+ assert failed(result, msg_has=['failed'])
+ assert not changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch):
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda: {})
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert failed(result, msg_has=['unknown', 'name'])
+ assert not changed(result)
+ assert not skipped(result)
+
+
+@pytest.mark.parametrize('names,all_checks,expected', [
+ ([], [], set()),
+ (
+ ['a', 'b'],
+ [
+ fake_check('a'),
+ fake_check('b'),
+ ],
+ set(['a', 'b']),
+ ),
+ (
+ ['a', 'b', '@group'],
+ [
+ fake_check('from_group_1', ['group', 'another_group']),
+ fake_check('not_in_group', ['another_group']),
+ fake_check('from_group_2', ['preflight', 'group']),
+ fake_check('a'),
+ fake_check('b'),
+ ],
+ set(['a', 'b', 'from_group_1', 'from_group_2']),
+ ),
+])
+def test_resolve_checks_ok(names, all_checks, expected):
+ assert resolve_checks(names, all_checks) == expected
+
+
+@pytest.mark.parametrize('names,all_checks,words_in_exception,words_not_in_exception', [
+ (
+ ['testA', 'testB'],
+ [],
+ ['check', 'name', 'testA', 'testB'],
+ ['tag', 'group', '@'],
+ ),
+ (
+ ['@group'],
+ [],
+ ['tag', 'name', 'group'],
+ ['check', '@'],
+ ),
+ (
+ ['testA', 'testB', '@group'],
+ [],
+ ['check', 'name', 'testA', 'testB', 'tag', 'group'],
+ ['@'],
+ ),
+ (
+ ['testA', 'testB', '@group'],
+ [
+ fake_check('from_group_1', ['group', 'another_group']),
+ fake_check('not_in_group', ['another_group']),
+ fake_check('from_group_2', ['preflight', 'group']),
+ ],
+ ['check', 'name', 'testA', 'testB'],
+ ['tag', 'group', '@'],
+ ),
+])
+def test_resolve_checks_failure(names, all_checks, words_in_exception, words_not_in_exception):
+ with pytest.raises(Exception) as excinfo:
+ resolve_checks(names, all_checks)
+ for word in words_in_exception:
+ assert word in str(excinfo.value)
+ for word in words_not_in_exception:
+ assert word not in str(excinfo.value)
diff --git a/roles/openshift_health_checker/test/aos_version_test.py b/roles/openshift_health_checker/test/aos_version_test.py
new file mode 100644
index 000000000..697805dd2
--- /dev/null
+++ b/roles/openshift_health_checker/test/aos_version_test.py
@@ -0,0 +1,149 @@
+import pytest
+import aos_version
+
+from collections import namedtuple
+Package = namedtuple('Package', ['name', 'version'])
+
+expected_pkgs = {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ "check_multi": False,
+ },
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ },
+}
+
+
+@pytest.mark.parametrize('pkgs, expect_not_found', [
+ (
+ [],
+ {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ "check_multi": False,
+ },
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # none found
+ ),
+ (
+ [Package('spam', '3.2.1')],
+ {
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # completely missing
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('eggs', '3.3.2')],
+ {
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # not the right version
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.1')],
+ {}, # all found
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')],
+ {}, # found with more specific version
+ ),
+ (
+ [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5')],
+ {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # eggs found with multiple versions
+ ),
+])
+def test_check_pkgs_for_precise_version(pkgs, expect_not_found):
+ if expect_not_found:
+ with pytest.raises(aos_version.PreciseVersionNotFound) as e:
+ aos_version._check_precise_version_found(pkgs, expected_pkgs)
+
+ assert list(expect_not_found.values()) == e.value.problem_pkgs
+ else:
+ aos_version._check_precise_version_found(pkgs, expected_pkgs)
+
+
+@pytest.mark.parametrize('pkgs, expect_higher', [
+ (
+ [],
+ [],
+ ),
+ (
+ [Package('spam', '3.2.1.9')],
+ [], # more precise but not strictly higher
+ ),
+ (
+ [Package('spam', '3.3')],
+ ['spam-3.3'], # lower precision, but higher
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('eggs', '3.3.2')],
+ ['eggs-3.3.2'], # one too high
+ ),
+ (
+ [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')],
+ ['eggs-3.4'], # multiple versions, one is higher
+ ),
+ (
+ [Package('eggs', '3.2.1'), Package('eggs', '3.4'), Package('eggs', '3.3')],
+ ['eggs-3.4'], # multiple versions, two are higher
+ ),
+])
+def test_check_pkgs_for_greater_version(pkgs, expect_higher):
+ if expect_higher:
+ with pytest.raises(aos_version.FoundHigherVersion) as e:
+ aos_version._check_higher_version_found(pkgs, expected_pkgs)
+ assert set(expect_higher) == set(e.value.problem_pkgs)
+ else:
+ aos_version._check_higher_version_found(pkgs, expected_pkgs)
+
+
+@pytest.mark.parametrize('pkgs, expect_to_flag_pkgs', [
+ (
+ [],
+ [],
+ ),
+ (
+ [Package('spam', '3.2.1')],
+ [],
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.2')],
+ [],
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('spam', '3.3.2')],
+ ['spam'],
+ ),
+ (
+ [Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')],
+ ['eggs'],
+ ),
+])
+def test_check_pkgs_for_multi_release(pkgs, expect_to_flag_pkgs):
+ if expect_to_flag_pkgs:
+ with pytest.raises(aos_version.FoundMultiRelease) as e:
+ aos_version._check_multi_minor_release(pkgs, expected_pkgs)
+ assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs)
+ else:
+ aos_version._check_multi_minor_release(pkgs, expected_pkgs)
diff --git a/roles/openshift_health_checker/test/conftest.py b/roles/openshift_health_checker/test/conftest.py
index bf717ae85..3cbd65507 100644
--- a/roles/openshift_health_checker/test/conftest.py
+++ b/roles/openshift_health_checker/test/conftest.py
@@ -1,5 +1,11 @@
import os
import sys
-# extend sys.path so that tests can import openshift_checks
-sys.path.insert(1, os.path.dirname(os.path.dirname(__file__)))
+# extend sys.path so that tests can import openshift_checks and action plugins
+# from this role.
+openshift_health_checker_path = os.path.dirname(os.path.dirname(__file__))
+sys.path[1:1] = [
+ openshift_health_checker_path,
+ os.path.join(openshift_health_checker_path, 'action_plugins'),
+ os.path.join(openshift_health_checker_path, 'library'),
+]
diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py
new file mode 100644
index 000000000..ae108c96e
--- /dev/null
+++ b/roles/openshift_health_checker/test/curator_test.py
@@ -0,0 +1,68 @@
+import pytest
+
+from openshift_checks.logging.curator import Curator
+
+
+def canned_curator(exec_oc=None):
+ """Create a Curator check object with canned exec_oc method"""
+ check = Curator("dummy") # fails if a module is actually invoked
+ if exec_oc:
+ check._exec_oc = exec_oc
+ return check
+
+
+def assert_error(error, expect_error):
+ if expect_error:
+ assert error
+ assert expect_error in error
+ else:
+ assert not error
+
+
+plain_curator_pod = {
+ "metadata": {
+ "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+ "name": "logging-curator-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "podIP": "10.10.10.10",
+ }
+}
+
+not_running_curator_pod = {
+ "metadata": {
+ "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+ "name": "logging-curator-2",
+ },
+ "status": {
+ "containerStatuses": [{"ready": False}],
+ "conditions": [{"status": "False", "type": "Ready"}],
+ "podIP": "10.10.10.10",
+ }
+}
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+ (
+ [],
+ "no Curator pods",
+ ),
+ (
+ [plain_curator_pod],
+ None,
+ ),
+ (
+ [not_running_curator_pod],
+ "not currently in a running state",
+ ),
+ (
+ [plain_curator_pod, plain_curator_pod],
+ "more than one Curator pod",
+ ),
+])
+def test_get_curator_pods(pods, expect_error):
+ check = canned_curator()
+ error = check.check_curator(pods)
+ assert_error(error, expect_error)
diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py
new file mode 100644
index 000000000..945b9eafc
--- /dev/null
+++ b/roles/openshift_health_checker/test/disk_availability_test.py
@@ -0,0 +1,180 @@
+import pytest
+
+from openshift_checks.disk_availability import DiskAvailability, OpenShiftCheckException
+
+
+@pytest.mark.parametrize('group_names,is_active', [
+ (['masters'], True),
+ (['nodes'], True),
+ (['etcd'], True),
+ (['masters', 'nodes'], True),
+ (['masters', 'etcd'], True),
+ ([], False),
+ (['lb'], False),
+ (['nfs'], False),
+])
+def test_is_active(group_names, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ )
+ assert DiskAvailability.is_active(task_vars=task_vars) == is_active
+
+
+@pytest.mark.parametrize('ansible_mounts,extra_words', [
+ ([], ['none']), # empty ansible_mounts
+ ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
+ ([{'mount': '/var'}], ['/var']), # missing size_available
+])
+def test_cannot_determine_available_disk(ansible_mounts, extra_words):
+ task_vars = dict(
+ group_names=['masters'],
+ ansible_mounts=ansible_mounts,
+ )
+ check = DiskAvailability(execute_module=fake_execute_module)
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+
+ for word in 'determine disk availability'.split() + extra_words:
+ assert word in str(excinfo.value)
+
+
+@pytest.mark.parametrize('group_names,configured_min,ansible_mounts', [
+ (
+ ['masters'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9 + 1,
+ }],
+ ),
+ (
+ ['nodes'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 15 * 10**9 + 1,
+ }],
+ ),
+ (
+ ['etcd'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 20 * 10**9 + 1,
+ }],
+ ),
+ (
+ ['etcd'],
+ 1, # configure lower threshold
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9 + 1, # way smaller than recommended
+ }],
+ ),
+ (
+ ['etcd'],
+ 0,
+ [{
+ # not enough space on / ...
+ 'mount': '/',
+ 'size_available': 2 * 10**9,
+ }, {
+ # ... but enough on /var
+ 'mount': '/var',
+ 'size_available': 20 * 10**9 + 1,
+ }],
+ ),
+])
+def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansible_mounts):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_check_min_host_disk_gb=configured_min,
+ ansible_mounts=ansible_mounts,
+ )
+
+ check = DiskAvailability(execute_module=fake_execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ assert not result.get('failed', False)
+
+
+@pytest.mark.parametrize('group_names,configured_min,ansible_mounts,extra_words', [
+ (
+ ['masters'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 1,
+ }],
+ ['0.0 GB'],
+ ),
+ (
+ ['masters'],
+ 100, # set a higher threshold
+ [{
+ 'mount': '/',
+ 'size_available': 50 * 10**9, # would normally be enough...
+ }],
+ ['100.0 GB'],
+ ),
+ (
+ ['nodes'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9,
+ }],
+ ['1.0 GB'],
+ ),
+ (
+ ['etcd'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 1,
+ }],
+ ['0.0 GB'],
+ ),
+ (
+ ['nodes', 'masters'],
+ 0,
+ [{
+ 'mount': '/',
+ # enough space for a node, not enough for a master
+ 'size_available': 15 * 10**9 + 1,
+ }],
+ ['15.0 GB'],
+ ),
+ (
+ ['etcd'],
+ 0,
+ [{
+ # enough space on / ...
+ 'mount': '/',
+ 'size_available': 20 * 10**9 + 1,
+ }, {
+ # .. but not enough on /var
+ 'mount': '/var',
+ 'size_available': 0,
+ }],
+ ['0.0 GB'],
+ ),
+])
+def test_fails_with_insufficient_disk_space(group_names, configured_min, ansible_mounts, extra_words):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_check_min_host_disk_gb=configured_min,
+ ansible_mounts=ansible_mounts,
+ )
+
+ check = DiskAvailability(execute_module=fake_execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ assert result['failed']
+ for word in 'below recommended'.split() + extra_words:
+ assert word in result['msg']
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py
index 2a9c32f77..0a7c0f8d3 100644
--- a/roles/openshift_health_checker/test/docker_image_availability_test.py
+++ b/roles/openshift_health_checker/test/docker_image_availability_test.py
@@ -3,26 +3,259 @@ import pytest
from openshift_checks.docker_image_availability import DockerImageAvailability
-@pytest.mark.xfail(strict=True) # TODO: remove this once this test is fully implemented.
-@pytest.mark.parametrize('task_vars,expected_result', [
- (
- dict(
- openshift=dict(common=dict(
+@pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [
+ ("origin", True, [], True),
+ ("openshift-enterprise", True, [], True),
+ ("enterprise", True, [], False),
+ ("online", True, [], False),
+ ("invalid", True, [], False),
+ ("", True, [], False),
+ ("origin", False, [], False),
+ ("openshift-enterprise", False, [], False),
+ ("origin", False, ["nodes", "masters"], True),
+ ("openshift-enterprise", False, ["etcd"], False),
+])
+def test_is_active(deployment_type, is_containerized, group_names, expect_active):
+ task_vars = dict(
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ openshift_deployment_type=deployment_type,
+ group_names=group_names,
+ )
+ assert DockerImageAvailability.is_active(task_vars=task_vars) == expect_active
+
+
+@pytest.mark.parametrize("is_containerized,is_atomic", [
+ (True, True),
+ (False, False),
+ (True, False),
+ (False, True),
+])
+def test_all_images_available_locally(is_containerized, is_atomic):
+ def execute_module(module_name, module_args, task_vars):
+ if module_name == "yum":
+ return {"changed": True}
+
+ assert module_name == "docker_image_facts"
+ assert 'name' in module_args
+ assert module_args['name']
+ return {
+ 'images': [module_args['name']],
+ }
+
+ result = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ openshift=dict(
+ common=dict(
+ service_type='origin',
+ is_containerized=is_containerized,
+ is_atomic=is_atomic,
+ ),
+ docker=dict(additional_registries=["docker.io"]),
+ ),
+ openshift_deployment_type='origin',
+ openshift_image_tag='3.4',
+ group_names=['nodes', 'masters'],
+ ))
+
+ assert not result.get('failed', False)
+
+
+@pytest.mark.parametrize("available_locally", [
+ False,
+ True,
+])
+def test_all_images_available_remotely(available_locally):
+ def execute_module(module_name, module_args, task_vars):
+ if module_name == 'docker_image_facts':
+ return {'images': [], 'failed': available_locally}
+ return {'changed': False}
+
+ result = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ openshift=dict(
+ common=dict(
service_type='origin',
is_containerized=False,
- )),
- openshift_release='v3.5',
- deployment_type='origin',
- openshift_image_tag='', # FIXME: should not be required
+ is_atomic=False,
+ ),
+ docker=dict(additional_registries=["docker.io", "registry.access.redhat.com"]),
),
- {'changed': False},
+ openshift_deployment_type='origin',
+ openshift_image_tag='v3.4',
+ group_names=['nodes', 'masters'],
+ ))
+
+ assert not result.get('failed', False)
+
+
+def test_all_images_unavailable():
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ if module_name == "command":
+ return {
+ 'failed': True,
+ }
+
+ return {
+ 'changed': False,
+ }
+
+ check = DockerImageAvailability(execute_module=execute_module)
+ actual = check.run(tmp=None, task_vars=dict(
+ openshift=dict(
+ common=dict(
+ service_type='origin',
+ is_containerized=False,
+ is_atomic=False,
+ ),
+ docker=dict(additional_registries=["docker.io"]),
+ ),
+ openshift_deployment_type="openshift-enterprise",
+ openshift_image_tag='latest',
+ group_names=['nodes', 'masters'],
+ ))
+
+ assert actual['failed']
+ assert "required Docker images are not available" in actual['msg']
+
+
+@pytest.mark.parametrize("message,extra_words", [
+ (
+ "docker image update failure",
+ ["docker image update failure"],
+ ),
+ (
+ "No package matching 'skopeo' found available, installed or updated",
+ ["dependencies can be installed via `yum`"]
),
- # TODO: add more parameters here to test the multiple possible inputs that affect behavior.
])
-def test_docker_image_availability(task_vars, expected_result):
+def test_skopeo_update_failure(message, extra_words):
def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
- return {'info': {}} # TODO: this will vary depending on input parameters.
+ if module_name == "yum":
+ return {
+ "failed": True,
+ "msg": message,
+ "changed": False,
+ }
- check = DockerImageAvailability(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
- assert result == expected_result
+ return {'changed': False}
+
+ actual = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ openshift=dict(
+ common=dict(
+ service_type='origin',
+ is_containerized=False,
+ is_atomic=False,
+ ),
+ docker=dict(additional_registries=["unknown.io"]),
+ ),
+ openshift_deployment_type="openshift-enterprise",
+ openshift_image_tag='',
+ group_names=['nodes', 'masters'],
+ ))
+
+ assert actual["failed"]
+ for word in extra_words:
+ assert word in actual["msg"]
+
+
+@pytest.mark.parametrize("deployment_type,registries", [
+ ("origin", ["unknown.io"]),
+ ("openshift-enterprise", ["registry.access.redhat.com"]),
+ ("openshift-enterprise", []),
+])
+def test_registry_availability(deployment_type, registries):
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ return {
+ 'changed': False,
+ }
+
+ actual = DockerImageAvailability(execute_module=execute_module).run(tmp=None, task_vars=dict(
+ openshift=dict(
+ common=dict(
+ service_type='origin',
+ is_containerized=False,
+ is_atomic=False,
+ ),
+ docker=dict(additional_registries=registries),
+ ),
+ openshift_deployment_type=deployment_type,
+ openshift_image_tag='',
+ group_names=['nodes', 'masters'],
+ ))
+
+ assert not actual.get("failed", False)
+
+
+@pytest.mark.parametrize("deployment_type, is_containerized, groups, oreg_url, expected", [
+ ( # standard set of stuff required on nodes
+ "origin", False, ['nodes'], None,
+ set([
+ 'openshift/origin-pod:vtest',
+ 'openshift/origin-deployer:vtest',
+ 'openshift/origin-docker-registry:vtest',
+ 'openshift/origin-haproxy-router:vtest',
+ 'cockpit/kubernetes', # origin version of registry-console
+ ])
+ ),
+ ( # set a different URL for images
+ "origin", False, ['nodes'], 'foo.io/openshift/origin-${component}:${version}',
+ set([
+ 'foo.io/openshift/origin-pod:vtest',
+ 'foo.io/openshift/origin-deployer:vtest',
+ 'foo.io/openshift/origin-docker-registry:vtest',
+ 'foo.io/openshift/origin-haproxy-router:vtest',
+ 'cockpit/kubernetes', # AFAICS this is not built from the URL
+ ])
+ ),
+ (
+ "origin", True, ['nodes', 'masters', 'etcd'], None,
+ set([
+ # images running on top of openshift
+ 'openshift/origin-pod:vtest',
+ 'openshift/origin-deployer:vtest',
+ 'openshift/origin-docker-registry:vtest',
+ 'openshift/origin-haproxy-router:vtest',
+ 'cockpit/kubernetes',
+ # containerized component images
+ 'openshift/origin:vtest',
+ 'openshift/node:vtest',
+ 'openshift/openvswitch:vtest',
+ 'registry.access.redhat.com/rhel7/etcd',
+ ])
+ ),
+ ( # enterprise images
+ "openshift-enterprise", True, ['nodes'], 'foo.io/openshift3/ose-${component}:f13ac45',
+ set([
+ 'foo.io/openshift3/ose-pod:f13ac45',
+ 'foo.io/openshift3/ose-deployer:f13ac45',
+ 'foo.io/openshift3/ose-docker-registry:f13ac45',
+ 'foo.io/openshift3/ose-haproxy-router:f13ac45',
+ # registry-console is not constructed/versioned the same as the others.
+ 'registry.access.redhat.com/openshift3/registry-console',
+ # containerized images aren't built from oreg_url
+ 'openshift3/node:vtest',
+ 'openshift3/openvswitch:vtest',
+ ])
+ ),
+ (
+ "openshift-enterprise", True, ['etcd', 'lb'], 'foo.io/openshift3/ose-${component}:f13ac45',
+ set([
+ 'registry.access.redhat.com/rhel7/etcd',
+ # lb does not yet come in a containerized version
+ ])
+ ),
+
+])
+def test_required_images(deployment_type, is_containerized, groups, oreg_url, expected):
+ task_vars = dict(
+ openshift=dict(
+ common=dict(
+ is_containerized=is_containerized,
+ is_atomic=False,
+ ),
+ ),
+ openshift_deployment_type=deployment_type,
+ group_names=groups,
+ oreg_url=oreg_url,
+ openshift_image_tag='vtest',
+ )
+
+ assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py
new file mode 100644
index 000000000..876614b1d
--- /dev/null
+++ b/roles/openshift_health_checker/test/docker_storage_test.py
@@ -0,0 +1,224 @@
+import pytest
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.docker_storage import DockerStorage
+
+
+def dummy_check(execute_module=None):
+ def dummy_exec(self, status, task_vars):
+ raise Exception("dummy executor called")
+ return DockerStorage(execute_module=execute_module or dummy_exec)
+
+
+@pytest.mark.parametrize('is_containerized, group_names, is_active', [
+ (False, ["masters", "etcd"], False),
+ (False, ["masters", "nodes"], True),
+ (True, ["etcd"], True),
+])
+def test_is_active(is_containerized, group_names, is_active):
+ task_vars = dict(
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ group_names=group_names,
+ )
+ assert DockerStorage.is_active(task_vars=task_vars) == is_active
+
+
+non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
+
+
+@pytest.mark.parametrize('docker_info, failed, expect_msg', [
+ (
+ dict(failed=True, msg="Error connecting: Error while fetching server API version"),
+ True,
+ ["Is docker running on this host?"],
+ ),
+ (
+ dict(msg="I have no info"),
+ True,
+ ["missing info"],
+ ),
+ (
+ dict(info={
+ "Driver": "devicemapper",
+ "DriverStatus": [("Pool Name", "docker-docker--pool")],
+ }),
+ False,
+ [],
+ ),
+ (
+ dict(info={
+ "Driver": "devicemapper",
+ "DriverStatus": [("Data loop file", "true")],
+ }),
+ True,
+ ["loopback devices with the Docker devicemapper storage driver"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay2",
+ "DriverStatus": []
+ }),
+ False,
+ [],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay",
+ }),
+ True,
+ ["unsupported Docker storage driver"],
+ ),
+ (
+ dict(info={
+ "Driver": "unsupported",
+ }),
+ True,
+ ["unsupported Docker storage driver"],
+ ),
+])
+def test_check_storage_driver(docker_info, failed, expect_msg):
+ def execute_module(module_name, module_args, tmp=None, task_vars=None):
+ if module_name == "yum":
+ return {}
+ if module_name != "docker_info":
+ raise ValueError("not expecting module " + module_name)
+ return docker_info
+
+ check = dummy_check(execute_module=execute_module)
+ check._check_dm_usage = lambda status, task_vars: dict() # stub out for this test
+ result = check.run(tmp=None, task_vars=non_atomic_task_vars)
+
+ if failed:
+ assert result["failed"]
+ else:
+ assert not result.get("failed", False)
+
+ for word in expect_msg:
+ assert word in result["msg"]
+
+
+enough_space = {
+ "Pool Name": "docker--vg-docker--pool",
+ "Data Space Used": "19.92 MB",
+ "Data Space Total": "8.535 GB",
+ "Metadata Space Used": "40.96 kB",
+ "Metadata Space Total": "25.17 MB",
+}
+
+not_enough_space = {
+ "Pool Name": "docker--vg-docker--pool",
+ "Data Space Used": "10 GB",
+ "Data Space Total": "10 GB",
+ "Metadata Space Used": "42 kB",
+ "Metadata Space Total": "43 kB",
+}
+
+
+@pytest.mark.parametrize('task_vars, driver_status, vg_free, success, expect_msg', [
+ (
+ {"max_thinpool_data_usage_percent": "not a float"},
+ enough_space,
+ "12g",
+ False,
+ ["is not a percentage"],
+ ),
+ (
+ {},
+ {}, # empty values from driver status
+ "bogus", # also does not parse as bytes
+ False,
+ ["Could not interpret", "as bytes"],
+ ),
+ (
+ {},
+ enough_space,
+ "12.00g",
+ True,
+ [],
+ ),
+ (
+ {},
+ not_enough_space,
+ "0.00",
+ False,
+ ["data usage", "metadata usage", "higher than threshold"],
+ ),
+])
+def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
+ check = dummy_check()
+ check._get_vg_free = lambda pool, task_vars: vg_free
+ result = check._check_dm_usage(driver_status, task_vars)
+ result_success = not result.get("failed")
+
+ assert result_success is success
+ for msg in expect_msg:
+ assert msg in result["msg"]
+
+
+@pytest.mark.parametrize('pool, command_returns, raises, returns', [
+ (
+ "foo-bar",
+ { # vgs missing
+ "msg": "[Errno 2] No such file or directory",
+ "failed": True,
+ "cmd": "/sbin/vgs",
+ "rc": 2,
+ },
+ "Failed to run /sbin/vgs",
+ None,
+ ),
+ (
+ "foo", # no hyphen in name - should not happen
+ {},
+ "name does not have the expected format",
+ None,
+ ),
+ (
+ "foo-bar",
+ dict(stdout=" 4.00g\n"),
+ None,
+ "4.00g",
+ ),
+ (
+ "foo-bar",
+ dict(stdout="\n"), # no matching VG
+ "vgs did not find this VG",
+ None,
+ )
+])
+def test_vg_free(pool, command_returns, raises, returns):
+ def execute_module(module_name, module_args, tmp=None, task_vars=None):
+ if module_name != "command":
+ raise ValueError("not expecting module " + module_name)
+ return command_returns
+
+ check = dummy_check(execute_module=execute_module)
+ if raises:
+ with pytest.raises(OpenShiftCheckException) as err:
+ check._get_vg_free(pool, {})
+ assert raises in str(err.value)
+ else:
+ ret = check._get_vg_free(pool, {})
+ assert ret == returns
+
+
+@pytest.mark.parametrize('string, expect_bytes', [
+ ("12", 12.0),
+ ("12 k", 12.0 * 1024),
+ ("42.42 MB", 42.42 * 1024**2),
+ ("12g", 12.0 * 1024**3),
+])
+def test_convert_to_bytes(string, expect_bytes):
+ got = DockerStorage._convert_to_bytes(string)
+ assert got == expect_bytes
+
+
+@pytest.mark.parametrize('string', [
+ "bork",
+ "42 Qs",
+])
+def test_convert_to_bytes_error(string):
+ with pytest.raises(ValueError) as err:
+ DockerStorage._convert_to_bytes(string)
+ assert "Cannot convert" in str(err.value)
+ assert string in str(err.value)
diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py
new file mode 100644
index 000000000..b9d375d8c
--- /dev/null
+++ b/roles/openshift_health_checker/test/elasticsearch_test.py
@@ -0,0 +1,180 @@
+import pytest
+import json
+
+from openshift_checks.logging.elasticsearch import Elasticsearch
+
+task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
+
+
+def canned_elasticsearch(exec_oc=None):
+ """Create an Elasticsearch check object with canned exec_oc method"""
+ check = Elasticsearch("dummy") # fails if a module is actually invoked
+ if exec_oc:
+ check._exec_oc = exec_oc
+ return check
+
+
+def assert_error(error, expect_error):
+ if expect_error:
+ assert error
+ assert expect_error in error
+ else:
+ assert not error
+
+
+plain_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es"},
+ "name": "logging-es",
+ },
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "containerStatuses": [{"ready": True}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "name logging-es",
+}
+
+split_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es-2"},
+ "name": "logging-es-2",
+ },
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "containerStatuses": [{"ready": True}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "name logging-es-2",
+}
+
+
+def test_check_elasticsearch():
+ assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([], {})
+
+ # canned oc responses to match so all the checks pass
+ def _exec_oc(cmd, args, task_vars):
+ if '_cat/master' in cmd:
+ return 'name logging-es'
+ elif '/_nodes' in cmd:
+ return json.dumps(es_node_list)
+ elif '_cluster/health' in cmd:
+ return '{"status": "green"}'
+ elif ' df ' in cmd:
+ return 'IUse% Use%\n 3% 4%\n'
+ else:
+ raise Exception(cmd)
+
+ assert not canned_elasticsearch(_exec_oc).check_elasticsearch([plain_es_pod], {})
+
+
+def pods_by_name(pods):
+ return {pod['metadata']['name']: pod for pod in pods}
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+ (
+ [],
+ 'No logging Elasticsearch masters',
+ ),
+ (
+ [plain_es_pod],
+ None,
+ ),
+ (
+ [plain_es_pod, split_es_pod],
+ 'Found multiple Elasticsearch masters',
+ ),
+])
+def test_check_elasticsearch_masters(pods, expect_error):
+ test_pods = list(pods)
+ check = canned_elasticsearch(lambda cmd, args, task_vars: test_pods.pop(0)['_test_master_name_str'])
+
+ errors = check._check_elasticsearch_masters(pods_by_name(pods), task_vars_config_base)
+ assert_error(''.join(errors), expect_error)
+
+
+es_node_list = {
+ 'nodes': {
+ 'random-es-name': {
+ 'host': 'logging-es',
+ }}}
+
+
+@pytest.mark.parametrize('pods, node_list, expect_error', [
+ (
+ [],
+ {},
+ 'No logging Elasticsearch masters',
+ ),
+ (
+ [plain_es_pod],
+ es_node_list,
+ None,
+ ),
+ (
+ [plain_es_pod],
+ {}, # empty list of nodes triggers KeyError
+ "Failed to query",
+ ),
+ (
+ [split_es_pod],
+ es_node_list,
+ 'does not correspond to any known ES pod',
+ ),
+])
+def test_check_elasticsearch_node_list(pods, node_list, expect_error):
+ check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(node_list))
+
+ errors = check._check_elasticsearch_node_list(pods_by_name(pods), task_vars_config_base)
+ assert_error(''.join(errors), expect_error)
+
+
+@pytest.mark.parametrize('pods, health_data, expect_error', [
+ (
+ [plain_es_pod],
+ [{"status": "green"}],
+ None,
+ ),
+ (
+ [plain_es_pod],
+ [{"no-status": "should bomb"}],
+ 'Could not retrieve cluster health status',
+ ),
+ (
+ [plain_es_pod, split_es_pod],
+ [{"status": "green"}, {"status": "red"}],
+ 'Elasticsearch cluster health status is RED',
+ ),
+])
+def test_check_elasticsearch_cluster_health(pods, health_data, expect_error):
+ test_health_data = list(health_data)
+ check = canned_elasticsearch(lambda cmd, args, task_vars: json.dumps(test_health_data.pop(0)))
+
+ errors = check._check_es_cluster_health(pods_by_name(pods), task_vars_config_base)
+ assert_error(''.join(errors), expect_error)
+
+
+@pytest.mark.parametrize('disk_data, expect_error', [
+ (
+ 'df: /elasticsearch/persistent: No such file or directory\n',
+ 'Could not retrieve storage usage',
+ ),
+ (
+ 'IUse% Use%\n 3% 4%\n',
+ None,
+ ),
+ (
+ 'IUse% Use%\n 95% 40%\n',
+ 'Inode percent usage on the storage volume',
+ ),
+ (
+ 'IUse% Use%\n 3% 94%\n',
+ 'Disk percent usage on the storage volume',
+ ),
+])
+def test_check_elasticsearch_diskspace(disk_data, expect_error):
+ check = canned_elasticsearch(lambda cmd, args, task_vars: disk_data)
+
+ errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod]), task_vars_config_base)
+ assert_error(''.join(errors), expect_error)
diff --git a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
new file mode 100644
index 000000000..df9d52d41
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
@@ -0,0 +1,328 @@
+import pytest
+
+from collections import namedtuple
+from openshift_checks.etcd_imagedata_size import EtcdImageDataSize, OpenShiftCheckException
+from etcdkeysize import check_etcd_key_size
+
+
+def fake_etcd_client(root):
+ fake_nodes = dict()
+ fake_etcd_node(root, fake_nodes)
+
+ clientclass = namedtuple("client", ["read"])
+ return clientclass(lambda key, recursive: fake_etcd_result(fake_nodes[key]))
+
+
+def fake_etcd_result(fake_node):
+ resultclass = namedtuple("result", ["leaves"])
+ if not fake_node.dir:
+ return resultclass([fake_node])
+
+ return resultclass(fake_node.leaves)
+
+
+def fake_etcd_node(node, visited):
+ min_req_fields = ["dir", "key"]
+ fields = list(node)
+ leaves = []
+
+ if node["dir"] and node.get("leaves"):
+ for leaf in node["leaves"]:
+ leaves.append(fake_etcd_node(leaf, visited))
+
+ if len(set(min_req_fields) - set(fields)) > 0:
+ raise ValueError("fake etcd nodes require at least {} fields.".format(min_req_fields))
+
+ if node.get("leaves"):
+ node["leaves"] = leaves
+
+ nodeclass = namedtuple("node", fields)
+ nodeinst = nodeclass(**node)
+ visited[nodeinst.key] = nodeinst
+
+ return nodeinst
+
+
+@pytest.mark.parametrize('ansible_mounts,extra_words', [
+ ([], ['none']), # empty ansible_mounts
+ ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
+])
+def test_cannot_determine_available_mountpath(ansible_mounts, extra_words):
+ task_vars = dict(
+ ansible_mounts=ansible_mounts,
+ )
+ check = EtcdImageDataSize(execute_module=fake_execute_module)
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+
+ for word in 'determine valid etcd mountpath'.split() + extra_words:
+ assert word in str(excinfo.value)
+
+
+@pytest.mark.parametrize('ansible_mounts,tree,size_limit,should_fail,extra_words', [
+ (
+ # test that default image size limit evals to 1/2 * (total size in use)
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "1234"},
+ None,
+ False,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 48 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "1234"},
+ None,
+ False,
+ [],
+ ),
+ (
+ # set max size limit for image data to be below total node value
+ # total node value is defined as the sum of the value field
+ # from every node
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 48 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "12345678"},
+ 7,
+ True,
+ ["exceeds the maximum recommended limit", "0.00 GB"],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 48 * 10**9 - 1,
+ 'size_total': 48 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "1234"},
+ None,
+ True,
+ ["exceeds the maximum recommended limit", "0.00 GB"],
+ )
+])
+def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size_limit, should_fail, extra_words):
+ def execute_module(module_name, args, tmp=None, task_vars=None):
+ if module_name != "etcdkeysize":
+ return {
+ "changed": False,
+ }
+
+ client = fake_etcd_client(tree)
+ s, limit_exceeded = check_etcd_key_size(client, tree["key"], args["size_limit_bytes"])
+
+ return {"size_limit_exceeded": limit_exceeded}
+
+ task_vars = dict(
+ etcd_max_image_data_size_bytes=size_limit,
+ ansible_mounts=ansible_mounts,
+ openshift=dict(
+ master=dict(etcd_hosts=["localhost"]),
+ common=dict(config_base="/var/lib/origin")
+ )
+ )
+ if size_limit is None:
+ task_vars.pop("etcd_max_image_data_size_bytes")
+
+ check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+
+ if should_fail:
+ assert check["failed"]
+
+ for word in extra_words:
+ assert word in check["msg"]
+ else:
+ assert not check.get("failed", False)
+
+
+@pytest.mark.parametrize('ansible_mounts,tree,root_path,expected_size,extra_words', [
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test recursive size check on tree with height > 1
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1234"},
+ {"dir": False, "key": "/foo2", "value": "1234"},
+ {"dir": False, "key": "/foo3", "value": "1234"},
+ {"dir": False, "key": "/foo4", "value": "1234"},
+ {
+ "dir": True,
+ "key": "/foo5",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar1", "value": "56789"},
+ {"dir": False, "key": "/foo/bar2", "value": "56789"},
+ {"dir": False, "key": "/foo/bar3", "value": "56789"},
+ {
+ "dir": True,
+ "key": "/foo/bar4",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar/baz1", "value": "123"},
+ {"dir": False, "key": "/foo/bar/baz2", "value": "123"},
+ ]
+ },
+ ]
+ },
+ ]
+ },
+ "/",
+ 37,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test correct sub-tree size calculation
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1234"},
+ {"dir": False, "key": "/foo2", "value": "1234"},
+ {"dir": False, "key": "/foo3", "value": "1234"},
+ {"dir": False, "key": "/foo4", "value": "1234"},
+ {
+ "dir": True,
+ "key": "/foo5",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar1", "value": "56789"},
+ {"dir": False, "key": "/foo/bar2", "value": "56789"},
+ {"dir": False, "key": "/foo/bar3", "value": "56789"},
+ {
+ "dir": True,
+ "key": "/foo/bar4",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar/baz1", "value": "123"},
+ {"dir": False, "key": "/foo/bar/baz2", "value": "123"},
+ ]
+ },
+ ]
+ },
+ ]
+ },
+ "/foo5",
+ 21,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test that a non-existing key is handled correctly
+ {
+ "dir": False,
+ "key": "/",
+ "value": "1234",
+ },
+ "/missing",
+ 0,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test etcd cycle handling
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1234"},
+ {"dir": False, "key": "/foo2", "value": "1234"},
+ {"dir": False, "key": "/foo3", "value": "1234"},
+ {"dir": False, "key": "/foo4", "value": "1234"},
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1"},
+ ],
+ },
+ ]
+ },
+ "/",
+ 16,
+ [],
+ ),
+])
+def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_path, expected_size, extra_words):
+ def execute_module(module_name, args, tmp=None, task_vars=None):
+ if module_name != "etcdkeysize":
+ return {
+ "changed": False,
+ }
+
+ client = fake_etcd_client(tree)
+ size, limit_exceeded = check_etcd_key_size(client, root_path, args["size_limit_bytes"])
+
+ assert size == expected_size
+ return {
+ "size_limit_exceeded": limit_exceeded,
+ }
+
+ task_vars = dict(
+ ansible_mounts=ansible_mounts,
+ openshift=dict(
+ master=dict(etcd_hosts=["localhost"]),
+ common=dict(config_base="/var/lib/origin")
+ )
+ )
+
+ check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+ assert not check.get("failed", False)
+
+
+def test_etcdkeysize_module_failure():
+ def execute_module(module_name, tmp=None, task_vars=None):
+ if module_name != "etcdkeysize":
+ return {
+ "changed": False,
+ }
+
+ return {
+ "rc": 1,
+ "module_stderr": "failure",
+ }
+
+ task_vars = dict(
+ ansible_mounts=[{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ openshift=dict(
+ master=dict(etcd_hosts=["localhost"]),
+ common=dict(config_base="/var/lib/origin")
+ )
+ )
+
+ check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+
+ assert check["failed"]
+ for word in "Failed to retrieve stats":
+ assert word in check["msg"]
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py
new file mode 100644
index 000000000..917045526
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_volume_test.py
@@ -0,0 +1,149 @@
+import pytest
+
+from openshift_checks.etcd_volume import EtcdVolume, OpenShiftCheckException
+
+
+@pytest.mark.parametrize('ansible_mounts,extra_words', [
+ ([], ['none']), # empty ansible_mounts
+ ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
+])
+def test_cannot_determine_available_disk(ansible_mounts, extra_words):
+ task_vars = dict(
+ ansible_mounts=ansible_mounts,
+ )
+ check = EtcdVolume(execute_module=fake_execute_module)
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+
+ for word in 'Unable to find etcd storage mount point'.split() + extra_words:
+ assert word in str(excinfo.value)
+
+
+@pytest.mark.parametrize('size_limit,ansible_mounts', [
+ (
+ # if no size limit is specified, expect max usage
+ # limit to default to 90% of size_total
+ None,
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9
+ }],
+ ),
+ (
+ 1,
+ [{
+ 'mount': '/',
+ 'size_available': 30 * 10**9,
+ 'size_total': 30 * 10**9,
+ }],
+ ),
+ (
+ 20000000000,
+ [{
+ 'mount': '/',
+ 'size_available': 20 * 10**9,
+ 'size_total': 40 * 10**9,
+ }],
+ ),
+ (
+ 5000000000,
+ [{
+ # not enough space on / ...
+ 'mount': '/',
+ 'size_available': 0,
+ 'size_total': 0,
+ }, {
+ # not enough space on /var/lib ...
+ 'mount': '/var/lib',
+ 'size_available': 2 * 10**9,
+ 'size_total': 21 * 10**9,
+ }, {
+ # ... but enough on /var/lib/etcd
+ 'mount': '/var/lib/etcd',
+ 'size_available': 36 * 10**9,
+ 'size_total': 40 * 10**9
+ }],
+ )
+])
+def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts):
+ task_vars = dict(
+ etcd_device_usage_threshold_percent=size_limit,
+ ansible_mounts=ansible_mounts,
+ )
+
+ if task_vars["etcd_device_usage_threshold_percent"] is None:
+ task_vars.pop("etcd_device_usage_threshold_percent")
+
+ check = EtcdVolume(execute_module=fake_execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ assert not result.get('failed', False)
+
+
+@pytest.mark.parametrize('size_limit_percent,ansible_mounts,extra_words', [
+ (
+ # if no size limit is specified, expect max usage
+ # limit to default to 90% of size_total
+ None,
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9,
+ 'size_total': 100 * 10**9,
+ }],
+ ['99.0%'],
+ ),
+ (
+ 70.0,
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**6,
+ 'size_total': 5 * 10**9,
+ }],
+ ['100.0%'],
+ ),
+ (
+ 40.0,
+ [{
+ 'mount': '/',
+ 'size_available': 2 * 10**9,
+ 'size_total': 6 * 10**9,
+ }],
+ ['66.7%'],
+ ),
+ (
+ None,
+ [{
+ # enough space on /var ...
+ 'mount': '/var',
+ 'size_available': 20 * 10**9,
+ 'size_total': 20 * 10**9,
+ }, {
+ # .. but not enough on /var/lib
+ 'mount': '/var/lib',
+ 'size_available': 1 * 10**9,
+ 'size_total': 20 * 10**9,
+ }],
+ ['95.0%'],
+ ),
+])
+def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words):
+ task_vars = dict(
+ etcd_device_usage_threshold_percent=size_limit_percent,
+ ansible_mounts=ansible_mounts,
+ )
+
+ if task_vars["etcd_device_usage_threshold_percent"] is None:
+ task_vars.pop("etcd_device_usage_threshold_percent")
+
+ check = EtcdVolume(execute_module=fake_execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ assert result['failed']
+ for word in extra_words:
+ assert word in result['msg']
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py
new file mode 100644
index 000000000..d151c0b19
--- /dev/null
+++ b/roles/openshift_health_checker/test/fluentd_test.py
@@ -0,0 +1,109 @@
+import pytest
+import json
+
+from openshift_checks.logging.fluentd import Fluentd
+
+
+def canned_fluentd(exec_oc=None):
+ """Create a Fluentd check object with canned exec_oc method"""
+ check = Fluentd("dummy") # fails if a module is actually invoked
+ if exec_oc:
+ check._exec_oc = exec_oc
+ return check
+
+
+def assert_error(error, expect_error):
+ if expect_error:
+ assert error
+ assert expect_error in error
+ else:
+ assert not error
+
+
+fluentd_pod_node1 = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {"host": "node1", "nodeName": "node1"},
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+fluentd_pod_node2_down = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-2",
+ },
+ "spec": {"host": "node2", "nodeName": "node2"},
+ "status": {
+ "containerStatuses": [{"ready": False}],
+ "conditions": [{"status": "False", "type": "Ready"}],
+ }
+}
+fluentd_node1 = {
+ "metadata": {
+ "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "node1"},
+ "name": "node1",
+ },
+ "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.1"}]},
+}
+fluentd_node2 = {
+ "metadata": {
+ "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "hostname"},
+ "name": "node2",
+ },
+ "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.2"}]},
+}
+fluentd_node3_unlabeled = {
+ "metadata": {
+ "labels": {"kubernetes.io/hostname": "hostname"},
+ "name": "node3",
+ },
+ "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.3"}]},
+}
+
+
+@pytest.mark.parametrize('pods, nodes, expect_error', [
+ (
+ [],
+ [],
+ 'No nodes appear to be defined',
+ ),
+ (
+ [],
+ [fluentd_node3_unlabeled],
+ 'There are no nodes with the fluentd label',
+ ),
+ (
+ [],
+ [fluentd_node1, fluentd_node3_unlabeled],
+ 'Fluentd will not aggregate logs from these nodes.',
+ ),
+ (
+ [],
+ [fluentd_node2],
+ "nodes are supposed to have a Fluentd pod but do not",
+ ),
+ (
+ [fluentd_pod_node1, fluentd_pod_node1],
+ [fluentd_node1],
+ 'more Fluentd pods running than nodes labeled',
+ ),
+ (
+ [fluentd_pod_node2_down],
+ [fluentd_node2],
+ "Fluentd pods are supposed to be running",
+ ),
+ (
+ [fluentd_pod_node1],
+ [fluentd_node1],
+ None,
+ ),
+])
+def test_get_fluentd_pods(pods, nodes, expect_error):
+ check = canned_fluentd(lambda cmd, args, task_vars: json.dumps(dict(items=nodes)))
+
+ error = check.check_fluentd(pods, {})
+ assert_error(error, expect_error)
diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py
new file mode 100644
index 000000000..19140a1b6
--- /dev/null
+++ b/roles/openshift_health_checker/test/kibana_test.py
@@ -0,0 +1,218 @@
+import pytest
+import json
+
+try:
+ import urllib2
+ from urllib2 import HTTPError, URLError
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks.logging.kibana import Kibana
+
+
+def canned_kibana(exec_oc=None):
+ """Create a Kibana check object with canned exec_oc method"""
+ check = Kibana("dummy") # fails if a module is actually invoked
+ if exec_oc:
+ check._exec_oc = exec_oc
+ return check
+
+
+def assert_error(error, expect_error):
+ if expect_error:
+ assert error
+ assert expect_error in error
+ else:
+ assert not error
+
+
+plain_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+not_running_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-2",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": False}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+ (
+ [],
+ "There are no Kibana pods deployed",
+ ),
+ (
+ [plain_kibana_pod],
+ None,
+ ),
+ (
+ [not_running_kibana_pod],
+ "No Kibana pod is in a running state",
+ ),
+ (
+ [plain_kibana_pod, not_running_kibana_pod],
+ "The following Kibana pods are not currently in a running state",
+ ),
+])
+def test_check_kibana(pods, expect_error):
+ check = canned_kibana()
+ error = check.check_kibana(pods)
+ assert_error(error, expect_error)
+
+
+@pytest.mark.parametrize('route, expect_url, expect_error', [
+ (
+ None,
+ None,
+ 'no_route_exists',
+ ),
+
+ # test route with no ingress
+ (
+ {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana",
+ },
+ "status": {
+ "ingress": [],
+ },
+ "spec": {
+ "host": "hostname",
+ }
+ },
+ None,
+ 'route_not_accepted',
+ ),
+
+ # test route with no host
+ (
+ {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana",
+ },
+ "status": {
+ "ingress": [{
+ "status": True,
+ }],
+ },
+ "spec": {},
+ },
+ None,
+ 'route_missing_host',
+ ),
+
+ # test route that looks fine
+ (
+ {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana",
+ },
+ "status": {
+ "ingress": [{
+ "status": True,
+ }],
+ },
+ "spec": {
+ "host": "hostname",
+ },
+ },
+ "https://hostname/",
+ None,
+ ),
+])
+def test_get_kibana_url(route, expect_url, expect_error):
+ check = canned_kibana(lambda cmd, args, task_vars: json.dumps(route) if route else "")
+
+ url, error = check._get_kibana_url({})
+ if expect_url:
+ assert url == expect_url
+ else:
+ assert not url
+ if expect_error:
+ assert error == expect_error
+ else:
+ assert not error
+
+
+@pytest.mark.parametrize('exec_result, expect', [
+ (
+ 'urlopen error [Errno 111] Connection refused',
+ 'at least one router routing to it?',
+ ),
+ (
+ 'urlopen error [Errno -2] Name or service not known',
+ 'DNS configured for the Kibana hostname?',
+ ),
+ (
+ 'Status code was not [302]: HTTP Error 500: Server error',
+ 'did not return the correct status code',
+ ),
+ (
+ 'bork bork bork',
+ 'bork bork bork', # should pass through
+ ),
+])
+def test_verify_url_internal_failure(exec_result, expect):
+ check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result))
+ check._get_kibana_url = lambda task_vars: ('url', None)
+
+ error = check._check_kibana_route({})
+ assert_error(error, expect)
+
+
+@pytest.mark.parametrize('lib_result, expect', [
+ (
+ HTTPError('url', 500, "it broke", hdrs=None, fp=None),
+ 'it broke',
+ ),
+ (
+ URLError('it broke'),
+ 'it broke',
+ ),
+ (
+ 302,
+ 'returned the wrong error code',
+ ),
+ (
+ 200,
+ None,
+ ),
+])
+def test_verify_url_external_failure(lib_result, expect, monkeypatch):
+
+ class _http_return:
+
+ def __init__(self, code):
+ self.code = code
+
+ def getcode(self):
+ return self.code
+
+ def urlopen(url, context):
+ if type(lib_result) is int:
+ return _http_return(lib_result)
+ raise lib_result
+ monkeypatch.setattr(urllib2, 'urlopen', urlopen)
+
+ check = canned_kibana()
+ check._get_kibana_url = lambda task_vars: ('url', None)
+ check._verify_url_internal = lambda url, task_vars: None
+
+ error = check._check_kibana_route({})
+ assert_error(error, expect)
diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py
new file mode 100644
index 000000000..b6db34fe3
--- /dev/null
+++ b/roles/openshift_health_checker/test/logging_check_test.py
@@ -0,0 +1,137 @@
+import pytest
+import json
+
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
+
+task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
+
+
+logging_namespace = "logging"
+
+
+def canned_loggingcheck(exec_oc=None):
+ """Create a LoggingCheck object with canned exec_oc method"""
+ check = LoggingCheck("dummy") # fails if a module is actually invoked
+ check.logging_namespace = 'logging'
+ if exec_oc:
+ check.exec_oc = exec_oc
+ return check
+
+
+def assert_error(error, expect_error):
+ if expect_error:
+ assert error
+ assert expect_error in error
+ else:
+ assert not error
+
+
+plain_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es"},
+ "name": "logging-es",
+ },
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "containerStatuses": [{"ready": True}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "name logging-es",
+}
+
+plain_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+fluentd_pod_node1 = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {"host": "node1", "nodeName": "node1"},
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+plain_curator_pod = {
+ "metadata": {
+ "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+ "name": "logging-curator-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "podIP": "10.10.10.10",
+ }
+}
+
+
+@pytest.mark.parametrize('problem, expect', [
+ ("[Errno 2] No such file or directory", "supposed to be a master"),
+ ("Permission denied", "Unexpected error using `oc`"),
+])
+def test_oc_failure(problem, expect):
+ def execute_module(module_name, args, task_vars):
+ if module_name == "ocutil":
+ return dict(failed=True, result=problem)
+ return dict(changed=False)
+
+ check = LoggingCheck({})
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.exec_oc(execute_module, logging_namespace, 'get foo', [], task_vars=task_vars_config_base)
+ assert expect in str(excinfo)
+
+
+groups_with_first_master = dict(masters=['this-host', 'other-host'])
+groups_with_second_master = dict(masters=['other-host', 'this-host'])
+groups_not_a_master = dict(masters=['other-host'])
+
+
+@pytest.mark.parametrize('groups, logging_deployed, is_active', [
+ (groups_with_first_master, True, True),
+ (groups_with_first_master, False, False),
+ (groups_not_a_master, True, False),
+ (groups_with_second_master, True, False),
+ (groups_not_a_master, True, False),
+])
+def test_is_active(groups, logging_deployed, is_active):
+ task_vars = dict(
+ ansible_ssh_host='this-host',
+ groups=groups,
+ openshift_hosted_logging_deploy=logging_deployed,
+ )
+
+ assert LoggingCheck.is_active(task_vars=task_vars) == is_active
+
+
+@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [
+ (
+ 'No resources found.',
+ None,
+ 'There are no pods in the logging namespace',
+ ),
+ (
+ json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}),
+ [plain_es_pod],
+ None,
+ ),
+])
+def test_get_pods_for_component(pod_output, expect_pods, expect_error):
+ check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: pod_output)
+ pods, error = check.get_pods_for_component(
+ lambda name, args, task_vars: {},
+ logging_namespace,
+ "es",
+ {}
+ )
+ assert_error(error, expect_error)
diff --git a/roles/openshift_health_checker/test/memory_availability_test.py b/roles/openshift_health_checker/test/memory_availability_test.py
new file mode 100644
index 000000000..4fbaea0a9
--- /dev/null
+++ b/roles/openshift_health_checker/test/memory_availability_test.py
@@ -0,0 +1,129 @@
+import pytest
+
+from openshift_checks.memory_availability import MemoryAvailability
+
+
+@pytest.mark.parametrize('group_names,is_active', [
+ (['masters'], True),
+ (['nodes'], True),
+ (['etcd'], True),
+ (['masters', 'nodes'], True),
+ (['masters', 'etcd'], True),
+ ([], False),
+ (['lb'], False),
+ (['nfs'], False),
+])
+def test_is_active(group_names, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ )
+ assert MemoryAvailability.is_active(task_vars=task_vars) == is_active
+
+
+@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb', [
+ (
+ ['masters'],
+ 0,
+ 17200,
+ ),
+ (
+ ['nodes'],
+ 0,
+ 8200,
+ ),
+ (
+ ['nodes'],
+ 1, # configure lower threshold
+ 2000, # too low for recommended but not for configured
+ ),
+ (
+ ['nodes'],
+ 2, # configure threshold where adjustment pushes it over
+ 1900,
+ ),
+ (
+ ['etcd'],
+ 0,
+ 8200,
+ ),
+ (
+ ['masters', 'nodes'],
+ 0,
+ 17000,
+ ),
+])
+def test_succeeds_with_recommended_memory(group_names, configured_min, ansible_memtotal_mb):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_check_min_host_memory_gb=configured_min,
+ ansible_memtotal_mb=ansible_memtotal_mb,
+ )
+
+ check = MemoryAvailability(execute_module=fake_execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ assert not result.get('failed', False)
+
+
+@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb,extra_words', [
+ (
+ ['masters'],
+ 0,
+ 0,
+ ['0.0 GiB'],
+ ),
+ (
+ ['nodes'],
+ 0,
+ 100,
+ ['0.1 GiB'],
+ ),
+ (
+ ['nodes'],
+ 24, # configure higher threshold
+ 20 * 1024, # enough to meet recommended but not configured
+ ['20.0 GiB'],
+ ),
+ (
+ ['nodes'],
+ 24, # configure higher threshold
+ 22 * 1024, # not enough for adjustment to push over threshold
+ ['22.0 GiB'],
+ ),
+ (
+ ['etcd'],
+ 0,
+ 6 * 1024,
+ ['6.0 GiB'],
+ ),
+ (
+ ['etcd', 'masters'],
+ 0,
+ 9 * 1024, # enough memory for etcd, not enough for a master
+ ['9.0 GiB'],
+ ),
+ (
+ ['nodes', 'masters'],
+ 0,
+ # enough memory for a node, not enough for a master
+ 11 * 1024,
+ ['11.0 GiB'],
+ ),
+])
+def test_fails_with_insufficient_memory(group_names, configured_min, ansible_memtotal_mb, extra_words):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_check_min_host_memory_gb=configured_min,
+ ansible_memtotal_mb=ansible_memtotal_mb,
+ )
+
+ check = MemoryAvailability(execute_module=fake_execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ assert result.get('failed', False)
+ for word in 'below recommended'.split() + extra_words:
+ assert word in result['msg']
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/ovs_version_test.py b/roles/openshift_health_checker/test/ovs_version_test.py
new file mode 100644
index 000000000..6494e1c06
--- /dev/null
+++ b/roles/openshift_health_checker/test/ovs_version_test.py
@@ -0,0 +1,89 @@
+import pytest
+
+from openshift_checks.ovs_version import OvsVersion, OpenShiftCheckException
+
+
+def test_openshift_version_not_supported():
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ return {}
+
+ openshift_release = '111.7.0'
+
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type='origin',
+ )
+
+ check = OvsVersion(execute_module=execute_module)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+
+ assert "no recommended version of Open vSwitch" in str(excinfo.value)
+
+
+def test_invalid_openshift_release_format():
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ return {}
+
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_image_tag='v0',
+ openshift_deployment_type='origin',
+ )
+
+ check = OvsVersion(execute_module=execute_module)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+ assert "invalid version" in str(excinfo.value)
+
+
+@pytest.mark.parametrize('openshift_release,expected_ovs_version', [
+ ("3.5", "2.6"),
+ ("3.6", "2.6"),
+ ("3.4", "2.4"),
+ ("3.3", "2.4"),
+ ("1.0", "2.4"),
+])
+def test_ovs_package_version(openshift_release, expected_ovs_version):
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ )
+ return_value = object()
+
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ assert module_name == 'rpm_version'
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if pkg["name"] == "openvswitch":
+ assert pkg["version"] == expected_ovs_version
+
+ return return_value
+
+ check = OvsVersion(execute_module=execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+ assert result is return_value
+
+
+@pytest.mark.parametrize('group_names,is_containerized,is_active', [
+ (['masters'], False, True),
+ # ensure check is skipped on containerized installs
+ (['masters'], True, False),
+ (['nodes'], False, True),
+ (['masters', 'nodes'], False, True),
+ (['masters', 'etcd'], False, True),
+ ([], False, False),
+ (['etcd'], False, False),
+ (['lb'], False, False),
+ (['nfs'], False, False),
+])
+def test_ovs_version_skip_when_not_master_nor_node(group_names, is_containerized, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ )
+ assert OvsVersion.is_active(task_vars=task_vars) == is_active
diff --git a/roles/openshift_health_checker/test/package_availability_test.py b/roles/openshift_health_checker/test/package_availability_test.py
index 25385339a..f7e916a46 100644
--- a/roles/openshift_health_checker/test/package_availability_test.py
+++ b/roles/openshift_health_checker/test/package_availability_test.py
@@ -3,6 +3,20 @@ import pytest
from openshift_checks.package_availability import PackageAvailability
+@pytest.mark.parametrize('pkg_mgr,is_containerized,is_active', [
+ ('yum', False, True),
+ ('yum', True, False),
+ ('dnf', True, False),
+ ('dnf', False, False),
+])
+def test_is_active(pkg_mgr, is_containerized, is_active):
+ task_vars = dict(
+ ansible_pkg_mgr=pkg_mgr,
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ )
+ assert PackageAvailability.is_active(task_vars=task_vars) == is_active
+
+
@pytest.mark.parametrize('task_vars,must_have_packages,must_not_have_packages', [
(
dict(openshift=dict(common=dict(service_type='openshift'))),
diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py
index cc1d263bc..91eace512 100644
--- a/roles/openshift_health_checker/test/package_version_test.py
+++ b/roles/openshift_health_checker/test/package_version_test.py
@@ -1,21 +1,154 @@
-from openshift_checks.package_version import PackageVersion
+import pytest
+from openshift_checks.package_version import PackageVersion, OpenShiftCheckException
-def test_package_version():
+
+@pytest.mark.parametrize('openshift_release, extra_words', [
+ ('111.7.0', ["no recommended version of Open vSwitch"]),
+ ('0.0.0', ["no recommended version of Docker"]),
+])
+def test_openshift_version_not_supported(openshift_release, extra_words):
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ return {}
+
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type='origin',
+ )
+
+ check = PackageVersion(execute_module=execute_module)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+
+ for word in extra_words:
+ assert word in str(excinfo.value)
+
+
+def test_invalid_openshift_release_format():
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ return {}
+
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_image_tag='v0',
+ openshift_deployment_type='origin',
+ )
+
+ check = PackageVersion(execute_module=execute_module)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run(tmp=None, task_vars=task_vars)
+ assert "invalid version" in str(excinfo.value)
+
+
+@pytest.mark.parametrize('openshift_release', [
+ "3.5",
+ "3.6",
+ "3.4",
+ "3.3",
+])
+def test_package_version(openshift_release):
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type='origin',
+ )
+ return_value = object()
+
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ assert module_name == 'aos_version'
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if "-master" in pkg["name"] or "-node" in pkg["name"]:
+ assert pkg["version"] == task_vars["openshift_release"]
+
+ return return_value
+
+ check = PackageVersion(execute_module=execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+ assert result is return_value
+
+
+@pytest.mark.parametrize('deployment_type,openshift_release,expected_ovs_version', [
+ ("openshift-enterprise", "3.5", "2.6"),
+ ("origin", "3.6", "2.6"),
+ ("openshift-enterprise", "3.4", "2.4"),
+ ("origin", "3.3", "2.4"),
+])
+def test_ovs_package_version(deployment_type, openshift_release, expected_ovs_version):
task_vars = dict(
openshift=dict(common=dict(service_type='origin')),
- openshift_release='v3.5',
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type=deployment_type,
)
return_value = object()
def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
assert module_name == 'aos_version'
- assert 'prefix' in module_args
- assert 'version' in module_args
- assert module_args['prefix'] == task_vars['openshift']['common']['service_type']
- assert module_args['version'] == task_vars['openshift_release']
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if pkg["name"] == "openvswitch":
+ assert pkg["version"] == expected_ovs_version
+
return return_value
check = PackageVersion(execute_module=execute_module)
result = check.run(tmp=None, task_vars=task_vars)
assert result is return_value
+
+
+@pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [
+ ("origin", "3.5", "1.12"),
+ ("openshift-enterprise", "3.4", "1.12"),
+ ("origin", "3.3", "1.10"),
+ ("openshift-enterprise", "3.2", "1.10"),
+ ("origin", "3.1", "1.8"),
+ ("openshift-enterprise", "3.1", "1.8"),
+])
+def test_docker_package_version(deployment_type, openshift_release, expected_docker_version):
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type=deployment_type,
+ )
+ return_value = object()
+
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ assert module_name == 'aos_version'
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if pkg["name"] == "docker":
+ assert pkg["version"] == expected_docker_version
+
+ return return_value
+
+ check = PackageVersion(execute_module=execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+ assert result is return_value
+
+
+@pytest.mark.parametrize('group_names,is_containerized,is_active', [
+ (['masters'], False, True),
+ # ensure check is skipped on containerized installs
+ (['masters'], True, False),
+ (['nodes'], False, True),
+ (['masters', 'nodes'], False, True),
+ (['masters', 'etcd'], False, True),
+ ([], False, False),
+ (['etcd'], False, False),
+ (['lb'], False, False),
+ (['nfs'], False, False),
+])
+def test_package_version_skip_when_not_master_nor_node(group_names, is_containerized, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ )
+ assert PackageVersion.is_active(task_vars=task_vars) == is_active
diff --git a/roles/openshift_health_checker/test/rpm_version_test.py b/roles/openshift_health_checker/test/rpm_version_test.py
new file mode 100644
index 000000000..2f09ef965
--- /dev/null
+++ b/roles/openshift_health_checker/test/rpm_version_test.py
@@ -0,0 +1,82 @@
+import pytest
+import rpm_version
+
+expected_pkgs = {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ },
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ },
+}
+
+
+@pytest.mark.parametrize('pkgs, expect_not_found', [
+ (
+ {},
+ ["spam", "eggs"], # none found
+ ),
+ (
+ {"spam": ["3.2.1", "4.5.1"]},
+ ["eggs"], # completely missing
+ ),
+ (
+ {
+ "spam": ["3.2.1", "4.5.1"],
+ "eggs": ["3.2.1"],
+ },
+ [], # all found
+ ),
+])
+def test_check_pkg_found(pkgs, expect_not_found):
+ if expect_not_found:
+ with pytest.raises(rpm_version.RpmVersionException) as e:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
+
+ assert "not found to be installed" in str(e.value)
+ assert set(expect_not_found) == set(e.value.problem_pkgs)
+ else:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
+
+
+@pytest.mark.parametrize('pkgs, expect_not_found', [
+ (
+ {
+ 'spam': ['3.2.1'],
+ 'eggs': ['3.3.2'],
+ },
+ {
+ "eggs": {
+ "required_version": "3.2",
+ "found_versions": ["3.3"],
+ }
+ }, # not the right version
+ ),
+ (
+ {
+ 'spam': ['3.1.2', "3.3.2"],
+ 'eggs': ['3.3.2', "1.2.3"],
+ },
+ {
+ "eggs": {
+ "required_version": "3.2",
+ "found_versions": ["3.3", "1.2"],
+ },
+ "spam": {
+ "required_version": "3.2",
+ "found_versions": ["3.1", "3.3"],
+ }
+ }, # not the right version
+ ),
+])
+def test_check_pkg_version_found(pkgs, expect_not_found):
+ if expect_not_found:
+ with pytest.raises(rpm_version.RpmVersionException) as e:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
+
+ assert "found to be installed with an incorrect version" in str(e.value)
+ assert expect_not_found == e.value.problem_pkgs
+ else:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)