summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker')
-rw-r--r--roles/openshift_health_checker/action_plugins/openshift_health_check.py5
-rw-r--r--roles/openshift_health_checker/callback_plugins/zz_failure_summary.py3
-rwxr-xr-xroles/openshift_health_checker/library/aos_version.py28
-rw-r--r--roles/openshift_health_checker/library/search_journalctl.py150
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py120
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py3
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py181
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_traffic.py47
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py2
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py6
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py4
-rw-r--r--roles/openshift_health_checker/test/action_plugin_test.py12
-rw-r--r--roles/openshift_health_checker/test/aos_version_test.py137
-rw-r--r--roles/openshift_health_checker/test/disk_availability_test.py27
-rw-r--r--roles/openshift_health_checker/test/docker_image_availability_test.py14
-rw-r--r--roles/openshift_health_checker/test/docker_storage_test.py109
-rw-r--r--roles/openshift_health_checker/test/etcd_traffic_test.py80
-rw-r--r--roles/openshift_health_checker/test/kibana_test.py2
-rw-r--r--roles/openshift_health_checker/test/logging_check_test.py32
-rw-r--r--roles/openshift_health_checker/test/package_version_test.py30
-rw-r--r--roles/openshift_health_checker/test/search_journalctl_test.py157
21 files changed, 956 insertions, 193 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
index a62e4331e..0390dc82e 100644
--- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py
+++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
@@ -38,14 +38,13 @@ class ActionModule(ActionBase):
try:
known_checks = self.load_known_checks()
+ args = self._task.args
+ resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
except OpenShiftCheckException as e:
result["failed"] = True
result["msg"] = str(e)
return result
- args = self._task.args
- resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
-
result["checks"] = check_results = {}
user_disabled_checks = [
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
index 64c29a8d9..443b76ea1 100644
--- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -39,7 +39,8 @@ class CallbackModule(CallbackBase):
def v2_runner_on_failed(self, result, ignore_errors=False):
super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
- self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
+ if not ignore_errors:
+ self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
def v2_playbook_on_stats(self, stats):
super(CallbackModule, self).v2_playbook_on_stats(stats)
diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py
index 4c205e48c..4f43ee751 100755
--- a/roles/openshift_health_checker/library/aos_version.py
+++ b/roles/openshift_health_checker/library/aos_version.py
@@ -19,6 +19,10 @@ the inventory, the version comparison checks just pass.
'''
from ansible.module_utils.basic import AnsibleModule
+# NOTE: because of the dependency on yum (Python 2-only), this module does not
+# work under Python 3. But since we run unit tests against both Python 2 and
+# Python 3, we use six for cross compatibility in this module alone:
+from ansible.module_utils.six import string_types
IMPORT_EXCEPTION = None
try:
@@ -122,12 +126,15 @@ def _check_precise_version_found(pkgs, expected_pkgs_dict):
for pkg in pkgs:
if pkg.name not in expected_pkgs_dict:
continue
- # does the version match, to the precision requested?
- # and, is it strictly greater, at the precision requested?
- expected_pkg_version = expected_pkgs_dict[pkg.name]["version"]
- match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1])
- if match_version == expected_pkg_version:
- pkgs_precise_version_found.add(pkg.name)
+ expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"]
+ if isinstance(expected_pkg_versions, string_types):
+ expected_pkg_versions = [expected_pkg_versions]
+ for expected_pkg_version in expected_pkg_versions:
+ # does the version match, to the precision requested?
+ # and, is it strictly greater, at the precision requested?
+ match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1])
+ if match_version == expected_pkg_version:
+ pkgs_precise_version_found.add(pkg.name)
not_found = []
for name, pkg in expected_pkgs_dict.items():
@@ -157,8 +164,13 @@ def _check_higher_version_found(pkgs, expected_pkgs_dict):
for pkg in pkgs:
if pkg.name not in expected_pkg_names:
continue
- expected_pkg_version = expected_pkgs_dict[pkg.name]["version"]
- req_release_arr = [int(segment) for segment in expected_pkg_version.split(".")]
+ expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"]
+ if isinstance(expected_pkg_versions, string_types):
+ expected_pkg_versions = [expected_pkg_versions]
+ # NOTE: the list of versions is assumed to be sorted so that the highest
+ # desirable version is the last.
+ highest_desirable_version = expected_pkg_versions[-1]
+ req_release_arr = [int(segment) for segment in highest_desirable_version.split(".")]
version = [int(segment) for segment in pkg.version.split(".")]
too_high = version[:len(req_release_arr)] > req_release_arr
higher_than_seen = version > higher_version_for_pkg.get(pkg.name, [])
diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py
new file mode 100644
index 000000000..3631f71c8
--- /dev/null
+++ b/roles/openshift_health_checker/library/search_journalctl.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python
+"""Interface to journalctl."""
+
+from time import time
+import json
+import re
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+class InvalidMatcherRegexp(Exception):
+ """Exception class for invalid matcher regexp."""
+ pass
+
+
+class InvalidLogEntry(Exception):
+ """Exception class for invalid / non-json log entries."""
+ pass
+
+
+class LogInputSubprocessError(Exception):
+ """Exception class for errors that occur while executing a subprocess."""
+ pass
+
+
+def main():
+ """Scan a given list of "log_matchers" for journalctl messages containing given patterns.
+ "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching:
+ 'start_regexp', 'regexp', and 'unit'.
+
+ Sample "log_matchers" list:
+
+ [
+ {
+ 'start_regexp': r'Beginning of systemd unit',
+ 'regexp': r'the specific log message to find',
+ 'unit': 'etcd',
+ }
+ ]
+ """
+ module = AnsibleModule(
+ argument_spec=dict(
+ log_count_limit=dict(type="int", default=500),
+ log_matchers=dict(type="list", required=True),
+ ),
+ )
+
+ timestamp_limit_seconds = time() - 60 * 60 # 1 hour
+
+ log_count_limit = module.params["log_count_limit"]
+ log_matchers = module.params["log_matchers"]
+
+ matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds)
+
+ module.exit_json(
+ changed=False,
+ failed=bool(errors),
+ errors=errors,
+ matched=matched_regexp,
+ )
+
+
+def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds):
+ """Return a list of up to log_count_limit matches for each matcher.
+
+ Log entries are only considered if newer than timestamp_limit_seconds.
+ """
+ matched_regexp = []
+ errors = []
+
+ for matcher in matchers:
+ try:
+ log_output = get_log_output(matcher)
+ except LogInputSubprocessError as err:
+ errors.append(str(err))
+ continue
+
+ try:
+ matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds)
+ if matched:
+ matched_regexp.append(matcher.get("regexp", ""))
+ except InvalidMatcherRegexp as err:
+ errors.append(str(err))
+ except InvalidLogEntry as err:
+ errors.append(str(err))
+
+ return matched_regexp, errors
+
+
+def get_log_output(matcher):
+ """Return an iterator on the logs of a given matcher."""
+ try:
+ cmd_output = subprocess.Popen(list([
+ '/bin/journalctl',
+ '-ru', matcher.get("unit", ""),
+ '--output', 'json',
+ ]), stdout=subprocess.PIPE)
+
+ return iter(cmd_output.stdout.readline, '')
+
+ except subprocess.CalledProcessError as exc:
+ msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}"
+ raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc)))
+ except OSError as exc:
+ raise LogInputSubprocessError(str(exc))
+
+
+def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds):
+ """Return log messages matched in iterable log_output by a given matcher.
+
+ Ignore any log_output items older than timestamp_limit_seconds.
+ """
+ try:
+ regexp = re.compile(matcher.get("regexp", ""))
+ start_regexp = re.compile(matcher.get("start_regexp", ""))
+ except re.error as err:
+ msg = "A log matcher object was provided with an invalid regular expression: {}"
+ raise InvalidMatcherRegexp(msg.format(str(err)))
+
+ matched = None
+
+ for log_count, line in enumerate(log_output):
+ if log_count >= log_count_limit:
+ break
+
+ try:
+ obj = json.loads(line)
+
+ # don't need to look past the most recent service restart
+ if start_regexp.match(obj["MESSAGE"]):
+ break
+
+ log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000
+ if log_timestamp_seconds < timestamp_limit_seconds:
+ break
+
+ if regexp.match(obj["MESSAGE"]):
+ matched = line
+ break
+
+ except ValueError:
+ msg = "Log entry for systemd unit {} contained invalid json syntax: {}"
+ raise InvalidLogEntry(msg.format(matcher.get("unit"), line))
+
+ return matched
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index 962148cb8..e93e81efa 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -1,9 +1,12 @@
-# pylint: disable=missing-docstring
+"""Check that there is enough disk space in predefined paths."""
+
+import os.path
+import tempfile
+
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
-from openshift_checks.mixins import NotContainerizedMixin
-class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
+class DiskAvailability(OpenShiftCheck):
"""Check that recommended disk space is available before a first-time install."""
name = "disk_availability"
@@ -12,56 +15,101 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
# Values taken from the official installation documentation:
# https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
recommended_disk_space_bytes = {
- "masters": 40 * 10**9,
- "nodes": 15 * 10**9,
- "etcd": 20 * 10**9,
+ '/var': {
+ 'masters': 40 * 10**9,
+ 'nodes': 15 * 10**9,
+ 'etcd': 20 * 10**9,
+ },
+ # Used to copy client binaries into,
+ # see roles/openshift_cli/library/openshift_container_binary_sync.py.
+ '/usr/local/bin': {
+ 'masters': 1 * 10**9,
+ 'nodes': 1 * 10**9,
+ 'etcd': 1 * 10**9,
+ },
+ # Used as temporary storage in several cases.
+ tempfile.gettempdir(): {
+ 'masters': 1 * 10**9,
+ 'nodes': 1 * 10**9,
+ 'etcd': 1 * 10**9,
+ },
}
@classmethod
def is_active(cls, task_vars):
"""Skip hosts that do not have recommended disk space requirements."""
group_names = get_var(task_vars, "group_names", default=[])
- has_disk_space_recommendation = bool(set(group_names).intersection(cls.recommended_disk_space_bytes))
+ active_groups = set()
+ for recommendation in cls.recommended_disk_space_bytes.values():
+ active_groups.update(recommendation.keys())
+ has_disk_space_recommendation = bool(active_groups.intersection(group_names))
return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
def run(self, tmp, task_vars):
group_names = get_var(task_vars, "group_names")
ansible_mounts = get_var(task_vars, "ansible_mounts")
- free_bytes = self.openshift_available_disk(ansible_mounts)
-
- recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
- configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9
- min_free_bytes = configured_min or recommended_min
-
- if free_bytes < min_free_bytes:
- return {
- 'failed': True,
- 'msg': (
- 'Available disk space ({:.1f} GB) for the volume containing '
- '"/var" is below minimum recommended space ({:.1f} GB)'
- ).format(float(free_bytes) / 10**9, float(min_free_bytes) / 10**9)
+ ansible_mounts = {mount['mount']: mount for mount in ansible_mounts}
+
+ user_config = get_var(task_vars, "openshift_check_min_host_disk_gb", default={})
+ try:
+ # For backwards-compatibility, if openshift_check_min_host_disk_gb
+ # is a number, then it overrides the required config for '/var'.
+ number = float(user_config)
+ user_config = {
+ '/var': {
+ 'masters': number,
+ 'nodes': number,
+ 'etcd': number,
+ },
}
+ except TypeError:
+ # If it is not a number, then it should be a nested dict.
+ pass
+
+ # TODO: as suggested in
+ # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
+ # maybe we could support checking disk availability in paths that are
+ # not part of the official recommendation but present in the user
+ # configuration.
+ for path, recommendation in self.recommended_disk_space_bytes.items():
+ free_bytes = self.free_bytes(path, ansible_mounts)
+ recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
+
+ config = user_config.get(path, {})
+ # NOTE: the user config is in GB, but we compare bytes, thus the
+ # conversion.
+ config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
+ recommended_bytes = config_bytes or recommended_bytes
+
+ if free_bytes < recommended_bytes:
+ free_gb = float(free_bytes) / 10**9
+ recommended_gb = float(recommended_bytes) / 10**9
+ return {
+ 'failed': True,
+ 'msg': (
+ 'Available disk space in "{}" ({:.1f} GB) '
+ 'is below minimum recommended ({:.1f} GB)'
+ ).format(path, free_gb, recommended_gb)
+ }
return {}
@staticmethod
- def openshift_available_disk(ansible_mounts):
- """Determine the available disk space for an OpenShift installation.
-
- ansible_mounts should be a list of dicts like the 'setup' Ansible module
- returns.
- """
- # priority list in descending order
- supported_mnt_paths = ["/var", "/"]
- available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+ def free_bytes(path, ansible_mounts):
+ """Return the size available in path based on ansible_mounts."""
+ mount_point = path
+ # arbitry value to prevent an infinite loop, in the unlike case that '/'
+ # is not in ansible_mounts.
+ max_depth = 32
+ while mount_point not in ansible_mounts and max_depth > 0:
+ mount_point = os.path.dirname(mount_point)
+ max_depth -= 1
try:
- for path in supported_mnt_paths:
- if path in available_mnts:
- return available_mnts[path]["size_available"]
+ free_bytes = ansible_mounts[mount_point]['size_available']
except KeyError:
- pass
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(ansible_mounts)) or 'none'
+ msg = 'Unable to determine disk availability for "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
- paths = ''.join(sorted(available_mnts)) or 'none'
- msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths)
- raise OpenShiftCheckException(msg)
+ return free_bytes
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index 26bf4c09b..bde81ad2c 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -94,7 +94,8 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
required = set()
deployment_type = get_var(task_vars, "openshift_deployment_type")
host_groups = get_var(task_vars, "group_names")
- image_tag = get_var(task_vars, "openshift_image_tag")
+ # containerized etcd may not have openshift_image_tag, see bz 1466622
+ image_tag = get_var(task_vars, "openshift_image_tag", default="latest")
image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
if not image_info:
return required
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
index 2bd615457..d2227d244 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_storage.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -1,5 +1,6 @@
"""Check Docker storage driver and usage."""
import json
+import os.path
import re
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
from openshift_checks.mixins import DockerHostMixin
@@ -17,13 +18,30 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
tags = ["pre-install", "health", "preflight"]
dependencies = ["python-docker-py"]
- storage_drivers = ["devicemapper", "overlay2"]
+ storage_drivers = ["devicemapper", "overlay", "overlay2"]
max_thinpool_data_usage_percent = 90.0
max_thinpool_meta_usage_percent = 90.0
+ max_overlay_usage_percent = 90.0
+
+ # TODO(lmeyer): mention these in the output when check fails
+ configuration_variables = [
+ (
+ "max_thinpool_data_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for data. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+ ),
+ (
+ "max_thinpool_meta_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+ ),
+ (
+ "max_overlay_usage_percent",
+ "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+ "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+ ),
+ ]
- # pylint: disable=too-many-return-statements
- # Reason: permanent stylistic exception;
- # it is clearer to return on failures and there are just many ways to fail here.
def run(self, tmp, task_vars):
msg, failed, changed = self.ensure_dependencies(task_vars)
if failed:
@@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
}
# attempt to get the docker info hash from the API
- info = self.execute_module("docker_info", {}, task_vars=task_vars)
- if info.get("failed"):
+ docker_info = self.execute_module("docker_info", {}, task_vars=task_vars)
+ if docker_info.get("failed"):
return {"failed": True, "changed": changed,
"msg": "Failed to query Docker API. Is docker running on this host?"}
- if not info.get("info"): # this would be very strange
+ if not docker_info.get("info"): # this would be very strange
return {"failed": True, "changed": changed,
- "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
- info = info["info"]
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+ docker_info = docker_info["info"]
# check if the storage driver we saw is valid
- driver = info.get("Driver", "[NONE]")
+ driver = docker_info.get("Driver", "[NONE]")
if driver not in self.storage_drivers:
msg = (
"Detected unsupported Docker storage driver '{driver}'.\n"
@@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return {"failed": True, "changed": changed, "msg": msg}
# driver status info is a list of tuples; convert to dict and validate based on driver
- driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+ driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+ result = {}
+
if driver == "devicemapper":
- if driver_status.get("Data loop file"):
- msg = (
- "Use of loopback devices with the Docker devicemapper storage driver\n"
- "(the default storage configuration) is unsupported in production.\n"
- "Please use docker-storage-setup to configure a backing storage volume.\n"
- "See http://red.ht/2rNperO for further information."
- )
- return {"failed": True, "changed": changed, "msg": msg}
- result = self._check_dm_usage(driver_status, task_vars)
- result['changed'] = result.get('changed', False) or changed
- return result
+ result = self.check_devicemapper_support(driver_status, task_vars)
- # TODO(lmeyer): determine how to check usage for overlay2
+ if driver in ['overlay', 'overlay2']:
+ result = self.check_overlay_support(docker_info, driver_status, task_vars)
- return {"changed": changed}
+ result['changed'] = result.get('changed', False) or changed
+ return result
- def _check_dm_usage(self, driver_status, task_vars):
- """
+ def check_devicemapper_support(self, driver_status, task_vars):
+ """Check if dm storage driver is supported as configured. Return: result dict."""
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "msg": msg}
+ result = self.check_dm_usage(driver_status, task_vars)
+ return result
+
+ def check_dm_usage(self, driver_status, task_vars):
+ """Check usage thresholds for Docker dm storage driver. Return: result dict.
Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
devicemapper storage. The LV is "thin" because it does not use all available storage
@@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
could run out of space first; so we check both.
"""
vals = dict(
- vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+ vg_free=self.get_vg_free(driver_status.get("Pool Name"), task_vars),
data_used=driver_status.get("Data Space Used"),
data_total=driver_status.get("Data Space Total"),
metadata_used=driver_status.get("Metadata Space Used"),
@@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
# convert all human-readable strings to bytes
for key, value in vals.copy().items():
try:
- vals[key + "_bytes"] = self._convert_to_bytes(value)
+ vals[key + "_bytes"] = self.convert_to_bytes(value)
except ValueError as err: # unlikely to hit this from API info, but just to be safe
return {
"failed": True,
@@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
return vals
- def _get_vg_free(self, pool, task_vars):
- # Determine which VG to examine according to the pool name, the only indicator currently
- # available from the Docker API driver info. We assume a name that looks like
- # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+ def get_vg_free(self, pool, task_vars):
+ """Determine which VG to examine according to the pool name. Return: size vgs reports.
+ Pool name is the only indicator currently available from the Docker API driver info.
+ We assume a name that looks like "vg--name-docker--pool";
+ vg and lv names with inner hyphens doubled, joined by a hyphen.
+ """
match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
if not match: # unlikely, but... be clear if we assumed wrong
raise OpenShiftCheckException(
@@ -143,7 +171,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
"so the available storage in the VG cannot be determined.".format(pool)
)
vg_name = match.groups()[0].replace("--", "-")
- vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name
+ vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
# should return free space like " 12.00g" if the VG exists; empty if it does not
ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars=task_vars)
@@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
return size
@staticmethod
- def _convert_to_bytes(string):
+ def convert_to_bytes(string):
+ """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
units = dict(
b=1,
k=1024,
@@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
raise ValueError("Cannot convert to a byte size: " + string)
return float(number) * multiplier
+
+ def check_overlay_support(self, docker_info, driver_status, task_vars):
+ """Check if overlay storage driver is supported for this host. Return: result dict."""
+ # check for xfs as backing store
+ backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+ if backing_fs != "xfs":
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+ "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+ ).format(fs=backing_fs)
+ return {"failed": True, "msg": msg}
+
+ # check support for OS and kernel version
+ o_s = docker_info.get("OperatingSystem", "[NONE]")
+ if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+ # keep it simple, only check enterprise kernel versions; assume everyone else is good
+ kernel = docker_info.get("KernelVersion", "[NONE]")
+ kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+ if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+ "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+ ).format(version=kernel)
+ return {"failed": True, "msg": msg}
+ # NOTE: we could check for --selinux-enabled here but docker won't even start with
+ # that option until it's supported in the kernel so we don't need to.
+
+ return self.check_overlay_usage(docker_info, task_vars)
+
+ def check_overlay_usage(self, docker_info, task_vars):
+ """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+ path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+ threshold = get_var(task_vars, "max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+ try:
+ threshold = float(threshold)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+ }
+
+ mount = self.find_ansible_mount(path, get_var(task_vars, "ansible_mounts"))
+ try:
+ free_bytes = mount['size_available']
+ total_bytes = mount['size_total']
+ usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+ except (KeyError, ZeroDivisionError):
+ return {
+ "failed": True,
+ "msg": "The ansible_mount found for path {} is invalid.\n"
+ "This is likely to be an Ansible bug. The record was:\n"
+ "{}".format(path, json.dumps(mount, indent=2)),
+ }
+
+ if usage > threshold:
+ return {
+ "failed": True,
+ "msg": (
+ "For Docker OverlayFS mount point {path},\n"
+ "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+ ).format(path=mount["mount"], pct=usage, thresh=threshold)
+ }
+
+ return {}
+
+ # TODO(lmeyer): migrate to base class
+ @staticmethod
+ def find_ansible_mount(path, ansible_mounts):
+ """Return the mount point for path from ansible_mounts."""
+
+ mount_for_path = {mount['mount']: mount for mount in ansible_mounts}
+ mount_point = path
+ while mount_point not in mount_for_path:
+ if mount_point in ["/", ""]: # "/" not in ansible_mounts???
+ break
+ mount_point = os.path.dirname(mount_point)
+
+ try:
+ return mount_for_path[mount_point]
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none'
+ msg = 'Unable to determine mount point for path "{}". Known mount points: {}.'
+ raise OpenShiftCheckException(msg.format(path, known_mounts))
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..40c87873d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,47 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck, get_var
+
+
+class EtcdTraffic(OpenShiftCheck):
+ """Check if host is being affected by an increase in etcd traffic."""
+
+ name = "etcd_traffic"
+ tags = ["health", "etcd"]
+
+ @classmethod
+ def is_active(cls, task_vars):
+ """Skip hosts that do not have etcd in their group names."""
+ group_names = get_var(task_vars, "group_names", default=[])
+ valid_group_names = "etcd" in group_names
+
+ version = get_var(task_vars, "openshift", "common", "short_version")
+ valid_version = version in ("3.4", "3.5", "1.4", "1.5")
+
+ return super(EtcdTraffic, cls).is_active(task_vars) and valid_group_names and valid_version
+
+ def run(self, tmp, task_vars):
+ is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+ unit = "etcd_container" if is_containerized else "etcd"
+
+ log_matchers = [{
+ "start_regexp": r"Starting Etcd Server",
+ "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+ "unit": unit
+ }]
+
+ match = self.execute_module("search_journalctl", {
+ "log_matchers": log_matchers,
+ }, task_vars)
+
+ if match.get("matched"):
+ msg = ("Higher than normal etcd traffic detected.\n"
+ "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+ "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+ "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+ return {"failed": True, "msg": msg}
+
+ if match.get("failed"):
+ return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index 442f407b1..551e8dfa0 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -62,7 +62,7 @@ class Kibana(LoggingCheck):
# TODO(lmeyer): give users option to validate certs
status_code=302,
)
- result = self.execute_module('uri', args, task_vars)
+ result = self.execute_module('uri', args, None, task_vars)
if result.get('failed'):
return result['msg']
return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 1fddcd6f6..02a094007 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -55,12 +55,12 @@ class LoggingCheck(OpenShiftCheck):
"""Returns: list of pods not in a ready and running state"""
return [
pod for pod in pods
- if any(
+ if not pod.get("status", {}).get("containerStatuses") or any(
container['ready'] is False
for container in pod['status']['containerStatuses']
) or not any(
condition['type'] == 'Ready' and condition['status'] == 'True'
- for condition in pod['status']['conditions']
+ for condition in pod['status'].get('conditions', [])
)
]
@@ -79,7 +79,7 @@ class LoggingCheck(OpenShiftCheck):
"extra_args": list(extra_args) if extra_args else [],
}
- result = execute_module("ocutil", args, task_vars)
+ result = execute_module("ocutil", args, None, task_vars)
if result.get("failed"):
msg = (
'Unexpected error using `oc` to validate the logging stack components.\n'
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 6a76bb93d..204752bd0 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -10,8 +10,8 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
tags = ["preflight"]
openshift_to_ovs_version = {
- "3.6": "2.6",
- "3.5": "2.6",
+ "3.6": ["2.6", "2.7"],
+ "3.5": ["2.6", "2.7"],
"3.4": "2.4",
}
diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py
index 6ebf0ebb2..9383b233c 100644
--- a/roles/openshift_health_checker/test/action_plugin_test.py
+++ b/roles/openshift_health_checker/test/action_plugin_test.py
@@ -59,7 +59,7 @@ def failed(result, msg_has=None):
if msg_has is not None:
assert 'msg' in result
for term in msg_has:
- assert term in result['msg']
+ assert term.lower() in result['msg'].lower()
return result.get('failed', False)
@@ -178,6 +178,16 @@ def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):
assert not skipped(result)
+def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch):
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda: {})
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert failed(result, msg_has=['unknown', 'name'])
+ assert not changed(result)
+ assert not skipped(result)
+
+
@pytest.mark.parametrize('names,all_checks,expected', [
([], [], set()),
(
diff --git a/roles/openshift_health_checker/test/aos_version_test.py b/roles/openshift_health_checker/test/aos_version_test.py
index 697805dd2..4100f6c70 100644
--- a/roles/openshift_health_checker/test/aos_version_test.py
+++ b/roles/openshift_health_checker/test/aos_version_test.py
@@ -18,7 +18,43 @@ expected_pkgs = {
}
-@pytest.mark.parametrize('pkgs, expect_not_found', [
+@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [
+ (
+ # all found
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.1')],
+ expected_pkgs,
+ ),
+ (
+ # found with more specific version
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')],
+ expected_pkgs,
+ ),
+ (
+ [Package('ovs', '2.6'), Package('ovs', '2.4')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
+ ),
+ (
+ [Package('ovs', '2.7')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
+ ),
+])
+def test_check_precise_version_found(pkgs, expected_pkgs_dict):
+ aos_version._check_precise_version_found(pkgs, expected_pkgs_dict)
+
+
+@pytest.mark.parametrize('pkgs,expect_not_found', [
(
[],
{
@@ -55,14 +91,6 @@ expected_pkgs = {
}, # not the right version
),
(
- [Package('spam', '3.2.1'), Package('eggs', '3.2.1')],
- {}, # all found
- ),
- (
- [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')],
- {}, # found with more specific version
- ),
- (
[Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5')],
{
"spam": {
@@ -73,64 +101,86 @@ expected_pkgs = {
}, # eggs found with multiple versions
),
])
-def test_check_pkgs_for_precise_version(pkgs, expect_not_found):
- if expect_not_found:
- with pytest.raises(aos_version.PreciseVersionNotFound) as e:
- aos_version._check_precise_version_found(pkgs, expected_pkgs)
-
- assert list(expect_not_found.values()) == e.value.problem_pkgs
- else:
+def test_check_precise_version_found_fail(pkgs, expect_not_found):
+ with pytest.raises(aos_version.PreciseVersionNotFound) as e:
aos_version._check_precise_version_found(pkgs, expected_pkgs)
+ assert list(expect_not_found.values()) == e.value.problem_pkgs
-@pytest.mark.parametrize('pkgs, expect_higher', [
+@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [
(
[],
- [],
+ expected_pkgs,
),
(
+ # more precise but not strictly higher
[Package('spam', '3.2.1.9')],
- [], # more precise but not strictly higher
+ expected_pkgs,
),
(
+ [Package('ovs', '2.7')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
+ ),
+])
+def test_check_higher_version_found(pkgs, expected_pkgs_dict):
+ aos_version._check_higher_version_found(pkgs, expected_pkgs_dict)
+
+
+@pytest.mark.parametrize('pkgs,expected_pkgs_dict,expect_higher', [
+ (
[Package('spam', '3.3')],
+ expected_pkgs,
['spam-3.3'], # lower precision, but higher
),
(
[Package('spam', '3.2.1'), Package('eggs', '3.3.2')],
+ expected_pkgs,
['eggs-3.3.2'], # one too high
),
(
[Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')],
+ expected_pkgs,
['eggs-3.4'], # multiple versions, one is higher
),
(
[Package('eggs', '3.2.1'), Package('eggs', '3.4'), Package('eggs', '3.3')],
+ expected_pkgs,
['eggs-3.4'], # multiple versions, two are higher
),
+ (
+ [Package('ovs', '2.8')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
+ ['ovs-2.8'],
+ ),
])
-def test_check_pkgs_for_greater_version(pkgs, expect_higher):
- if expect_higher:
- with pytest.raises(aos_version.FoundHigherVersion) as e:
- aos_version._check_higher_version_found(pkgs, expected_pkgs)
- assert set(expect_higher) == set(e.value.problem_pkgs)
- else:
- aos_version._check_higher_version_found(pkgs, expected_pkgs)
+def test_check_higher_version_found_fail(pkgs, expected_pkgs_dict, expect_higher):
+ with pytest.raises(aos_version.FoundHigherVersion) as e:
+ aos_version._check_higher_version_found(pkgs, expected_pkgs_dict)
+ assert set(expect_higher) == set(e.value.problem_pkgs)
-@pytest.mark.parametrize('pkgs, expect_to_flag_pkgs', [
- (
- [],
- [],
- ),
- (
- [Package('spam', '3.2.1')],
- [],
- ),
- (
- [Package('spam', '3.2.1'), Package('eggs', '3.2.2')],
- [],
- ),
+@pytest.mark.parametrize('pkgs', [
+ [],
+ [Package('spam', '3.2.1')],
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.2')],
+])
+def test_check_multi_minor_release(pkgs):
+ aos_version._check_multi_minor_release(pkgs, expected_pkgs)
+
+
+@pytest.mark.parametrize('pkgs,expect_to_flag_pkgs', [
(
[Package('spam', '3.2.1'), Package('spam', '3.3.2')],
['spam'],
@@ -140,10 +190,7 @@ def test_check_pkgs_for_greater_version(pkgs, expect_higher):
['eggs'],
),
])
-def test_check_pkgs_for_multi_release(pkgs, expect_to_flag_pkgs):
- if expect_to_flag_pkgs:
- with pytest.raises(aos_version.FoundMultiRelease) as e:
- aos_version._check_multi_minor_release(pkgs, expected_pkgs)
- assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs)
- else:
+def test_check_multi_minor_release_fail(pkgs, expect_to_flag_pkgs):
+ with pytest.raises(aos_version.FoundMultiRelease) as e:
aos_version._check_multi_minor_release(pkgs, expected_pkgs)
+ assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs)
diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py
index b353fa610..945b9eafc 100644
--- a/roles/openshift_health_checker/test/disk_availability_test.py
+++ b/roles/openshift_health_checker/test/disk_availability_test.py
@@ -3,22 +3,19 @@ import pytest
from openshift_checks.disk_availability import DiskAvailability, OpenShiftCheckException
-@pytest.mark.parametrize('group_names,is_containerized,is_active', [
- (['masters'], False, True),
- # ensure check is skipped on containerized installs
- (['masters'], True, False),
- (['nodes'], False, True),
- (['etcd'], False, True),
- (['masters', 'nodes'], False, True),
- (['masters', 'etcd'], False, True),
- ([], False, False),
- (['lb'], False, False),
- (['nfs'], False, False),
+@pytest.mark.parametrize('group_names,is_active', [
+ (['masters'], True),
+ (['nodes'], True),
+ (['etcd'], True),
+ (['masters', 'nodes'], True),
+ (['masters', 'etcd'], True),
+ ([], False),
+ (['lb'], False),
+ (['nfs'], False),
])
-def test_is_active(group_names, is_containerized, is_active):
+def test_is_active(group_names, is_active):
task_vars = dict(
group_names=group_names,
- openshift=dict(common=dict(is_containerized=is_containerized)),
)
assert DiskAvailability.is_active(task_vars=task_vars) == is_active
@@ -38,7 +35,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
with pytest.raises(OpenShiftCheckException) as excinfo:
check.run(tmp=None, task_vars=task_vars)
- for word in 'determine available disk'.split() + extra_words:
+ for word in 'determine disk availability'.split() + extra_words:
assert word in str(excinfo.value)
@@ -81,7 +78,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
[{
# not enough space on / ...
'mount': '/',
- 'size_available': 0,
+ 'size_available': 2 * 10**9,
}, {
# ... but enough on /var
'mount': '/var',
diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py
index 0a7c0f8d3..3b9e097fb 100644
--- a/roles/openshift_health_checker/test/docker_image_availability_test.py
+++ b/roles/openshift_health_checker/test/docker_image_availability_test.py
@@ -259,3 +259,17 @@ def test_required_images(deployment_type, is_containerized, groups, oreg_url, ex
)
assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
+
+
+def test_containerized_etcd():
+ task_vars = dict(
+ openshift=dict(
+ common=dict(
+ is_containerized=True,
+ ),
+ ),
+ openshift_deployment_type="origin",
+ group_names=['etcd'],
+ )
+ expected = set(['registry.access.redhat.com/rhel7/etcd'])
+ assert expected == DockerImageAvailability("DUMMY").required_images(task_vars)
diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py
index 876614b1d..99c529054 100644
--- a/roles/openshift_health_checker/test/docker_storage_test.py
+++ b/roles/openshift_health_checker/test/docker_storage_test.py
@@ -23,7 +23,8 @@ def test_is_active(is_containerized, group_names, is_active):
assert DockerStorage.is_active(task_vars=task_vars) == is_active
-non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
+def non_atomic_task_vars():
+ return {"openshift": {"common": {"is_atomic": False}}}
@pytest.mark.parametrize('docker_info, failed, expect_msg', [
@@ -56,7 +57,7 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
(
dict(info={
"Driver": "overlay2",
- "DriverStatus": []
+ "DriverStatus": [("Backing Filesystem", "xfs")],
}),
False,
[],
@@ -64,9 +65,30 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
(
dict(info={
"Driver": "overlay",
+ "DriverStatus": [("Backing Filesystem", "btrfs")],
}),
True,
- ["unsupported Docker storage driver"],
+ ["storage is type 'btrfs'", "only supported with\n'xfs'"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay2",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)",
+ "KernelVersion": "3.10.0-327.22.2.el7.x86_64",
+ }),
+ True,
+ ["Docker reports kernel version 3.10.0-327"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ "OperatingSystem": "CentOS",
+ "KernelVersion": "3.10.0-514",
+ }),
+ False,
+ [],
),
(
dict(info={
@@ -85,8 +107,9 @@ def test_check_storage_driver(docker_info, failed, expect_msg):
return docker_info
check = dummy_check(execute_module=execute_module)
- check._check_dm_usage = lambda status, task_vars: dict() # stub out for this test
- result = check.run(tmp=None, task_vars=non_atomic_task_vars)
+ check.check_dm_usage = lambda status, task_vars: dict() # stub out for this test
+ check.check_overlay_usage = lambda info, task_vars: dict() # stub out for this test
+ result = check.run(tmp=None, task_vars=non_atomic_task_vars())
if failed:
assert result["failed"]
@@ -146,8 +169,8 @@ not_enough_space = {
])
def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
check = dummy_check()
- check._get_vg_free = lambda pool, task_vars: vg_free
- result = check._check_dm_usage(driver_status, task_vars)
+ check.get_vg_free = lambda pool, task_vars: vg_free
+ result = check.check_dm_usage(driver_status, task_vars)
result_success = not result.get("failed")
assert result_success is success
@@ -195,10 +218,10 @@ def test_vg_free(pool, command_returns, raises, returns):
check = dummy_check(execute_module=execute_module)
if raises:
with pytest.raises(OpenShiftCheckException) as err:
- check._get_vg_free(pool, {})
+ check.get_vg_free(pool, {})
assert raises in str(err.value)
else:
- ret = check._get_vg_free(pool, {})
+ ret = check.get_vg_free(pool, {})
assert ret == returns
@@ -209,7 +232,7 @@ def test_vg_free(pool, command_returns, raises, returns):
("12g", 12.0 * 1024**3),
])
def test_convert_to_bytes(string, expect_bytes):
- got = DockerStorage._convert_to_bytes(string)
+ got = DockerStorage.convert_to_bytes(string)
assert got == expect_bytes
@@ -219,6 +242,70 @@ def test_convert_to_bytes(string, expect_bytes):
])
def test_convert_to_bytes_error(string):
with pytest.raises(ValueError) as err:
- DockerStorage._convert_to_bytes(string)
+ DockerStorage.convert_to_bytes(string)
assert "Cannot convert" in str(err.value)
assert string in str(err.value)
+
+
+ansible_mounts_enough = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 50 * 10**9,
+ 'size_total': 50 * 10**9,
+}]
+ansible_mounts_not_enough = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 0,
+ 'size_total': 50 * 10**9,
+}]
+ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')]
+ansible_mounts_zero_size = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 0,
+ 'size_total': 0,
+}]
+
+
+@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [
+ (
+ ansible_mounts_enough,
+ None,
+ False,
+ [],
+ ),
+ (
+ ansible_mounts_not_enough,
+ None,
+ True,
+ ["usage percentage", "higher than threshold"],
+ ),
+ (
+ ansible_mounts_not_enough,
+ "bogus percent",
+ True,
+ ["is not a percentage"],
+ ),
+ (
+ ansible_mounts_missing_fields,
+ None,
+ True,
+ ["Ansible bug"],
+ ),
+ (
+ ansible_mounts_zero_size,
+ None,
+ True,
+ ["Ansible bug"],
+ ),
+])
+def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg):
+ check = dummy_check()
+ task_vars = non_atomic_task_vars()
+ task_vars["ansible_mounts"] = ansible_mounts
+ if threshold is not None:
+ task_vars["max_overlay_usage_percent"] = threshold
+ docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay")
+ result = check.check_overlay_usage(docker_info, task_vars)
+
+ assert expect_fail == bool(result.get("failed"))
+ for msg in expect_msg:
+ assert msg in result["msg"]
diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py
new file mode 100644
index 000000000..287175e29
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_traffic_test.py
@@ -0,0 +1,80 @@
+import pytest
+
+from openshift_checks.etcd_traffic import EtcdTraffic
+
+
+@pytest.mark.parametrize('group_names,version,is_active', [
+ (['masters'], "3.5", False),
+ (['masters'], "3.6", False),
+ (['nodes'], "3.4", False),
+ (['etcd'], "3.4", True),
+ (['etcd'], "3.5", True),
+ (['etcd'], "3.1", False),
+ (['masters', 'nodes'], "3.5", False),
+ (['masters', 'etcd'], "3.5", True),
+ ([], "3.4", False),
+])
+def test_is_active(group_names, version, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(
+ common=dict(short_version=version),
+ ),
+ )
+ assert EtcdTraffic.is_active(task_vars=task_vars) == is_active
+
+
+@pytest.mark.parametrize('group_names,matched,failed,extra_words', [
+ (["masters"], True, True, ["Higher than normal", "traffic"]),
+ (["masters", "etcd"], False, False, []),
+ (["etcd"], False, False, []),
+])
+def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words):
+ def execute_module(module_name, args, task_vars):
+ return {
+ "matched": matched,
+ "failed": failed,
+ }
+
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(
+ common=dict(service_type="origin", is_containerized=False),
+ )
+ )
+
+ check = EtcdTraffic(execute_module=execute_module)
+ result = check.run(tmp=None, task_vars=task_vars)
+
+ for word in extra_words:
+ assert word in result.get("msg", "")
+
+ assert result.get("failed", False) == failed
+
+
+@pytest.mark.parametrize('is_containerized,expected_unit_value', [
+ (False, "etcd"),
+ (True, "etcd_container"),
+])
+def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value):
+ task_vars = dict(
+ openshift=dict(
+ common=dict(is_containerized=is_containerized),
+ )
+ )
+
+ def execute_module(module_name, args, task_vars):
+ assert module_name == "search_journalctl"
+ matchers = args["log_matchers"]
+
+ for matcher in matchers:
+ assert matcher["unit"] == expected_unit_value
+
+ return {"failed": False}
+
+ check = EtcdTraffic(execute_module=execute_module)
+ check.run(tmp=None, task_vars=task_vars)
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py
index 19140a1b6..40a5d19d8 100644
--- a/roles/openshift_health_checker/test/kibana_test.py
+++ b/roles/openshift_health_checker/test/kibana_test.py
@@ -169,7 +169,7 @@ def test_get_kibana_url(route, expect_url, expect_error):
),
])
def test_verify_url_internal_failure(exec_result, expect):
- check = Kibana(execute_module=lambda module_name, args, task_vars: dict(failed=True, msg=exec_result))
+ check = Kibana(execute_module=lambda module_name, args, tmp, task_vars: dict(failed=True, msg=exec_result))
check._get_kibana_url = lambda task_vars: ('url', None)
error = check._check_kibana_route({})
diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py
index a19881e5b..4f71fbf52 100644
--- a/roles/openshift_health_checker/test/logging_check_test.py
+++ b/roles/openshift_health_checker/test/logging_check_test.py
@@ -50,6 +50,16 @@ plain_kibana_pod = {
}
}
+plain_kibana_pod_no_containerstatus = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
fluentd_pod_node1 = {
"metadata": {
"labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
@@ -80,7 +90,7 @@ plain_curator_pod = {
("Permission denied", "Unexpected error using `oc`"),
])
def test_oc_failure(problem, expect):
- def execute_module(module_name, args, task_vars):
+ def execute_module(module_name, args, tmp, task_vars):
if module_name == "ocutil":
return dict(failed=True, result=problem)
return dict(changed=False)
@@ -135,3 +145,23 @@ def test_get_pods_for_component(pod_output, expect_pods, expect_error):
{}
)
assert_error(error, expect_error)
+
+
+@pytest.mark.parametrize('name, pods, expected_pods', [
+ (
+ 'test single pod found, scheduled, but no containerStatuses field',
+ [plain_kibana_pod_no_containerstatus],
+ [plain_kibana_pod_no_containerstatus],
+ ),
+ (
+ 'set of pods has at least one pod with containerStatuses (scheduled); should still fail',
+ [plain_kibana_pod_no_containerstatus, plain_kibana_pod],
+ [plain_kibana_pod_no_containerstatus],
+ ),
+
+], ids=lambda argvals: argvals[0])
+def test_get_not_running_pods_no_container_status(name, pods, expected_pods):
+ check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: '')
+ result = check.not_running_pods(pods)
+
+ assert result == expected_pods
diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py
index 91eace512..1bb6371ae 100644
--- a/roles/openshift_health_checker/test/package_version_test.py
+++ b/roles/openshift_health_checker/test/package_version_test.py
@@ -72,36 +72,6 @@ def test_package_version(openshift_release):
assert result is return_value
-@pytest.mark.parametrize('deployment_type,openshift_release,expected_ovs_version', [
- ("openshift-enterprise", "3.5", "2.6"),
- ("origin", "3.6", "2.6"),
- ("openshift-enterprise", "3.4", "2.4"),
- ("origin", "3.3", "2.4"),
-])
-def test_ovs_package_version(deployment_type, openshift_release, expected_ovs_version):
- task_vars = dict(
- openshift=dict(common=dict(service_type='origin')),
- openshift_release=openshift_release,
- openshift_image_tag='v' + openshift_release,
- openshift_deployment_type=deployment_type,
- )
- return_value = object()
-
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
- assert module_name == 'aos_version'
- assert "package_list" in module_args
-
- for pkg in module_args["package_list"]:
- if pkg["name"] == "openvswitch":
- assert pkg["version"] == expected_ovs_version
-
- return return_value
-
- check = PackageVersion(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
- assert result is return_value
-
-
@pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [
("origin", "3.5", "1.12"),
("openshift-enterprise", "3.4", "1.12"),
diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py
new file mode 100644
index 000000000..724928aa1
--- /dev/null
+++ b/roles/openshift_health_checker/test/search_journalctl_test.py
@@ -0,0 +1,157 @@
+import pytest
+import search_journalctl
+
+
+def canned_search_journalctl(get_log_output=None):
+ """Create a search_journalctl object with canned get_log_output method"""
+ module = search_journalctl
+ if get_log_output:
+ module.get_log_output = get_log_output
+ return module
+
+
+DEFAULT_TIMESTAMP = 1496341364
+
+
+def get_timestamp(modifier=0):
+ return DEFAULT_TIMESTAMP + modifier
+
+
+def get_timestamp_microseconds(modifier=0):
+ return get_timestamp(modifier) * 1000000
+
+
+def create_test_log_object(stamp, msg):
+ return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg)
+
+
+@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [
+ (
+ 'test with valid params',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test log message",
+ "unit": "test",
+ },
+ ],
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ ["test log message"],
+ [],
+ ),
+ (
+ 'test with invalid json in log input',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test log message",
+ "unit": "test-unit",
+ },
+ ],
+ [
+ '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}',
+ ],
+ [],
+ [
+ ["invalid json", "test-unit", "test log message"],
+ ],
+ ),
+ (
+ 'test with invalid regexp',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test [ log message",
+ "unit": "test",
+ },
+ ],
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ [],
+ [
+ ["invalid regular expression"],
+ ],
+ ),
+], ids=lambda argval: argval[0])
+def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors):
+ def get_log_output(matcher):
+ return log_input
+
+ module = canned_search_journalctl(get_log_output)
+ matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60)
+
+ assert set(matched_regexp) == set(expected_matches)
+ assert len(expected_errors) == len(errors)
+
+ for idx, partial_err_set in enumerate(expected_errors):
+ for partial_err_msg in partial_err_set:
+ assert partial_err_msg in errors[idx]
+
+
+@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [
+ (
+ 'test with matching log message, but out of bounds of log_count_lim',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 3,
+ get_timestamp(-100 * 60 * 60),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ None,
+ ),
+ (
+ 'test with matching log message, but with timestamp too old',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 100,
+ get_timestamp(-10),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+ ],
+ None,
+ ),
+ (
+ 'test with matching log message, and timestamp within time limit',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 100,
+ get_timestamp(-1010),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+ ],
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ ),
+], ids=lambda argval: argval[0])
+def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match):
+ match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds)
+ assert match == expected_match