From 931499b7cf9d4e03f2dcd4449650986d31886362 Mon Sep 17 00:00:00 2001 From: Tim Bielawa Date: Mon, 3 Oct 2016 08:20:38 -0700 Subject: Try to make boiler plate for cert expiry checking --- library/openshift_cert_expiry.py | 428 +++++++++++++++++++++ .../byo/openshift-cluster/check-cert-expiry.yaml | 35 ++ .../openshift-cluster/check-cert-expiry.yaml | 37 ++ utils/Makefile | 12 +- utils/test-requirements.txt | 1 + 5 files changed, 510 insertions(+), 3 deletions(-) create mode 100644 library/openshift_cert_expiry.py create mode 100644 playbooks/byo/openshift-cluster/check-cert-expiry.yaml create mode 100644 playbooks/common/openshift-cluster/check-cert-expiry.yaml diff --git a/library/openshift_cert_expiry.py b/library/openshift_cert_expiry.py new file mode 100644 index 000000000..cd8662f67 --- /dev/null +++ b/library/openshift_cert_expiry.py @@ -0,0 +1,428 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# etcd config file +import ConfigParser +# Expiration parsing +import datetime +# File path stuff +import os +# Config file parsing +import yaml +# Certificate loading +import OpenSSL.crypto + + +DOCUMENTATION = ''' +--- +module: openshift_cert_expiry +short_description: Check OpenShift Container Platform (OCP) and Kube certificate expirations on a cluster +description: + - The M(openshift_cert_expiry) module has two basic functions: to flag certificates which will expire in a set window of time from now, and to notify you about certificates which have already expired. + - When the module finishes, a summary of the examination is returned. Each certificate in the summary has a C(health) key with a value of one of the following: + - C(ok) - not expired, and outside of the expiration C(warning_days) window. + - C(warning) - not expired, but will expire between now and the C(warning_days) window. + - C(expired) - an expired certificate. + - Certificate flagging follow this logic: + - If the expiration date is before now then the certificate is classified as C(expired). + - The certificates time to live (expiration date - now) is calculated, if that time window is less than C(warning_days) the certificate is classified as C(warning). + - All other conditions are classified as C(ok). + - The following keys are ALSO present in the certificate summary: + - C(cert_cn) - The common name of the certificate (additional CNs present in SAN extensions are omitted) + - C(days_remaining) - The number of days until the certificate expires. + - C(expiry) - The date the certificate expires on. + - C(path) - The full path to the certificate on the examined host. +version_added: "0.0" +options: + config_base: + description: + - Base path to OCP system settings. + required: false + default: /etc/origin + warning_days: + description: + - Flag certificates which will expire in C(warning_days) days from now. + required: false + default: 30 + show_all: + description: + - Enable this option to show analysis of ALL certificates examined by this module. + - By default only certificates which have expired, or will expire within the C(warning_days) window will be reported. + required: false + default: false + +author: "Tim Bielawa (@tbielawa) " +''' + +EXAMPLES = ''' +# Default invocation, only notify about expired certificates or certificates which will expire within 30 days from now +- openshift_cert_expiry: + +# Expand the warning window to show certificates expiring within a year from now +- openshift_cert_expiry: warning_days=365 + +# Show expired, soon to expire (now + 30 days), and all other certificates examined +- openshift_cert_expiry: show_all=true +''' + + +###################################################################### +# etcd does not begin their config file with an opening [section] as +# required by the Python ConfigParser module. We hack around it by +# slipping one in ourselves prior to parsing. +# +# Source: Alex Martelli - http://stackoverflow.com/a/2819788/6490583 +class FakeSecHead(object): + def __init__(self, fp): + self.fp = fp + self.sechead = '[ETCD]\n' + + def readline(self): + if self.sechead: + try: + return self.sechead + finally: + self.sechead = None + else: + return self.fp.readline() + +###################################################################### + +def filter_paths(path_list): + # `path_list` - A list of file paths to check. Only files which + # exist will be returned + return filter( + lambda p: os.path.exists(os.path.realpath(p)), + path_list) + +def load_and_handle_cert(cert_string, now, base64decode=False): + """Load a certificate, split off the good parts, and return some +useful data + +Params: + +- `cert_string` (string) - a certificate loaded into a string object +- `now` (datetime) - a datetime object of the time to calculate the certificate 'time_remaining' against +- `base64decode` (bool) - run .decode('base64') on the input? + +Returns: +A 3-tuple of the form: (certificate_common_name, certificate_expiry_date, certificate_time_remaining) + + """ + if base64decode: + _cert_string = cert_string.decode('base-64') + else: + _cert_string = cert_string + + cert_loaded = OpenSSL.crypto.load_certificate( + OpenSSL.crypto.FILETYPE_PEM, _cert_string) + + # Strip the subject down to just the value of the first name + cert_subject = cert_loaded.get_subject().get_components()[0][1] + + # Grab the expiration date + cert_expiry = cert_loaded.get_notAfter() + cert_expiry_date = datetime.datetime.strptime( + cert_expiry, + # example get_notAfter() => 20180922170439Z + '%Y%m%d%H%M%SZ') + + time_remaining = cert_expiry_date - now + + return (cert_subject, cert_expiry_date, time_remaining) + +def classify_cert(cert_meta, now, time_remaining, expire_window, cert_list): + """Given metadata about a certificate under examination, classify it + into one of three categories, 'ok', 'warning', and 'expired'. + +Params: + +- `cert_meta` dict - A dict with certificate metadata. Required fields + include: 'cert_cn', 'path', 'expiry', 'days_remaining', 'health'. +- `now` (datetime) - a datetime object of the time to calculate the certificate 'time_remaining' against +- `time_remaining` (datetime.timedelta) - a timedelta for how long until the cert expires +- `expire_window` (datetime.timedelta) - a timedelta for how long the warning window is +- `cert_list` list - A list to shove the classified cert into + +Return: +- `cert_list` - The updated list of classified certificates + """ + expiry_str = str(cert_meta['expiry']) + # Categorization + if cert_meta['expiry'] < now: + # This already expired, must NOTIFY + cert_meta['health'] = 'expired' + elif time_remaining < expire_window: + # WARN about this upcoming expirations + cert_meta['health'] = 'warning' + else: + # Not expired or about to expire + cert_meta['health'] = 'ok' + + cert_meta['expiry'] = expiry_str + cert_list.append(cert_meta) + return cert_list + +def tabulate_summary(certificates, kubeconfigs): + """Calculate the summary text for when the module finishes +running. This includes counds of each classification and what have +you. + +Params: + +- `certificates` (list of dicts) - Processed `expire_check_result` + dicts with filled in `health` keys for system certificates. +- `kubeconfigs` (list of dicts) - Processed `expire_check_result` + dicts with filled in `health` keys for embedded kubeconfig + certificates. + +Return: +- `summary_results` (dict) - Counts of each cert/kubeconfig + classification and total items examined. + """ + summary_results = { + 'system_certificates': len(certificates), + 'kubeconfig_certificates': len(kubeconfigs), + 'total': len(certificates + kubeconfigs), + 'ok': 0, + 'warning': 0, + 'expired': 0 + } + + items = certificates + kubeconfigs + summary_results['expired'] = len([c for c in items if c['health'] == 'expired']) + summary_results['warning'] = len([c for c in items if c['health'] == 'warning']) + summary_results['ok'] = len([c for c in items if c['health'] == 'ok']) + + return summary_results + + +###################################################################### +def main(): + module = AnsibleModule( + argument_spec=dict( + config_base=dict( + required=False, + default="/etc/origin", + type='str'), + warning_days=dict( + required=False, + default=int(30), + type='int'), + show_all=dict( + required=False, + default="False", + type='bool') + ), + supports_check_mode=True, + ) + + # Basic scaffolding for OpenShift spcific certs + openshift_base_config_path = module.params['config_base'] + openshift_master_config_path = os.path.normpath( + os.path.join(openshift_base_config_path, "master/master-config.yaml") + ) + openshift_node_config_path = os.path.normpath( + os.path.join(openshift_base_config_path, "node/node-config.yaml") + ) + openshift_cert_check_paths = [ + openshift_master_config_path, + openshift_node_config_path, + ] + + # Paths for Kubeconfigs. Additional kubeconfigs are conditionally checked later in the code + kubeconfig_paths = [ + os.path.normpath( + os.path.join(openshift_base_config_path, "master/admin.kubeconfig") + ), + os.path.normpath( + os.path.join(openshift_base_config_path, "master/openshift-master.kubeconfig") + ), + os.path.normpath( + os.path.join(openshift_base_config_path, "master/openshift-node.kubeconfig") + ), + os.path.normpath( + os.path.join(openshift_base_config_path, "master/openshift-router.kubeconfig") + ), + ] + + # Expiry checking stuff + now = datetime.datetime.now() + # todo, catch exception for invalid input and return a fail_json + warning_days = int(module.params['warning_days']) + expire_window = datetime.timedelta(days=warning_days) + + # Module stuff + # + # The results of our cert checking to return from the task call + check_results = {} + check_results['meta'] = {} + check_results['meta']['warning_days'] = warning_days + check_results['meta']['checked_at_time'] = str(now) + check_results['meta']['warn_after_date'] = str(now + expire_window) + check_results['meta']['show_all'] = str(module.params['show_all']) + # All the analyzed certs accumulate here + certs = [] + + ###################################################################### + # Sure, why not? Let's enable check mode. + if module.check_mode: + check_results['certs'] = [] + module.exit_json( + check_results=check_results, + msg="Checked 0 certificates. Expired/Warning/OK: 0/0/0. Warning window: %s days" % module.params['warning_days'], + rc=0, + changed=False + ) + + ###################################################################### + # Check for OpenShift Container Platform specific certs + ###################################################################### + for os_cert in filter_paths(openshift_cert_check_paths): + # Open up that config file and locate the cert and CA + with open(os_cert, 'r') as fp: + cert_meta = {} + cfg = yaml.load(fp) + # cert files are specified in parsed `fp` as relative to the path + # of the original config file. 'master-config.yaml' with certFile + # = 'foo.crt' implies that 'foo.crt' is in the same + # directory. certFile = '../foo.crt' is in the parent directory. + cfg_path = os.path.dirname(fp.name) + cert_meta['certFile'] = os.path.join(cfg_path, cfg['servingInfo']['certFile']) + cert_meta['clientCA'] = os.path.join(cfg_path, cfg['servingInfo']['clientCA']) + + ###################################################################### + # Load the certificate and the CA, parse their expiration dates into + # datetime objects so we can manipulate them later + for _, v in cert_meta.iteritems(): + with open(v, 'r') as fp: + cert = fp.read() + cert_subject, cert_expiry_date, time_remaining = load_and_handle_cert(cert, now) + + expire_check_result = { + 'cert_cn': cert_subject, + 'path': fp.name, + 'expiry': cert_expiry_date, + 'days_remaining': time_remaining.days, + 'health': None, + } + + classify_cert(expire_check_result, now, time_remaining, expire_window, certs) + + ###################################################################### + # /Check for OpenShift Container Platform specific certs + ###################################################################### + + ###################################################################### + # Check service Kubeconfigs + ###################################################################### + kubeconfigs = [] + + # There may be additional kubeconfigs to check, but their naming + # is less predictable than the ones we've already assembled. + + try: + # Try to read the standard 'node-config.yaml' file to check if + # this host is a node. + with open(openshift_node_config_path, 'r') as fp: + cfg = yaml.load(fp) + # OK, the config file exists, therefore this is a + # node. Nodes have their own kubeconfig files to + # communicate with the master API. Let's read the relative + # path to that file from the node config. + node_masterKubeConfig = cfg['masterKubeConfig'] + # As before, the path to the 'masterKubeConfig' file is + # relative to `fp` + cfg_path = os.path.dirname(fp.name) + node_kubeconfig = os.path.join(cfg_path, node_masterKubeConfig) + with open(node_kubeconfig, 'r') as fp: + # Read in the nodes kubeconfig file and grab the good stuff + cfg = yaml.load(fp) + c = cfg['users'][0]['user']['client-certificate-data'] + (cert_subject, + cert_expiry_date, + time_remaining) = load_and_handle_cert(c, now, base64decode=True) + + expire_check_result = { + 'cert_cn': cert_subject, + 'path': fp.name, + 'expiry': cert_expiry_date, + 'days_remaining': time_remaining.days, + 'health': None, + } + + classify_cert(expire_check_result, now, time_remaining, expire_window, kubeconfigs) + except Exception: + # This is not a node + pass + + for kube in filter_paths(kubeconfig_paths): + with open(kube, 'r') as fp: + # TODO: Maybe consider catching exceptions here? + cfg = yaml.load(fp) + # Per conversation, "the kubeconfigs you care about: + # admin, router, registry should all be single + # value". Following that advice we only grab the data for + # the user at index 0 in the 'users' list. There should + # not be more than one user. + c = cfg['users'][0]['user']['client-certificate-data'] + (cert_subject, + cert_expiry_date, + time_remaining) = load_and_handle_cert(c, now, base64decode=True) + + expire_check_result = { + 'cert_cn': cert_subject, + 'path': fp.name, + 'expiry': cert_expiry_date, + 'days_remaining': time_remaining.days, + 'health': None, + } + + classify_cert(expire_check_result, now, time_remaining, expire_window, kubeconfigs) + + + ###################################################################### + # /Check service Kubeconfigs + ###################################################################### + res = tabulate_summary(certs, kubeconfigs) + + msg = "Checked {count} certificates and kubeconfigs. Expired/Warning/OK: {exp}/{warn}/{ok}. Warning window: {window} days".format( + count=res['total'], + exp=res['expired'], + warn=res['warning'], + ok=res['ok'], + window=int(module.params['warning_days']), + ) + + # By default we only return detailed information about expired or + # warning certificates. If show_all is true then we will print all + # the certificates examined. + if not module.params['show_all']: + check_results['certs'] = filter(lambda ctr: ctr['health'] in ['expired', 'warning'], certs) + check_results['kubeconfigs'] = filter(lambda ctr: ctr['health'] in ['expired', 'warning'], kubeconfigs) + else: + check_results['certs'] = certs + check_results['kubeconfigs'] = kubeconfigs + + # Sort the final results to report in order of ascending safety + # time. That is to say, the certificates which will expire sooner + # will be at the front of the list and certificates which will + # expire later are at the end. + check_results['certs'] = sorted(check_results['certs'], cmp=lambda x, y: cmp(x['days_remaining'], y['days_remaining'])) + check_results['kubeconfigs'] = sorted(check_results['kubeconfigs'], cmp=lambda x, y: cmp(x['days_remaining'], y['days_remaining'])) + # This module will never change anything, but we might want to + # change the return code parameter if there is some catastrophic + # error we noticed earlier + module.exit_json( + check_results=check_results, + summary=res, + msg=msg, + rc=0, + changed=False + ) + +###################################################################### +# import module snippets +from ansible.module_utils.basic import AnsibleModule +if __name__ == '__main__': + main() diff --git a/playbooks/byo/openshift-cluster/check-cert-expiry.yaml b/playbooks/byo/openshift-cluster/check-cert-expiry.yaml new file mode 100644 index 000000000..39efdbd36 --- /dev/null +++ b/playbooks/byo/openshift-cluster/check-cert-expiry.yaml @@ -0,0 +1,35 @@ +--- +# check-cert-expiry.yaml - A utility for cluster ops to scan through +# (critical) certificates for the ongoing operations of a cluster. + +# We do not support all Ansible versions. This is our safety net. +- include: ../../common/openshift-cluster/verify_ansible_version.yml + +- name: Generate the l_oo_all_hosts group + hosts: localhost + connection: local + become: no + gather_facts: no + tasks: + - include_vars: cluster_hosts.yml + - add_host: + name: "{{ item }}" + groups: l_oo_all_hosts + with_items: "{{ g_all_hosts | default([]) }}" + +# This may seem redundant, running `include_vars` again on the list of +# hosts in the group 'l_oo_all_hosts' which we just created. But the +# fact of the matter is that if we don't re-run include_vars on the +# new host group we created, then they will not have access to those +# same group variables they were birthed from. +# +# Go ahead and try to 'debug: var=g_all_hosts' later on (without this +# play) and you'll find that the result is UNDEFINED VARIABLE. +- name: Inject cluster hosts variables into l_oo_all_hosts + hosts: l_oo_all_hosts + gather_facts: no + tasks: + - include_vars: cluster_hosts.yml + +# This is where the actual business gets started: +- include: ../../common/openshift-cluster/check-cert-expiry.yaml diff --git a/playbooks/common/openshift-cluster/check-cert-expiry.yaml b/playbooks/common/openshift-cluster/check-cert-expiry.yaml new file mode 100644 index 000000000..e160383af --- /dev/null +++ b/playbooks/common/openshift-cluster/check-cert-expiry.yaml @@ -0,0 +1,37 @@ +# --- +# # This must be evaluated after cluster_hosts.yml has been evaluated on +# # 'localhost' to generate l_oo_all_hosts and then subsequently +# # evaluated again on the generated hosts group (l_oo_all_hosts). +# - include: evaluate_groups.yml +# tags: +# - always + +# # Initialize cluster facts for oo_all_hosts using the openshift_facts +# # role followed by the 'common' role +# - include: initialize_facts.yml +# tags: +# - always + +# # Get the version to install from the first master, then synchronize +# # that variable across all in oo_all_hosts +# - include: initialize_openshift_version.yml +# tags: +# - always + +# # Earlier 'initialize_facts' included the openshift_facts role and +# # used the openshift_facts module with the 'role' parameter set to +# # 'common'. Now we're applying the openshift_facts role AGAIN but just +# # to the subset of oo_all_hosts which require configuring. +# - name: Load openshift_facts +# hosts: oo_etcd_to_config:oo_masters_to_config:oo_nodes_to_config +# roles: +# - openshift_facts + +- hosts: all + become: yes + gather_facts: no + tasks: + - name: Check cert expirys on host + openshift_cert_expiry: + warning_days: 1500 + show_all: true diff --git a/utils/Makefile b/utils/Makefile index 59aff92fd..bc708964b 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -31,6 +31,8 @@ ASCII2MAN = a2x -D $(dir $@) -d manpage -f manpage $< MANPAGES := docs/man/man1/atomic-openshift-installer.1 VERSION := 1.3 +PEPEXCLUDES := E501,E121,E124 + sdist: clean python setup.py sdist rm -fR $(SHORTNAME).egg-info @@ -80,7 +82,7 @@ ci-pylint: @echo "#############################################" @echo "# Running PyLint Tests in virtualenv" @echo "#############################################" - . $(NAME)env/bin/activate && python -m pylint --rcfile ../git/.pylintrc src/ooinstall/cli_installer.py src/ooinstall/oo_config.py src/ooinstall/openshift_ansible.py src/ooinstall/variants.py ../callback_plugins/openshift_quick_installer.py + . $(NAME)env/bin/activate && python -m pylint --rcfile ../git/.pylintrc src/ooinstall/cli_installer.py src/ooinstall/oo_config.py src/ooinstall/openshift_ansible.py src/ooinstall/variants.py ../callback_plugins/openshift_quick_installer.py ../library/openshift_cert_expiry.py ci-list-deps: @echo "#############################################" @@ -94,13 +96,17 @@ ci-pyflakes: @echo "#################################################" . $(NAME)env/bin/activate && pyflakes src/ooinstall/*.py . $(NAME)env/bin/activate && pyflakes ../callback_plugins/openshift_quick_installer.py + . $(NAME)env/bin/activate && pyflakes ../library/openshift_cert_expiry.py ci-pep8: @echo "#############################################" @echo "# Running PEP8 Compliance Tests in virtualenv" @echo "#############################################" - . $(NAME)env/bin/activate && pep8 --ignore=E501,E121,E124 src/$(SHORTNAME)/ - . $(NAME)env/bin/activate && pep8 --ignore=E501,E121,E124 ../callback_plugins/openshift_quick_installer.py + . $(NAME)env/bin/activate && pep8 --ignore=$(PEPEXCLUDES) src/$(SHORTNAME)/ + . $(NAME)env/bin/activate && pep8 --ignore=$(PEPEXCLUDES) ../callback_plugins/openshift_quick_installer.py +# This one excludes E402 because it is an ansible module and the +# boilerplate import statement is expected to be at the bottom + . $(NAME)env/bin/activate && pep8 --ignore=$(PEPEXCLUDES),E402 ../library/openshift_cert_expiry.py ci: clean virtualenv ci-list-deps ci-pep8 ci-pylint ci-pyflakes ci-unittests : diff --git a/utils/test-requirements.txt b/utils/test-requirements.txt index f2216a177..af91ab6a7 100644 --- a/utils/test-requirements.txt +++ b/utils/test-requirements.txt @@ -9,3 +9,4 @@ flake8 PyYAML click backports.functools_lru_cache +pyOpenSSL -- cgit v1.2.1