summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker/openshift_checks/logging/kibana.py
blob: 16ec3a7f633ead11c4e462626c759820a9b880cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""
Module for performing checks on a Kibana logging deployment
"""

import json
import ssl

# pylint can't find the package when its installed in virtualenv
# pylint: disable=import-error,no-name-in-module
from ansible.module_utils.six.moves.urllib import request
# pylint: disable=import-error,no-name-in-module
from ansible.module_utils.six.moves.urllib.error import HTTPError, URLError

from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException


class Kibana(LoggingCheck):
    """Module that checks an integrated logging Kibana deployment"""

    name = "kibana"
    tags = ["health", "logging"]

    def run(self):
        """Check various things and gather errors. Returns: result as hash"""

        kibana_pods = self.get_pods_for_component("kibana")
        self.check_kibana(kibana_pods)
        self.check_kibana_route()
        # TODO(lmeyer): run it all again for the ops cluster

        return {}

    def _verify_url_internal(self, url):
        """
        Try to reach a URL from the host.
        Returns: success (bool), reason (for failure)
        """
        args = dict(
            url=url,
            follow_redirects='none',
            validate_certs='no',  # likely to be signed with internal CA
            # TODO(lmeyer): give users option to validate certs
            status_code=302,
        )
        result = self.execute_module('uri', args)
        if result.get('failed'):
            return result['msg']
        return None

    @staticmethod
    def _verify_url_external(url):
        """
        Try to reach a URL from ansible control host.
        Raise an OpenShiftCheckException if anything goes wrong.
        """
        # This actually checks from the ansible control host, which may or may not
        # really be "external" to the cluster.

        # Disable SSL cert validation to work around internally signed certs
        ctx = ssl.create_default_context()
        ctx.check_hostname = False  # or setting CERT_NONE is refused
        ctx.verify_mode = ssl.CERT_NONE

        # Verify that the url is returning a valid response
        try:
            # We only care if the url connects and responds
            return_code = request.urlopen(url, context=ctx).getcode()
        except HTTPError as httperr:
            return httperr.reason
        except URLError as urlerr:
            return str(urlerr)

        # there appears to be no way to prevent urlopen from following redirects
        if return_code != 200:
            return 'Expected success (200) but got return code {}'.format(int(return_code))

        return None

    def check_kibana(self, pods):
        """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""

        if not pods:
            raise OpenShiftCheckException(
                "MissingComponentPods",
                "There are no Kibana pods deployed, so no access to the logging UI."
            )

        not_running = self.not_running_pods(pods)
        if len(not_running) == len(pods):
            raise OpenShiftCheckException(
                "NoRunningPods",
                "No Kibana pod is in a running state, so there is no access to the logging UI."
            )
        elif not_running:
            raise OpenShiftCheckException(
                "PodNotRunning",
                "The following Kibana pods are not currently in a running state:\n"
                "  {pods}\n"
                "However at least one is, so service may not be impacted.".format(
                    pods="\n  ".join(pod['metadata']['name'] for pod in not_running)
                )
            )

    def _get_kibana_url(self):
        """
        Get kibana route or report error.
        Returns: url
        """

        # Get logging url
        get_route = self.exec_oc("get route logging-kibana -o json", [])
        if not get_route:
            raise OpenShiftCheckException(
                'no_route_exists',
                'No route is defined for Kibana in the logging namespace,\n'
                'so the logging stack is not accessible. Is logging deployed?\n'
                'Did something remove the logging-kibana route?'
            )

        try:
            route = json.loads(get_route)
            # check that the route has been accepted by a router
            ingress = route["status"]["ingress"]
        except (ValueError, KeyError):
            raise OpenShiftCheckException(
                'get_route_failed',
                '"oc get route" returned an unexpected response:\n' + get_route
            )

        # ingress can be null if there is no router, or empty if not routed
        if not ingress or not ingress[0]:
            raise OpenShiftCheckException(
                'route_not_accepted',
                'The logging-kibana route is not being routed by any router.\n'
                'Is the router deployed and working?'
            )

        host = route.get("spec", {}).get("host")
        if not host:
            raise OpenShiftCheckException(
                'route_missing_host',
                'The logging-kibana route has no hostname defined,\n'
                'which should never happen. Did something alter its definition?'
            )

        return 'https://{}/'.format(host)

    def check_kibana_route(self):
        """
        Check to see if kibana route is up and working.
        Raises exception if not.
        """

        kibana_url = self._get_kibana_url()

        # first, check that kibana is reachable from the master.
        error = self._verify_url_internal(kibana_url)
        if error:
            if 'urlopen error [Errno 111] Connection refused' in error:
                raise OpenShiftCheckException(
                    'FailedToConnectInternal',
                    'Failed to connect from this master to Kibana URL {url}\n'
                    'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
                )
            elif 'urlopen error [Errno -2] Name or service not known' in error:
                raise OpenShiftCheckException(
                    'FailedToResolveInternal',
                    'Failed to connect from this master to Kibana URL {url}\n'
                    'because the hostname does not resolve.\n'
                    'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
                )
            elif 'Status code was not' in error:
                raise OpenShiftCheckException(
                    'WrongReturnCodeInternal',
                    'A request from this master to the Kibana URL {url}\n'
                    'did not return the correct status code (302).\n'
                    'This could mean that Kibana is malfunctioning, the hostname is\n'
                    'resolving incorrectly, or other network issues. The output was:\n'
                    '  {error}'.format(url=kibana_url, error=error)
                )
            raise OpenShiftCheckException(
                'MiscRouteErrorInternal',
                'Error validating the logging Kibana route internally:\n' + error
            )

        # in production we would like the kibana route to work from outside the
        # cluster too; but that may not be the case, so allow disabling just this part.
        if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
            return
        error = self._verify_url_external(kibana_url)

        if not error:
            return

        error_fmt = (
            'Error validating the logging Kibana route:\n{error}\n'
            'To disable external Kibana route validation, set the variable:\n'
            '  openshift_check_efk_kibana_external=False'
        )
        if 'urlopen error [Errno 111] Connection refused' in error:
            msg = (
                'Failed to connect from the Ansible control host to Kibana URL {url}\n'
                'Is the router for the Kibana hostname exposed externally?'
            ).format(url=kibana_url)
            raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
        elif 'urlopen error [Errno -2] Name or service not known' in error:
            msg = (
                'Failed to resolve the Kibana hostname in {url}\n'
                'from the Ansible control host.\n'
                'Is DNS configured to resolve this Kibana hostname externally?'
            ).format(url=kibana_url)
            raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
        elif 'Expected success (200)' in error:
            msg = (
                'A request to Kibana at {url}\n'
                'returned the wrong error code:\n'
                '  {error}\n'
                'This could mean that Kibana is malfunctioning, the hostname is\n'
                'resolving incorrectly, or other network issues.'
            ).format(url=kibana_url, error=error)
            raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
        raise OpenShiftCheckException(
            'MiscRouteError',
            'Error validating the logging Kibana route externally:\n' + error
        )