summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars/template_openshift_node.yml
blob: e6daee8e472a7e1570fd08591a578d722619b968 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
---
g_template_openshift_node:
  name: Template Openshift Node
  zitems:
  - key: openshift.node.process.count
    description: Shows number of OpenShift Node processes running
    type: int
    applications:
    - Openshift Node

  - key: openshift.node.ovs.pids.count
    description: Shows number of ovs process ids running
    type: int
    applications:
    - Openshift Node

  - key: openshift.node.ovs.ports.count
    description: Shows number of OVS ports defined
    type: int
    applications:
    - Openshift Node

  - key: openshift.node.ovs.stray.rules
    description: Number of OVS stray rules found/removed
    type: int
    applications:
    - Openshift Node

  - key: openshift.node.registry-pods.healthy_pct
    description: Shows the percentage of healthy registries in the cluster
    type: int
    applications:
    - Openshift Node

  - key: openshift.node.registry.service.ping
    description: Ping docker-registry service from node
    type: int
    applications:
    - Openshift Node

  ztriggers:
  - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
    expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
    priority: avg

  - name: 'Docker Registry service is unhealthy according to {HOST.NAME}'
    expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1'
    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
    priority: avg

  - name: 'Openshift Node process not running on {HOST.NAME}'
    expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
    url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
    priority: high

  - name: 'Too many Openshift Node processes running on {HOST.NAME}'
    expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
    url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
    priority: high

  - name: '[HEAL] OVS may not be running on {HOST.NAME}'
    expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
    url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
    priority: high

  - name: 'Number of OVS ports is 0 on {HOST.NAME}'
    expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
    url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
    priority: high

  zactions:
  - name: '[HEAL] OVS may not be running on {HOST.NAME}'
    status: disabled
    escalation_time: 60
    conditions_filter:
      calculation_type: "and/or"
      conditions:
      - conditiontype: maintenance status
        operator: not in
      - conditiontype: trigger name
        operator: like
        value: "[HEAL] OVS may not be running on"
      - conditiontype: trigger value
        operator: "="
        value: PROBLEM
    operations:
    - esc_step_from: 1
      esc_step_to: 1
      esc_period: 0
      operationtype: remote command
      opcommand:
        command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"'
        execute_on: "zabbix server"
        type: 'custom script'
      target_hosts:
      - target_type: 'zabbix server'
      opconditions:
      - conditiontype: 'event acknowledged'
        operator: '='
        value: 'not acknowledged'