summaryrefslogtreecommitdiffstats
path: root/roles/openshift_prometheus/templates/prometheus.yml.j2
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_prometheus/templates/prometheus.yml.j2')
-rw-r--r--roles/openshift_prometheus/templates/prometheus.yml.j2175
1 files changed, 121 insertions, 54 deletions
diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2
index 63430f834..005c2c564 100644
--- a/roles/openshift_prometheus/templates/prometheus.yml.j2
+++ b/roles/openshift_prometheus/templates/prometheus.yml.j2
@@ -1,10 +1,5 @@
rule_files:
- - 'prometheus.rules'
-{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
- - 'prometheus.additional.rules'
-{% endif %}
-
-
+ - '*.rules'
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
@@ -39,31 +34,11 @@ scrape_configs:
action: keep
regex: default;kubernetes;https
-# Scrape config for nodes.
-#
-# Each node exposes a /metrics endpoint that contains operational metrics for
-# the Kubelet and other components.
-- job_name: 'kubernetes-nodes'
-
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
- kubernetes_sd_configs:
- - role: node
-
- relabel_configs:
- - action: labelmap
- regex: __meta_kubernetes_node_label_(.+)
-
# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
-# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
-# endpoints.
- job_name: 'kubernetes-controllers'
scheme: https
@@ -87,6 +62,27 @@ scrape_configs:
regex: (.+)(?::\d+)
replacement: $1:8444
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ kubernetes_sd_configs:
+ - role: node
+ # Drop a very high cardinality metric that is incorrect in 3.7. It will be
+ # fixed in 3.9.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+
# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
@@ -107,6 +103,14 @@ scrape_configs:
kubernetes_sd_configs:
- role: node
+ # Exclude a set of high cardinality metrics that can contribute to significant
+ # memory use in large clusters. These can be selectively enabled as necessary
+ # for medium or small clusters.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
+
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
@@ -133,38 +137,101 @@ scrape_configs:
- role: endpoints
relabel_configs:
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
- action: keep
- regex: true
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
- action: replace
- target_label: __scheme__
- regex: (https?)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ # only scrape infrastructure components
+ - source_labels: [__meta_kubernetes_namespace]
+ action: keep
+ regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
+ # drop infrastructure components managed by other scrape targets
+ - source_labels: [__meta_kubernetes_service_name]
+ action: drop
+ regex: 'prometheus-node-exporter'
+ # only those that have requested scraping
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ action: replace
+ target_label: __address__
+ regex: (.+)(?::\d+);(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_service_name]
+ action: replace
+ target_label: kubernetes_name
+
+# Scrape config for node-exporter, which is expected to be running on port 9100.
+- job_name: 'kubernetes-nodes-exporter'
+
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+
+ kubernetes_sd_configs:
+ - role: node
+
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_cpu|node_(disk|scrape_collector)_.+'
+ # preserve a subset of the network, netstat, vmstat, and filesystem series
+ - source_labels: [__name__]
action: replace
- target_label: __metrics_path__
- regex: (.+)
- - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
+ target_label: __name__
+ replacement: renamed_$1
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_(netstat|vmstat|filesystem|network)_.+'
+ - source_labels: [__name__]
action: replace
+ regex: 'renamed_(.+)'
+ target_label: __name__
+ replacement: $1
+ # drop any partial expensive series
+ - source_labels: [__name__, device]
+ action: drop
+ regex: 'node_network_.+;veth.+'
+ - source_labels: [__name__, mountpoint]
+ action: drop
+ regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'
+
+ relabel_configs:
+ - source_labels: [__address__]
+ regex: '(.*):10250'
+ replacement: '${1}:9100'
target_label: __address__
- regex: (.+)(?::\d+);(\d+)
- replacement: $1:$2
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
- action: replace
- target_label: __basic_auth_username__
- regex: (.+)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
- action: replace
- target_label: __basic_auth_password__
- regex: (.+)
+ - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
+ target_label: __instance__
- action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - source_labels: [__meta_kubernetes_namespace]
- action: replace
- target_label: kubernetes_namespace
- - source_labels: [__meta_kubernetes_service_name]
- action: replace
- target_label: kubernetes_name
+ regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for the template service broker
+- job_name: 'openshift-template-service-broker'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
+ server_name: apiserver.openshift-template-service-broker.svc
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: openshift-template-service-broker;apiserver;https
+
alerting:
alertmanagers: