mirror of
https://github.com/bitnami/charts.git
synced 2026-03-03 06:58:45 +08:00
[bitnami/thanos]Fix: Make prometheus rules reliable with release name (#24655)
* [bitnami/thanos]Fix: Make prometheus rules reliable with release name (#24651) Signed-off-by: Pierre BLAIS <pierreblais@hotmail.fr> * [bitnami/thanos]Fix: Make prometheus rules reliable with release name (#24651) Signed-off-by: Pierre BLAIS <pierreblais@hotmail.fr> * Bump version Signed-off-by: David Gomez <davidbhlm@gmail.com> --------- Signed-off-by: Pierre BLAIS <pierreblais@hotmail.fr> Signed-off-by: David Gomez <davidbhlm@gmail.com> Signed-off-by: David Gomez <dgomezleon@vmware.com> Co-authored-by: David Gomez <davidbhlm@gmail.com> Co-authored-by: David Gomez <dgomezleon@vmware.com>
This commit is contained in:
@@ -35,4 +35,4 @@ maintainers:
|
||||
name: thanos
|
||||
sources:
|
||||
- https://github.com/bitnami/charts/tree/main/bitnami/thanos
|
||||
version: 15.0.1
|
||||
version: 15.0.2
|
||||
|
||||
@@ -33,7 +33,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompactisdown
|
||||
summary: Thanos component has disappeared.
|
||||
expr: |
|
||||
absent(up{job=~".*thanos-compact.*"} == 1)
|
||||
absent(up{job=~".*{{ include "thanos.compactor.fullname" . }}.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -51,7 +51,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosqueryisdown
|
||||
summary: Thanos component has disappeared.
|
||||
expr: |
|
||||
absent(up{job=~".*thanos-query.*"} == 1)
|
||||
absent(up{job=~".*{{ include "thanos.query.fullname" . }}.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -69,7 +69,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosreceiveisdown
|
||||
summary: Thanos component has disappeared.
|
||||
expr: |
|
||||
absent(up{job=~".*thanos-receive.*"} == 1)
|
||||
absent(up{job=~".*{{ include "thanos.receive.fullname" . }}.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -87,7 +87,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosruleisdown
|
||||
summary: Thanos component has disappeared.
|
||||
expr: |
|
||||
absent(up{job=~".*thanos-rule.*"} == 1)
|
||||
absent(up{job=~".*{{ include "thanos.ruler.fullname" . }}.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -123,7 +123,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosstoreisdown
|
||||
summary: Thanos component has disappeared.
|
||||
expr: |
|
||||
absent(up{job=~".*thanos-store.*"} == 1)
|
||||
absent(up{job=~".*{{ include "thanos.storegateway.fullname" . }}.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
@@ -32,7 +32,7 @@ spec:
|
||||
description: No more than one Thanos Compact instance should be running at once. There are {{`{{`}} $value {{`}}`}} instances running.
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompactmultiplerunning
|
||||
summary: Thanos Compact has multiple instances running.
|
||||
expr: sum by (job) (up{job=~".*thanos-compact.*"}) > 1
|
||||
expr: sum by (job) (up{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -49,7 +49,7 @@ spec:
|
||||
description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has failed to run and now is halted.
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompacthalted
|
||||
summary: Thanos Compact has failed to run and is now halted.
|
||||
expr: thanos_compact_halted{job=~".*thanos-compact.*"} == 1
|
||||
expr: thanos_compact_halted{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -68,9 +68,9 @@ spec:
|
||||
summary: Thanos Compact is failing to execute compactions.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m]))
|
||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m]))
|
||||
sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
@@ -91,9 +91,9 @@ spec:
|
||||
summary: Thanos Compact Bucket is having a high number of operation failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m]))
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m]))
|
||||
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
@@ -112,7 +112,7 @@ spec:
|
||||
description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has not uploaded anything for 24 hours.
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompacthasnotrun
|
||||
summary: Thanos Compact has not uploaded anything for last 24 hours.
|
||||
expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24
|
||||
expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[24h]))) / 60 / 60 > 24
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.metrics.prometheusRule.additionalLabels }}
|
||||
|
||||
@@ -34,9 +34,9 @@ spec:
|
||||
summary: Thanos Query is failing to handle requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))
|
||||
sum by (job) (rate(http_requests_total{code=~"5..", job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))
|
||||
sum by (job) (rate(http_requests_total{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -56,9 +56,9 @@ spec:
|
||||
summary: Thanos Query is failing to handle requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))
|
||||
sum by (job) (rate(http_requests_total{code=~"5..", job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))
|
||||
sum by (job) (rate(http_requests_total{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -78,9 +78,9 @@ spec:
|
||||
summary: Thanos Query is failing to handle requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(grpc_server_started_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
@@ -101,9 +101,9 @@ spec:
|
||||
summary: Thanos Query is failing to send requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(grpc_client_started_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -123,9 +123,9 @@ spec:
|
||||
summary: Thanos Query is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
@@ -145,9 +145,9 @@ spec:
|
||||
summary: Thanos Query has high latency for queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m]))) > 40
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0
|
||||
sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
@@ -167,9 +167,9 @@ spec:
|
||||
summary: Thanos Query has high latency for queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m]))) > 90
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
|
||||
@@ -34,9 +34,9 @@ spec:
|
||||
summary: Thanos Receive is failing to handle requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))
|
||||
sum by (job) (rate(http_requests_total{code=~"5..", job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))
|
||||
sum by (job) (rate(http_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -56,9 +56,9 @@ spec:
|
||||
summary: Thanos Receive has high HTTP requests latency.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m]))) > 10
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
@@ -81,15 +81,15 @@ spec:
|
||||
and
|
||||
(
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m]))
|
||||
sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))
|
||||
sum by (job) (rate(thanos_receive_replications_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m]))
|
||||
)
|
||||
>
|
||||
(
|
||||
max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) / 2))
|
||||
max by (job) (floor((thanos_receive_replication_factor{job=~".*{{ include "thanos.receive.fullname" . }}.*"}+1) / 2))
|
||||
/
|
||||
max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"})
|
||||
max by (job) (thanos_receive_hashring_nodes{job=~".*{{ include "thanos.receive.fullname" . }}.*"})
|
||||
)
|
||||
) * 100
|
||||
for: 5m
|
||||
@@ -110,9 +110,9 @@ spec:
|
||||
summary: Thanos Receive is failing to forward requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m]))
|
||||
) * 100 > 20
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -132,9 +132,9 @@ spec:
|
||||
summary: Thanos Receive is failing to refresh hasring file.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m]))
|
||||
> 0
|
||||
)
|
||||
for: 15m
|
||||
@@ -153,7 +153,7 @@ spec:
|
||||
description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} has not been able to reload hashring configurations.
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosreceiveconfigreloadfailure
|
||||
summary: Thanos Receive has not been able to reload configuration.
|
||||
expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1
|
||||
expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*{{ include "thanos.receive.fullname" . }}.*"}) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -171,9 +171,9 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosreceivenoupload
|
||||
summary: Thanos Receive has not uploaded latest data to object storage.
|
||||
expr: |
|
||||
(up{job=~".*thanos-receive.*"} - 1)
|
||||
(up{job=~".*{{ include "thanos.receive.fullname" . }}.*"} - 1)
|
||||
+ on (job, instance) # filters to only alert on current instance last 3h
|
||||
(sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)
|
||||
(sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[3h])) == 0)
|
||||
for: 3h
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -192,9 +192,9 @@ spec:
|
||||
summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.
|
||||
expr: |
|
||||
(
|
||||
avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m])
|
||||
avg_over_time(rate(http_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*", code=~"2..", handler="receive"}[5m])[1h:5m])
|
||||
/
|
||||
avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m])
|
||||
avg_over_time(rate(http_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*", code=~"2..", handler="receive"}[5m])[12h:5m])
|
||||
) * 100 < 50
|
||||
for: 1h
|
||||
labels:
|
||||
|
||||
@@ -34,9 +34,9 @@ spec:
|
||||
summary: Thanos Replicate is failing to run.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))
|
||||
sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m]))
|
||||
/ on (job) group_left
|
||||
sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))
|
||||
sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m]))
|
||||
) * 100 >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -56,9 +56,9 @@ spec:
|
||||
summary: Thanos Replicate has a high latency for replicate operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20
|
||||
histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m]))) > 20
|
||||
and
|
||||
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0
|
||||
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m])) > 0
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
|
||||
@@ -33,7 +33,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulequeueisdroppingalerts
|
||||
summary: Thanos Rule is failing to queue alerts.
|
||||
expr: |
|
||||
sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0
|
||||
sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -51,7 +51,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulesenderisfailingalerts
|
||||
summary: Thanos Rule is failing to send alerts to alertmanager.
|
||||
expr: |
|
||||
sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0
|
||||
sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -70,9 +70,9 @@ spec:
|
||||
summary: Thanos Rule is failing to evaluate rules.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
@@ -92,7 +92,7 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulehighruleevaluationwarnings
|
||||
summary: Thanos Rule has high number of evaluation warnings.
|
||||
expr: |
|
||||
sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0
|
||||
sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
@@ -111,9 +111,9 @@ spec:
|
||||
summary: Thanos Rule has high rule evaluation latency.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"})
|
||||
sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"})
|
||||
>
|
||||
sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
|
||||
sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"})
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
@@ -133,9 +133,9 @@ spec:
|
||||
summary: Thanos Rule is failing to handle grpc requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(grpc_server_started_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
@@ -154,7 +154,7 @@ spec:
|
||||
description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has not been able to reload its configuration.
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosruleconfigreloadfailure
|
||||
summary: Thanos Rule has not been able to reload configuration.
|
||||
expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1
|
||||
expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
@@ -173,9 +173,9 @@ spec:
|
||||
summary: Thanos Rule is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
@@ -196,9 +196,9 @@ spec:
|
||||
summary: Thanos Rule is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
/
|
||||
sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))
|
||||
sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
@@ -218,9 +218,9 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulenoevaluationfor10intervals
|
||||
summary: Thanos Rule has rule groups that did not evaluate for 10 intervals.
|
||||
expr: |
|
||||
time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})
|
||||
time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"})
|
||||
>
|
||||
10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
|
||||
10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
@@ -238,9 +238,9 @@ spec:
|
||||
runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosnoruleevaluations
|
||||
summary: Thanos Rule did not perform any rule evaluations.
|
||||
expr: |
|
||||
sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0
|
||||
sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) <= 0
|
||||
and
|
||||
sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0
|
||||
sum by (job, instance) (thanos_rule_loaded_rules{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
Reference in New Issue
Block a user