diff --git a/bitnami/thanos/Chart.yaml b/bitnami/thanos/Chart.yaml index 7c5626d922..49ccdcff9c 100644 --- a/bitnami/thanos/Chart.yaml +++ b/bitnami/thanos/Chart.yaml @@ -35,4 +35,4 @@ maintainers: name: thanos sources: - https://github.com/bitnami/charts/tree/main/bitnami/thanos -version: 15.0.1 +version: 15.0.2 diff --git a/bitnami/thanos/templates/alert-rule/absent_rules.yml b/bitnami/thanos/templates/alert-rule/absent_rules.yml index 86a60b2756..aa5dc6ca71 100644 --- a/bitnami/thanos/templates/alert-rule/absent_rules.yml +++ b/bitnami/thanos/templates/alert-rule/absent_rules.yml @@ -33,7 +33,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompactisdown summary: Thanos component has disappeared. expr: | - absent(up{job=~".*thanos-compact.*"} == 1) + absent(up{job=~".*{{ include "thanos.compactor.fullname" . }}.*"} == 1) for: 5m labels: severity: critical @@ -51,7 +51,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosqueryisdown summary: Thanos component has disappeared. expr: | - absent(up{job=~".*thanos-query.*"} == 1) + absent(up{job=~".*{{ include "thanos.query.fullname" . }}.*"} == 1) for: 5m labels: severity: critical @@ -69,7 +69,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosreceiveisdown summary: Thanos component has disappeared. expr: | - absent(up{job=~".*thanos-receive.*"} == 1) + absent(up{job=~".*{{ include "thanos.receive.fullname" . }}.*"} == 1) for: 5m labels: severity: critical @@ -87,7 +87,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosruleisdown summary: Thanos component has disappeared. expr: | - absent(up{job=~".*thanos-rule.*"} == 1) + absent(up{job=~".*{{ include "thanos.ruler.fullname" . }}.*"} == 1) for: 5m labels: severity: critical @@ -123,7 +123,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosstoreisdown summary: Thanos component has disappeared. expr: | - absent(up{job=~".*thanos-store.*"} == 1) + absent(up{job=~".*{{ include "thanos.storegateway.fullname" . }}.*"} == 1) for: 5m labels: severity: critical diff --git a/bitnami/thanos/templates/alert-rule/compaction.yml b/bitnami/thanos/templates/alert-rule/compaction.yml index 681d836d3b..50477867ed 100644 --- a/bitnami/thanos/templates/alert-rule/compaction.yml +++ b/bitnami/thanos/templates/alert-rule/compaction.yml @@ -32,7 +32,7 @@ spec: description: No more than one Thanos Compact instance should be running at once. There are {{`{{`}} $value {{`}}`}} instances running. runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompactmultiplerunning summary: Thanos Compact has multiple instances running. - expr: sum by (job) (up{job=~".*thanos-compact.*"}) > 1 + expr: sum by (job) (up{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}) > 1 for: 5m labels: severity: warning @@ -49,7 +49,7 @@ spec: description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has failed to run and now is halted. runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompacthalted summary: Thanos Compact has failed to run and is now halted. - expr: thanos_compact_halted{job=~".*thanos-compact.*"} == 1 + expr: thanos_compact_halted{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"} == 1 for: 5m labels: severity: warning @@ -68,9 +68,9 @@ spec: summary: Thanos Compact is failing to execute compactions. expr: | ( - sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m])) / - sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m])) * 100 > 5 ) for: 15m @@ -91,9 +91,9 @@ spec: summary: Thanos Compact Bucket is having a high number of operation failures. expr: | ( - sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m])) / - sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[5m])) * 100 > 5 ) for: 15m @@ -112,7 +112,7 @@ spec: description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has not uploaded anything for 24 hours. runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanoscompacthasnotrun summary: Thanos Compact has not uploaded anything for last 24 hours. - expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24 + expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*{{ include "thanos.compactor.fullname" . }}-compact.*"}[24h]))) / 60 / 60 > 24 labels: severity: warning {{- if .Values.metrics.prometheusRule.additionalLabels }} diff --git a/bitnami/thanos/templates/alert-rule/query.yml b/bitnami/thanos/templates/alert-rule/query.yml index d213572e59..a1049e0f27 100644 --- a/bitnami/thanos/templates/alert-rule/query.yml +++ b/bitnami/thanos/templates/alert-rule/query.yml @@ -34,9 +34,9 @@ spec: summary: Thanos Query is failing to handle requests. expr: | ( - sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m])) / - sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m])) ) * 100 > 5 for: 5m labels: @@ -56,9 +56,9 @@ spec: summary: Thanos Query is failing to handle requests. expr: | ( - sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m])) / - sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m])) ) * 100 > 5 for: 5m labels: @@ -78,9 +78,9 @@ spec: summary: Thanos Query is failing to handle requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m])) * 100 > 5 ) for: 5m @@ -101,9 +101,9 @@ spec: summary: Thanos Query is failing to send requests. expr: | ( - sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) + sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m])) / - sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) + sum by (job) (rate(grpc_client_started_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m])) ) * 100 > 5 for: 5m labels: @@ -123,9 +123,9 @@ spec: summary: Thanos Query is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m])) / - sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*{{ include "thanos.query.fullname" . }}.*"}[5m])) ) * 100 > 1 for: 15m labels: @@ -145,9 +145,9 @@ spec: summary: Thanos Query has high latency for queries. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m]))) > 40 and - sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0 + sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query"}[5m])) > 0 ) for: 10m labels: @@ -167,9 +167,9 @@ spec: summary: Thanos Query has high latency for queries. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m]))) > 90 and - sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0 + sum by (job) (rate(http_request_duration_seconds_count{job=~".*{{ include "thanos.query.fullname" . }}.*", handler="query_range"}[5m])) > 0 ) for: 10m labels: diff --git a/bitnami/thanos/templates/alert-rule/receive.yml b/bitnami/thanos/templates/alert-rule/receive.yml index a65ba967de..06fdb8cd4d 100644 --- a/bitnami/thanos/templates/alert-rule/receive.yml +++ b/bitnami/thanos/templates/alert-rule/receive.yml @@ -34,9 +34,9 @@ spec: summary: Thanos Receive is failing to handle requests. expr: | ( - sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m])) / - sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m])) ) * 100 > 5 for: 5m labels: @@ -56,9 +56,9 @@ spec: summary: Thanos Receive has high HTTP requests latency. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m]))) > 10 and - sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0 + sum by (job) (rate(http_request_duration_seconds_count{job=~".*{{ include "thanos.receive.fullname" . }}.*", handler="receive"}[5m])) > 0 ) for: 10m labels: @@ -81,15 +81,15 @@ spec: and ( ( - sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m])) / - sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m])) ) > ( - max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) / 2)) + max by (job) (floor((thanos_receive_replication_factor{job=~".*{{ include "thanos.receive.fullname" . }}.*"}+1) / 2)) / - max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}) + max by (job) (thanos_receive_hashring_nodes{job=~".*{{ include "thanos.receive.fullname" . }}.*"}) ) ) * 100 for: 5m @@ -110,9 +110,9 @@ spec: summary: Thanos Receive is failing to forward requests. expr: | ( - sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m])) / - sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m])) ) * 100 > 20 for: 5m labels: @@ -132,9 +132,9 @@ spec: summary: Thanos Receive is failing to refresh hasring file. expr: | ( - sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m])) / - sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[5m])) > 0 ) for: 15m @@ -153,7 +153,7 @@ spec: description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} has not been able to reload hashring configurations. runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosreceiveconfigreloadfailure summary: Thanos Receive has not been able to reload configuration. - expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1 + expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*{{ include "thanos.receive.fullname" . }}.*"}) != 1 for: 5m labels: severity: warning @@ -171,9 +171,9 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosreceivenoupload summary: Thanos Receive has not uploaded latest data to object storage. expr: | - (up{job=~".*thanos-receive.*"} - 1) + (up{job=~".*{{ include "thanos.receive.fullname" . }}.*"} - 1) + on (job, instance) # filters to only alert on current instance last 3h - (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0) + (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*{{ include "thanos.receive.fullname" . }}.*"}[3h])) == 0) for: 3h labels: severity: critical @@ -192,9 +192,9 @@ spec: summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate. expr: | ( - avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) + avg_over_time(rate(http_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*", code=~"2..", handler="receive"}[5m])[1h:5m]) / - avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) + avg_over_time(rate(http_requests_total{job=~".*{{ include "thanos.receive.fullname" . }}.*", code=~"2..", handler="receive"}[5m])[12h:5m]) ) * 100 < 50 for: 1h labels: diff --git a/bitnami/thanos/templates/alert-rule/replicate.yml b/bitnami/thanos/templates/alert-rule/replicate.yml index 5acc852f5c..7b213f1802 100644 --- a/bitnami/thanos/templates/alert-rule/replicate.yml +++ b/bitnami/thanos/templates/alert-rule/replicate.yml @@ -34,9 +34,9 @@ spec: summary: Thanos Replicate is failing to run. expr: | ( - sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) + sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m])) / on (job) group_left - sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) + sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m])) ) * 100 >= 10 for: 5m labels: @@ -56,9 +56,9 @@ spec: summary: Thanos Replicate has a high latency for replicate operations. expr: | ( - histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 + histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m]))) > 20 and - sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0 + sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*{{ template "common.names.fullname" . }}-bucket-replicate.*"}[5m])) > 0 ) for: 5m labels: diff --git a/bitnami/thanos/templates/alert-rule/ruler.yml b/bitnami/thanos/templates/alert-rule/ruler.yml index b778ff87c9..811a5dbffc 100644 --- a/bitnami/thanos/templates/alert-rule/ruler.yml +++ b/bitnami/thanos/templates/alert-rule/ruler.yml @@ -33,7 +33,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulequeueisdroppingalerts summary: Thanos Rule is failing to queue alerts. expr: | - sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) > 0 for: 5m labels: severity: critical @@ -51,7 +51,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulesenderisfailingalerts summary: Thanos Rule is failing to send alerts to alertmanager. expr: | - sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) > 0 for: 5m labels: severity: critical @@ -70,9 +70,9 @@ spec: summary: Thanos Rule is failing to evaluate rules. expr: | ( - sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) / - sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) * 100 > 5 ) for: 5m @@ -92,7 +92,7 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulehighruleevaluationwarnings summary: Thanos Rule has high number of evaluation warnings. expr: | - sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) > 0 for: 15m labels: severity: info @@ -111,9 +111,9 @@ spec: summary: Thanos Rule has high rule evaluation latency. expr: | ( - sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) + sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) > - sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) + sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) ) for: 5m labels: @@ -133,9 +133,9 @@ spec: summary: Thanos Rule is failing to handle grpc requests. expr: | ( - sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) / - sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_started_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) * 100 > 5 ) for: 5m @@ -154,7 +154,7 @@ spec: description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has not been able to reload its configuration. runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosruleconfigreloadfailure summary: Thanos Rule has not been able to reload configuration. - expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1 + expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) != 1 for: 5m labels: severity: info @@ -173,9 +173,9 @@ spec: summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) / - sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) * 100 > 1 ) for: 15m @@ -196,9 +196,9 @@ spec: summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) / - sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) * 100 > 1 ) for: 15m @@ -218,9 +218,9 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosrulenoevaluationfor10intervals summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | - time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"}) + time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) > - 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) + 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) for: 5m labels: severity: info @@ -238,9 +238,9 @@ spec: runbook_url: {{ .Values.metrics.prometheusRule.runbookUrl }}thanosnoruleevaluations summary: Thanos Rule did not perform any rule evaluations. expr: | - sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}[5m])) <= 0 and - sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0 + sum by (job, instance) (thanos_rule_loaded_rules{job=~".*{{ include "thanos.ruler.fullname" . }}.*"}) > 0 for: 5m labels: severity: critical