2025-09-24 13:47:55 +07:00

39 lines
1.4 KiB
YAML

groups:
- name: online_metrics_alerts
rules:
- alert: OnlineStatusErrors
expr: sum(increase(sms_online_status_set_total{status="error"}[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High number of online status errors"
description: "The number of online status errors has exceeded 10 in the last 5 minutes."
- alert: CacheOperationErrors
expr: sum by (operation) (increase(sms_online_cache_operations_total{status="error"}[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High number of cache operation errors"
description: "Cache errors for operation={{ $labels.operation }} exceeded 5 in 5m."
- alert: PersistenceErrors
expr: sum(increase(sms_online_persistence_errors_total[5m])) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Persistence errors detected"
description: "Persistence errors have been detected in the online module."
- alert: HighPersistenceLatency
expr: histogram_quantile(0.95, sum(rate(sms_online_persistence_latency_seconds_bucket[5m])) by (le)) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High persistence latency"
description: "The 95th percentile persistence latency has exceeded 0.5 seconds."