maj provisioning pour alerts via mail
parent
43441f012e
commit
379d491b39
@ -1,172 +0,0 @@
|
||||
groups:
|
||||
- name: targets
|
||||
rules:
|
||||
- alert: monitor_service_down
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Monitor service non-operational"
|
||||
description: "Service {{ $labels.instance }} is down."
|
||||
|
||||
## FOR HOST ##################################################################
|
||||
|
||||
- name: host
|
||||
rules:
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server memory is almost full"
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server storage is almost full"
|
||||
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
## FOR RAID ##########################################################
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
## FOR CONTAINERS #####################################################
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: nextcloud_down
|
||||
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Nextcloud down"
|
||||
description: "Nextcloud container is down for more than 30 seconds."
|
||||
|
||||
- alert: ContainerCpuUsage
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerMemoryUsage
|
||||
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
## FOR NGINX ##########################################################
|
||||
|
||||
- name: nginx
|
||||
rules:
|
||||
- alert: NginxHighHttp4xxErrorRate
|
||||
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NginxHighHttp5xxErrorRate
|
||||
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
@ -1,11 +0,0 @@
|
||||
route:
|
||||
receiver: 'slack'
|
||||
|
||||
receivers:
|
||||
- name: 'slack'
|
||||
slack_configs:
|
||||
- send_resolved: true
|
||||
text: "{{ .CommonAnnotations.description }}"
|
||||
username: 'Prometheus'
|
||||
channel: '#prometheus'
|
||||
api_url: 'https://hooks.slack.com/services/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'
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,398 +0,0 @@
|
||||
{
|
||||
"id": null,
|
||||
"title": "Nginx",
|
||||
"description": "Nginx exporter metrics",
|
||||
"tags": [
|
||||
"nginx"
|
||||
],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"hideControls": false,
|
||||
"sharedCrosshair": true,
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"editable": true,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 3,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "requests",
|
||||
"metric": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Requests/sec",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 2,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(nginx_connections_current) by (state)",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{state}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 2
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Connections",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 1,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{stage}}",
|
||||
"metric": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Connections rate",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Nginx exporter metrics"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"editable": true,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": null,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 4,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "nginx",
|
||||
"refId": "A",
|
||||
"step": 2
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU usage",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Nginx container metrics"
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 12,
|
||||
"version": 9,
|
||||
"links": [],
|
||||
"gnetId": null
|
||||
}
|
@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
notifiers:
|
||||
- name: garagenum
|
||||
type: email
|
||||
uid: 1
|
||||
isDefault: true
|
||||
sendReminder: true
|
||||
disableResolveMessage: false
|
||||
settings:
|
||||
addresses: email-1@mail.com;email-2@mail.com
|
Loading…
Reference in New Issue