maj provisioning pour alerts via mail

main
Grégory Lebreton 12 months ago
parent 43441f012e
commit 379d491b39

@ -1,34 +1,71 @@
# PROMETHEUS & GRAFANA
# PROMETHEUS & GRAFANA :bar_chart:
Ce projet vise à monitorer des serveurs via prometheus pour centraliser le scraping, node exporter pour les host metrics, cadvisor pour les metrics des conteneurs docker et grafana pour afficher les metrics sous forme de dashboards
![PROM](docs/prom.png)
## CONFIGURATION
## PREREQUIS :memo:
- [docker + compose plugin]() :whale:
## CONFIGURATION :wrench:
### ALERTES VIA EMAIL :mail:
#### VIA ALERTMANAGER
- Configuration de la boîte mail pour les alertes:
```bash
nano alertmanager/alertmanager.yml
```
> Renseigner les champs:
```yml
```
- Configuration des alertes:
```bash
nano alertmanager/alert.rules
```
- Configuration des alertes:
> Renseigner les champs:
```
```
#### VIA GRAFANA
- Configuration de la boîte mail pour les alertes:
```bash
nano alertmanager/alert.rules
nano grafana/config/grafana.ini
```
> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000
> Renseigner les champs:
### NODE EXPORTER TLS
```
Pour déployer un node exporter sur un serveur distant:
> voir branche [node-exporter](https://git.legaragenumerique.fr/GARAGENUM/prometheus-monitoring/src/branch/node-exporter)
```
### GRAFANA SSO KEYCLOAK
- Configuration du notifier (email) par ddefault:
```bash
nano grafana/provisioning/notifiers.yml
```
> Renseigner les champs:
```yml
```
- Commenter alermanager dans le docker-compose.yml:
```bash
sed -i "" docker-compose.yml
```
### GRAFANA SSO KEYCLOAK :key:
- Créer un client sur keycloak en confidential pour obtenir le client-secret
- Entrer le nom de domaine de votre instance grafana
@ -58,11 +95,28 @@ api_url = https://votre-keycloak/auth/realms/votre-royaume/protocol/openid-conne
#disable_login_form = true
```
## TO DO
## UTILISATION :chackered_flag:
- Démarrer la stack:
```bash
docker compose up -d
```
> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000
### NODE EXPORTER TLS
Pour déployer un node exporter sur un serveur distant:
> voir branche [node-exporter](https://git.legaragenumerique.fr/GARAGENUM/prometheus-monitoring/src/branch/node-exporter)
## TO DO :bookmark_tabs:
- [x] node exporter
- [x] node exporter -> prometheus via https
- [ ] alert manager config
- [x] alert manager config / grafana alert via mail config
- [x] provision dashboard / notifier par default
- [ ] dashboard for Grafana amd64:
- [x] host metrics
- [ ] cadvisor for docker
- [ ] config alertes sous grafana + images

@ -1,172 +0,0 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
## FOR HOST ##################################################################
- name: host
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR RAID ##########################################################
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR CONTAINERS #####################################################
- name: containers
rules:
- alert: nextcloud_down
expr: absent(container_memory_usage_bytes{name="jenkins"})
for: 30s
labels:
severity: critical
annotations:
summary: "Nextcloud down"
description: "Nextcloud container is down for more than 30 seconds."
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR NGINX ##########################################################
- name: nginx
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

@ -11,12 +11,11 @@ route:
receivers:
- name: 'email'
email_configs:
- to: 'mail1@mail.com, mail2@mail.com'
from: ''
smarthost: ''
auth_username: ''
auth_identity: ''
auth_password: ''
- to: 'mail-1@mail.com, mail-2@mail.com'
from: 'mail@mail.com'
smarthost: 'smtp.mail-provider.net:port'
auth_username: 'mail@mail.com'
auth_password: 'password'
require_tls: yes
send_resolved: true

@ -1,11 +0,0 @@
route:
receiver: 'slack'
receivers:
- name: 'slack'
slack_configs:
- send_resolved: true
text: "{{ .CommonAnnotations.description }}"
username: 'Prometheus'
channel: '#prometheus'
api_url: 'https://hooks.slack.com/services/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'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,398 +0,0 @@
{
"id": null,
"title": "Nginx",
"description": "Nginx exporter metrics",
"tags": [
"nginx"
],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"sharedCrosshair": true,
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
"hide": false,
"interval": "",
"intervalFactor": 10,
"legendFormat": "requests",
"metric": "",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Requests/sec",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(nginx_connections_current) by (state)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{state}}",
"metric": "",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Connections",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
"hide": false,
"interval": "",
"intervalFactor": 10,
"legendFormat": "{{stage}}",
"metric": "",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Connections rate",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Nginx exporter metrics"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 4,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
"intervalFactor": 2,
"legendFormat": "nginx",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "CPU usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Nginx container metrics"
}
],
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": "10s",
"schemaVersion": 12,
"version": 9,
"links": [],
"gnetId": null
}

@ -0,0 +1,11 @@
apiVersion: 1
notifiers:
- name: garagenum
type: email
uid: 1
isDefault: true
sendReminder: true
disableResolveMessage: false
settings:
addresses: email-1@mail.com;email-2@mail.com

@ -52,6 +52,7 @@ scrape_configs:
# labels:
# instance: serveur-distant
# SI ALERT VIA ALERTMANAGER SINON COMMENTER !
alerting:
alertmanagers:
- scheme: http

Loading…
Cancel
Save