maj provisioning pour alerts via mail

12 months ago · 379d491b39
parent 43441f012e
commit 379d491b39
11 changed files with 82 additions and 6721 deletions
--- a/README.md
+++ b/README.md
@ -1,34 +1,71 @@
-# PROMETHEUS & GRAFANA
+# PROMETHEUS & GRAFANA :bar_chart:

 Ce projet vise à monitorer des serveurs via prometheus pour centraliser le scraping, node exporter pour les host metrics, cadvisor pour les metrics des conteneurs docker et grafana pour afficher les metrics sous forme de dashboards

 ![PROM](docs/prom.png)

-## CONFIGURATION
+## PREREQUIS :memo:
+
+- [docker + compose plugin]() :whale:
+
+## CONFIGURATION :wrench:
+
+### ALERTES VIA EMAIL :mail:
+
+#### VIA ALERTMANAGER

 - Configuration de la boîte mail pour les alertes:
 ```bash
 nano alertmanager/alertmanager.yml
 ```

+> Renseigner les champs:
+
+```yml
+
+```
+
 - Configuration des alertes:
 ```bash
 nano alertmanager/alert.rules
 ```

- Configuration des alertes:
+> Renseigner les champs:
+
+```
+
+```
+
+#### VIA GRAFANA
+
+- Configuration de la boîte mail pour les alertes:
 ```bash
-nano alertmanager/alert.rules
+nano grafana/config/grafana.ini
 ```

-> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000
+> Renseigner les champs:

-### NODE EXPORTER TLS
+```

-Pour déployer un node exporter sur un serveur distant:
-> voir branche [node-exporter](https://git.legaragenumerique.fr/GARAGENUM/prometheus-monitoring/src/branch/node-exporter)
+```

-### GRAFANA SSO KEYCLOAK
+- Configuration du notifier (email) par ddefault:
+```bash
+nano grafana/provisioning/notifiers.yml
+```
+
+> Renseigner les champs:
+
+```yml
+
+```
+
+- Commenter alermanager dans le docker-compose.yml:
+```bash
+sed -i "" docker-compose.yml
+```
+
+### GRAFANA SSO KEYCLOAK :key:

 - Créer un client sur keycloak en confidential pour obtenir le client-secret
 - Entrer le nom de domaine de votre instance grafana
@ -58,11 +95,28 @@ api_url = https://votre-keycloak/auth/realms/votre-royaume/protocol/openid-conne
 #disable_login_form = true
 ```

-## TO DO 
+## UTILISATION :chackered_flag:
+
+- Démarrer la stack:
+```bash
+docker compose up -d
+```
+
+> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000
+
+### NODE EXPORTER TLS
+
+Pour déployer un node exporter sur un serveur distant:
+> voir branche [node-exporter](https://git.legaragenumerique.fr/GARAGENUM/prometheus-monitoring/src/branch/node-exporter)
+
+
+## TO DO :bookmark_tabs:

 - [x] node exporter
 - [x] node exporter -> prometheus via https
- [ ] alert manager config
+- [x] alert manager config / grafana alert via mail config
+- [x] provision dashboard / notifier par default
 - [ ] dashboard for Grafana amd64:
    - [x] host metrics
    - [ ] cadvisor for docker
+- [ ] config alertes sous grafana + images
--- a/alertmanager/alert.rules.BAK
+++ b/alertmanager/alert.rules.BAK
@ -1,172 +0,0 @@
-groups:
- name: targets
-  rules:
-  - alert: monitor_service_down
-    expr: up == 0
-    for: 30s
-    labels:
-      severity: critical
-    annotations:
-      summary: "Monitor service non-operational"
-      description: "Service {{ $labels.instance }} is down."
-
-## FOR HOST ##################################################################
-
- name: host
-  rules:
-  - alert: HostHighCpuLoad
-    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host high CPU load (instance {{ $labels.instance }})
-      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: high_memory_load
-    expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
-    for: 30s
-    labels:
-      severity: warning
-    annotations:
-      summary: "Server memory is almost full"
-      description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
-
-  - alert: HostPhysicalComponentTooHot
-    expr: node_hwmon_temp_celsius > 75
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host physical component too hot (instance {{ $labels.instance }})
-      description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: HostSwapIsFillingUp
-    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host swap is filling up (instance {{ $labels.instance }})
-      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: high_storage_load
-    expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"}  * 100 > 85
-    for: 30s
-    labels:
-      severity: warning
-    annotations:
-      summary: "Server storage is almost full"
-      description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
-
-  - alert: HostOutOfMemory
-    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of memory (instance {{ $labels.instance }})
-      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: HostUnusualNetworkThroughputIn
-    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual network throughput in (instance {{ $labels.instance }})
-      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: HostUnusualNetworkThroughputOut
-    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual network throughput out (instance {{ $labels.instance }})
-      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: HostOutOfDiskSpace
-    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of disk space (instance {{ $labels.instance }})
-      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-## FOR RAID ##########################################################
-
-  - alert: HostRaidArrayGotInactive
-    expr: node_md_state{state="inactive"} > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Host RAID array got inactive (instance {{ $labels.instance }})
-      description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: HostRaidDiskFailure
-    expr: node_md_disks{state="failed"} > 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host RAID disk failure (instance {{ $labels.instance }})
-      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-
-## FOR CONTAINERS #####################################################
-
- name: containers
-  rules:
-  - alert: nextcloud_down
-    expr: absent(container_memory_usage_bytes{name="jenkins"})
-    for: 30s
-    labels:
-      severity: critical
-    annotations:
-      summary: "Nextcloud down"
-      description: "Nextcloud container is down for more than 30 seconds."
-
-  - alert: ContainerCpuUsage
-    expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Container CPU usage (instance {{ $labels.instance }})
-      description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ContainerMemoryUsage
-    expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Container Memory usage (instance {{ $labels.instance }})
-      description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-
-## FOR NGINX ##########################################################
-
- name: nginx
-  rules:
-  - alert: NginxHighHttp4xxErrorRate
-    expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
-      description: "Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: NginxHighHttp5xxErrorRate
-    expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
-      description: "Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
--- a/alertmanager/alertmanager.yml
+++ b/alertmanager/alertmanager.yml
@ -11,12 +11,11 @@ route:
 receivers:
 - name: 'email'
  email_configs:
-  - to: 'mail1@mail.com, mail2@mail.com'
-    from: ''
-    smarthost: ''
-    auth_username: ''
-    auth_identity: ''
-    auth_password: ''
+  - to: 'mail-1@mail.com, mail-2@mail.com'
+    from: 'mail@mail.com'
+    smarthost: 'smtp.mail-provider.net:port'
+    auth_username: 'mail@mail.com'
+    auth_password: 'password'
    require_tls: yes
    send_resolved: true

--- a/alertmanager/config.yml
+++ b/alertmanager/config.yml
@ -1,11 +0,0 @@
-route:
-    receiver: 'slack'
-
-receivers:
-    - name: 'slack'
-      slack_configs:
-          - send_resolved: true
-            text: "{{ .CommonAnnotations.description }}"
-            username: 'Prometheus'
-            channel: '#prometheus'
-            api_url: 'https://hooks.slack.com/services/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'
--- a/alertmanager/custom-mail-alert.yml
+++ b/alertmanager/custom-mail-alert.yml
--- a/grafana/provisioning/dashboards/docker_containers.json
+++ b/grafana/provisioning/dashboards/docker_containers.json
--- a/grafana/provisioning/dashboards/docker_host.json
+++ b/grafana/provisioning/dashboards/docker_host.json
--- a/grafana/provisioning/dashboards/monitor_services.json
+++ b/grafana/provisioning/dashboards/monitor_services.json
--- a/grafana/provisioning/dashboards/nginx_container.json
+++ b/grafana/provisioning/dashboards/nginx_container.json
@ -1,398 +0,0 @@
-{
-  "id": null,
-  "title": "Nginx",
-  "description": "Nginx exporter metrics",
-  "tags": [
-    "nginx"
-  ],
-  "style": "dark",
-  "timezone": "browser",
-  "editable": true,
-  "hideControls": false,
-  "sharedCrosshair": true,
-  "rows": [
-    {
-      "collapse": false,
-      "editable": true,
-      "height": "250px",
-      "panels": [
-        {
-          "aliasColors": {},
-          "bars": false,
-          "datasource": "Prometheus",
-          "decimals": 2,
-          "editable": true,
-          "error": false,
-          "fill": 1,
-          "grid": {
-            "threshold1": null,
-            "threshold1Color": "rgba(216, 200, 27, 0.27)",
-            "threshold2": null,
-            "threshold2Color": "rgba(234, 112, 112, 0.22)"
-          },
-          "id": 3,
-          "isNew": true,
-          "legend": {
-            "alignAsTable": true,
-            "avg": true,
-            "current": true,
-            "max": true,
-            "min": true,
-            "rightSide": true,
-            "show": true,
-            "total": false,
-            "values": true
-          },
-          "lines": true,
-          "linewidth": 2,
-          "links": [],
-          "nullPointMode": "connected",
-          "percentage": false,
-          "pointradius": 5,
-          "points": false,
-          "renderer": "flot",
-          "seriesOverrides": [],
-          "span": 12,
-          "stack": false,
-          "steppedLine": false,
-          "targets": [
-            {
-              "expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
-              "hide": false,
-              "interval": "",
-              "intervalFactor": 10,
-              "legendFormat": "requests",
-              "metric": "",
-              "refId": "B",
-              "step": 10
-            }
-          ],
-          "timeFrom": null,
-          "timeShift": null,
-          "title": "Requests/sec",
-          "tooltip": {
-            "msResolution": false,
-            "shared": true,
-            "sort": 0,
-            "value_type": "cumulative"
-          },
-          "type": "graph",
-          "xaxis": {
-            "show": true
-          },
-          "yaxes": [
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": 0,
-              "show": true
-            },
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
-            }
-          ]
-        },
-        {
-          "aliasColors": {},
-          "bars": false,
-          "datasource": "Prometheus",
-          "decimals": 2,
-          "editable": true,
-          "error": false,
-          "fill": 1,
-          "grid": {
-            "threshold1": null,
-            "threshold1Color": "rgba(216, 200, 27, 0.27)",
-            "threshold2": null,
-            "threshold2Color": "rgba(234, 112, 112, 0.22)"
-          },
-          "id": 2,
-          "isNew": true,
-          "legend": {
-            "alignAsTable": true,
-            "avg": true,
-            "current": true,
-            "max": true,
-            "min": true,
-            "rightSide": true,
-            "show": true,
-            "total": false,
-            "values": true
-          },
-          "lines": true,
-          "linewidth": 2,
-          "links": [],
-          "nullPointMode": "connected",
-          "percentage": false,
-          "pointradius": 5,
-          "points": false,
-          "renderer": "flot",
-          "seriesOverrides": [],
-          "span": 12,
-          "stack": false,
-          "steppedLine": false,
-          "targets": [
-            {
-              "expr": "sum(nginx_connections_current) by (state)",
-              "interval": "",
-              "intervalFactor": 2,
-              "legendFormat": "{{state}}",
-              "metric": "",
-              "refId": "A",
-              "step": 2
-            }
-          ],
-          "timeFrom": null,
-          "timeShift": null,
-          "title": "Connections",
-          "tooltip": {
-            "msResolution": false,
-            "shared": true,
-            "sort": 0,
-            "value_type": "cumulative"
-          },
-          "type": "graph",
-          "xaxis": {
-            "show": true
-          },
-          "yaxes": [
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": 0,
-              "show": true
-            },
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
-            }
-          ]
-        },
-        {
-          "aliasColors": {},
-          "bars": false,
-          "datasource": "Prometheus",
-          "decimals": 2,
-          "editable": true,
-          "error": false,
-          "fill": 1,
-          "grid": {
-            "threshold1": null,
-            "threshold1Color": "rgba(216, 200, 27, 0.27)",
-            "threshold2": null,
-            "threshold2Color": "rgba(234, 112, 112, 0.22)"
-          },
-          "id": 1,
-          "isNew": true,
-          "legend": {
-            "alignAsTable": true,
-            "avg": true,
-            "current": true,
-            "max": true,
-            "min": true,
-            "rightSide": true,
-            "show": true,
-            "total": false,
-            "values": true
-          },
-          "lines": true,
-          "linewidth": 2,
-          "links": [],
-          "nullPointMode": "connected",
-          "percentage": false,
-          "pointradius": 5,
-          "points": false,
-          "renderer": "flot",
-          "seriesOverrides": [],
-          "span": 12,
-          "stack": false,
-          "steppedLine": false,
-          "targets": [
-            {
-              "expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
-              "hide": false,
-              "interval": "",
-              "intervalFactor": 10,
-              "legendFormat": "{{stage}}",
-              "metric": "",
-              "refId": "B",
-              "step": 10
-            }
-          ],
-          "timeFrom": null,
-          "timeShift": null,
-          "title": "Connections rate",
-          "tooltip": {
-            "msResolution": false,
-            "shared": true,
-            "sort": 0,
-            "value_type": "cumulative"
-          },
-          "type": "graph",
-          "xaxis": {
-            "show": true
-          },
-          "yaxes": [
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": 0,
-              "show": true
-            },
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
-            }
-          ]
-        }
-      ],
-      "title": "Nginx exporter metrics"
-    },
-    {
-      "collapse": false,
-      "editable": true,
-      "height": "250px",
-      "panels": [
-        {
-          "aliasColors": {},
-          "bars": false,
-          "datasource": null,
-          "editable": true,
-          "error": false,
-          "fill": 1,
-          "grid": {
-            "threshold1": null,
-            "threshold1Color": "rgba(216, 200, 27, 0.27)",
-            "threshold2": null,
-            "threshold2Color": "rgba(234, 112, 112, 0.22)"
-          },
-          "id": 4,
-          "isNew": true,
-          "legend": {
-            "alignAsTable": true,
-            "avg": true,
-            "current": true,
-            "max": true,
-            "min": true,
-            "rightSide": true,
-            "show": true,
-            "total": false,
-            "values": true
-          },
-          "lines": true,
-          "linewidth": 2,
-          "links": [],
-          "nullPointMode": "connected",
-          "percentage": false,
-          "pointradius": 5,
-          "points": false,
-          "renderer": "flot",
-          "seriesOverrides": [],
-          "span": 12,
-          "stack": false,
-          "steppedLine": false,
-          "targets": [
-            {
-              "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
-              "intervalFactor": 2,
-              "legendFormat": "nginx",
-              "refId": "A",
-              "step": 2
-            }
-          ],
-          "timeFrom": null,
-          "timeShift": null,
-          "title": "CPU usage",
-          "tooltip": {
-            "msResolution": false,
-            "shared": true,
-            "sort": 0,
-            "value_type": "cumulative"
-          },
-          "type": "graph",
-          "xaxis": {
-            "show": true
-          },
-          "yaxes": [
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
-            },
-            {
-              "format": "short",
-              "label": null,
-              "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
-            }
-          ]
-        }
-      ],
-      "title": "Nginx container metrics"
-    }
-  ],
-  "time": {
-    "from": "now-15m",
-    "to": "now"
-  },
-  "timepicker": {
-    "refresh_intervals": [
-      "5s",
-      "10s",
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
-  },
-    "templating": {
-    "list": []
-  },
-  "annotations": {
-    "list": []
-  },
-  "refresh": "10s",
-  "schemaVersion": 12,
-  "version": 9,
-  "links": [],
-  "gnetId": null
-}
--- a/grafana/provisioning/notifiers/email.yaml
+++ b/grafana/provisioning/notifiers/email.yaml
@ -0,0 +1,11 @@
+apiVersion: 1
+
+notifiers:
+  - name: garagenum
+    type: email
+    uid: 1
+    isDefault: true
+    sendReminder: true
+    disableResolveMessage: false
+    settings:
+      addresses:  email-1@mail.com;email-2@mail.com
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@ -52,6 +52,7 @@ scrape_configs:
  #       labels:
  #         instance: serveur-distant

+# SI ALERT VIA ALERTMANAGER SINON COMMENTER !
 alerting:
  alertmanagers:
  - scheme: http