Compare commits
7 Commits
main
...
node-expor
| Author | SHA1 | Date | |
|---|---|---|---|
| 3f9b2c4008 | |||
| 8d7b26bc02 | |||
| 6c6bef9cff | |||
| 8362a8e084 | |||
| d0f58fc3a9 | |||
| 4e141271b3 | |||
| 77f8c2ad18 |
45
README.md
45
README.md
@ -1,40 +1,18 @@
|
||||
# PROMETHEUS & GRAFANA
|
||||
|
||||
Ce projet vise à monitorer des serveurs via prometheus pour centraliser le scraping, node exporter pour les host metrics, cadvisor pour les metrics des conteneurs docker et grafana pour afficher les metrics sous forme de dashboards
|
||||
## NODE EXPORTER TLS
|
||||
|
||||

|
||||
|
||||
## CONFIGURATION
|
||||
|
||||
- Configuration de la boîte mail pour les alertes:
|
||||
```bash
|
||||
nano alertmanager/alertmanager.yml
|
||||
```
|
||||
|
||||
- Configuration des alertes:
|
||||
```bash
|
||||
nano alertmanager/alert.rules
|
||||
```
|
||||
|
||||
- Configuration des alertes:
|
||||
```bash
|
||||
nano alertmanager/alert.rules
|
||||
```
|
||||
|
||||
> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000
|
||||
|
||||
### NODE EXPORTER TLS
|
||||
|
||||
- Créer un enregistrement DNS pointant vers votre serveur
|
||||
- Créer une configuration serveur pointant le DNS vers 127.0.0.1:9100 (port exposé par node exporter) [exemple pour nginx](docs/nginx-config)
|
||||
Cette branche à pour but de déployer un conteneur node exporter afin d'exposer les metrics du serveur (afin que prometheus puisse les récupérer).
|
||||
Les metrics transférées sont cryptées via TLS.
|
||||
|
||||
#### NODE EXPORTER HOST
|
||||
|
||||
- Create certs:
|
||||
|
||||
```bash
|
||||
openssl req -new -newkey rsa:4096 -days 365 -nodes -x509 -keyout gn-prod.key -out gn-prod.crt -subj "/C=FR/ST=PARIS/L=GarageNum/O=prom/CN=legaragenumerique.fr" -addext "subjectAltName = DNS:gnprod"
|
||||
openssl req -new -newkey rsa:4096 -days 3650 -nodes -x509 -keyout gn-prod.key -out gn-prod.crt -subj "/C=FR/ST=PARIS/L=GarageNum/O=prom/CN=legaragenumerique.fr" -addext "subjectAltName = DNS:gnprod"
|
||||
```
|
||||
> remplacer Les infos (C=FR, ST=PARIS, L=GarageNum, ...)
|
||||
|
||||
- Create password:
|
||||
|
||||
@ -54,7 +32,7 @@ basic_auth_users:
|
||||
|
||||
- Copy certs to prometheus host
|
||||
|
||||
## PROMETHEUS HOST
|
||||
#### PROMETHEUS HOST
|
||||
|
||||
- Config prometheus.yml (/etc/prometheus/prometheus.yml):
|
||||
|
||||
@ -74,13 +52,4 @@ scrape_configs:
|
||||
instance: friendly-instance-name
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## TO DO
|
||||
|
||||
- [x] node exporter
|
||||
- [x] node exporter -> prometheus via https
|
||||
- [ ] dashboard for Grafana amd64:
|
||||
- [x] host metrics
|
||||
- [ ] cadvisor for docker
|
||||
> le scraping des metrics du serveur distant se font désormais via TLS
|
||||
|
||||
@ -1,172 +0,0 @@
|
||||
groups:
|
||||
- name: targets
|
||||
rules:
|
||||
- alert: monitor_service_down
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Monitor service non-operational"
|
||||
description: "Service {{ $labels.instance }} is down."
|
||||
|
||||
## FOR HOST ##################################################################
|
||||
|
||||
- name: host
|
||||
rules:
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server memory is almost full"
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server storage is almost full"
|
||||
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
## FOR RAID ##########################################################
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
## FOR CONTAINERS #####################################################
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: nextcloud_down
|
||||
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Nextcloud down"
|
||||
description: "Nextcloud container is down for more than 30 seconds."
|
||||
|
||||
- alert: ContainerCpuUsage
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerMemoryUsage
|
||||
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
## FOR NGINX ##########################################################
|
||||
|
||||
- name: nginx
|
||||
rules:
|
||||
- alert: NginxHighHttp4xxErrorRate
|
||||
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NginxHighHttp5xxErrorRate
|
||||
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
@ -1,172 +0,0 @@
|
||||
groups:
|
||||
- name: targets
|
||||
rules:
|
||||
- alert: monitor_service_down
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Monitor service non-operational"
|
||||
description: "Service {{ $labels.instance }} is down."
|
||||
|
||||
## FOR HOST ##################################################################
|
||||
|
||||
- name: host
|
||||
rules:
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server memory is almost full"
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Server storage is almost full"
|
||||
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
## FOR RAID ##########################################################
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
## FOR CONTAINERS #####################################################
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: nextcloud_down
|
||||
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Nextcloud down"
|
||||
description: "Nextcloud container is down for more than 30 seconds."
|
||||
|
||||
- alert: ContainerCpuUsage
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerMemoryUsage
|
||||
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
|
||||
## FOR NGINX ##########################################################
|
||||
|
||||
- name: nginx
|
||||
rules:
|
||||
- alert: NginxHighHttp4xxErrorRate
|
||||
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NginxHighHttp5xxErrorRate
|
||||
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
@ -1,39 +0,0 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 10m
|
||||
receiver: 'email'
|
||||
|
||||
receivers:
|
||||
- name: 'email'
|
||||
email_configs:
|
||||
- to: 'mail1@mail.com, mail2@mail.com'
|
||||
from: ''
|
||||
smarthost: ''
|
||||
auth_username: ''
|
||||
auth_identity: ''
|
||||
auth_password: ''
|
||||
require_tls: yes
|
||||
send_resolved: true
|
||||
|
||||
# mute_time_intervals:
|
||||
# - name: out-of-business-hours
|
||||
# time_intervals:
|
||||
# - weekdays: ['Saturday','Sunday']
|
||||
# - times:
|
||||
# - start_time: '00:00'
|
||||
# end_time: '08:00'
|
||||
# - start_time: '18:00'
|
||||
# end_time: '24:00'
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'dev', 'instance']
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
route:
|
||||
receiver: 'slack'
|
||||
|
||||
receivers:
|
||||
- name: 'slack'
|
||||
slack_configs:
|
||||
- send_resolved: true
|
||||
text: "{{ .CommonAnnotations.description }}"
|
||||
username: 'Prometheus'
|
||||
channel: '#prometheus'
|
||||
api_url: 'https://hooks.slack.com/services/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'
|
||||
@ -1,37 +0,0 @@
|
||||
# Whether to notify about resolved alerts.
|
||||
[ send_resolved: <boolean> | default = false ]
|
||||
|
||||
# The email address to send notifications to.
|
||||
to: <tmpl_string>
|
||||
|
||||
# The sender's address.
|
||||
[ from: <tmpl_string> | default = global.smtp_from ]
|
||||
|
||||
# The SMTP host through which emails are sent.
|
||||
[ smarthost: <string> | default = global.smtp_smarthost ]
|
||||
|
||||
# The hostname to identify to the SMTP server.
|
||||
[ hello: <string> | default = global.smtp_hello ]
|
||||
|
||||
# SMTP authentication information.
|
||||
[ auth_username: <string> | default = global.smtp_auth_username ]
|
||||
[ auth_password: <secret> | default = global.smtp_auth_password ]
|
||||
[ auth_secret: <secret> | default = global.smtp_auth_secret ]
|
||||
[ auth_identity: <string> | default = global.smtp_auth_identity ]
|
||||
|
||||
# The SMTP TLS requirement.
|
||||
# Note that Go does not support unencrypted connections to remote SMTP endpoints.
|
||||
[ require_tls: <bool> | default = global.smtp_require_tls ]
|
||||
|
||||
# TLS configuration.
|
||||
tls_config:
|
||||
[ <tls_config> ]
|
||||
|
||||
# The HTML body of the email notification.
|
||||
[ html: <tmpl_string> | default = '{{ template "email.default.html" . }}' ]
|
||||
# The text body of the email notification.
|
||||
[ text: <tmpl_string> ]
|
||||
|
||||
# Further headers email header key/value pairs. Overrides any headers
|
||||
# previously set by the notification implementation.
|
||||
[ headers: { <string>: <tmpl_string>, ... } ]
|
||||
22
compose.yml
Normal file
22
compose.yml
Normal file
@ -0,0 +1,22 @@
|
||||
services:
|
||||
|
||||
# HOST METRICS
|
||||
nodeexporter:
|
||||
image: prom/node-exporter:v1.8.2
|
||||
container_name: nodeexporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- ./config:/etc/prometheus
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
- '--web.config.file=/etc/prometheus/web.yml'
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 9100:9100
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
5
config/web.yml
Normal file
5
config/web.yml
Normal file
@ -0,0 +1,5 @@
|
||||
tls_server_config:
|
||||
cert_file: /etc/prometheus/gn-prod.crt
|
||||
key_file: /etc/prometheus/gn-prod.key
|
||||
basic_auth_users:
|
||||
prometheus: <hashed-password>
|
||||
@ -1,121 +0,0 @@
|
||||
version: '2.1'
|
||||
|
||||
|
||||
networks:
|
||||
monitor-net:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
prometheus_data: {}
|
||||
grafana_data: {}
|
||||
|
||||
services:
|
||||
|
||||
# METRICS GATHERER
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.17.1
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--storage.tsdb.retention.time=200h'
|
||||
- '--web.enable-lifecycle'
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9090
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
# FOR ALERTS
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.20.0
|
||||
container_name: alertmanager
|
||||
volumes:
|
||||
- ./alertmanager:/etc/alertmanager
|
||||
command:
|
||||
#- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9093
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
# FOR HOST METRICS
|
||||
nodeexporter:
|
||||
image: prom/node-exporter:v0.18.1
|
||||
container_name: nodeexporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9100
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
# FOR DOCKER CONTAINERS
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor
|
||||
container_name: cadvisor
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker:/var/lib/docker:ro
|
||||
- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 8080
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
# POUR AFFICHAGE DASHBOARD
|
||||
grafana:
|
||||
image: grafana/grafana:6.7.2
|
||||
container_name: grafana
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${ADMIN_USER}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 3000;3000
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
|
||||
pushgateway:
|
||||
image: prom/pushgateway:v1.2.0
|
||||
container_name: pushgateway
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9091
|
||||
networks:
|
||||
- monitor-net
|
||||
labels:
|
||||
org.label-schema.group: "monitoring"
|
||||
@ -1,34 +0,0 @@
|
||||
upstream nodeexporter {
|
||||
server 127.0.0.1:9100;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name monitoring.mondomaine.tld;
|
||||
|
||||
location / {
|
||||
proxy_pass http://nodeexporter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
server {
|
||||
listen 443 ssl;
|
||||
listen [::]:443 ssl;
|
||||
server_name monitoring.mondomaine.tld;
|
||||
|
||||
error_log /var/log/nginx/monitoring.mondomaine.tld-proxy-error.log;
|
||||
access_log /var/log/nginx/monitoring.mondomaine.tld-proxy-access.log;
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/monitoring.mondomaine.tld/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/monitoring.mondomaine.tld/privkey.pem;
|
||||
|
||||
ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
|
||||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||||
|
||||
location / {
|
||||
proxy_pass http://nodeexporter;
|
||||
}
|
||||
|
||||
}
|
||||
BIN
docs/prom.png
BIN
docs/prom.png
Binary file not shown.
|
Before Width: | Height: | Size: 286 KiB |
File diff suppressed because it is too large
Load Diff
@ -1,12 +0,0 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Prometheus'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,398 +0,0 @@
|
||||
{
|
||||
"id": null,
|
||||
"title": "Nginx",
|
||||
"description": "Nginx exporter metrics",
|
||||
"tags": [
|
||||
"nginx"
|
||||
],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"hideControls": false,
|
||||
"sharedCrosshair": true,
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"editable": true,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 3,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "requests",
|
||||
"metric": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Requests/sec",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 2,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(nginx_connections_current) by (state)",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{state}}",
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
"step": 2
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Connections",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": "Prometheus",
|
||||
"decimals": 2,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 1,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{stage}}",
|
||||
"metric": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Connections rate",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Nginx exporter metrics"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"editable": true,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"datasource": null,
|
||||
"editable": true,
|
||||
"error": false,
|
||||
"fill": 1,
|
||||
"grid": {
|
||||
"threshold1": null,
|
||||
"threshold1Color": "rgba(216, 200, 27, 0.27)",
|
||||
"threshold2": null,
|
||||
"threshold2Color": "rgba(234, 112, 112, 0.22)"
|
||||
},
|
||||
"id": 4,
|
||||
"isNew": true,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "nginx",
|
||||
"refId": "A",
|
||||
"step": 2
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU usage",
|
||||
"tooltip": {
|
||||
"msResolution": false,
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "cumulative"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"show": true
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Nginx container metrics"
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 12,
|
||||
"version": 9,
|
||||
"links": [],
|
||||
"gnetId": null
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
orgId: 1
|
||||
url: http://prometheus:9090
|
||||
basicAuth: false
|
||||
isDefault: true
|
||||
editable: true
|
||||
@ -1,70 +0,0 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: 'docker-host-alpha'
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- "alert.rules"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape.
|
||||
|
||||
# LOCAL SERVER
|
||||
scrape_configs:
|
||||
- job_name: 'nodeexporter'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['nodeexporter:9100']
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'pushgateway'
|
||||
scrape_interval: 10s
|
||||
honor_labels: true
|
||||
static_configs:
|
||||
- targets: ['pushgateway:9091']
|
||||
|
||||
# DISTANT SERVER (WITH NODE EXPORTER)
|
||||
# - job_name: 'serveur-distant'
|
||||
# scheme: https
|
||||
# basic_auth:
|
||||
# username: 'prometheus'
|
||||
# password: 'htpassword-non-crypté'
|
||||
# tls_config:
|
||||
# ca_file: certif.crt
|
||||
# insecure_skip_verify: true
|
||||
# scrape_interval: 10s
|
||||
# honor_labels: true
|
||||
# static_configs:
|
||||
# - targets: ['monitoring.mondomaine.tld:9100']
|
||||
# labels:
|
||||
# instance: serveur-distant
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'alertmanager:9093'
|
||||
|
||||
# - job_name: 'nginx'
|
||||
# scrape_interval: 10s
|
||||
# static_configs:
|
||||
# - targets: ['nginxexporter:9113']
|
||||
|
||||
# - job_name: 'aspnetcore'
|
||||
# scrape_interval: 10s
|
||||
# static_configs:
|
||||
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']
|
||||
Loading…
x
Reference in New Issue
Block a user