Compare commits

...

7 Commits

20 changed files with 34 additions and 8848 deletions

2
.env
View File

@ -1,2 +0,0 @@
ADMIN_USER=admin
ADMIN_PASSWORD=password

View File

@ -1,40 +1,18 @@
# PROMETHEUS & GRAFANA
Ce projet vise à monitorer des serveurs via prometheus pour centraliser le scraping, node exporter pour les host metrics, cadvisor pour les metrics des conteneurs docker et grafana pour afficher les metrics sous forme de dashboards
## NODE EXPORTER TLS
![PROM](docs/prom.png)
## CONFIGURATION
- Configuration de la boîte mail pour les alertes:
```bash
nano alertmanager/alertmanager.yml
```
- Configuration des alertes:
```bash
nano alertmanager/alert.rules
```
- Configuration des alertes:
```bash
nano alertmanager/alert.rules
```
> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000
### NODE EXPORTER TLS
- Créer un enregistrement DNS pointant vers votre serveur
- Créer une configuration serveur pointant le DNS vers 127.0.0.1:9100 (port exposé par node exporter) [exemple pour nginx](docs/nginx-config)
Cette branche à pour but de déployer un conteneur node exporter afin d'exposer les metrics du serveur (afin que prometheus puisse les récupérer).
Les metrics transférées sont cryptées via TLS.
#### NODE EXPORTER HOST
- Create certs:
```bash
openssl req -new -newkey rsa:4096 -days 365 -nodes -x509 -keyout gn-prod.key -out gn-prod.crt -subj "/C=FR/ST=PARIS/L=GarageNum/O=prom/CN=legaragenumerique.fr" -addext "subjectAltName = DNS:gnprod"
openssl req -new -newkey rsa:4096 -days 3650 -nodes -x509 -keyout gn-prod.key -out gn-prod.crt -subj "/C=FR/ST=PARIS/L=GarageNum/O=prom/CN=legaragenumerique.fr" -addext "subjectAltName = DNS:gnprod"
```
> remplacer Les infos (C=FR, ST=PARIS, L=GarageNum, ...)
- Create password:
@ -54,7 +32,7 @@ basic_auth_users:
- Copy certs to prometheus host
## PROMETHEUS HOST
#### PROMETHEUS HOST
- Config prometheus.yml (/etc/prometheus/prometheus.yml):
@ -74,13 +52,4 @@ scrape_configs:
instance: friendly-instance-name
```
## TO DO
- [x] node exporter
- [x] node exporter -> prometheus via https
- [ ] dashboard for Grafana amd64:
- [x] host metrics
- [ ] cadvisor for docker
> le scraping des metrics du serveur distant se font désormais via TLS

View File

@ -1,172 +0,0 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
## FOR HOST ##################################################################
- name: host
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR RAID ##########################################################
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR CONTAINERS #####################################################
- name: containers
rules:
- alert: nextcloud_down
expr: absent(container_memory_usage_bytes{name="jenkins"})
for: 30s
labels:
severity: critical
annotations:
summary: "Nextcloud down"
description: "Nextcloud container is down for more than 30 seconds."
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR NGINX ##########################################################
- name: nginx
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -1,172 +0,0 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
## FOR HOST ##################################################################
- name: host
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR RAID ##########################################################
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR CONTAINERS #####################################################
- name: containers
rules:
- alert: nextcloud_down
expr: absent(container_memory_usage_bytes{name="jenkins"})
for: 30s
labels:
severity: critical
annotations:
summary: "Nextcloud down"
description: "Nextcloud container is down for more than 30 seconds."
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR NGINX ##########################################################
- name: nginx
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -1,39 +0,0 @@
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 10m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'mail1@mail.com, mail2@mail.com'
from: ''
smarthost: ''
auth_username: ''
auth_identity: ''
auth_password: ''
require_tls: yes
send_resolved: true
# mute_time_intervals:
# - name: out-of-business-hours
# time_intervals:
# - weekdays: ['Saturday','Sunday']
# - times:
# - start_time: '00:00'
# end_time: '08:00'
# - start_time: '18:00'
# end_time: '24:00'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']

View File

@ -1,11 +0,0 @@
route:
receiver: 'slack'
receivers:
- name: 'slack'
slack_configs:
- send_resolved: true
text: "{{ .CommonAnnotations.description }}"
username: 'Prometheus'
channel: '#prometheus'
api_url: 'https://hooks.slack.com/services/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'

View File

@ -1,37 +0,0 @@
# Whether to notify about resolved alerts.
[ send_resolved: <boolean> | default = false ]
# The email address to send notifications to.
to: <tmpl_string>
# The sender's address.
[ from: <tmpl_string> | default = global.smtp_from ]
# The SMTP host through which emails are sent.
[ smarthost: <string> | default = global.smtp_smarthost ]
# The hostname to identify to the SMTP server.
[ hello: <string> | default = global.smtp_hello ]
# SMTP authentication information.
[ auth_username: <string> | default = global.smtp_auth_username ]
[ auth_password: <secret> | default = global.smtp_auth_password ]
[ auth_secret: <secret> | default = global.smtp_auth_secret ]
[ auth_identity: <string> | default = global.smtp_auth_identity ]
# The SMTP TLS requirement.
# Note that Go does not support unencrypted connections to remote SMTP endpoints.
[ require_tls: <bool> | default = global.smtp_require_tls ]
# TLS configuration.
tls_config:
[ <tls_config> ]
# The HTML body of the email notification.
[ html: <tmpl_string> | default = '{{ template "email.default.html" . }}' ]
# The text body of the email notification.
[ text: <tmpl_string> ]
# Further headers email header key/value pairs. Overrides any headers
# previously set by the notification implementation.
[ headers: { <string>: <tmpl_string>, ... } ]

22
compose.yml Normal file
View File

@ -0,0 +1,22 @@
services:
# HOST METRICS
nodeexporter:
image: prom/node-exporter:v1.8.2
container_name: nodeexporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- ./config:/etc/prometheus
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
- '--web.config.file=/etc/prometheus/web.yml'
restart: unless-stopped
ports:
- 9100:9100
labels:
org.label-schema.group: "monitoring"

5
config/web.yml Normal file
View File

@ -0,0 +1,5 @@
tls_server_config:
cert_file: /etc/prometheus/gn-prod.crt
key_file: /etc/prometheus/gn-prod.key
basic_auth_users:
prometheus: <hashed-password>

View File

@ -1,121 +0,0 @@
version: '2.1'
networks:
monitor-net:
driver: bridge
volumes:
prometheus_data: {}
grafana_data: {}
services:
# METRICS GATHERER
prometheus:
image: prom/prometheus:v2.17.1
container_name: prometheus
volumes:
- ./prometheus:/etc/prometheus
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
restart: unless-stopped
expose:
- 9090
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
# FOR ALERTS
alertmanager:
image: prom/alertmanager:v0.20.0
container_name: alertmanager
volumes:
- ./alertmanager:/etc/alertmanager
command:
#- '--config.file=/etc/alertmanager/config.yml'
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
restart: unless-stopped
expose:
- 9093
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
# FOR HOST METRICS
nodeexporter:
image: prom/node-exporter:v0.18.1
container_name: nodeexporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
expose:
- 9100
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
# FOR DOCKER CONTAINERS
cadvisor:
image: gcr.io/cadvisor/cadvisor
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
restart: unless-stopped
expose:
- 8080
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
# POUR AFFICHAGE DASHBOARD
grafana:
image: grafana/grafana:6.7.2
container_name: grafana
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_USER=${ADMIN_USER}
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
ports:
- 3000;3000
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
pushgateway:
image: prom/pushgateway:v1.2.0
container_name: pushgateway
restart: unless-stopped
expose:
- 9091
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"

View File

@ -1,34 +0,0 @@
upstream nodeexporter {
server 127.0.0.1:9100;
}
server {
listen 80;
listen [::]:80;
server_name monitoring.mondomaine.tld;
location / {
proxy_pass http://nodeexporter;
}
}
server {
listen 443 ssl;
listen [::]:443 ssl;
server_name monitoring.mondomaine.tld;
error_log /var/log/nginx/monitoring.mondomaine.tld-proxy-error.log;
access_log /var/log/nginx/monitoring.mondomaine.tld-proxy-access.log;
ssl_certificate /etc/letsencrypt/live/monitoring.mondomaine.tld/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/monitoring.mondomaine.tld/privkey.pem;
ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
ssl_ciphers HIGH:!aNULL:!MD5;
location / {
proxy_pass http://nodeexporter;
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 286 KiB

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +0,0 @@
apiVersion: 1
providers:
- name: 'Prometheus'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,398 +0,0 @@
{
"id": null,
"title": "Nginx",
"description": "Nginx exporter metrics",
"tags": [
"nginx"
],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"sharedCrosshair": true,
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
"hide": false,
"interval": "",
"intervalFactor": 10,
"legendFormat": "requests",
"metric": "",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Requests/sec",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(nginx_connections_current) by (state)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{state}}",
"metric": "",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Connections",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
"hide": false,
"interval": "",
"intervalFactor": 10,
"legendFormat": "{{stage}}",
"metric": "",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Connections rate",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Nginx exporter metrics"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 4,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
"intervalFactor": 2,
"legendFormat": "nginx",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "CPU usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Nginx container metrics"
}
],
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": "10s",
"schemaVersion": 12,
"version": 9,
"links": [],
"gnetId": null
}

View File

@ -1,11 +0,0 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
basicAuth: false
isDefault: true
editable: true

View File

@ -1,70 +0,0 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'docker-host-alpha'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- "alert.rules"
# A scrape configuration containing exactly one endpoint to scrape.
# LOCAL SERVER
scrape_configs:
- job_name: 'nodeexporter'
scrape_interval: 5s
static_configs:
- targets: ['nodeexporter:9100']
- job_name: 'cadvisor'
scrape_interval: 5s
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'prometheus'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9090']
- job_name: 'pushgateway'
scrape_interval: 10s
honor_labels: true
static_configs:
- targets: ['pushgateway:9091']
# DISTANT SERVER (WITH NODE EXPORTER)
# - job_name: 'serveur-distant'
# scheme: https
# basic_auth:
# username: 'prometheus'
# password: 'htpassword-non-crypté'
# tls_config:
# ca_file: certif.crt
# insecure_skip_verify: true
# scrape_interval: 10s
# honor_labels: true
# static_configs:
# - targets: ['monitoring.mondomaine.tld:9100']
# labels:
# instance: serveur-distant
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- 'alertmanager:9093'
# - job_name: 'nginx'
# scrape_interval: 10s
# static_configs:
# - targets: ['nginxexporter:9113']
# - job_name: 'aspnetcore'
# scrape_interval: 10s
# static_configs:
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']