First commit

main
stef 2025-07-14 00:24:03 +02:00
commit 833005fc3e
9 changed files with 2662 additions and 0 deletions

1
.gitignore vendored 100644
View File

@ -0,0 +1 @@
.env

84
Readme.md 100644
View File

@ -0,0 +1,84 @@
# Stack prometheus
## Composant
- Prometheus
- Grafana
- sflow
- alertmanager
## Installation
Creer un .env a la racine du projet au format suivant
```
GRAFANA_ADMIN=<login admin>
GRAFANA_PASSWORD=<passwoed admin>
```
Ces avec ce login/pwd que vous pourrez vous connecter a l'interface Grafana
Note: Le docker compose utilise ```Traefik```, penser à adapter les ```labels``` du docker-compose.yml .
### Creer le reseau supervision
```
docker network create supervision
```
### Creer le stockage suivant
```
docker volume create prom_data
docker volume create sflow-rt_data
```
### Configuration prometheus
- Editer le fichier ```prometheus/prometheus.yml``` pour configurer le pull des exporter
- Editer le fichier ```prometheus/alert.rules``` pour configurer vos alarmes
### Configuration Grafana
Editer le fichier ```grafana/conf/grafana.ini``` pour adapter l'url publique d'acces
```
...
...
###################### Server ####################
[server]
...
root_url = <Votre url d'acces>
...
...
````
Note:
>L'acces a prometheus est interne au stack docker inutile de modifier le fichier grafana/datasource/datasource.yml si vous ne modifier pas le hostname du service prometheus
### Lancement de l'application
```
docker compose up -d
```
## Prépartion de la configuration des opens vswitch pour envoie des flux sflow
Exemple avec 2 openvswitdh
- ovsbr0
- ovsbr1
Sur le host docker:
```
ovs-vsctl -- --id=@sflow create sflow agent=enp2s0 target="\"addresse_sflow:6343\"" header=128 sampling=64 polling=10 -- set bridge ovsbr0 sflow=@sflow
ovs-vsctl -- --id=@sflow create sflow agent=enp2s0 target="\"addresse_sflow:6343\"" header=128 sampling=64 polling=10 -- set bridge ovsbr1 sflow=@sflow
```
ou ```addresse sflow``` correspond a l'addresse du host docker
Vérification:
```
ovs-vsctl list sflow
_uuid : d110f5c1-3b58-457e-8d7e-ba2c35ec302d
agent : enp2s0
external_ids : {}
header : 128
polling : 10
sampling : 64
targets : ["192.168.200.21:6343"]
_uuid : 36ef3b15-c3c2-4159-86a9-f0a1d8e877a1
agent : enp2s0
external_ids : {}
header : 128
polling : 10
sampling : 64
targets : ["192.168.200.21:6343"]
```

View File

@ -0,0 +1,5 @@
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h

92
docker-compose.yml 100644
View File

@ -0,0 +1,92 @@
services:
prometheus:
image: prom/prometheus
container_name: prometheus
hostname: prometheus
networks:
- traefik
- supervision
command:
- '--config.file=/etc/prometheus/prometheus.yml'
ports:
- 9090:9090
restart: unless-stopped
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik"
- "traefik.http.routers.prometheus.rule=Host(`prometheus.bv.stef.lan`)"
- "traefik.http.routers.prometheus.tls=false"
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.prometheus.entrypoints=insecure"
volumes:
- ./prometheus:/etc/prometheus
- prom_data:/prometheus
alertmanager:
container_name: alertmanager
hostname: alertmanager
image: prom/alertmanager
ports:
- 9093:9093
networks:
- supervision
volumes:
- './alertmanager:/etc/alertmanager'
grafana:
image: grafana/grafana
container_name: grafana
hostname: grafana
networks:
- traefik
- supervision
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik"
- "traefik.http.routers.grafana.rule=Host(`grafana.bv.stef.lan`)"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.grafana.tls=true"
- "traefik.http.routers.grafana.entrypoints=secure"
restart: unless-stopped
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
volumes:
- ./grafana/conf:/etc/grafana
- ./grafana/datasource:/etc/grafana/provisioning/datasources
cadvisor:
image: gcr.io/cadvisor/cadvisor
container_name: cadvisor
hostname: cadvisor
restart: unless-stopped
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /:/rootfs:ro
networks:
- supervision
ports:
- 8080:8080
sflow-rt:
image: sflow/prometheus
container_name: sflow-rt
restart: unless-stopped
volumes:
- sflow-rt_data:/sflow-rt/store
ports:
- '6343:6343/udp'
- '8008:8008'
networks:
- supervision
volumes:
prom_data:
external: true
sflow-rt_data:
external: true
networks:
traefik:
name: traefik
external: true
supervision:
name: supervision
external: true

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,75 @@
# To troubleshoot and get more log info enable ldap debug logging in grafana.ini
# [log]
# filters = ldap:debug
[[servers]]
# Ldap server host (specify multiple hosts space separated)
host = "127.0.0.1"
# Default port is 389 or 636 if use_ssl = true
port = 389
# Set to true if LDAP server should use an encrypted TLS connection (either with STARTTLS or LDAPS)
use_ssl = false
# If set to true, use LDAP with STARTTLS instead of LDAPS
start_tls = false
# The value of an accepted TLS cipher. By default, this value is empty. Example value: ["TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384"])
# For a complete list of supported ciphers and TLS versions, refer to: https://go.dev/src/crypto/tls/cipher_suites.go
# Starting with Grafana v11.0 only ciphers with ECDHE support are accepted for TLS 1.2 connections.
tls_ciphers = []
# This is the minimum TLS version allowed. By default, this value is empty. Accepted values are: TLS1.1 (only for Grafana v10.4 or older), TLS1.2, TLS1.3.
min_tls_version = ""
# set to true if you want to skip ssl cert validation
ssl_skip_verify = false
# set to the path to your root CA certificate or leave unset to use system defaults
# root_ca_cert = "/path/to/certificate.crt"
# Authentication against LDAP servers requiring client certificates
# client_cert = "/path/to/client.crt"
# client_key = "/path/to/client.key"
# Search user bind dn
bind_dn = "cn=admin,dc=grafana,dc=org"
# Search user bind password
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
bind_password = 'grafana'
# We recommend using variable expansion for the bind_password, for more info https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#variable-expansion
# bind_password = '$__env{LDAP_BIND_PASSWORD}'
# Timeout in seconds (applies to each host specified in the 'host' entry (space separated))
timeout = 10
# User search filter, for example "(cn=%s)" or "(sAMAccountName=%s)" or "(uid=%s)"
search_filter = "(cn=%s)"
# An array of base dns to search through
search_base_dns = ["dc=grafana,dc=org"]
## For Posix or LDAP setups that does not support member_of attribute you can define the below settings
## Please check grafana LDAP docs for examples
# group_search_filter = "(&(objectClass=posixGroup)(memberUid=%s))"
# group_search_base_dns = ["ou=groups,dc=grafana,dc=org"]
# group_search_filter_user_attribute = "uid"
# Specify names of the ldap attributes your ldap uses
[servers.attributes]
name = "givenName"
surname = "sn"
username = "cn"
member_of = "memberOf"
email = "email"
# Map ldap groups to grafana org roles
[[servers.group_mappings]]
group_dn = "cn=admins,ou=groups,dc=grafana,dc=org"
org_role = "Admin"
# To make user an instance admin (Grafana Admin) uncomment line below
# grafana_admin = true
# The Grafana organization database id, optional, if left out the default org (id 1) will be used
# org_id = 1
[[servers.group_mappings]]
group_dn = "cn=editors,ou=groups,dc=grafana,dc=org"
org_role = "Editor"
[[servers.group_mappings]]
# If you want to match all (or no ldap groups) then you can use wildcard
group_dn = "*"
org_role = "Viewer"

View File

@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
url: http://prometheus:9090
isDefault: true
access: proxy
editable: true

View File

@ -0,0 +1,301 @@
groups:
- name: node_exporter_alerts
rules:
- alert: NodeDown
expr: up == 0
for: 2m
labels:
severity: warning
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: Disk is almost full (< 10% left)\n VALUE = {{ $value }}
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: CPU load is > 80%\n VALUE = {{ $value }}
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: Swap is filling up (>80%)\n VALUE = {{ $value }}
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: Host SystemD service crashed (instance {{ $labels.instance }})
description: SystemD service crashed\n VALUE = {{ $value }}
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: Physical hardware component too hot\n VALUE = {{ $value }}
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: Physical node temperature alarm triggered\n VALUE = {{ $value }}
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}
- alert: HostKernelVersionDeviations
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: Different kernel versions are running\n VALUE = {{ $value }}
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: OOM kill detected\n VALUE = {{ $value }}
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
description: The network interface is getting overloaded.\n VALUE = {{ $value }}
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: The number of conntrack is approching limit\n VALUE = {{ $value }}
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }}
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: Clock not synchronising.\n VALUE = {{ $value }}

View File

@ -0,0 +1,69 @@
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: [alertmanager:9093]
scheme: http
timeout: 10s
api_version: v1
rule_files:
- /etc/prometheus/alert.rules
scrape_configs:
- job_name: prometheus
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- infra.bv.stef.lan:9100
- 192.168.1.50:9100
- cadvisor:8080
- 192.168.1.104:9100
- appdemo.dell.stef.lan:80
- job_name: nodes_kube
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- knode01.bv.stef.lan:9100
- knode02.bv.stef.lan:9100
- knode03.bv.stef.lan:9100
- job_name: 'sflow-rt-analyzer'
metrics_path: /prometheus/analyzer/txt
static_configs:
- targets: ['192.168.200.21:8008']
- job_name: 'sflow-rt-metrics'
metrics_path: /prometheus/metrics/ALL/ALL/txt
static_configs:
- targets: ['192.168.200.21:8008']
- job_name: 'sflow-rt-flow-src-dst-bps'
metrics_path: /app/prometheus/scripts/export.js/flows/ALL/txt
static_configs:
- targets: ['192.168.200.21:8008']
params:
metric: ['ip_src_dst_bps']
key: ['ipsource','ipdestination']
label: ['src','dst']
value: ['bytes']
scale: ['8']
minValue: ['1000']
maxFlows: ['100']
- job_name: minio-job
bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJwcm9tZXRoZXVzIiwic3ViIjoiYWRtaW4iLCJleHAiOjQ5MDYwMzc3Njh9.VmApbIPOb0Ham_0rsPb-uIzN7MhjUjdIkCG6t7cC-tSc8A1UGHc82LOUGzTt7uP7sYH03sP_BGLY39BrNQ2Riw
metrics_path: /minio/v2/metrics/cluster
scheme: http
static_configs:
- targets: [minio.dell.stef.lan]