First commit
commit
833005fc3e
|
|
@ -0,0 +1 @@
|
|||
.env
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
# Stack prometheus
|
||||
|
||||
## Composant
|
||||
- Prometheus
|
||||
- Grafana
|
||||
- sflow
|
||||
- alertmanager
|
||||
|
||||
## Installation
|
||||
Creer un .env a la racine du projet au format suivant
|
||||
```
|
||||
GRAFANA_ADMIN=<login admin>
|
||||
GRAFANA_PASSWORD=<passwoed admin>
|
||||
```
|
||||
Ces avec ce login/pwd que vous pourrez vous connecter a l'interface Grafana
|
||||
|
||||
Note: Le docker compose utilise ```Traefik```, penser à adapter les ```labels``` du docker-compose.yml .
|
||||
|
||||
### Creer le reseau supervision
|
||||
```
|
||||
docker network create supervision
|
||||
```
|
||||
### Creer le stockage suivant
|
||||
```
|
||||
docker volume create prom_data
|
||||
docker volume create sflow-rt_data
|
||||
```
|
||||
### Configuration prometheus
|
||||
- Editer le fichier ```prometheus/prometheus.yml``` pour configurer le pull des exporter
|
||||
- Editer le fichier ```prometheus/alert.rules``` pour configurer vos alarmes
|
||||
### Configuration Grafana
|
||||
Editer le fichier ```grafana/conf/grafana.ini``` pour adapter l'url publique d'acces
|
||||
```
|
||||
...
|
||||
...
|
||||
###################### Server ####################
|
||||
[server]
|
||||
...
|
||||
root_url = <Votre url d'acces>
|
||||
...
|
||||
...
|
||||
````
|
||||
|
||||
Note:
|
||||
>L'acces a prometheus est interne au stack docker inutile de modifier le fichier grafana/datasource/datasource.yml si vous ne modifier pas le hostname du service prometheus
|
||||
|
||||
|
||||
### Lancement de l'application
|
||||
```
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
## Prépartion de la configuration des opens vswitch pour envoie des flux sflow
|
||||
Exemple avec 2 openvswitdh
|
||||
- ovsbr0
|
||||
- ovsbr1
|
||||
|
||||
Sur le host docker:
|
||||
```
|
||||
ovs-vsctl -- --id=@sflow create sflow agent=enp2s0 target="\"addresse_sflow:6343\"" header=128 sampling=64 polling=10 -- set bridge ovsbr0 sflow=@sflow
|
||||
ovs-vsctl -- --id=@sflow create sflow agent=enp2s0 target="\"addresse_sflow:6343\"" header=128 sampling=64 polling=10 -- set bridge ovsbr1 sflow=@sflow
|
||||
```
|
||||
|
||||
ou ```addresse sflow``` correspond a l'addresse du host docker
|
||||
|
||||
Vérification:
|
||||
```
|
||||
ovs-vsctl list sflow
|
||||
_uuid : d110f5c1-3b58-457e-8d7e-ba2c35ec302d
|
||||
agent : enp2s0
|
||||
external_ids : {}
|
||||
header : 128
|
||||
polling : 10
|
||||
sampling : 64
|
||||
targets : ["192.168.200.21:6343"]
|
||||
|
||||
_uuid : 36ef3b15-c3c2-4159-86a9-f0a1d8e877a1
|
||||
agent : enp2s0
|
||||
external_ids : {}
|
||||
header : 128
|
||||
polling : 10
|
||||
sampling : 64
|
||||
targets : ["192.168.200.21:6343"]
|
||||
```
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 1h
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus
|
||||
container_name: prometheus
|
||||
hostname: prometheus
|
||||
networks:
|
||||
- traefik
|
||||
- supervision
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
ports:
|
||||
- 9090:9090
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=traefik"
|
||||
- "traefik.http.routers.prometheus.rule=Host(`prometheus.bv.stef.lan`)"
|
||||
- "traefik.http.routers.prometheus.tls=false"
|
||||
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.prometheus.entrypoints=insecure"
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prom_data:/prometheus
|
||||
alertmanager:
|
||||
container_name: alertmanager
|
||||
hostname: alertmanager
|
||||
image: prom/alertmanager
|
||||
ports:
|
||||
- 9093:9093
|
||||
networks:
|
||||
- supervision
|
||||
volumes:
|
||||
- './alertmanager:/etc/alertmanager'
|
||||
grafana:
|
||||
image: grafana/grafana
|
||||
container_name: grafana
|
||||
hostname: grafana
|
||||
networks:
|
||||
- traefik
|
||||
- supervision
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=traefik"
|
||||
- "traefik.http.routers.grafana.rule=Host(`grafana.bv.stef.lan`)"
|
||||
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.grafana.tls=true"
|
||||
- "traefik.http.routers.grafana.entrypoints=secure"
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
||||
volumes:
|
||||
- ./grafana/conf:/etc/grafana
|
||||
- ./grafana/datasource:/etc/grafana/provisioning/datasources
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor
|
||||
container_name: cadvisor
|
||||
hostname: cadvisor
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /:/rootfs:ro
|
||||
networks:
|
||||
- supervision
|
||||
ports:
|
||||
- 8080:8080
|
||||
sflow-rt:
|
||||
image: sflow/prometheus
|
||||
container_name: sflow-rt
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- sflow-rt_data:/sflow-rt/store
|
||||
ports:
|
||||
- '6343:6343/udp'
|
||||
- '8008:8008'
|
||||
networks:
|
||||
- supervision
|
||||
volumes:
|
||||
prom_data:
|
||||
external: true
|
||||
sflow-rt_data:
|
||||
external: true
|
||||
networks:
|
||||
traefik:
|
||||
name: traefik
|
||||
external: true
|
||||
supervision:
|
||||
name: supervision
|
||||
external: true
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,75 @@
|
|||
# To troubleshoot and get more log info enable ldap debug logging in grafana.ini
|
||||
# [log]
|
||||
# filters = ldap:debug
|
||||
|
||||
[[servers]]
|
||||
# Ldap server host (specify multiple hosts space separated)
|
||||
host = "127.0.0.1"
|
||||
# Default port is 389 or 636 if use_ssl = true
|
||||
port = 389
|
||||
# Set to true if LDAP server should use an encrypted TLS connection (either with STARTTLS or LDAPS)
|
||||
use_ssl = false
|
||||
# If set to true, use LDAP with STARTTLS instead of LDAPS
|
||||
start_tls = false
|
||||
# The value of an accepted TLS cipher. By default, this value is empty. Example value: ["TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384"])
|
||||
# For a complete list of supported ciphers and TLS versions, refer to: https://go.dev/src/crypto/tls/cipher_suites.go
|
||||
# Starting with Grafana v11.0 only ciphers with ECDHE support are accepted for TLS 1.2 connections.
|
||||
tls_ciphers = []
|
||||
# This is the minimum TLS version allowed. By default, this value is empty. Accepted values are: TLS1.1 (only for Grafana v10.4 or older), TLS1.2, TLS1.3.
|
||||
min_tls_version = ""
|
||||
# set to true if you want to skip ssl cert validation
|
||||
ssl_skip_verify = false
|
||||
# set to the path to your root CA certificate or leave unset to use system defaults
|
||||
# root_ca_cert = "/path/to/certificate.crt"
|
||||
# Authentication against LDAP servers requiring client certificates
|
||||
# client_cert = "/path/to/client.crt"
|
||||
# client_key = "/path/to/client.key"
|
||||
|
||||
# Search user bind dn
|
||||
bind_dn = "cn=admin,dc=grafana,dc=org"
|
||||
# Search user bind password
|
||||
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
|
||||
bind_password = 'grafana'
|
||||
# We recommend using variable expansion for the bind_password, for more info https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#variable-expansion
|
||||
# bind_password = '$__env{LDAP_BIND_PASSWORD}'
|
||||
|
||||
# Timeout in seconds (applies to each host specified in the 'host' entry (space separated))
|
||||
timeout = 10
|
||||
|
||||
# User search filter, for example "(cn=%s)" or "(sAMAccountName=%s)" or "(uid=%s)"
|
||||
search_filter = "(cn=%s)"
|
||||
|
||||
# An array of base dns to search through
|
||||
search_base_dns = ["dc=grafana,dc=org"]
|
||||
|
||||
## For Posix or LDAP setups that does not support member_of attribute you can define the below settings
|
||||
## Please check grafana LDAP docs for examples
|
||||
# group_search_filter = "(&(objectClass=posixGroup)(memberUid=%s))"
|
||||
# group_search_base_dns = ["ou=groups,dc=grafana,dc=org"]
|
||||
# group_search_filter_user_attribute = "uid"
|
||||
|
||||
# Specify names of the ldap attributes your ldap uses
|
||||
[servers.attributes]
|
||||
name = "givenName"
|
||||
surname = "sn"
|
||||
username = "cn"
|
||||
member_of = "memberOf"
|
||||
email = "email"
|
||||
|
||||
# Map ldap groups to grafana org roles
|
||||
[[servers.group_mappings]]
|
||||
group_dn = "cn=admins,ou=groups,dc=grafana,dc=org"
|
||||
org_role = "Admin"
|
||||
# To make user an instance admin (Grafana Admin) uncomment line below
|
||||
# grafana_admin = true
|
||||
# The Grafana organization database id, optional, if left out the default org (id 1) will be used
|
||||
# org_id = 1
|
||||
|
||||
[[servers.group_mappings]]
|
||||
group_dn = "cn=editors,ou=groups,dc=grafana,dc=org"
|
||||
org_role = "Editor"
|
||||
|
||||
[[servers.group_mappings]]
|
||||
# If you want to match all (or no ldap groups) then you can use wildcard
|
||||
group_dn = "*"
|
||||
org_role = "Viewer"
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
access: proxy
|
||||
editable: true
|
||||
|
|
@ -0,0 +1,301 @@
|
|||
groups:
|
||||
- name: node_exporter_alerts
|
||||
rules:
|
||||
- alert: NodeDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
title: Node {{ $labels.instance }} is down
|
||||
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
|
||||
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: Disk is almost full (< 10% left)\n VALUE = {{ $value }}
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: CPU load is > 80%\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}
|
||||
|
||||
# 1000 context switches is an arbitrary number.
|
||||
# Alert threshold depends on nature of application.
|
||||
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||
- alert: HostContextSwitching
|
||||
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: Swap is filling up (>80%)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: node_systemd_unit_state{state="failed"} == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host SystemD service crashed (instance {{ $labels.instance }})
|
||||
description: SystemD service crashed\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: Physical hardware component too hot\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: Physical node temperature alarm triggered\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: Different kernel versions are running\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: OOM kill detected\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: node_edac_uncorrectable_errors_total > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
|
||||
description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
|
||||
description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
|
||||
description: The network interface is getting overloaded.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: The number of conntrack is approching limit\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: Clock not synchronising.\n VALUE = {{ $value }}
|
||||
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
evaluation_interval: 15s
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: [alertmanager:9093]
|
||||
scheme: http
|
||||
timeout: 10s
|
||||
api_version: v1
|
||||
rule_files:
|
||||
- /etc/prometheus/alert.rules
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
honor_timestamps: true
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
metrics_path: /metrics
|
||||
scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- infra.bv.stef.lan:9100
|
||||
- 192.168.1.50:9100
|
||||
- cadvisor:8080
|
||||
- 192.168.1.104:9100
|
||||
- appdemo.dell.stef.lan:80
|
||||
|
||||
- job_name: nodes_kube
|
||||
honor_timestamps: true
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
metrics_path: /metrics
|
||||
scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- knode01.bv.stef.lan:9100
|
||||
- knode02.bv.stef.lan:9100
|
||||
- knode03.bv.stef.lan:9100
|
||||
|
||||
- job_name: 'sflow-rt-analyzer'
|
||||
metrics_path: /prometheus/analyzer/txt
|
||||
static_configs:
|
||||
- targets: ['192.168.200.21:8008']
|
||||
- job_name: 'sflow-rt-metrics'
|
||||
metrics_path: /prometheus/metrics/ALL/ALL/txt
|
||||
static_configs:
|
||||
- targets: ['192.168.200.21:8008']
|
||||
- job_name: 'sflow-rt-flow-src-dst-bps'
|
||||
metrics_path: /app/prometheus/scripts/export.js/flows/ALL/txt
|
||||
static_configs:
|
||||
- targets: ['192.168.200.21:8008']
|
||||
params:
|
||||
metric: ['ip_src_dst_bps']
|
||||
key: ['ipsource','ipdestination']
|
||||
label: ['src','dst']
|
||||
value: ['bytes']
|
||||
scale: ['8']
|
||||
minValue: ['1000']
|
||||
maxFlows: ['100']
|
||||
|
||||
- job_name: minio-job
|
||||
bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJwcm9tZXRoZXVzIiwic3ViIjoiYWRtaW4iLCJleHAiOjQ5MDYwMzc3Njh9.VmApbIPOb0Ham_0rsPb-uIzN7MhjUjdIkCG6t7cC-tSc8A1UGHc82LOUGzTt7uP7sYH03sP_BGLY39BrNQ2Riw
|
||||
metrics_path: /minio/v2/metrics/cluster
|
||||
scheme: http
|
||||
static_configs:
|
||||
- targets: [minio.dell.stef.lan]
|
||||
|
||||
Loading…
Reference in New Issue