Prometheus警报规则
作者:互联网
groups:
- name: node_rules
rules:
- record: instance:node_cpu:avg_rate5m
expr: 100 - avg (irate(node_cpu_seconds_total{job="node_prod",mode="idle"}[5m])) by (instance) * 100
- record: instance:node_cpus:count
expr: count by (instance)(node_cpu_seconds_total{mode="idle"})
- record: instance:node_cpu_saturation_load1
expr: node_load1 > on (instance) 2 * count by (instance)(node_cpu_seconds_total{job="node_prod",mode="idle"})
- record: instance:node_memory_usage:percentage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes{job="node_prod"} * 100
- record: instance:node_memory_swap_io_bytes:sum_rate
expr: 1024 * sum by (instance) (
(rate(node_vmstat_pgpgin[1m])
+ rate(node_vmstat_pgpgout[1m]))
)
- record: instance:root:node_filesystem_usage:percentage
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100
groups:
- name: node_alerts
rules:
- alert: HighNodeCPU(CPU使用率)
expr: instance:node_cpu:avg_rate5m > 90
for: 60m
labels:
name: CPU
severity: warning
annotations:
summary: 5分钟内的节点平均CPU使用率在至少60分钟内超过90%
description: "CPU使用率过高,5分钟内平均CPU使用率为 {{ humanize $value}}%"
- alert: HighNodeLoad(CPU饱和度)
expr: instance:node_cpu_saturation_load1
for: 5m
labels:
name: Load
severity: warning
annotations:
summary: CPU负载平均数超过了CPU数量
description: CPU平均负载至少5分钟内超过主机CPU数量的两倍
- alert: HighNodeMem(内存使用率)
expr: instance:node_memory_usage:percentage > 95
for: 5m
labels:
name: Memory
severity: warning
annotations:
summary: 使用的内存百分比至少在5分钟内超过95%
description: "内存使用率过高,目前值为{{ humanize $value}}%"
- alert: DiskUsage(磁盘使用量)
expr: instance:root:node_filesystem_usage:percentage > 95
for: 5m
labels:
name: Disk
severity: warning
annotations:
summary: "{{$labels.device}}磁盘使用量超过95%"
description: "{{$labels.instance}}的{{ $labels.mountpoint }}使用量为{{ humanize $value}}%"
- alert: DiskWillFillIn4Hours(线性回归预测磁盘空间将耗尽)
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 4*3600) < 0
for: 5m
labels:
name: Disk
severity: critical
annotations:
summary: 基于最后一小时的增长历史记录,根文件系统的磁盘空间将在接下来的四小时内耗尽
description: "{{$labels.instance}}挂载在 {{ $labels.mountpoint }}的设备 {{$labels.device}}磁盘空间可能将用完"
- alert: DiskIO(磁盘IO操作耗时)
expr: 100-(avg(irate(node_disk_io_time_seconds_total{job="node_prod"}[1m])) by(instance)* 100) < 40
for: 5m
labels:
name: Disk
severity: critical
annotations:
summary: "{{$labels.instance}}磁盘IO使用率过高,磁盘IO大于60%"
description: "{{$labels.instance}}:磁盘IO空闲百分比为:{{humanize $value}}%"
- alert: InstanceDown(实例已停止响应抓取)
expr: up{job="node_prod"} == 0
for: 10s
labels:
severity: critical
annotations:
summary: Host {{ $labels.instance }} is down!
description: 实例 {{ $labels.instance }}已停止响应抓取
- alert: InstancesDown(作业中至少25%的实例无法响应抓取)
expr: avg(up{job="node_prod"}) by (job) < 0.75
for: 10s
labels:
severity: critical
annotations:
summary: 作业中25%以上的实例停止响应抓取
description: 作业 {{$labels.job}}中至少25%的实例无法响应抓取
- alert: InstancesGone(UP指标缺失警报)
expr: absent(up{job="node_prod"})
for: 10s
labels:
severity: critical
annotations:
summary: 节点作业中的UP指标消失
description: 实例 {{ $labels.instance }} 的UP指标消失
groups:
- name: network_alerts
rules:
- alert: HostNetwork_receive(网卡接收流量异常)
expr: irate(node_network_receive_bytes_total{job="node_prod",device!~"lo|qb.*|qv.*|tap.*|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1024 / 1024 > 20
for: 1m
labels:
name: Network_receive
severity: warning
annotations:
summary: "{{$labels.instance}} 网卡接收流量异常"
description: "{{$labels.instance}} 网卡{{$labels.device}} 5分钟平均接收流量为 {{ humanize $value }}MB/s"
- alert: hostNetwork_transmit(网卡流出流量异常)
expr: irate(node_network_transmit_bytes_total{job="node_prod",device!~"lo|qb.*|qv.*|tap.*|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1024 / 1024 > 20
for: 1m
labels:
name: Network_transmit
severity: warning
annotations:
summary: "{{$labels.instance}} 网卡流出流量异常"
description: "{{$labels.instance}} 网卡{{$labels.device}} 5分钟平均接收流量为 {{ humanize $value }}MB/s"
groups:
- name: SSL证书状态
rules:
- alert: "SSL证书过期警告"
expr: (probe_ssl_earliest_cert_expiry - time())/86400 <3
for: 1h
labels:
severity: 警告
annotations:
description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
summary: "SSL证书过期警告"
groups:
- name: blackbox_network_stats
rules:
- alert: blackbox_network_stats
expr: probe_success == 0
for: 3m
labels:
severity: critical
annotations:
summary: "接口/主机/端口 {{ $labels.instance }} 无法联通"
description: "请尽快检测"
groups:
- name: prometheus_alerts
rules:
- alert: PrometheusConfigReloadFailed(Prometheus配置重载失败)
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed on {{ $labels.instance }}.
- alert: PrometheusNotConnectedToAlertmanagers(Prometheus没有发现任何Alertmanager)
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.instance }} is not connected to any Alertmanagers
标签:node,name,expr,labels,alert,instance,Prometheus,警报,规则 来源: https://blog.51cto.com/9473774/2657248