prometheus监控方案(草稿)
作者:互联网
一、描述
两台服务器:192.168.11.109、192.168.11.121
第一台计划安装:prometheus、blackbox_exporter、snmp_exporter、consul、grafana、consul-template、thanos query、thanos sidecar、thanos rule
第二台计划安装:prometheus、blackbox_exporter、snmp_exporter、thanos sidecar
二、安装blackbox探针,2台都安装
tar -zxvf blackbox_exporter-0.19.0.linux-amd64.tar.gz
mv blackbox_exporter-0.19.0.linux-amd64 /usr/local/blackbox_exporter
cd /usr/local/blackbox_exporter
vim blackbox.yml
(实际我们只用到network_ping)
modules:
http_2xx:
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
network_ping:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ipv4"
payload_size: 100
dont_fragment: true
添加到systemctl服务
vim /etc/systemd/system/blackbox.service
[Unit]
Description=Blackbox
After=network.target
[Service]
User=root
Group=root
Type=simple
Restart=on-failure
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
[Install]
WantedBy=multi-user.target
服务自启动
systemctl daemon-reload
systemctl enable blackbox
systemctl start blackbox
三、安装snmp探针,2台都安装
tar -zxvf snmp_exporter-0.20.0.linux-amd64.tar.gz
mv snmp_exporter-0.20.0.linux-amd64 /usr/local/snmp_exporter
cd /usr/local/snmp_exporter
vim snmp.yml
(三个module,分别是接口流量、cpu内存、sla侦测)
network_if_table:
walk:
- 1.3.6.1.2.1.31.1.1.1.1
- 1.3.6.1.2.1.31.1.1.1.10
- 1.3.6.1.2.1.31.1.1.1.6
- 1.3.6.1.2.1.31.1.1.1.2
- 1.3.6.1.2.1.31.1.1.1.3
- 1.3.6.1.2.1.31.1.1.1.4
- 1.3.6.1.2.1.31.1.1.1.5
- 1.3.6.1.2.1.31.1.1.1.7
- 1.3.6.1.2.1.31.1.1.1.11
- 1.3.6.1.2.1.31.1.1.1.15
- 1.3.6.1.2.1.31.1.1.1.18
- 1.3.6.1.2.1.2.2.1.14
- 1.3.6.1.2.1.2.2.1.13
- 1.3.6.1.2.1.2.2.1.20
- 1.3.6.1.2.1.2.2.1.19
- 1.3.6.1.2.1.2.2.1.7
- 1.3.6.1.2.1.2.2.1.8
metrics:
- name: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
help: The textual name of the interface - 1.3.6.1.2.1.31.1.1.1.1
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- name: ifHCOutOctets
oid: 1.3.6.1.2.1.31.1.1.1.10
type: counter
help: The total number of octets transmitted out of the interface, includingframing
characters - 1.3.6.1.2.1.31.1.1.1.10
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifHCInOctets
oid: 1.3.6.1.2.1.31.1.1.1.6
type: counter
help: The total number of octets received on the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.6
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifInMulticastPkts
oid: 1.3.6.1.2.1.31.1.1.1.2
type: counter
help: The total number of multicast packets received on the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.2
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifInBroadcastPkts
oid: 1.3.6.1.2.1.31.1.1.1.3
type: counter
help: The total number of broadcast packets received on the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.3
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifOutMulticastPkts
oid: 1.3.6.1.2.1.31.1.1.1.4
type: counter
help: The total number of multicast packets transmitted out of the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.4
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifOutBroadcastPkts
oid: 1.3.6.1.2.1.31.1.1.1.5
type: counter
help: The total number of broadcast packets transmitted out of the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.5
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifHCInUcastPkts
oid: 1.3.6.1.2.1.31.1.1.1.7
type: counter
help: The total number of unicast packets received on the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.7
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifHCOutUcastPkts
oid: 1.3.6.1.2.1.31.1.1.1.11
type: counter
help: The total number of Unicast packets transmitted out of the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.11
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifHighSpeed
oid: 1.3.6.1.2.1.31.1.1.1.15
type: gauge
help: An estimate of the interface's current bandwidth in units of 1,000,000 bits
per second - 1.3.6.1.2.1.31.1.1.1.15
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifInDiscards
oid: 1.3.6.1.2.1.2.2.1.13
type: counter
help: The number of inbound packets which were chosen to be discarded even though
no errors had been detected to prevent their being deliverable to a higher- layer
protocol - 1.3.6.1.2.1.2.2.1.13
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifInErrors
oid: 1.3.6.1.2.1.2.2.1.14
type: counter
help: For packet- oriented interfaces, the number of inbound packets that contained
errors preventing them from being deliverable to a higher - layer protocol - 1.3.6.1.2.1.2.2.1.14
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifOutDiscards
oid: 1.3.6.1.2.1.2.2.1.19
type: counter
help: The number of outbound packets which were chosen to be discarded even though
no errors had been detected to prevent their being transmitted - 1.3.6.1.2.1.2.2.1.19
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifOutErrors
oid: 1.3.6.1.2.1.2.2.1.20
type: counter
help: For packet- oriented interfaces, the number of outbound packets that could
not be transmitted because of errors - 1.3.6.1.2.1.2.2.1.20
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifOperStatus
oid: 1.3.6.1.2.1.2.2.1.8
type: gauge
help: The current operational state of the interface - 1.3.6.1.2.1.2.2.1.8
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifAdminStatus
oid: 1.3.6.1.2.1.2.2.1.7
type: gauge
help: The desired state of the interface - 1.3.6.1.2.1.2.2.1.7
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifAlias
oid: 1.3.6.1.2.1.31.1.1.1.18
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
version: 2
auth:
community: linux
security_level: noAuthNoPriv
auth_protocol: MD5
priv_protocol: DES
#cisco ip sla metrics:
#ipsla_status: 1 success, 4 fail
#ipSlaTag: description in network device's ipsla configuration
network_cisco_ipsla:
walk:
- 1.3.6.1.4.1.9.9.42.1.2.10.1.1
- 1.3.6.1.4.1.9.9.42.1.2.10.1.2
- 1.3.6.1.4.1.9.9.42.1.2.1.1.3
metrics:
- name: ipSlaTag
oid: 1.3.6.1.4.1.9.9.42.1.2.1.1.3
type: DisplayString
help: The tag of ip sla, - 1.3.6.1.4.1.9.9.42.1.2.1.1.3
indexes:
- labelname: ipSlaIndex
type: gauge
- name: ipsla_status
oid: 1.3.6.1.4.1.9.9.42.1.2.10.1.2
type: gauge
indexes:
- labelname: ipSlaIndex
type: gauge
lookups:
- labels:
- ipSlaIndex
labelname: ipSlaTag
oid: 1.3.6.1.4.1.9.9.42.1.2.1.1.3
type: DisplayString
- name: ipsla_rtt
oid: 1.3.6.1.4.1.9.9.42.1.2.10.1.1
type: gauge
help: The rtt of ip sla, - 1.3.6.1.4.1.9.9.42.1.2.10.1.1
indexes:
- labelname: ipSlaIndex
type: gauge
lookups:
- labels:
- ipSlaIndex
labelname: ipSlaTag
oid: 1.3.6.1.4.1.9.9.42.1.2.1.1.3
type: DisplayString
version: 2
auth:
community: linux
security_level: noAuthNoPriv
auth_protocol: MD5
priv_protocol: DES
network_cisco_process:
walk:
- 1.3.6.1.4.1.9.9.109.1.1.1.1.4
- 1.3.6.1.4.1.9.9.48.1.1.1.6.1
- 1.3.6.1.4.1.9.9.48.1.1.1.6.2
- 1.3.6.1.4.1.9.9.48.1.1.1.5.1
- 1.3.6.1.4.1.9.9.48.1.1.1.5.2
metrics:
- name: CpuUsage1min
oid: 1.3.6.1.4.1.9.9.109.1.1.1.1.4
type: gauge
- name: ProcessorFreeMem
oid: 1.3.6.1.4.1.9.9.48.1.1.1.6.1
type: counter
- name: IOFreeMem
oid: 1.3.6.1.4.1.9.9.48.1.1.1.6.2
type: gauge
- name: ProcessorUsedMem
oid: 1.3.6.1.4.1.9.9.48.1.1.1.5.1
type: counter
- name: IOUsedMem
oid: 1.3.6.1.4.1.9.9.48.1.1.1.5.2
type: gauge
version: 2
auth:
community: linux
security_level: noAuthNoPriv
auth_protocol: MD5
priv_protocol: DES
添加systemctl服务
vim /etc/systemd/system/snmp_exporter.service
[Unit]
Description=snmp_exporter
After=network.target
[Service]
User=root
Group=root
Type=simple
Restart=on-failure
ExecStart=/usr/local/snmp_exporter/snmp_exporter --config.file=/usr/local/snmp_exporter/snmp.yml
[Install]
WantedBy=multi-user.target
服务自启动
systemctl daemon-reload
systemctl enable snmpexporter
systemctl start snmpexporter
四、安装consul,仅第一台安装
wget https://releases.hashicorp.com/consul/1.10.1/consul_1.10.1_linux_amd64.zip
unzip consul_1.10.1_linux_amd64.zip
mkdir /usr/local/consul
mv consul /usr/local/consul/
cd /usr/local/consul
新建配置文件路径和数据路径
mkdir conf
mkdir data
vim consul.pid
2008
vim conf/consul-srv.json
{
"datacenter": "mydc",
"data_dir": "/usr/local/consul/data",
"log_level": "info",
"node_name": "mydc1",
"server": true,
"bind_addr": "0.0.0.0",
"client_addr": "0.0.0.0",
"advertise_addr": "192.168.11.109",
"retry_interval": "30s",
"enable_debug": false,
"rejoin_after_leave": false,
"enable_syslog": true,
"syslog_facility": "local0",
"ui": true
}
vim conf/port.json
{
"ports":{
"http": 8500,
"dns": 8600,
"serf_lan": 8301,
"serf_wan": 8302,
"server": 8300
}
}
添加consul服务
vim /etc/systemd/system/consul.service
[Unit]
Description=Consul
Requires=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Environment=GOMAXPROCS=2
Restart=on-failure
ExecStartPre=[ -f "/usr/local/consul/consul.pid" ] && /usr/bin/rm -f /usr/local/consul/consul.pid
ExecStartPre=/usr/local/consul/consul validate /usr/local/consul/conf/
ExecStart=/usr/local/consul/consul agent -config-dir=/usr/local/consul/conf/ -pid-file=/usr/local/consul/consul.pid
ExecReload=/bin/kill -HUP
KillSignal=sIGTERM
TimeoutStopSec=5
LimitNOFILE=65535
[Install]
WantedBy=multi-user.target
服务自启动
systemctl daemon-reload
systemctl enable consul
systemctl start consul
http://192.168.11.109:8500/ui/mydc/kv
添加规则rules/router_sla和rules/switch_sla
- name: router_sla_fail
rules:
- alert: router_sla_fail
expr: sum_over_time(ipsla_status{device="router", host=~".+?", job="network_cisco_ipsla"}[1m]) >= 48
for: 1m
labels:
severity: minor
appType: net
annotations:
realvalue: "{{ $value }}"
ruleId: 2
summary: "{{$labels.idc}}:{{$labels.device}}:{{$labels.host}}:{{$labels.ipSlaTag}}:fail"
- name: switch_sla_fail
rules:
- alert: switch_sla_fail
expr: sum_over_time(ipsla_status{device="switch", host=~".+?", job="network_cisco_ipsla"}[1m]) >= 48
for: 1m
labels:
severity: major
appType: net
annotations:
realvalue: "{{ $value }}"
ruleId: 1
summary: "{{$labels.idc}}:{{$labels.device}}:{{$labels.host}}:{{$labels.ipSlaTag}}:fail"
也可以通过curl推送规则到consul
curl -X PUT -d '''
- name: router_sla_fail
rules:
- alert: router_sla_fail
expr: sum_over_time(ipsla_status{device="router", host=~".+?", job="network_cisco_ipsla"}[1m]) >= 48
for: 1m
labels:
severity: minor
appType: net
annotations:
realvalue: "{{ $value }}"
ruleId: 2
summary: "{{$labels.idc}}:{{$labels.device}}:{{$labels.host}}:{{$labels.ipSlaTag}}:fail"
''' http://localhost:8500/v1/kv/network/router_sla
五、安装consul-template,仅第一台安装
wget https://releases.hashicorp.com/consul-template/0.26.0/consul-template_0.26.0_linux_amd64.zip
unzip consul-template_0.26.0_linux_amd64.zip
mv consul-template /usr/local/consul_template
cd /usr/local/consul_template
mkdir conf
vim conf/config_rule.hcl
consul {
address = "192.168.11.109:8500"
retry {
enabled = true
attempts = 5
backoff = "250ms"
}
}
log_level = "info"
wait {
min = "5s"
max = "10s"
}
template = {
source = "/usr/local/thanos_rule/conf/rule.tpl"
destination = "/usr/local/thanos_rule/conf/rule.rules"
command = "systemctl restart thanos_rule"
command_timeout = "60s"
wait {
min = "2s"
max = "10s"
}
}
添加consul_template服务
vim /etc/systemd/system/consul_template_rule.service
[Unit]
Description = consul_template_rule
Wants = network-online.target
After = network-online.target
[Service]
User = root
Group = root
Restart = on-failure
ExecStart = /usr/local/consul_template/consul-template -config "/usr/local/consul_template/conf/config_rule.hcl"
LimitNOFILE = 65535
[Install]
WantedBy = multi-user.target
服务自启动
systemctl daemon-reload
systemctl enable consul_template_rule
systemctl start consul_template_rule
六、安装prometheus
tar -zxvf prometheus-2.29.0-rc.0.linux-amd64.tar.gz
mv prometheus-2.29.0-rc.0.linux-amd64 /usr/local/prometheus
cd /usr/local/prometheus
vim prometheus.yml
第一台:
global:
scrape_interval: 60s
scrape_timeout: 60s
evaluation_interval: 1m
external_labels:
slave: '192.168.11.109:9090'
idc: 'mydc'
scrape_configs:
- job_name: 'snmp_5s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 5s
scrape_timeout: 5s
metrics_path: /snmp
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^1$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.109:9116
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(snmp_5s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'snmp_10s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 10s
scrape_timeout: 10s
metrics_path: /snmp
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^1$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.109:9116
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(snmp_10s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'snmp_60s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 60s
scrape_timeout: 60s
metrics_path: /snmp
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^1$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.109:9116
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(snmp_60s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'black_5s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 5s
scrape_timeout: 5s
metrics_path: /probe
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^1$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.109:9115
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(black_5s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'black_10s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 10s
scrape_timeout: 10s
metrics_path: /probe
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^1$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.109:9115
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(black_10s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
第二台:
global:
scrape_interval: 60s
scrape_timeout: 60s
evaluation_interval: 1m
external_labels:
slave: '192.168.11.121:9090'
idc: 'mydc'
scrape_configs:
- job_name: 'snmp_5s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 5s
scrape_timeout: 5s
metrics_path: /snmp
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^2$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.121:9116
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(snmp_5s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'snmp_10s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 10s
scrape_timeout: 10s
metrics_path: /snmp
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^2$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.121:9116
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(snmp_10s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'snmp_60s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 60s
scrape_timeout: 60s
metrics_path: /snmp
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^2$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.121:9116
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(snmp_60s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'black_5s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 5s
scrape_timeout: 5s
metrics_path: /probe
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^2$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.121:9115
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(black_5s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
- job_name: 'black_10s'
consul_sd_configs:
- server: 192.168.11.109:8500
datacenter: mydc
scrape_interval: 10s
scrape_timeout: 10s
metrics_path: /probe
relabel_configs:
- source_labels: [__address__]
modulus: 2
target_label: __tmp_hash
action: hashmod
- source_labels: [__tmp_hash]
regex: ^2$
action: keep
- source_labels: [__address__]
regex: '(\d+\.\d+\.\d+\.\d+):.*'
replacement: $1
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.11.121:9115
- source_labels: [__meta_consul_address]
target_label: ip
- source_labels: [__meta_consul_node]
target_label: host
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){3}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [job_name]
regex: '(black_10s)'
action: keep
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){0}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){1}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_tags]
regex: ',(?:[^,]+,){2}([^=]+)=([^,]+),.*'
replacement: '${2}'
target_label: '${1}'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_service]
target_label: __param_module
添加systemctl服务,存储数据7天
vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Restart=on-failure
ExecStartPre=/usr/local/prometheus/promtool check config /usr/local/prometheus/prometheus.yml
ExecStart=/usr/local/prometheus/prometheus --config.file /usr/local/prometheus/prometheus.yml --storage.tsdb.path /usr/local/prometheus/data --web.enable-lifecycle --storage.tsdb.retention=7d
ExecReload=/usr/bin/curl -X POST localhost:9090/-/reload
服务自启动
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
七、安装thanos
tar -zxvf thanos-0.22.0-rc.0.linux-amd64.tar.gz
mv thanos-0.22.0-rc.0.linux-amd64 /usr/local/thanos
添加thanos sidecar服务(两台)
vim /etc/systemd/system/thanos_sidecar.service
第一台:
[Unit]
Description=thanos_sidecar
[Service]
Type=simple
User=root
ExecStart=/usr/local/thanos/thanos sidecar --log.level=info --grpc-address=192.168.11.109:10901 --http-address=192.168.11.109:10902 --prometheus.url=http://192.168.11.109:9090 --tsdb.path=/usr/local/prometheus/data
[Install]
WantedBy=multi-user.target
第二台:
[Unit]
Description=thanos_sidecar
[Service]
Type=simple
User=root
ExecStart=/usr/local/thanos/thanos sidecar --log.level=info --grpc-address=192.168.11.121:10901 --http-address=192.168.11.121:10902 --prometheus.url=http://192.168.11.121:9090 --tsdb.path=/usr/local/prometheus/data
[Install]
WantedBy=multi-user.target
启动服务
systemctl daemon-reload
systemctl enable thanos_sidecar
systemctl start thanos_sidecar
添加thanos query服务(第一台)
vim /etc/systemd/system/thanos_query.service
[Unit]
Description=thanos_query
[Service]
Type=simple
User=root
ExecStart=/usr/local/thanos/thanos query --query.auto-downsampling --store=192.168.11.109:10901 --store=192.168.11.121:10901 --grpc-address=192.168.11.109:20901 --http-address=192.168.11.109:80
[Install]
WantedBy=multi-user.target
启动服务
systemctl daemon-reload
systemctl enable thanos_query
systemctl start thanos_query
添加thanos rule服务(第一台)
cd /usr/local/
mkdir thanos_rule
mkdir -p {conf,data}
vim conf/rule.tpl
groups:
{{ range ls "rules"}}
{{ .Value}}
{{ end }}
vim /etc/systemd/system/thanos_rule.service
[Unit]
Description = consul_template
Wants = network-online.target
After = network-online.target
[Service]
User = root
Group = root
Restart = on-failure
ExecStart = /usr/local/thanos/thanos rule --data-dir "/usr/local/thanos_rule/data" --rule-file "/usr/local/thanos_rule/conf/*.rules" --alert.query-url "http://192.168.11.109" --query "192.168.11.109:80" --alertmanagers.url "http://192.168.11.109:9093" --log.level "info" --grpc-address "192.168.11.109:10911" --http-address "192.168.11.109:10912" --eval-interval "60s"
LimitNOFILE = 65535
[Install]
WantedBy = multi-user.target
启动服务
systemctl daemon-reload
systemctl enable thanos_rule
systemctl start thanos_rule
八、思科路由器配置sla和snmp,192.168.11.134是另一台pc
sla:
ip sla 1
icmp-echo 192.168.11.134 source-ip 192.168.11.106
tag sla1
frequency 5
ip sla schedule 1 life forever start-time now
snmp:
snmp-server community linux RO
九、添加设备和需要启用的监控模板到consul
curl --request PUT --data '{"node": "RT4", "address": "192.168.11.106","service": { "service": "network_if_table", "tags": ["idc=mydc","device=router","owner=zc","job_name=snmp_60s"]}}' http://192.168.11.109:8500/v1/catalog/register
curl --request PUT --data '{"node": "RT4", "address": "192.168.11.106","service": { "service": "network_cisco_ipsla", "tags": ["idc=mydc","device=router","owner=zc","job_name=snmp_5s"]}}' http://192.168.11.109:8500/v1/catalog/register
curl --request PUT --data '{"node": "RT4", "address": "192.168.11.106","service": { "service": "network_cisco_process", "tags": ["idc=mydc","device=router","owner=zc","job_name=snmp_5s"]}}' http://192.168.11.109:8500/v1/catalog/register
curl --request PUT --data '{"node": "RT4", "address": "192.168.11.106","service": { "service": "network_ping", "tags": ["idc=mydc","device=router","owner=zc","job_name=black_5s"]}}' http://192.168.11.109:8500/v1/catalog/register
如果要删除设备node
curl --request PUT --data '{ "node": "RT4", "address": "192.168.11.106"}' http://192.168.11.109:8500/v1/catalog/deregister
十、安装grafana
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-6.2.5-1.x86_64.rpm
yum install grafana-enterprise-6.2.5-1.x86_64.rpm
systemctl enable grafana-server
systemctl start grafana-server
添加数据源
添加Dashboard,在json model中导入下面代码
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
},
{
"builtln": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 3,
"iteration": 1659508685852,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "irate(ifHCInOctets{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])*8",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-inBound",
"refId": "A",
"refld": "A"
},
{
"expr": "irate(ifHCOutOctets{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])*8",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-outBound",
"refId": "B",
"refld": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Port Traffic",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bps",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"repeat": "SWITCHPORT",
"repeatDirection": "h",
"scopedVars": {
"SWITCHPORT": {
"selected": true,
"text": "Et0/0",
"value": "Et0/0"
}
},
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "irate(ifHCInUcastPkts{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"instant": false,
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-UcastIN",
"refId": "A",
"refld": "A"
},
{
"expr": "irate(ifHCOutUcastPkts{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-UcastOUT",
"refId": "B",
"refld": "B"
},
{
"expr": "irate(ifInMulticastPkts{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-MulticastIN",
"refId": "C",
"refld": "C"
},
{
"expr": "irate(ifOutMulticastPkts{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-MulticastOUT",
"refId": "D",
"refld": "D"
},
{
"expr": "irate(ifInBroadcastPkts{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-BroadcastIN",
"refId": "E",
"refld": "E"
},
{
"expr": "irate(ifOutBroadcastPkts{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[15m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-BroadcastOUT",
"refId": "F",
"refld": "F"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Port Packets",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "pps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 15
},
"hiddenSeries": false,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "irate(ifInDiscards{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-DiscardsIN",
"refId": "A",
"refld": "A"
},
{
"expr": "irate(ifOutDiscards{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"instant": false,
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-DiscardOUT",
"refId": "B",
"refld": "B"
},
{
"expr": "irate(ifInErrors{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-ErrorsIN",
"refId": "C",
"refld": "C"
},
{
"expr": "irate(ifOutErrors{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}[5m])",
"format": "time_series",
"interval": "60s",
"intervalFactor": 2,
"legendFormat": "{{host}}-{{ifName}}-ErrorsOUT",
"refId": "D",
"refld": "D"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Port Errors",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "pps",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 22
},
"hiddenSeries": false,
"id": 8,
"interval": "",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "ifOperStatus",
"yaxis": 2
},
{
"alias": "ifOperStatus: 1 up 2 down 3 testing",
"yaxis": 2
},
{
"alias": "ifStatus:1 up / 0,2 down",
"yaxis": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "ifHighSpeed{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}",
"formal": "time_series",
"format": "time_series",
"interval": "60s",
"intervalFactor": 1,
"legendFormat": "ifAlias: {{ifAlias}}",
"refId": "A",
"refld": "A"
},
{
"expr": "ifOperStatus{host=\"$HOST\",ifName=\"$SWITCHPORT\",job=\"network_if_table\"}",
"format": "time_series",
"interval": "60s",
"intervalFactor": 1,
"legendFormat": "ifStatus:1 up / 0,2 down",
"refId": "B",
"refld": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Port Speed / Port Status",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 3,
"format": "Mbits",
"label": "Speed",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": 0,
"format": "short",
"label": "ifStatus",
"logBase": 1,
"max": "5",
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 31
},
"hiddenSeries": false,
"id": 10,
"interval": "5s",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "ipSlaStatus: 1 up / 0,4 down",
"yaxis": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "ipsla_rtt{host=\"$HOST\",ipSlaTag=\"$SLA\"}",
"format": "time_series",
"interval": "5s",
"intervalFactor": 1,
"legendFormat": "ipSlaRtt: {{ipSlaTag}}",
"refId": "A",
"refld": "A"
},
{
"expr": "ipsla_status{host=\"$HOST\",ipSlaTag =\"$SLA\"}",
"format": "time_series",
"interval": "5s",
"intervalFactor": 1,
"legendFormat": "ipSlaStatus: 1 up / 0,4 down",
"refId": "B",
"refld": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "ipSlaRtt / ipSlaStatus",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": "ipSlaRtt",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": 0,
"format": "short",
"label": "ipSlaStatus",
"logBase": 1,
"max": "5",
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 39
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": false,
"avg": false,
"current": true,
"max": false,
"min": false,
"rightside": false,
"show": true,
"sort": "current",
"sortDesc": false,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "CpuUsage1min{PhysicalName=~\"Board.*|RPU *\",host=~\"$HOST\",job=\"network_router_h3c\"} or CpuUsage1min{host=~\"$HOST\",job=\"network_cisco_process\"}",
"format": "time_series",
"interval": "60s",
"intervalFactor": 1,
"legendFormat": "{{host}}: CpuUsage",
"refId": "A",
"refld": "A"
},
{
"expr": "ProcessorUsedMem{host=~\"$HOST\",job=\"network_cisco_process\"} / (ProcessorUsedMem{host=~\"$HOST\",job=\"network_cisco_process\"} + ProcessorFreeMem{host=~\"$HOST\",job=\"network_cisco_process\"}) * 100",
"format": "time_series",
"interval": "60s",
"intervalFactor": 1,
"legendFormat": "{{host}}: ProcessorMemUsage",
"refId": "O"
},
{
"expr": "IOUsedMem{host=~\"$HOST\",job=\"network_cisco_process\"} / (IOUsedMem{host=~\"$HOST\",job=\"network_cisco_process\"} + IOFreeMem{host=~\"$HOST\",job=\"network_cisco_process\"}) * 100",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{host}}: IOMemUsage",
"refId": "P"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CPU/MEM/Session",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "percent",
"label": "CPU / MEM",
"logBase": 1,
"max": "100",
"min": "0",
"show": true
},
{
"format": "none",
"label": "SESSION",
"logBase": 1,
"max": null,
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "mydc",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 39
},
"hiddenSeries": false,
"id": 11,
"interval": "5s",
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.2.1",
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "pingStatus: 1 up / 0 down",
"yaxis": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "probe_success{host=\"$HOST\",job=\"network_ping\"}",
"format": "time_series",
"interval": "5s",
"intervalFactor": 1,
"legendFormat": "pingStatus: 1 up / 0 down",
"refId": "A",
"refld": "A"
},
{
"expr": "probe_icmp_duration_seconds{host=\"$HOST\",job=\"network_ping\",phase=\"rtt\"}",
"format": "time_series",
"interval": "5s",
"intervalFactor": 1,
"legendFormat": "pingRtt",
"refId": "B",
"refld": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "pingRtt / pingStatus",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "s",
"label": "pingRtt",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": 0,
"format": "short",
"label": "pingStatus",
"logBase": 1,
"max": "5",
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": false,
"schemaVersion": 18,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": true,
"text": "RT4",
"value": "RT4"
},
"datasource": "mydc",
"definition": "up{owner='zc'}",
"hide": 0,
"includeAll": false,
"label": null,
"multi": true,
"name": "HOST",
"options": [],
"query": "up{owner='zc'}",
"refresh": 1,
"regex": "/host=\"(.*?)\"/",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"selected": true,
"text": "Et0/0",
"value": "Et0/0"
},
"datasource": "mydc",
"definition": "ifHighSpeed{host=~\"$HOST\",owner=\"zc\"}",
"hide": 0,
"includeAll": false,
"label": null,
"multi": true,
"name": "SWITCHPORT",
"options": [],
"query": "ifHighSpeed{host=~\"$HOST\",owner=\"zc\"}",
"refresh": 1,
"regex": "/ifName=\"(.*?)\"/",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"selected": true,
"text": "sla1",
"value": "sla1"
},
"datasource": "mydc",
"definition": "ipSlaTag{host=~\"$HOST\",owner=\"zc\"}",
"hide": 0,
"includeAll": false,
"label": null,
"multi": true,
"name": "SLA",
"options": [],
"query": "ipSlaTag{host=~\"$HOST\",owner=\"zc\"}",
"refresh": 1,
"regex": "/ipSlaTag=\"(.*?)\"/",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-2h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "mydc",
"uid": "GmRQdHkmz",
"version": 59
}
十一、安装alertmanager
tar -zxvf alertmanager-0.21.0.linux-amd64.tar.gz
mv alertmanager-0.21.0.linux-amd64 /usr/local/alertmanager
cd /usr/local/alertmanager
mkdir -p {data,template}
vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: 2********@qq.com
smtp_auth_username: 2********
smtp_auth_password: 1111111111
smtp_require_tls: false
smtp_smarthost: smtp.qq.com:25
templates:
- "/usr/local/alertmanager/template/alertmanager.tmpl"
route:
group_by: ['instance']
group_wait: 10s
group_interval: 10s
repeat_interval: 30m
receiver: 'all.mail'
routes:
- receiver: 'router.mail'
group_wait: 10s
match:
device: router
- receiver: 'switch.mail'
group_wait: 10s
match:
device: switch
receivers:
- name: 'router.mail'
email_configs:
- to: 2********@qq.com
send_resolved: true
- name: 'switch.mail'
email_configs:
- to: '2********@163.com,e***********@sina.com'
send_resolved: true
- name: 'all.mail'
email_configs:
- to: e***********@sina.com
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'major'
equal: ['instance']
- source_match:
severity: 'critical'
target_match:
severity: 'minor'
equal: ['instance']
- source_match:
severity: 'major'
target_match:
severity: 'minor'
equal: ['instance']
vim template/alertmanager.tmpl
{{ define "__text_alert_list" }}
{{ range . }}
告警名称:{{ .Labels.alertname }}
<br>
告警级别:{{ .Labels.severity }}
<br>
主机地址: {{ .Labels.instance }}
<br>
主机名称: {{ .Labels.host }}
<br>
告警描述: {{ .Annotations.summary }}
<br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
<br>
-----------
<br>
{{ end }}
{{ end }}
{{ define "__text_resolve_list" }}
{{ range . }}
告警名称:{{ .Labels.alertname }}
<br>
告警级别:{{ .Labels.severity }}
<br>
主机地址: {{ .Labels.instance }}
<br>
主机名称: {{ .Labels.host }}
<br>
告警描述: {{ .Annotations.summary }}
<br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
<br>
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
<br>
-----------
<br>
{{ end }}
{{ end }}
{{ define "email.default.html" }}
{{ if gt (len .Alerts.Firing) 0 -}}
告警列表:
<br>
------------------------
<br>
{{ template "__text_alert_list" .Alerts.Firing }}
<br>
<br>
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
恢复列表:
<br>
------------------------
<br>
{{ template "__text_resolve_list" .Alerts.Resolved }}
<br>
<br>
{{- end }}
{{- end }}
添加服务
vim /etc/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
After=network.target
#StartLimitIntervalSec=0
[Service]
Type=simple
User=root
ExecStart=/usr/local/alertmanager/alertmanager --storage.path="/usr/local/alertmanager/data/" --config.file=/usr/local/alertmanager/alertmanager.yml --web.external-url=http://192.168.11.109:9093
Restart=always
RestartSec=1
# Restart=on-failure
[Install]
WantedBy=multi-user.target
启动服务
systemctl daemon-reload
systemctl enable alertmanager
systemctl start alertmanager
告警邮件示范:
告警列表:
------------------------
告警名称:router_sla_fail
告警级别:minor
主机地址: 192.168.11.106
主机名称: RT4
告警描述: mydc:router:RT4:sla2:fail
触发时间: 2022-08-03 07:44:34
-----------
恢复邮件示范:
恢复列表:
------------------------
告警名称:router_sla_fail
告警级别:minor
主机地址: 192.168.11.106
主机名称: RT4
告警描述: mydc:router:RT4:sla2:fail
触发时间: 2022-08-03 07:44:34
恢复时间: 2022-08-03 08:00:34
-----------
thanos_rule的告警通知报文,需要抓包才能看见
{
"labels": {
"alertname": "ip_sla_fail",
"appType": "net",
"device": "router",
"host": "RT4",
"idc": "mydc",
"instance": "192.168.11.106",
"ip": "192.168.11.106",
"ipSlaIndex": "1",
"ipSlaTag": "sla1",
"job": "network_cisco_ipsla",
"severity": "minor",
"slave": "192.168.11.109:9090"
},
"annotations": {
"realvalue": "12",
"ruleId": "73",
"summary": "mydc:router:RT4:sla1:fail"
},
"startsAt": "2021-07-23T02:48:47.924941435Z",
"endsAt": "2021-07-23T04:06:47.924941435Z",
"generatorURL": "192.168.11.109/graph?g0.expr=sum_over_time%28ipsla_status%7Bdevice%3D%22router%22%2Chost%3D~%22.%2B%3F%22%2Cjob%3D%22network_cisco_ipsla%22%7D%5B1m%5D%29+%3C%3D+48\u0026g0.tab=1"
}
标签:__,target,草稿,consul,labels,label,source,prometheus,监控 来源: https://www.cnblogs.com/choujin/p/16552809.html