其他分享
首页 > 其他分享> > zabbix4.0监控gpu

zabbix4.0监控gpu

作者:互联网

zabbix监控gpu

安装监控工具

yum install munin --nogpgcheck -y


1.编写获取gpu参数的脚本

# vim /usr/local/zabbix_agents/scripts/get_gpus_info.sh

#!/bin/bash

result=$(/usr/bin/nvidia-smi -L | sed 's/^GPU \([0-9]*\):.*(UUID: \(.*\))$/,{"{#GPUINDEX}":"\1","{#GPUUUID}":"\2"}/g')

first=1

echo "{"
echo "\"data\":["

for line in ${result[@]}
do
  if [ "$first" == "1" ]; then
    echo ${line:1}
    first=0
  else
    echo -n $line
  fi
done

echo 
echo "]"
echo "}"

2.添加监控项

UserParameter=gpu.number,/usr/bin/nvidia-smi -L | /usr/bin/wc -l
UserParameter=gpu.discovery,/usr/local/zabbix_agents_3.2.0/scripts/get_gpus_info.sh
UserParameter=gpu.fanspeed[*],/usr/bin/nvidia-smi --query-gpu=fan.speed --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.power[*],/usr/bin/nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.temp[*],/usr/bin/nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.utilization[*],/usr/bin/nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.memfree[*],/usr/bin/nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.memused[*],/usr/bin/nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $1 | tr -d "\n"
UserParameter=gpu.memtotal[*],/usr/bin/nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i $1 | tr -d "\n"

3.添加模板

zbx_nvidia-smi-multi-gpu.xml

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>3.0</version>
    <date>2018-06-05T20:56:12Z</date>
    <groups>
        <group>
            <name>Templates</name>
        </group>
    </groups>
    <templates>
        <template>
            <template>Template Nvidia GPUs Performance</template>
            <name>Template Nvidia GPUs Performance</name>
            <description/>
            <groups>
                <group>
                    <name>Templates</name>
                </group>
            </groups>
            <applications>
                <application>
                    <name>Nvidia</name>
                </application>
            </applications>
            <items>
                <item>
                    <name>Number of GPUs</name>
                    <type>0</type>
                    <snmp_community/>
                    <multiplier>0</multiplier>
                    <snmp_oid/>
                    <key>gpu.number</key>
                    <delay>30</delay>
                    <history>90</history>
                    <trends>365</trends>
                    <status>0</status>
                    <value_type>0</value_type>
                    <allowed_hosts/>
                    <units/>
                    <delta>0</delta>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <formula>1</formula>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <data_type>0</data_type>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <description>The number of GPUs present on this system.</description>
                    <inventory_link>0</inventory_link>
                    <applications>
                        <application>
                            <name>Nvidia</name>
                        </application>
                    </applications>
                    <valuemap/>
                    <logtimefmt/>
                </item>
            </items>
            <discovery_rules>
                <discovery_rule>
                    <name>GPU discovery</name>
                    <type>0</type>
                    <snmp_community/>
                    <snmp_oid/>
                    <key>gpu.discovery</key>
                    <delay>600</delay>
                    <status>0</status>
                    <allowed_hosts/>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <filter>
                        <evaltype>0</evaltype>
                        <formula/>
                        <conditions/>
                    </filter>
                    <lifetime>30</lifetime>
                    <description>Discovery of graphics cards.</description>
                    <item_prototypes>
                        <item_prototype>
                            <name>GPU $1 Fan Speed</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>1</multiplier>
                            <snmp_oid/>
                            <key>gpu.fanspeed[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>GPU $1 Memory Free</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>gpu.memfree[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>MB</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>GPU $1 Memory Total</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>gpu.memtotal[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>MB</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>GPU $1 Memory Used</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>gpu.memused[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>MB</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>GPU $1 Power in decaWatts</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>1</multiplier>
                            <snmp_oid/>
                            <key>gpu.power[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>0</value_type>
                            <allowed_hosts/>
                            <units>dW</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>0.1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>GPU $1 Temperature</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>gpu.temp[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>0</value_type>
                            <allowed_hosts/>
                            <units>C</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>GPU $1 Utilization</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>gpu.utilization[{#GPUINDEX}]</key>
                            <delay>60</delay>
                            <history>7</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>Nvidia</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                    </item_prototypes>
                    <trigger_prototypes>
                        <trigger_prototype>
                            <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;80</expression>
                            <name>GPU {#GPUINDEX} Temperature is extremely high</name>
                            <url/>
                            <status>0</status>
                            <priority>5</priority>
                            <description>A GPU's temperature is getting extremely high!</description>
                            <type>0</type>
                            <dependencies/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;70</expression>
                            <name>GPU {#GPUINDEX} Temperature is high</name>
                            <url/>
                            <status>0</status>
                            <priority>2</priority>
                            <description>A GPU's temperature is getting high!</description>
                            <type>0</type>
                            <dependencies>
                                <dependency>
                                    <name>GPU {#GPUINDEX} Temperature is very high</name>
                                    <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;75</expression>
                                </dependency>
                            </dependencies>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;75</expression>
                            <name>GPU {#GPUINDEX} Temperature is very high</name>
                            <url/>
                            <status>0</status>
                            <priority>4</priority>
                            <description>A GPU's temperature is getting very high!</description>
                            <type>0</type>
                            <dependencies>
                                <dependency>
                                    <name>GPU {#GPUINDEX} Temperature is extremely high</name>
                                    <expression>{Template Nvidia GPUs Performance:gpu.temp[{#GPUINDEX}].last()}&gt;80</expression>
                                </dependency>
                            </dependencies>
                        </trigger_prototype>
                    </trigger_prototypes>
                    <graph_prototypes>
                        <graph_prototype>
                            <name>GPU {#GPUINDEX} Memory</name>
                            <width>900</width>
                            <height>200</height>
                            <yaxismin>0.0000</yaxismin>
                            <yaxismax>100.0000</yaxismax>
                            <show_work_period>1</show_work_period>
                            <show_triggers>1</show_triggers>
                            <type>0</type>
                            <show_legend>1</show_legend>
                            <show_3d>0</show_3d>
                            <percent_left>0.0000</percent_left>
                            <percent_right>0.0000</percent_right>
                            <ymin_type_1>0</ymin_type_1>
                            <ymax_type_1>0</ymax_type_1>
                            <ymin_item_1>0</ymin_item_1>
                            <ymax_item_1>0</ymax_item_1>
                            <graph_items>
                                <graph_item>
                                    <sortorder>0</sortorder>
                                    <drawtype>0</drawtype>
                                    <color>00AA00</color>
                                    <yaxisside>0</yaxisside>
                                    <calc_fnc>2</calc_fnc>
                                    <type>0</type>
                                    <item>
                                        <host>Template Nvidia GPUs Performance</host>
                                        <key>gpu.memfree[{#GPUINDEX}]</key>
                                    </item>
                                </graph_item>
                                <graph_item>
                                    <sortorder>1</sortorder>
                                    <drawtype>0</drawtype>
                                    <color>0000DD</color>
                                    <yaxisside>0</yaxisside>
                                    <calc_fnc>2</calc_fnc>
                                    <type>0</type>
                                    <item>
                                        <host>Template Nvidia GPUs Performance</host>
                                        <key>gpu.memused[{#GPUINDEX}]</key>
                                    </item>
                                </graph_item>
                            </graph_items>
                        </graph_prototype>
                        <graph_prototype>
                            <name>GPU {#GPUINDEX} Temperature, Fan Speed and Power</name>
                            <width>900</width>
                            <height>200</height>
                            <yaxismin>0.0000</yaxismin>
                            <yaxismax>100.0000</yaxismax>
                            <show_work_period>1</show_work_period>
                            <show_triggers>1</show_triggers>
                            <type>0</type>
                            <show_legend>1</show_legend>
                            <show_3d>0</show_3d>
                            <percent_left>0.0000</percent_left>
                            <percent_right>0.0000</percent_right>
                            <ymin_type_1>0</ymin_type_1>
                            <ymax_type_1>0</ymax_type_1>
                            <ymin_item_1>0</ymin_item_1>
                            <ymax_item_1>0</ymax_item_1>
                            <graph_items>
                                <graph_item>
                                    <sortorder>0</sortorder>
                                    <drawtype>0</drawtype>
                                    <color>1A7C11</color>
                                    <yaxisside>0</yaxisside>
                                    <calc_fnc>2</calc_fnc>
                                    <type>0</type>
                                    <item>
                                        <host>Template Nvidia GPUs Performance</host>
                                        <key>gpu.power[{#GPUINDEX}]</key>
                                    </item>
                                </graph_item>
                                <graph_item>
                                    <sortorder>1</sortorder>
                                    <drawtype>0</drawtype>
                                    <color>2774A4</color>
                                    <yaxisside>0</yaxisside>
                                    <calc_fnc>2</calc_fnc>
                                    <type>0</type>
                                    <item>
                                        <host>Template Nvidia GPUs Performance</host>
                                        <key>gpu.fanspeed[{#GPUINDEX}]</key>
                                    </item>
                                </graph_item>
                                <graph_item>
                                    <sortorder>2</sortorder>
                                    <drawtype>0</drawtype>
                                    <color>F63100</color>
                                    <yaxisside>0</yaxisside>
                                    <calc_fnc>2</calc_fnc>
                                    <type>0</type>
                                    <item>
                                        <host>Template Nvidia GPUs Performance</host>
                                        <key>gpu.temp[{#GPUINDEX}]</key>
                                    </item>
                                </graph_item>
                            </graph_items>
                        </graph_prototype>
                        <graph_prototype>
                            <name>GPU {#GPUINDEX} Utilization</name>
                            <width>900</width>
                            <height>200</height>
                            <yaxismin>0.0000</yaxismin>
                            <yaxismax>100.0000</yaxismax>
                            <show_work_period>1</show_work_period>
                            <show_triggers>1</show_triggers>
                            <type>0</type>
                            <show_legend>1</show_legend>
                            <show_3d>0</show_3d>
                            <percent_left>0.0000</percent_left>
                            <percent_right>0.0000</percent_right>
                            <ymin_type_1>0</ymin_type_1>
                            <ymax_type_1>0</ymax_type_1>
                            <ymin_item_1>0</ymin_item_1>
                            <ymax_item_1>0</ymax_item_1>
                            <graph_items>
                                <graph_item>
                                    <sortorder>0</sortorder>
                                    <drawtype>0</drawtype>
                                    <color>2774A4</color>
                                    <yaxisside>0</yaxisside>
                                    <calc_fnc>2</calc_fnc>
                                    <type>0</type>
                                    <item>
                                        <host>Template Nvidia GPUs Performance</host>
                                        <key>gpu.utilization[{#GPUINDEX}]</key>
                                    </item>
                                </graph_item>
                            </graph_items>
                        </graph_prototype>
                    </graph_prototypes>
                    <host_prototypes/>
                </discovery_rule>
            </discovery_rules>
            <macros/>
            <templates/>
            <screens/>
        </template>
    </templates>
</zabbix_export>

参考:https://github.com/zhangyuteng/zabbix-nvidia-smi-multi-gpu

标签:--,Nvidia,GPUs,监控,GPU,gpu,zabbix4.0,GPUINDEX
来源: https://www.cnblogs.com/reblue520/p/14662988.html