From 654c9bef2ffa9e0c3613ea20d774f679ce2a5b0b Mon Sep 17 00:00:00 2001 From: Evgeny Naumov Date: Mon, 10 Sep 2018 10:32:31 -0400 Subject: [PATCH] add nvml module --- Makefile.am | 7 +++ README | 7 +++ configure.ac | 56 +++++++++++++++++++ src/collectd.conf.in | 6 ++ src/collectd.conf.pod | 23 ++++++++ src/gpu_nvml.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 250 insertions(+) create mode 100644 src/gpu_nvml.c diff --git a/Makefile.am b/Makefile.am index 48a7cb3d..3949101f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -952,6 +952,13 @@ gps_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBGPS_LDFLAGS) gps_la_LIBADD = -lpthread $(BUILD_WITH_LIBGPS_LIBS) endif +if BUILD_PLUGIN_GPU_NVML +pkglib_LTLIBRARIES += gpu_nvml.la +gpu_nvml_la_SOURCES = src/gpu_nvml.c +gpu_nvml_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_GPU_CUDA_LDFLAGS) +gpu_nvml_la_LIBADD = $(BUILD_WITH_CUDA_LIBS) +endif + if BUILD_PLUGIN_GRPC pkglib_LTLIBRARIES += grpc.la grpc_la_SOURCES = src/grpc.cc diff --git a/README b/README index a5947038..fb10a641 100644 --- a/README +++ b/README @@ -135,6 +135,9 @@ Features - gps Monitor gps related data through gpsd. + - gpu_nvml + Monitor NVIDIA GPU statistics available through NVML. + - hddtemp Hard disk temperatures using hddtempd. @@ -749,6 +752,10 @@ Prerequisites particular. + * CUDA (optional) + Used by the `gpu_nvml' plugin + + * libatasmart (optional) Used by the `smart' plugin. diff --git a/configure.ac b/configure.ac index 95caefff..72361439 100644 --- a/configure.ac +++ b/configure.ac @@ -2074,6 +2074,58 @@ if test "x$with_kvm_openfiles" = "xyes"; then with_libkvm="yes" fi +# --with-cuda {{{ +# only CUDA provides the nvml.h header +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda@<:@=PREFIX@:>@], [Path to cuda.])], + [ + if test "x$withval" = "xyes"; then + with_cuda="yes" + else if test "x$withval" = "xno"; then + with_cuda="no" + else + with_cuda="yes" + CUDA_CFLAGS="$CUDA_CFLAGS -I$withval/include" + CUDA_LDFLAGS="$CUDA_LDFLAGS -L$withval/lib" + fi; fi + ], + [ with_cuda="yes" + CUDA_CFLAGS="$CUDA_CFLAGS -I/opt/cuda/include" + CUDA_LDFLAGS="$CUDA_LDFLAGS -L/opt/cuda/lib64" + ] +) + +SAVE_CFLAGS="$CFLAGS" +SAVE_LDFLAGS="$LDFLAGS" +CFLAGS="$CFLAGS $CUDA_CFLAGS" +LDFLAGS="$LDFLAGS $CUDA_LDFLAGS" + +if test "x$with_cuda" = "xyes"; then + AC_CHECK_HEADERS([nvml.h], + [with_cuda="yes"], + [with_cuda="no (header file missing)"] + ) +fi + +if test "x$with_cuda" = "xpkgconfig"; then + AC_CHECK_HEADERS([nvml.h], + [], + [with_cuda="no (header file missing)"] + ) +fi + +if test "x$with_cuda" = "xyes"; then + BUILD_WITH_CUDA_CFLAGS="$CUDA_CFLAGS" + BUILD_WITH_CUDA_LDFLAGS="$CUDA_LDFLAGS" + BUILD_WITH_CUDA_LIBS="-lnvidia-ml" +fi + +AC_SUBST([BUILD_WITH_CUDA_CFLAGS]) +AC_SUBST([BUILD_WITH_CUDA_LDFLAGS]) +AC_SUBST([BUILD_WITH_CUDA_LIBS]) + +# }}} + # --with-libaquaero5 {{{ AC_ARG_WITH([libaquaero5], [AS_HELP_STRING([--with-libaquaero5@<:@=PREFIX@:>@], [Path to aquatools-ng source code.])], @@ -6310,6 +6362,7 @@ plugin_ethstat="no" plugin_fhcount="no" plugin_fscache="no" plugin_gps="no" +plugin_gpu_nvml="no" plugin_grpc="no" plugin_hugepages="no" plugin_intel_pmu="no" @@ -6735,6 +6788,7 @@ AC_PLUGIN([filecount], [yes], [Count files in dire AC_PLUGIN([fscache], [$plugin_fscache], [fscache statistics]) AC_PLUGIN([gmond], [$with_libganglia], [Ganglia plugin]) AC_PLUGIN([gps], [$plugin_gps], [GPS plugin]) +AC_PLUGIN([gpu_nvml], [$with_cuda], [NVIDIA GPU plugin]) AC_PLUGIN([grpc], [$plugin_grpc], [gRPC plugin]) AC_PLUGIN([hddtemp], [yes], [Query hddtempd]) AC_PLUGIN([hugepages], [$plugin_hugepages], [Hugepages statistics]) @@ -7044,6 +7098,7 @@ AC_MSG_RESULT([ YACC . . . . . . . . $YACC]) AC_MSG_RESULT([ YFLAGS . . . . . . . $YFLAGS]) AC_MSG_RESULT() AC_MSG_RESULT([ Libraries:]) +AC_MSG_RESULT([ cuda . . . . . . . . $with_cuda]) AC_MSG_RESULT([ intel mic . . . . . . $with_mic]) AC_MSG_RESULT([ libaquaero5 . . . . . $with_libaquaero5]) AC_MSG_RESULT([ libatasmart . . . . . $with_libatasmart]) @@ -7157,6 +7212,7 @@ AC_MSG_RESULT([ filecount . . . . . . $enable_filecount]) AC_MSG_RESULT([ fscache . . . . . . . $enable_fscache]) AC_MSG_RESULT([ gmond . . . . . . . . $enable_gmond]) AC_MSG_RESULT([ gps . . . . . . . . . $enable_gps]) +AC_MSG_RESULT([ gpu_nvml . . . . . . $enable_gpu_nvml]) AC_MSG_RESULT([ grpc . . . . . . . . $enable_grpc]) AC_MSG_RESULT([ hddtemp . . . . . . . $enable_hddtemp]) AC_MSG_RESULT([ hugepages . . . . . . $enable_hugepages]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index af652145..7b6acac8 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -656,6 +656,12 @@ # PauseConnect 5 # +# +# GPUIndex 0 +# GPUIndex 2 +# IgnoreSelected 0 +# + # # # EnableSSL true diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 6e6d6eaf..3ae10af1 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -3206,6 +3206,29 @@ Pause to apply between attempts of connection to gpsd in seconds (default 5 sec) =back +=head2 Plugin C + +Collects various statistics from the system's NVIDIA GPUs using the NVML +library. Currently collected are fan speed, core temperature, percent load, and +percent memory used. + +=over 4 + +=item B + +If one or more of these options is specified, only GPUs at that index (as +determined by nvidia-utils through I) have statistics collected. +If no instance of this option is specified, all GPUs are monitored. + +=item B + +If set to true, all detected GPUs B the ones at indices specified by +B entries are collected. For greater clarity, setting IgnoreSelected +without any GPUIndex directives will result in B statistics being +collected. + +=back + =head2 Plugin C The I plugin provides an RPC interface to submit values to or query diff --git a/src/gpu_nvml.c b/src/gpu_nvml.c new file mode 100644 index 00000000..9bbcce3c --- /dev/null +++ b/src/gpu_nvml.c @@ -0,0 +1,151 @@ +#include "daemon/collectd.h" +#include "daemon/common.h" +#include "daemon/plugin.h" + +#include +#include +#include + +#define MAX_DEVNAME_LEN 256 +#define PLUGIN_NAME "gpu_nvml" + +static nvmlReturn_t nv_status = NVML_SUCCESS; +static char *nv_errline = ""; + +#define TRY_CATCH(f, catch) \ + if ((nv_status = f) != NVML_SUCCESS) { \ + nv_errline = #f; \ + goto catch; \ + } +#define TRY(f) TRY_CATCH(f, catch) +#define WRAPGAUGE(x) ((value_t){.gauge = (gauge_t)(x)}) + +static const char *config_keys[] = { + "GPUIndex", + "IgnoreSelected", +}; +static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys); + +static uint64_t conf_match_mask = 0; +static bool conf_mask_is_exclude = 0; + +static int nvml_config(const char *key, const char *value) { + + unsigned long device_ix; + char *eptr; + + if (strcasecmp(key, config_keys[0]) == 0) { + device_ix = strtoul(value, &eptr, 10); + if (eptr == value) { + return -1; + } + if (device_ix > 64) { + return -2; + } + conf_match_mask |= (1 << device_ix); + } else if (strcasecmp(key, config_keys[1])) { + if + IS_TRUE(value) { conf_mask_is_exclude = 1; } + } else { + return -10; + } + + return 0; +} + +static int nvml_init(void) { + TRY(nvmlInit()); + return 0; + + catch : ERROR("NVML init failed with %d", nv_status); + return -1; +} + +static int nvml_shutdown(void) { + TRY(nvmlShutdown()) + return 0; + + catch : ERROR("NVML shutdown failed with %d", nv_status); + return -1; +} + +static void nvml_submit(const char *plugin_instance, const char *type, + const char *type_instance, value_t nvml) { + + value_list_t vl = VALUE_LIST_INIT; + + vl.values = &nvml; + vl.values_len = 1; + + sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin)); + sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance)); + + sstrncpy(vl.type, type, sizeof(vl.type)); + + if (type_instance != NULL) { + sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance)); + } + + plugin_dispatch_values(&vl); +} + +static int nvml_read(void) { + + unsigned int device_count; + TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount); + + if (device_count > 64) { + device_count = 64; + } + + nvmlDevice_t dev; + char dev_name[MAX_DEVNAME_LEN + 1]; + unsigned int fan_speed; + nvmlUtilization_t utilization; + nvmlMemory_t meminfo; + unsigned int core_temp; + + for (int ix = 0; ix < device_count; ix++) { + + int is_match = ((1 << ix) & conf_match_mask) || (conf_match_mask == 0); + if (conf_mask_is_exclude == !!is_match) { + continue; + } + + TRY(nvmlDeviceGetHandleByIndex(ix, &dev)); + + dev_name[0] = '\0'; + TRY(nvmlDeviceGetName(dev, dev_name, MAX_DEVNAME_LEN)); + + TRY(nvmlDeviceGetMemoryInfo(dev, &meminfo)) + TRY(nvmlDeviceGetUtilizationRates(dev, &utilization)) + TRY(nvmlDeviceGetFanSpeed(dev, &fan_speed)) + TRY(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp)) + + double pct_mem_used = 100. * (double)meminfo.used / meminfo.total; + + nvml_submit(dev_name, "percent", "GPU", WRAPGAUGE(pct_mem_used)); + nvml_submit(dev_name, "percent", "GPU", WRAPGAUGE(utilization.gpu)); + nvml_submit(dev_name, "fanspeed", "GPU", WRAPGAUGE(fan_speed)); + nvml_submit(dev_name, "temperature", "GPU", WRAPGAUGE(core_temp)); + continue; + + catch : WARNING("NVML call \"%s\" failed with code %d!", nv_errline, + nv_status); + continue; + } + + return 0; + +catch_nocount: + ERROR("Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)", nv_errline, + nv_status); + return -1; +} + +void module_register(void) { + plugin_register_init(PLUGIN_NAME, nvml_init); + plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys); + plugin_register_read(PLUGIN_NAME, nvml_read); + plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown); +} -- 2.11.0