src/gpu_nvidia.c

   1 /*
   2 Copyright 2018 Evgeny Naumov
   3
   4 Permission is hereby granted, free of charge, to any person obtaining a copy of
   5 this software and associated documentation files (the "Software"), to deal in
   6 the Software without restriction, including without limitation the rights to
   7 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   8 of the Software, and to permit persons to whom the Software is furnished to do
   9 so, subject to the following conditions:
  10
  11 The above copyright notice and this permission notice shall be included in all
  12 copies or substantial portions of the Software.
  13
  14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20 SOFTWARE.
  21 */
  22
  23 #include "daemon/collectd.h"
  24 #include "daemon/plugin.h"
  25 #include "utils/common/common.h"
  26
  27 #include <nvml.h>
  28 #include <stdint.h>
  29 #include <stdio.h>
  30
  31 #define MAX_DEVNAME_LEN 256
  32 #define PLUGIN_NAME "gpu_nvidia"
  33
  34 static nvmlReturn_t nv_status = NVML_SUCCESS;
  35 static char *nv_errline = "";
  36
  37 #define TRY_CATCH(f, catch)                                                    \
  38   if ((nv_status = f) != NVML_SUCCESS) {                                       \
  39     nv_errline = #f;                                                           \
  40     goto catch;                                                                \
  41   }
  42
  43 #define TRY_CATCH_OPTIONAL(f, catch)                                           \
  44   if ((nv_status = f) != NVML_SUCCESS &&                                       \
  45       nv_status != NVML_ERROR_NOT_SUPPORTED) {                                 \
  46     nv_errline = #f;                                                           \
  47     goto catch;                                                                \
  48   }
  49
  50 #define TRY(f) TRY_CATCH(f, catch)
  51 #define TRYOPT(f) TRY_CATCH_OPTIONAL(f, catch)
  52
  53 #define KEY_GPUINDEX "GPUIndex"
  54 #define KEY_IGNORESELECTED "IgnoreSelected"
  55
  56 static const char *config_keys[] = {
  57     KEY_GPUINDEX,
  58     KEY_IGNORESELECTED,
  59 };
  60 static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys);
  61
  62 // This is a bitflag, necessitating the (extremely conservative) assumption
  63 // that there are no more than 64 GPUs on this system.
  64 static uint64_t conf_match_mask = 0;
  65 static bool conf_mask_is_exclude = 0;
  66
  67 static int nvml_config(const char *key, const char *value) {
  68
  69   if (strcasecmp(key, KEY_GPUINDEX) == 0) {
  70     char *eptr;
  71     unsigned long device_ix = strtoul(value, &eptr, 10);
  72     if (eptr == value) {
  73       ERROR(PLUGIN_NAME ": Failed to parse GPUIndex value \"%s\"", value);
  74       return -1;
  75     }
  76     if (device_ix >= 64) {
  77       ERROR(PLUGIN_NAME
  78             ": At most 64 GPUs (0 <= GPUIndex < 64) are supported!");
  79       return -2;
  80     }
  81     conf_match_mask |= (1 << device_ix);
  82   } else if (strcasecmp(key, KEY_IGNORESELECTED)) {
  83     conf_mask_is_exclude = IS_TRUE(value);
  84   } else {
  85     ERROR(PLUGIN_NAME ": Unrecognized config option %s", key);
  86     return -10;
  87   }
  88
  89   return 0;
  90 }
  91
  92 static int nvml_init(void) {
  93   TRY(nvmlInit());
  94   return 0;
  95
  96   catch : ERROR(PLUGIN_NAME ": NVML init failed with %d", nv_status);
  97   return -1;
  98 }
  99
 100 static int nvml_shutdown(void) {
 101   TRY(nvmlShutdown())
 102   return 0;
 103
 104   catch : ERROR(PLUGIN_NAME ": NVML shutdown failed with %d", nv_status);
 105   return -1;
 106 }
 107
 108 static void nvml_submit_gauge(const char *plugin_instance, const char *type,
 109                               const char *type_instance, gauge_t nvml) {
 110
 111   value_list_t vl = VALUE_LIST_INIT;
 112
 113   vl.values = &(value_t){.gauge = nvml};
 114   vl.values_len = 1;
 115
 116   sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin));
 117   sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance));
 118
 119   sstrncpy(vl.type, type, sizeof(vl.type));
 120
 121   if (type_instance != NULL) {
 122     sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance));
 123   }
 124
 125   plugin_dispatch_values(&vl);
 126 }
 127
 128 static int nvml_read(void) {
 129
 130   unsigned int device_count;
 131   TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount);
 132
 133   if (device_count > 64) {
 134     device_count = 64;
 135   }
 136
 137   for (unsigned int ix = 0; ix < device_count; ix++) {
 138
 139     unsigned int is_match =
 140         ((1 << ix) & conf_match_mask) || (conf_match_mask == 0);
 141     if (conf_mask_is_exclude == !!is_match) {
 142       continue;
 143     }
 144
 145     nvmlDevice_t dev;
 146     TRY(nvmlDeviceGetHandleByIndex(ix, &dev));
 147
 148     char dev_name[MAX_DEVNAME_LEN + 1] = {0};
 149     TRY(nvmlDeviceGetName(dev, dev_name, sizeof(dev_name) - 1));
 150
 151     // Try to be as lenient as possible with the variety of devices that are
 152     // out there, ignoring any NOT_SUPPORTED errors gently.
 153     nvmlMemory_t meminfo;
 154     TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo))
 155     if (nv_status == NVML_SUCCESS) {
 156       nvml_submit_gauge(dev_name, "memory", "used", meminfo.used);
 157       nvml_submit_gauge(dev_name, "memory", "free", meminfo.free);
 158     }
 159
 160     nvmlUtilization_t utilization;
 161     TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization))
 162     if (nv_status == NVML_SUCCESS)
 163       nvml_submit_gauge(dev_name, "percent", "gpu_used", utilization.gpu);
 164
 165     unsigned int fan_speed;
 166     TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed))
 167     if (nv_status == NVML_SUCCESS)
 168       nvml_submit_gauge(dev_name, "fanspeed", NULL, fan_speed);
 169
 170     unsigned int core_temp;
 171     TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp))
 172     if (nv_status == NVML_SUCCESS)
 173       nvml_submit_gauge(dev_name, "temperature", "core", core_temp);
 174
 175     unsigned int sm_clk_mhz;
 176     TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz))
 177     if (nv_status == NVML_SUCCESS)
 178       nvml_submit_gauge(dev_name, "frequency", "multiprocessor",
 179                         1e6 * sm_clk_mhz);
 180
 181     unsigned int mem_clk_mhz;
 182     TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz))
 183     if (nv_status == NVML_SUCCESS)
 184       nvml_submit_gauge(dev_name, "frequency", "memory", 1e6 * mem_clk_mhz);
 185
 186     unsigned int power_mW;
 187     TRYOPT(nvmlDeviceGetPowerUsage(dev, &power_mW))
 188     if (nv_status == NVML_SUCCESS)
 189       nvml_submit_gauge(dev_name, "power", NULL, 1e-3 * power_mW);
 190
 191     continue;
 192
 193     // Failures here indicate transient errors or removal of GPU. In either
 194     // case it will either be resolved or the GPU will no longer be enumerated
 195     // the next time round.
 196     catch : WARNING(PLUGIN_NAME
 197                     ": NVML call \"%s\" failed (%d) on dev at index %d!",
 198                     nv_errline, nv_status, ix);
 199     continue;
 200   }
 201
 202   return 0;
 203
 204 // Failures here indicate serious misconfiguration; we bail out totally.
 205 catch_nocount:
 206   ERROR(PLUGIN_NAME ": Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)",
 207         nv_errline, nv_status);
 208   return -1;
 209 }
 210
 211 void module_register(void) {
 212   plugin_register_init(PLUGIN_NAME, nvml_init);
 213   plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys);
 214   plugin_register_read(PLUGIN_NAME, nvml_read);
 215   plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown);
 216 }