src/gpu_nvml.c

   1 /*
   2 Copyright 2018 Evgeny Naumov
   3
   4 Permission is hereby granted, free of charge, to any person obtaining a copy of
   5 this software and associated documentation files (the "Software"), to deal in
   6 the Software without restriction, including without limitation the rights to
   7 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   8 of the Software, and to permit persons to whom the Software is furnished to do
   9 so, subject to the following conditions:
  10
  11 The above copyright notice and this permission notice shall be included in all
  12 copies or substantial portions of the Software.
  13
  14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20 SOFTWARE.
  21 */
  22
  23 #include "daemon/collectd.h"
  24 #include "daemon/common.h"
  25 #include "daemon/plugin.h"
  26
  27 #include <nvml.h>
  28 #include <stdint.h>
  29 #include <stdio.h>
  30
  31 #define MAX_DEVNAME_LEN 256
  32 #define PLUGIN_NAME "gpu_nvml"
  33
  34 static nvmlReturn_t nv_status = NVML_SUCCESS;
  35 static char *nv_errline = "";
  36
  37 #define TRY_CATCH(f, catch)                                                    \
  38   if ((nv_status = f) != NVML_SUCCESS) {                                       \
  39     nv_errline = #f;                                                           \
  40     goto catch;                                                                \
  41   }
  42
  43 #define TRY_CATCH_OPTIONAL(f, catch)                                           \
  44   if ((nv_status = f) != NVML_SUCCESS &&                                       \
  45       nv_status != NVML_ERROR_NOT_SUPPORTED) {                                 \
  46     nv_errline = #f;                                                           \
  47     goto catch;                                                                \
  48   }
  49
  50 #define TRY(f) TRY_CATCH(f, catch)
  51 #define TRYOPT(f) TRY_CATCH_OPTIONAL(f, catch)
  52
  53 #define KEY_GPUINDEX "GPUIndex"
  54 #define KEY_IGNORESELECTED "IgnoreSelected"
  55
  56 static const char *config_keys[] = {
  57     KEY_GPUINDEX,
  58     KEY_IGNORESELECTED,
  59 };
  60 static const unsigned int n_config_keys = STATIC_ARRAY_SIZE(config_keys);
  61
  62 // This is a bitflag, necessitating the (extremely conservative) assumption
  63 // that there are no more than 64 GPUs on this system.
  64 static uint64_t conf_match_mask = 0;
  65 static bool conf_mask_is_exclude = 0;
  66
  67 static int nvml_config(const char *key, const char *value) {
  68
  69   char *eptr;
  70
  71   if (strcasecmp(key, KEY_GPUINDEX) == 0) {
  72     unsigned long device_ix = strtoul(value, &eptr, 10);
  73     if (eptr == value) {
  74       ERROR(PLUGIN_NAME ": Failed to parse GPUIndex value \"%s\"", value);
  75       return -1;
  76     }
  77     if (device_ix >= 64) {
  78       ERROR(PLUGIN_NAME
  79             ": At most 64 GPUs (0 <= GPUIndex < 64) are supported!");
  80       return -2;
  81     }
  82     conf_match_mask |= (1 << device_ix);
  83   } else if (strcasecmp(key, KEY_IGNORESELECTED)) {
  84     conf_mask_is_exclude = IS_TRUE(value);
  85   } else {
  86     ERROR(PLUGIN_NAME ": Unrecognized config option %s", key);
  87     return -10;
  88   }
  89
  90   return 0;
  91 }
  92
  93 static int nvml_init(void) {
  94   TRY(nvmlInit());
  95   return 0;
  96
  97   catch : ERROR(PLUGIN_NAME ": NVML init failed with %d", nv_status);
  98   return -1;
  99 }
 100
 101 static int nvml_shutdown(void) {
 102   TRY(nvmlShutdown())
 103   return 0;
 104
 105   catch : ERROR(PLUGIN_NAME ": NVML shutdown failed with %d", nv_status);
 106   return -1;
 107 }
 108
 109 static void nvml_submit(const char *plugin_instance, const char *type,
 110                         const char *type_instance, gauge_t nvml) {
 111
 112   value_list_t vl = VALUE_LIST_INIT;
 113
 114   vl.values = &(value_t){.gauge = nvml};
 115   vl.values_len = 1;
 116
 117   sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin));
 118   sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance));
 119
 120   sstrncpy(vl.type, type, sizeof(vl.type));
 121
 122   if (type_instance != NULL) {
 123     sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance));
 124   }
 125
 126   plugin_dispatch_values(&vl);
 127 }
 128
 129 static int nvml_read(void) {
 130
 131   unsigned int device_count;
 132   TRY_CATCH(nvmlDeviceGetCount(&device_count), catch_nocount);
 133
 134   if (device_count > 64) {
 135     device_count = 64;
 136   }
 137
 138   for (unsigned int ix = 0; ix < device_count; ix++) {
 139
 140     unsigned int is_match =
 141         ((1 << ix) & conf_match_mask) || (conf_match_mask == 0);
 142     if (conf_mask_is_exclude == !!is_match) {
 143       continue;
 144     }
 145
 146     nvmlDevice_t dev;
 147     TRY(nvmlDeviceGetHandleByIndex(ix, &dev));
 148
 149     char dev_name[MAX_DEVNAME_LEN + 1] = {0};
 150     TRY(nvmlDeviceGetName(dev, dev_name, sizeof(dev_name) - 1));
 151
 152     // Try to be as lenient as possible with the variety of devices that are
 153     // out there, ignoring any NOT_SUPPORTED errors gently.
 154     nvmlMemory_t meminfo;
 155     TRYOPT(nvmlDeviceGetMemoryInfo(dev, &meminfo))
 156     if (nv_status == NVML_SUCCESS) {
 157       nvml_submit(dev_name, "memory", "used", meminfo.used);
 158       nvml_submit(dev_name, "memory", "free", meminfo.free);
 159     }
 160
 161     nvmlUtilization_t utilization;
 162     TRYOPT(nvmlDeviceGetUtilizationRates(dev, &utilization))
 163     if (nv_status == NVML_SUCCESS)
 164       nvml_submit(dev_name, "percent", "gpu_used", utilization.gpu);
 165
 166     unsigned int fan_speed;
 167     TRYOPT(nvmlDeviceGetFanSpeed(dev, &fan_speed))
 168     if (nv_status == NVML_SUCCESS)
 169       nvml_submit(dev_name, "fanspeed", NULL, fan_speed);
 170
 171     unsigned int core_temp;
 172     TRYOPT(nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &core_temp))
 173     if (nv_status == NVML_SUCCESS)
 174       nvml_submit(dev_name, "temperature", "core", core_temp);
 175
 176     unsigned int sm_clk_mhz;
 177     TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_SM, &sm_clk_mhz))
 178     if (nv_status == NVML_SUCCESS)
 179       nvml_submit(dev_name, "frequency", "sm", 1e6 * sm_clk_mhz);
 180
 181     unsigned int mem_clk_mhz;
 182     TRYOPT(nvmlDeviceGetClockInfo(dev, NVML_CLOCK_MEM, &mem_clk_mhz))
 183     if (nv_status == NVML_SUCCESS)
 184       nvml_submit(dev_name, "frequency", "mem", 1e6 * mem_clk_mhz);
 185
 186     unsigned int power_mW;
 187     TRYOPT(nvmlDeviceGetPowerUsage(dev, &power_mW))
 188     if (nv_status == NVML_SUCCESS)
 189       nvml_submit(dev_name, "power", NULL, 1e-3 * power_mW);
 190
 191     continue;
 192
 193     // Failures here indicate transient errors or removal of GPU. In either
 194     // case it will either be resolved or the GPU will no longer be enumerated
 195     // the next time round.
 196     catch : WARNING(PLUGIN_NAME
 197                     ": NVML call \"%s\" failed (%d) on dev at index %d!",
 198                     nv_errline, nv_status, ix);
 199     continue;
 200   }
 201
 202   return 0;
 203
 204 // Failures here indicate serious misconfiguration; we bail out totally.
 205 catch_nocount:
 206   ERROR(PLUGIN_NAME ": Failed to enumerate NVIDIA GPUs (\"%s\" returned %d)",
 207         nv_errline, nv_status);
 208   return -1;
 209 }
 210
 211 void module_register(void) {
 212   plugin_register_init(PLUGIN_NAME, nvml_init);
 213   plugin_register_config(PLUGIN_NAME, nvml_config, config_keys, n_config_keys);
 214   plugin_register_read(PLUGIN_NAME, nvml_read);
 215   plugin_register_shutdown(PLUGIN_NAME, nvml_shutdown);
 216 }