From: Florian Forster Date: Wed, 6 Dec 2017 21:24:34 +0000 (+0100) Subject: processes plugin: Implement the "CollectDelayAccounting" option. X-Git-Url: https://git.octo.it/?p=collectd.git;a=commitdiff_plain;h=4ea7a57256f5e4d77f4fff052490b7f67a9a3829 processes plugin: Implement the "CollectDelayAccounting" option. --- diff --git a/Makefile.am b/Makefile.am index def61ed6..69e65fb9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1433,11 +1433,16 @@ endif if BUILD_PLUGIN_PROCESSES pkglib_LTLIBRARIES += processes.la processes_la_SOURCES = src/processes.c +processes_la_CPPFLAGS = $(AM_CPPFLAGS) processes_la_LDFLAGS = $(PLUGIN_LDFLAGS) processes_la_LIBADD = if BUILD_WITH_LIBKVM_GETPROCS processes_la_LIBADD += -lkvm endif +if HAVE_LIBMNL +processes_la_CPPFLAGS += -DHAVE_LIBTASKSTATS=1 +processes_la_LIBADD += libtaskstats.la +endif endif if BUILD_PLUGIN_PROTOCOLS diff --git a/configure.ac b/configure.ac index e869a6a0..5eae1146 100644 --- a/configure.ac +++ b/configure.ac @@ -3668,6 +3668,7 @@ if test "x$with_libmnl" = "xyes"; then fi AC_SUBST([BUILD_WITH_LIBMNL_CFLAGS]) AC_SUBST([BUILD_WITH_LIBMNL_LIBS]) +AM_CONDITIONAL([HAVE_LIBMNL], [test "x$with_libmnl" = "xyes"]) # }}} # --with-libnetapp {{{ diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 4efa29e4..efbeba83 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -1200,11 +1200,13 @@ # CollectFileDescriptor true # CollectContextSwitch true # CollectMemoryMaps true +# CollectDelayAccounting false # Process "name" # ProcessMatch "name" "regex" # # CollectFileDescriptor false # CollectContextSwitch false +# CollectDelayAccounting true # # # CollectFileDescriptor false diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 0e7a6046..2b2c1e9b 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -6842,22 +6842,25 @@ The statistics collected for matched processes are: - number of memory mapped files (under Linux) - io data (where available) - context switches (under Linux) - - minor and major pagefaults. + - minor and major pagefaults + - Delay Accounting information (Linux only, requires libmnl) B - CollectFileDescriptor true - CollectContextSwitch true + CollectFileDescriptor true + CollectContextSwitch true + CollectDelayAccounting false Process "name" ProcessMatch "name" "regex" - CollectFileDescriptor false - CollectContextSwitch false + CollectFileDescriptor false + CollectContextSwitch false + CollectDelayAccounting true CollectFileDescriptor false - CollectContextSwitch true + CollectContextSwitch true @@ -6883,6 +6886,17 @@ I must not contain slashes. Collect the number of context switches for matched processes. Disabled by default. +=item B I + +If enabled, collect Linux Delay Accounding information for matching processes. +Delay Accounting provides the time processes wait for the CPU to become +available, for I/O operations to finish, for pages to be swapped in and for +freed pages to be reclaimed. The metrics are reported as a percentage, e.g. +C. Disabled by default. + +This option is only available on Linux, requires the C library and +requires root privileges at runtime. + =item B I Collect number of file descriptors of matched processes. @@ -6896,9 +6910,12 @@ the Linux kernel. =back -Options B and B may be used inside -B and B blocks - then they affect corresponding match -only. Otherwise they set the default value for subsequent matches. +The B, B, +B and B options may be used inside +B and B blocks. When used there, these options affect +reporting the corresponding processes only. Outside of B and +B blocks these options set the default value for subsequent +matches. =head2 Plugin C diff --git a/src/processes.c b/src/processes.c index 30c4954a..30f4e32f 100644 --- a/src/processes.c +++ b/src/processes.c @@ -1,7 +1,7 @@ /** * collectd - src/processes.c * Copyright (C) 2005 Lyonel Vincent - * Copyright (C) 2006-2010 Florian octo Forster + * Copyright (C) 2006-2017 Florian octo Forster * Copyright (C) 2008 Oleg King * Copyright (C) 2009 Sebastian Harl * Copyright (C) 2009 Andrés J. Díaz @@ -41,6 +41,11 @@ #include "common.h" #include "plugin.h" +#if HAVE_LIBTASKSTATS +#include "utils_complain.h" +#include "utils_taskstats.h" +#endif + /* Include header files for the mach system, if they exist.. */ #if HAVE_THREAD_INFO #if HAVE_MACH_MACH_INIT_H @@ -195,6 +200,11 @@ typedef struct process_entry_s { derive_t cswitch_invol; _Bool has_cswitch; +#if HAVE_LIBTASKSTATS + ts_delay_t delay; +#endif + _Bool has_delay; + _Bool has_fd; _Bool has_maps; @@ -221,6 +231,13 @@ typedef struct procstat_entry_s { derive_t cswitch_vol; derive_t cswitch_invol; +#if HAVE_LIBTASKSTATS + value_to_rate_state_t delay_cpu; + value_to_rate_state_t delay_blkio; + value_to_rate_state_t delay_swapin; + value_to_rate_state_t delay_freepages; +#endif + struct procstat_entry_s *next; } procstat_entry_t; @@ -257,9 +274,16 @@ typedef struct procstat { derive_t cswitch_vol; derive_t cswitch_invol; + /* Linux Delay Accounting. Unit is ns/s. */ + gauge_t delay_cpu; + gauge_t delay_blkio; + gauge_t delay_swapin; + gauge_t delay_freepages; + _Bool report_fd_num; _Bool report_maps_num; _Bool report_ctx_switch; + _Bool report_delay; struct procstat *next; struct procstat_entry_s *instances; @@ -271,6 +295,7 @@ static _Bool want_init = 1; static _Bool report_ctx_switch = 0; static _Bool report_fd_num = 0; static _Bool report_maps_num = 0; +static _Bool report_delay = 0; #if HAVE_THREAD_INFO static mach_port_t port_host_self; @@ -304,6 +329,10 @@ int getthrds64(pid_t, void *, int, tid64_t *, int); int getargs(void *processBuffer, int bufferLen, char *argsBuffer, int argsLen); #endif /* HAVE_PROCINFO_H */ +#if HAVE_LIBTASKSTATS +static ts_t *taskstats_handle = NULL; +#endif + /* put name of process from config to list_head_g tree * list_head_g is a list of 'procstat_t' structs with * processes names we want to watch */ @@ -331,6 +360,7 @@ static procstat_t *ps_list_register(const char *name, const char *regexp) { new->report_fd_num = report_fd_num; new->report_maps_num = report_maps_num; new->report_ctx_switch = report_ctx_switch; + new->report_delay = report_delay; #if HAVE_REGEX_H if (regexp != NULL) { @@ -439,6 +469,39 @@ static void ps_update_counter(derive_t *group_counter, derive_t *curr_counter, *group_counter += curr_value; } +#if HAVE_LIBTASKSTATS +static void ps_update_delay_one(gauge_t *out_rate_sum, + value_to_rate_state_t *state, uint64_t cnt, + cdtime_t t) { + gauge_t rate = NAN; + int status = value_to_rate(&rate, (value_t){.counter = (counter_t)cnt}, + DS_TYPE_COUNTER, t, state); + if ((status != 0) || isnan(rate)) { + return; + } + + if (isnan(*out_rate_sum)) { + *out_rate_sum = rate; + } else { + *out_rate_sum += rate; + } +} + +static void ps_update_delay(procstat_t *out, procstat_entry_t *prev, + process_entry_t *curr) { + cdtime_t now = cdtime(); + + ps_update_delay_one(&out->delay_cpu, &prev->delay_cpu, curr->delay.cpu_ns, + now); + ps_update_delay_one(&out->delay_blkio, &prev->delay_blkio, + curr->delay.blkio_ns, now); + ps_update_delay_one(&out->delay_swapin, &prev->delay_swapin, + curr->delay.swapin_ns, now); + ps_update_delay_one(&out->delay_freepages, &prev->delay_freepages, + curr->delay.freepages_ns, now); +} +#endif + /* add process entry to 'instances' of process 'name' (or refresh it) */ static void ps_list_add(const char *name, const char *cmdline, process_entry_t *entry) { @@ -518,6 +581,10 @@ static void ps_list_add(const char *name, const char *cmdline, entry->cpu_user_counter); ps_update_counter(&ps->cpu_system_counter, &pse->cpu_system_counter, entry->cpu_system_counter); + +#if HAVE_LIBTASKSTATS + ps_update_delay(ps, pse, entry); +#endif } } @@ -537,6 +604,11 @@ static void ps_list_reset(void) { ps->vmem_code = 0; ps->stack_size = 0; + ps->delay_cpu = NAN; + ps->delay_blkio = NAN; + ps->delay_swapin = NAN; + ps->delay_freepages = NAN; + pse_prev = NULL; pse = ps->instances; while (pse != NULL) { @@ -573,8 +645,15 @@ static void ps_tune_instance(oconfig_item_t *ci, procstat_t *ps) { cf_util_get_boolean(c, &ps->report_fd_num); else if (strcasecmp(c->key, "CollectMemoryMaps") == 0) cf_util_get_boolean(c, &ps->report_maps_num); - else { - ERROR("processes plugin: Option `%s' not allowed here.", c->key); + else if (strcasecmp(c->key, "CollectDelayAccounting") == 0) { +#if HAVE_LIBTASKSTATS + cf_util_get_boolean(c, &ps->report_delay); +#else + WARNING("processes plugin: The plugin has been compiled without support " + "for the \"CollectDelayAccounting\" option."); +#endif + } else { + ERROR("processes plugin: Option `%s' not allowed heeere.", c->key); } } /* for (ci->children) */ } /* void ps_tune_instance */ @@ -633,6 +712,13 @@ static int ps_config(oconfig_item_t *ci) { cf_util_get_boolean(c, &report_fd_num); } else if (strcasecmp(c->key, "CollectMemoryMaps") == 0) { cf_util_get_boolean(c, &report_maps_num); + } else if (strcasecmp(c->key, "CollectDelayAccounting") == 0) { +#if HAVE_LIBTASKSTATS + cf_util_get_boolean(c, &report_delay); +#else + WARNING("processes plugin: The plugin has been compiled without support " + "for the \"CollectDelayAccounting\" option."); +#endif } else { ERROR("processes plugin: The `%s' configuration option is not " "understood and will be ignored.", @@ -670,6 +756,15 @@ static int ps_init(void) { #elif KERNEL_LINUX pagesize_g = sysconf(_SC_PAGESIZE); DEBUG("pagesize_g = %li; CONFIG_HZ = %i;", pagesize_g, CONFIG_HZ); + +#if HAVE_LIBTASKSTATS + if (taskstats_handle == NULL) { + taskstats_handle = ts_create(); + if (taskstats_handle == NULL) { + WARNING("processes plugin: Creating taskstats handle failed."); + } + } +#endif /* #endif KERNEL_LINUX */ #elif HAVE_LIBKVM_GETPROCS && \ @@ -804,6 +899,42 @@ static void ps_submit_proc_list(procstat_t *ps) { plugin_dispatch_values(&vl); } + /* The ps->delay_* metrics are in nanoseconds per second. This factor converts + * them to a percentage. */ + gauge_t const delay_factor = 100.0 / 1000000000.0; + + if (!isnan(ps->delay_cpu)) { + sstrncpy(vl.type, "percent", sizeof(vl.type)); + sstrncpy(vl.type_instance, "delay-cpu", sizeof(vl.type_instance)); + vl.values[0].gauge = ps->delay_cpu * delay_factor; + vl.values_len = 1; + plugin_dispatch_values(&vl); + } + + if (!isnan(ps->delay_blkio)) { + sstrncpy(vl.type, "percent", sizeof(vl.type)); + sstrncpy(vl.type_instance, "delay-blkio", sizeof(vl.type_instance)); + vl.values[0].gauge = ps->delay_blkio * delay_factor; + vl.values_len = 1; + plugin_dispatch_values(&vl); + } + + if (!isnan(ps->delay_swapin)) { + sstrncpy(vl.type, "percent", sizeof(vl.type)); + sstrncpy(vl.type_instance, "delay-swapin", sizeof(vl.type_instance)); + vl.values[0].gauge = ps->delay_swapin * delay_factor; + vl.values_len = 1; + plugin_dispatch_values(&vl); + } + + if (!isnan(ps->delay_freepages)) { + sstrncpy(vl.type, "percent", sizeof(vl.type)); + sstrncpy(vl.type_instance, "delay-freepages", sizeof(vl.type_instance)); + vl.values[0].gauge = ps->delay_freepages * delay_factor; + vl.values_len = 1; + plugin_dispatch_values(&vl); + } + DEBUG( "name = %s; num_proc = %lu; num_lwp = %lu; num_fd = %lu; num_maps = %lu; " "vmem_size = %lu; vmem_rss = %lu; vmem_data = %lu; " @@ -813,13 +944,16 @@ static void ps_submit_proc_list(procstat_t *ps) { "io_rchar = %" PRIi64 "; io_wchar = %" PRIi64 "; " "io_syscr = %" PRIi64 "; io_syscw = %" PRIi64 "; " "io_diskr = %" PRIi64 "; io_diskw = %" PRIi64 "; " - "cswitch_vol = %" PRIi64 "; cswitch_invol = %" PRIi64 ";", + "cswitch_vol = %" PRIi64 "; cswitch_invol = %" PRIi64 "; " + "delay_cpu = %g; delay_blkio = %g; " + "delay_swapin = %g; delay_freepages = %g;", ps->name, ps->num_proc, ps->num_lwp, ps->num_fd, ps->num_maps, ps->vmem_size, ps->vmem_rss, ps->vmem_data, ps->vmem_code, ps->vmem_minflt_counter, ps->vmem_majflt_counter, ps->cpu_user_counter, ps->cpu_system_counter, ps->io_rchar, ps->io_wchar, ps->io_syscr, ps->io_syscw, ps->io_diskr, ps->io_diskw, ps->cswitch_vol, - ps->cswitch_invol); + ps->cswitch_invol, ps->delay_cpu, ps->delay_blkio, ps->delay_swapin, + ps->delay_freepages); } /* void ps_submit_proc_list */ @@ -1072,6 +1206,33 @@ static int ps_count_fd(int pid) { return (count >= 1) ? count : 1; } /* int ps_count_fd (pid) */ +#if HAVE_LIBTASKSTATS +static int ps_delay(process_entry_t *ps) { + if (taskstats_handle == NULL) { + return ENOTCONN; + } + + int status = ts_delay_by_tgid(taskstats_handle, (uint32_t)ps->id, &ps->delay); + if (status == EPERM) { + static c_complain_t c; + c_complain(LOG_ERR, &c, "processes plugin: reading delay information " + "failed: \"%s\". This is probably because the " + "taskstats interface requires root privileges.", + STRERROR(status)); + return status; + } else if (status != 0) { + ERROR("processes plugin: ts_delay_by_tgid failed: %s", STRERROR(status)); + return status; + } + + return 0; +} +#else +static int ps_delay(__attribute__((unused)) process_entry_t *unused) { + return -1; +} +#endif + static void ps_fill_details(const procstat_t *ps, process_entry_t *entry) { if (entry->has_io == 0) { ps_read_io(entry); @@ -1100,8 +1261,17 @@ static void ps_fill_details(const procstat_t *ps, process_entry_t *entry) { } entry->has_fd = 1; } + +#if HAVE_LIBTASKSTATS + if (ps->report_delay && !entry->has_delay) { + if (ps_delay(entry) == 0) { + entry->has_delay = 1; + } + } +#endif } /* void ps_fill_details (...) */ +/* ps_read_process reads process counters on Linux. */ static int ps_read_process(long pid, process_entry_t *ps, char *state) { char filename[64]; char buffer[1024];