From b57fb475c34d4cbc71a0256bff4db961df6077fc Mon Sep 17 00:00:00 2001 From: "Pshyk, SerhiyX" Date: Mon, 19 Sep 2016 14:21:43 +0100 Subject: [PATCH] rdtmon: Implement RDT monitoring plugin The rdtmon plugin collects information provided by monitoring features of Intel Resource Director Technology (Intel(R) RDT) like Cache Monitoring Technology (CMT), Memory Bandwidth Monitoring (MBM). Change-Id: Ie45344c1035c522fcd918a1dd2427a2da2e173bb Signed-off-by: Serhiy Pshyk --- README | 14 + configure.ac | 53 ++++ src/Makefile.am | 8 + src/collectd.conf.in | 5 + src/collectd.conf.pod | 53 ++++ src/rdtmon.c | 694 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/types.db | 3 + 7 files changed, 830 insertions(+) create mode 100644 src/rdtmon.c diff --git a/README b/README index bb72ac2c..250259b9 100644 --- a/README +++ b/README @@ -313,6 +313,15 @@ Features collectd without the need to start a heavy interpreter every interval. See collectd-python(5) for details. + - rdtmon + The rdtmon plugin collects information provided by monitoring features of + Intel Resource Director Technology (Intel(R) RDT) like Cache Monitoring + Technology (CMT), Memory Bandwidth Monitoring (MBM). These features + provide information about utilization of shared resources like last level + cache occupacy, local memory bandwidth usage, remote memory bandwidth + usage, instructions per clock. + + - redis The redis plugin gathers information from a Redis server, including: uptime, used memory, total connections etc. @@ -814,6 +823,11 @@ Prerequisites The PostgreSQL C client library used by the `postgresql' plugin. + * libpqos (optional) + The PQoS library for Intel(R) Resource Director Technology used by the + `rdtmon' plugin. + + * libprotobuf, protoc 3.0+ (optional) Used by the `grpc' plugin to generate service stubs and code to handle network packets of collectd's protobuf-based network protocol. diff --git a/configure.ac b/configure.ac index 31c842af..a74864b8 100644 --- a/configure.ac +++ b/configure.ac @@ -4098,6 +4098,55 @@ fi AM_CONDITIONAL(BUILD_WITH_LIBPQ, test "x$with_libpq" = "xyes") # }}} +# --with-libpqos {{{ +with_libpqos_cppflags="" +with_libpqos_ldflags="" +AC_ARG_WITH(libpqos, [AS_HELP_STRING([--with-libpqos@<:@=PREFIX@:>@], [Path to libpqos.])], +[ + if test "x$withval" != "xno" && test "x$withval" != "xyes" + then + with_libpqos_cppflags="-I$withval/include" + with_libpqos_ldflags="-L$withval/lib" + with_libpqos="yes" + else + with_libpqos="$withval" + fi +], +[ + with_libpqos="yes" +]) +if test "x$with_libpqos" = "xyes" +then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $with_libpqos_cppflags" + + AC_CHECK_HEADERS(pqos.h, [with_libpqos="yes"], [with_libpqos="no (pqos.h not found)"]) + + CPPFLAGS="$SAVE_CPPFLAGS" +fi +if test "x$with_libpqos" = "xyes" +then + SAVE_CPPFLAGS="$CPPFLAGS" + SAVE_LDFLAGS="$LDFLAGS" + CPPFLAGS="$CPPFLAGS $with_libpqos_cppflags" + LDFLAGS="$LDFLAGS $with_libpqos_ldflags" + + AC_CHECK_LIB(pqos, pqos_init, [with_libpqos="yes"], [with_libpqos="no (Can't find libpqos)"]) + + CPPFLAGS="$SAVE_CPPFLAGS" + LDFLAGS="$SAVE_LDFLAGS" +fi +if test "x$with_libpqos" = "xyes" +then + BUILD_WITH_LIBPQOS_CPPFLAGS="$with_libpqos_cppflags" + BUILD_WITH_LIBPQOS_LDFLAGS="$with_libpqos_ldflags" + BUILD_WITH_LIBPQOS_LIBS="-lpqos" + AC_SUBST(BUILD_WITH_LIBPQOS_CPPFLAGS) + AC_SUBST(BUILD_WITH_LIBPQOS_LDFLAGS) + AC_SUBST(BUILD_WITH_LIBPQOS_LIBS) +fi +# }}} + # --with-libprotobuf {{{ with_libprotobuf_cppflags="" with_libprotobuf_ldflags="" @@ -5801,6 +5850,7 @@ plugin_pinba="no" plugin_processes="no" plugin_protocols="no" plugin_python="no" +plugin_rdtmon="no" plugin_serial="no" plugin_smart="no" plugin_swap="no" @@ -6298,6 +6348,7 @@ AC_PLUGIN([powerdns], [yes], [PowerDNS statistics AC_PLUGIN([processes], [$plugin_processes], [Process statistics]) AC_PLUGIN([protocols], [$plugin_protocols], [Protocol (IP, TCP, ...) statistics]) AC_PLUGIN([python], [$plugin_python], [Embed a Python interpreter]) +AC_PLUGIN([rdtmon], [$with_libpqos], [RDT monitor plugin]) AC_PLUGIN([redis], [$with_libhiredis], [Redis plugin]) AC_PLUGIN([routeros], [$with_librouteros], [RouterOS plugin]) AC_PLUGIN([rrdcached], [$librrd_rrdc_update], [RRDTool output plugin]) @@ -6604,6 +6655,7 @@ AC_MSG_RESULT([ libpcap . . . . . . . $with_libpcap]) AC_MSG_RESULT([ libperfstat . . . . . $with_perfstat]) AC_MSG_RESULT([ libperl . . . . . . . $with_libperl]) AC_MSG_RESULT([ libpq . . . . . . . . $with_libpq]) +AC_MSG_RESULT([ libpqos . . . . . . . $with_libpqos]) AC_MSG_RESULT([ libprotobuf . . . . . $with_libprotobuf]) AC_MSG_RESULT([ libprotobuf-c . . . . $with_libprotobuf_c]) AC_MSG_RESULT([ libpython . . . . . . $with_libpython]) @@ -6731,6 +6783,7 @@ AC_MSG_RESULT([ processes . . . . . . $enable_processes]) AC_MSG_RESULT([ protocols . . . . . . $enable_protocols]) AC_MSG_RESULT([ python . . . . . . . $enable_python]) AC_MSG_RESULT([ redis . . . . . . . . $enable_redis]) +AC_MSG_RESULT([ rdtmon . . . . . . . $enable_rdtmon]) AC_MSG_RESULT([ routeros . . . . . . $enable_routeros]) AC_MSG_RESULT([ rrdcached . . . . . . $enable_rrdcached]) AC_MSG_RESULT([ rrdtool . . . . . . . $enable_rrdtool]) diff --git a/src/Makefile.am b/src/Makefile.am index 99a7c024..5001fdef 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -962,6 +962,14 @@ protocols_la_SOURCES = protocols.c protocols_la_LDFLAGS = $(PLUGIN_LDFLAGS) endif +if BUILD_PLUGIN_RDTMON +pkglib_LTLIBRARIES += rdtmon.la +rdtmon_la_SOURCES = rdtmon.c +rdtmon_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBPQOS_LDFLAGS) +rdtmon_la_CFLAGS = $(AM_CFLAGS) $(BUILD_WITH_LIBPQOS_CPPFLAGS) +rdtmon_la_LIBADD = $(BUILD_WITH_LIBPQOS_LIBS) +endif + if BUILD_PLUGIN_REDIS pkglib_LTLIBRARIES += redis.la redis_la_SOURCES = redis.c diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 04950674..52028338 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -175,6 +175,7 @@ #@BUILD_PLUGIN_PROTOCOLS_TRUE@LoadPlugin protocols #@BUILD_PLUGIN_PYTHON_TRUE@LoadPlugin python #@BUILD_PLUGIN_REDIS_TRUE@LoadPlugin redis +#@BUILD_PLUGIN_RDTMON_TRUE@LoadPlugin rdtmon #@BUILD_PLUGIN_ROUTEROS_TRUE@LoadPlugin routeros #@BUILD_PLUGIN_RRDCACHED_TRUE@LoadPlugin rrdcached @LOAD_PLUGIN_RRDTOOL@LoadPlugin rrdtool @@ -1097,6 +1098,10 @@ # # +# +# Cores "0-2" +# + # # # Host "redis.example.com" diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 00cd781a..b36426fe 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -6295,6 +6295,59 @@ Defaults to B. =back +=head2 Plugin C + +The I plugin collects information provided by monitoring features of +Intel Resource Director Technology (Intel(R) RDT) like Cache Monitoring +Technology (CMT), Memory Bandwidth Monitoring (MBM). These features provide +information about utilization of shared resources. CMT monitors last level cache +occupancy (LLC). MBM supports two types of events reporting local and remote +memory bandwidth. Local memory bandwidth (MBL) reports the bandwidth of +accessing memory associated with the local socket. Remote memory bandwidth (MBR) +reports the bandwidth of accessing the remote socket. Also this technology +allows to monitor instructions per clock (IPC). +Monitor events are hardware dependant. Monitoring capabilities are detected on +plugin initialization and only supported events are monitored. + +B + + + Cores "0-2" "3,4,6" "8-10,15" + + +B + +=over 4 + +=item B I + +The interval within which to retrieve statistics on monitored events in seconds. +For milliseconds divide the time by 1000 for example if the desired interval +is 50ms, set interval to 0.05. Due to limited capacity of counters it is not +recommended to set interval higher than 1 sec. + +=item B I + +All events are reported on a per core basis. Monitoring of the events can be +configured for group of cores (aggregated statistics). This field defines groups +of cores on which to monitor supported events. The field is represented as list +of strings with core group values. Each string represents a list of cores in a +group. +Allowed formats are: + 0,1,2,3 + 0-10,20-18 + 1,3,5-8,10,0x10-12 +If an empty string is provided as value for this field default cores +configuration is applied - a separate group is created for each core. +=back + +B By default global interval is used to retrieve statistics on monitored +events. To configure a plugin specific interval use B option of the +rdtmon block. For milliseconds divide the time by 1000 for example +if the desired interval is 50ms, set interval to 0.05. +Due to limited capacity of counters it is not recommended to set interval higher +than 1 sec. + =head2 Plugin C The I connects to one or more Redis servers and gathers diff --git a/src/rdtmon.c b/src/rdtmon.c new file mode 100644 index 00000000..48624f9a --- /dev/null +++ b/src/rdtmon.c @@ -0,0 +1,694 @@ +/** + * collectd - src/rdtmon.c + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Serhiy Pshyk + **/ + +#include + +#include "common.h" + +#define RDTMON_PLUGIN "rdtmon" + +#define RDTMON_MAX_SOCKETS 8 +#define RDTMON_MAX_SOCKET_CORES 64 +#define RDTMON_MAX_CORES (RDTMON_MAX_SOCKET_CORES * RDTMON_MAX_SOCKETS) + +struct rdtmon_core_group_s { + char *desc; + int num_cores; + unsigned *cores; + enum pqos_mon_event events; +}; +typedef struct rdtmon_core_group_s rdtmon_core_group_t; + +struct rdtmon_ctx_s { + rdtmon_core_group_t cgroups[RDTMON_MAX_CORES]; + struct pqos_mon_data *pgroups[RDTMON_MAX_CORES]; + int num_groups; + const struct pqos_cpuinfo *pqos_cpu; + const struct pqos_cap *pqos_cap; + const struct pqos_capability *cap_mon; +}; +typedef struct rdtmon_ctx_s rdtmon_ctx_t; + +static rdtmon_ctx_t *g_rdtmon = NULL; + +static int isdup(const uint64_t *nums, unsigned size, uint64_t val) { + for (unsigned i = 0; i < size; i++) + if (nums[i] == val) + return 1; + return 0; +} + +static int strtouint64(const char *s, uint64_t *n) { + char *endptr = NULL; + + assert(s != NULL); + assert(n != NULL); + + *n = strtoull(s, &endptr, 0); + + if (!(*s != '\0' && *endptr == '\0')) { + DEBUG(RDTMON_PLUGIN ": Error converting '%s' to unsigned number.", s); + return (-EINVAL); + } + + return (0); +} + +/* + * NAME + * strlisttonums + * + * DESCRIPTION + * Converts string of characters representing list of numbers into array of + * numbers. Allowed formats are: + * 0,1,2,3 + * 0-10,20-18 + * 1,3,5-8,10,0x10-12 + * + * Numbers can be in decimal or hexadecimal format. + * + * PARAMETERS + * `s' String representing list of unsigned numbers. + * `nums' Array to put converted numeric values into. + * `max' Maximum number of elements that nums can accommodate. + * + * RETURN VALUE + * Number of elements placed into nums. + */ +static unsigned strlisttonums(char *s, uint64_t *nums, unsigned max) { + int ret; + unsigned index = 0; + char *saveptr = NULL; + + if (s == NULL || nums == NULL || max == 0) + return index; + + for (;;) { + char *p = NULL; + char *token = NULL; + + token = strtok_r(s, ",", &saveptr); + if (token == NULL) + break; + + s = NULL; + + while (isspace(*token)) + token++; + if (*token == '\0') + continue; + + p = strchr(token, '-'); + if (p != NULL) { + uint64_t n, start, end; + *p = '\0'; + ret = strtouint64(token, &start); + if (ret < 0) + return (0); + ret = strtouint64(p + 1, &end); + if (ret < 0) + return (0); + if (start > end) { + n = start; + start = end; + end = n; + } + for (n = start; n <= end; n++) { + if (!(isdup(nums, index, n))) { + nums[index] = n; + index++; + } + if (index >= max) + return index; + } + } else { + uint64_t val; + + ret = strtouint64(token, &val); + if (ret < 0) + return (0); + + if (!(isdup(nums, index, val))) { + nums[index] = val; + index++; + } + if (index >= max) + return index; + } + } + + return index; +} + +/* + * NAME + * cgroup_cmp + * + * DESCRIPTION + * Function to compare cores in 2 core groups. + * + * PARAMETERS + * `cg_a' Pointer to core group a. + * `cg_b' Pointer to core group b. + * + * RETURN VALUE + * 1 if both groups contain the same cores + * 0 if none of their cores match + * -1 if some but not all cores match + */ +static int cgroup_cmp(const rdtmon_core_group_t *cg_a, + const rdtmon_core_group_t *cg_b) { + int found = 0; + + assert(cg_a != NULL); + assert(cg_b != NULL); + + const int sz_a = cg_a->num_cores; + const int sz_b = cg_b->num_cores; + const unsigned *tab_a = cg_a->cores; + const unsigned *tab_b = cg_b->cores; + + for (int i = 0; i < sz_a; i++) { + for (int j = 0; j < sz_b; j++) + if (tab_a[i] == tab_b[j]) + found++; + } + /* if no cores are the same */ + if (!found) + return 0; + /* if group contains same cores */ + if (sz_a == sz_b && sz_b == found) + return 1; + /* if not all cores are the same */ + return -1; +} + +static int cgroup_set(rdtmon_core_group_t *cg, char *desc, uint64_t *cores, + int num_cores) { + assert(cg != NULL); + assert(desc != NULL); + assert(cores != NULL); + assert(num_cores > 0); + + cg->cores = malloc(sizeof(unsigned) * num_cores); + if (cg->cores == NULL) { + ERROR(RDTMON_PLUGIN ": Error allocating core group table"); + return (-ENOMEM); + } + cg->num_cores = num_cores; + cg->desc = desc; + + for (int i = 0; i < num_cores; i++) + cg->cores[i] = (unsigned)cores[i]; + + return 0; +} + +/* + * NAME + * oconfig_to_cgroups + * + * DESCRIPTION + * Function to set the descriptions and cores for each core group. + * Takes a config option containing list of strings that are used to set + * core group values. + * + * PARAMETERS + * `item' Config option containing core groups. + * `groups' Table of core groups to set values in. + * `max' Maximum number of core groups allowed. + * + * RETURN VALUE + * On success, the number of core groups set up. On error, appropriate + * negative error value. + */ +static int oconfig_to_cgroups(oconfig_item_t *item, rdtmon_core_group_t *groups, + unsigned max) { + int ret; + unsigned n, index = 0; + uint64_t cores[RDTMON_MAX_CORES]; + char value[DATA_MAX_NAME_LEN]; + + assert(groups != NULL); + assert(max > 0); + assert(item != NULL); + + for (int j = 0; j < item->values_num; j++) { + if (item->values[j].value.string != NULL && + strlen(item->values[j].value.string)) { + char *desc = NULL; + + sstrncpy(value, item->values[j].value.string, sizeof(value)); + + memset(cores, 0, sizeof(cores)); + + n = strlisttonums(value, cores, RDTMON_MAX_CORES); + if (n == 0) { + ERROR(RDTMON_PLUGIN ": Error parsing core group (%s)", value); + return (-EINVAL); + } + + desc = strdup(item->values[j].value.string); + + /* set core group info */ + ret = cgroup_set(&groups[index], desc, cores, n); + if (ret < 0) { + free(desc); + return ret; + } + + index++; + + if (index >= max) { + WARNING(RDTMON_PLUGIN ": Too many core groups configured"); + return index; + } + } + } + + return index; +} + +#if COLLECT_DEBUG +static void rdtmon_dump_cgroups(void) { + char cores[RDTMON_MAX_CORES * 4]; + + if (g_rdtmon == NULL) + return; + + DEBUG(RDTMON_PLUGIN ": Core Groups Dump"); + DEBUG(RDTMON_PLUGIN ": groups count: %d", g_rdtmon->num_groups); + + for (int i = 0; i < g_rdtmon->num_groups; i++) { + + memset(cores, 0, sizeof(cores)); + for (int j = 0; j < g_rdtmon->cgroups[i].num_cores; j++) { + snprintf(cores + strlen(cores), sizeof(cores) - strlen(cores) - 1, " %d", + g_rdtmon->cgroups[i].cores[j]); + } + + DEBUG(RDTMON_PLUGIN ": group[%d]:", i); + DEBUG(RDTMON_PLUGIN ": description: %s", g_rdtmon->cgroups[i].desc); + DEBUG(RDTMON_PLUGIN ": cores: %s", cores); + DEBUG(RDTMON_PLUGIN ": events: 0x%X", g_rdtmon->cgroups[i].events); + } + + return; +} + +static inline double bytes_to_kb(const double bytes) { return bytes / 1024.0; } + +static inline double bytes_to_mb(const double bytes) { + return bytes / (1024.0 * 1024.0); +} + +static void rdtmon_dump_data(void) { + /* + * CORE - monitored group of cores + * RMID - Resource Monitoring ID associated with the monitored group + * LLC - last level cache occupancy + * MBL - local memory bandwidth + * MBR - remote memory bandwidth + */ + DEBUG(" CORE RMID LLC[KB] MBL[MB] MBR[MB]"); + for (int i = 0; i < g_rdtmon->num_groups; i++) { + + const struct pqos_event_values *pv = &g_rdtmon->pgroups[i]->values; + + double llc = bytes_to_kb(pv->llc); + double mbr = bytes_to_mb(pv->mbm_remote_delta); + double mbl = bytes_to_mb(pv->mbm_local_delta); + + DEBUG(" [%s] %8u %10.1f %10.1f %10.1f", g_rdtmon->cgroups[i].desc, + g_rdtmon->pgroups[i]->poll_ctx[0].rmid, llc, mbl, mbr); + } +} +#endif /* COLLECT_DEBUG */ + +static void rdtmon_free_cgroups(void) { + for (int i = 0; i < RDTMON_MAX_CORES; i++) { + if (g_rdtmon->cgroups[i].desc) { + sfree(g_rdtmon->cgroups[i].desc); + } + + if (g_rdtmon->cgroups[i].cores) { + sfree(g_rdtmon->cgroups[i].cores); + g_rdtmon->cgroups[i].num_cores = 0; + } + + if (g_rdtmon->pgroups[i]) { + sfree(g_rdtmon->pgroups[i]); + } + } +} + +static int rdtmon_default_cgroups(void) { + int ret; + + /* configure each core in separate group */ + for (int i = 0; i < g_rdtmon->pqos_cpu->num_cores; i++) { + char *desc; + uint64_t core = i; + + desc = ssnprintf_alloc("%d", g_rdtmon->pqos_cpu->cores[i].lcore); + if (desc == NULL) + return (-ENOMEM); + + /* set core group info */ + ret = cgroup_set(&g_rdtmon->cgroups[i], desc, &core, 1); + if (ret < 0) { + free(desc); + return ret; + } + } + + return g_rdtmon->pqos_cpu->num_cores; +} + +static int rdtmon_config_cgroups(oconfig_item_t *item) { + int n = 0; + enum pqos_mon_event events = 0; + + if (item == NULL) { + DEBUG(RDTMON_PLUGIN ": cgroups_config: Invalid argument."); + return (-EINVAL); + } + + DEBUG(RDTMON_PLUGIN ": Core groups [%d]:", item->values_num); + for (int j = 0; j < item->values_num; j++) { + if (item->values[j].type != OCONFIG_TYPE_STRING) { + ERROR(RDTMON_PLUGIN ": given core group value is not a string [idx=%d]", + j); + return (-EINVAL); + } + DEBUG(RDTMON_PLUGIN ": [%d]: %s", j, item->values[j].value.string); + } + + n = oconfig_to_cgroups(item, g_rdtmon->cgroups, RDTMON_MAX_CORES); + if (n < 0) { + rdtmon_free_cgroups(); + ERROR(RDTMON_PLUGIN ": Error parsing core groups configuration."); + return (-EINVAL); + } + + if (n == 0) { + /* create default core groups if "Cores" config option is empty */ + n = rdtmon_default_cgroups(); + if (n < 0) { + rdtmon_free_cgroups(); + ERROR(RDTMON_PLUGIN + ": Error creating default core groups configuration."); + return n; + } + INFO(RDTMON_PLUGIN + ": No core groups configured. Default core groups created."); + } + + /* Get all available events on this platform */ + for (int i = 0; i < g_rdtmon->cap_mon->u.mon->num_events; i++) + events |= g_rdtmon->cap_mon->u.mon->events[i].type; + + events &= ~(PQOS_PERF_EVENT_LLC_MISS); + + DEBUG(RDTMON_PLUGIN ": Available events to monitor [0x%X]", events); + + g_rdtmon->num_groups = n; + for (int i = 0; i < n; i++) { + int found = 0; + + for (int j = 0; j < i; j++) { + found = cgroup_cmp(&g_rdtmon->cgroups[j], &g_rdtmon->cgroups[i]); + if (found != 0) { + rdtmon_free_cgroups(); + ERROR(RDTMON_PLUGIN ": Cannot monitor same cores in different groups."); + return (-EINVAL); + } + } + + g_rdtmon->cgroups[i].events = events; + g_rdtmon->pgroups[i] = malloc(sizeof(struct pqos_mon_data)); + if (g_rdtmon->pgroups[i] == NULL) { + rdtmon_free_cgroups(); + ERROR(RDTMON_PLUGIN ": Failed to allocate memory for monitoring data."); + return (-ENOMEM); + } + } + + return (0); +} + +static int rdtmon_preinit(void) { + struct pqos_config pqos_cfg; + int ret; + + if (g_rdtmon != NULL) { + /* already initialized if config callback was called before init callback */ + return (0); + } + + g_rdtmon = malloc(sizeof(rdtmon_ctx_t)); + if (g_rdtmon == NULL) { + ERROR(RDTMON_PLUGIN ": Failed to allocate memory for rdtmon context."); + return (-ENOMEM); + } + + memset(g_rdtmon, 0, sizeof(rdtmon_ctx_t)); + + /* init PQoS library */ + memset(&pqos_cfg, 0, sizeof(pqos_cfg)); + /* TODO: + * stdout should not be used here. Will be reworked when support of log + * callback is added to PQoS library. + */ + pqos_cfg.fd_log = STDOUT_FILENO; + pqos_cfg.verbose = 0; + + /* In case previous instance of the application was not closed properly + * call fini and ignore return code. */ + pqos_fini(); + + ret = pqos_init(&pqos_cfg); + if (ret != PQOS_RETVAL_OK) { + ERROR(RDTMON_PLUGIN ": Error initializing PQoS library!"); + goto rdtmon_preinit_error1; + } + + ret = pqos_cap_get(&g_rdtmon->pqos_cap, &g_rdtmon->pqos_cpu); + if (ret != PQOS_RETVAL_OK) { + ERROR(RDTMON_PLUGIN ": Error retrieving PQoS capabilities."); + goto rdtmon_preinit_error2; + } + + ret = pqos_cap_get_type(g_rdtmon->pqos_cap, PQOS_CAP_TYPE_MON, + &g_rdtmon->cap_mon); + if (ret == PQOS_RETVAL_PARAM) { + ERROR(RDTMON_PLUGIN ": Error retrieving monitoring capabilities."); + goto rdtmon_preinit_error2; + } + + if (g_rdtmon->cap_mon == NULL) { + ERROR( + RDTMON_PLUGIN + ": Monitoring capability not detected. Nothing to do for the plugin."); + goto rdtmon_preinit_error2; + } + + return (0); + +rdtmon_preinit_error2: + pqos_fini(); + +rdtmon_preinit_error1: + + sfree(g_rdtmon); + + return (-1); +} + +static int rdtmon_config(oconfig_item_t *ci) { + int ret = 0; + + ret = rdtmon_preinit(); + if (ret != 0) + return ret; + + for (int i = 0; i < ci->children_num; i++) { + oconfig_item_t *child = ci->children + i; + + if (strcasecmp("Cores", child->key) == 0) { + + ret = rdtmon_config_cgroups(child); + if (ret != 0) + return ret; + +#if COLLECT_DEBUG + rdtmon_dump_cgroups(); +#endif /* COLLECT_DEBUG */ + + } else { + ERROR(RDTMON_PLUGIN ": Unknown configuration parameter \"%s\".", + child->key); + } + } + + return (0); +} + +static void rdtmon_submit_gauge(char *cgroup, char *type, gauge_t value) { + value_t values[1]; + value_list_t vl = VALUE_LIST_INIT; + + values[0].gauge = value; + + vl.values = values; + vl.values_len = STATIC_ARRAY_SIZE(values); + + sstrncpy(vl.host, hostname_g, sizeof(vl.host)); + sstrncpy(vl.plugin, RDTMON_PLUGIN, sizeof(vl.plugin)); + snprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "[%s]", cgroup); + sstrncpy(vl.type, type, sizeof(vl.type)); + + plugin_dispatch_values(&vl); +} + +static void rdtmon_submit_mbm(char *cgroup, + const struct pqos_event_values *pv) { + value_t values[6]; + value_list_t vl = VALUE_LIST_INIT; + + values[0].gauge = pv->mbm_local; + values[1].gauge = pv->mbm_remote; + values[2].gauge = pv->mbm_total; + values[3].gauge = pv->mbm_local_delta; + values[4].gauge = pv->mbm_remote_delta; + values[5].gauge = pv->mbm_total_delta; + + vl.values = values; + vl.values_len = STATIC_ARRAY_SIZE(values); + + sstrncpy(vl.host, hostname_g, sizeof(vl.host)); + sstrncpy(vl.plugin, RDTMON_PLUGIN, sizeof(vl.plugin)); + snprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "[%s]", cgroup); + sstrncpy(vl.type, "mbm", sizeof(vl.type)); + + plugin_dispatch_values(&vl); +} + +static int rdtmon_read(user_data_t *ud) { + int ret; + + if (g_rdtmon == NULL) { + ERROR(RDTMON_PLUGIN ": rdtmon_read: plugin not initialized."); + return (-EINVAL); + } + + ret = pqos_mon_poll(&g_rdtmon->pgroups[0], (unsigned)g_rdtmon->num_groups); + if (ret != PQOS_RETVAL_OK) { + ERROR(RDTMON_PLUGIN ": Failed to poll monitoring data."); + return (-1); + } + +#if COLLECT_DEBUG + rdtmon_dump_data(); +#endif /* COLLECT_DEBUG */ + + for (int i = 0; i < g_rdtmon->num_groups; i++) { + enum pqos_mon_event mbm_events = + (PQOS_MON_EVENT_LMEM_BW | PQOS_MON_EVENT_TMEM_BW | + PQOS_MON_EVENT_RMEM_BW); + + const struct pqos_event_values *pv = &g_rdtmon->pgroups[i]->values; + + /* Submit only monitored events data */ + + if (g_rdtmon->cgroups[i].events & PQOS_MON_EVENT_L3_OCCUP) + rdtmon_submit_gauge(g_rdtmon->cgroups[i].desc, "llc", pv->llc); + + if (g_rdtmon->cgroups[i].events & PQOS_PERF_EVENT_IPC) + rdtmon_submit_gauge(g_rdtmon->cgroups[i].desc, "ipc", pv->ipc); + + if (g_rdtmon->cgroups[i].events & mbm_events) + rdtmon_submit_mbm(g_rdtmon->cgroups[i].desc, pv); + } + + return (0); +} + +static int rdtmon_init(void) { + int ret; + + ret = rdtmon_preinit(); + if (ret != 0) + return ret; + + /* Start monitoring */ + for (int i = 0; i < g_rdtmon->num_groups; i++) { + rdtmon_core_group_t *cg = &g_rdtmon->cgroups[i]; + + ret = pqos_mon_start(cg->num_cores, cg->cores, cg->events, (void *)cg->desc, + g_rdtmon->pgroups[i]); + + if (ret != PQOS_RETVAL_OK) { + ERROR(RDTMON_PLUGIN ": Error starting monitoring (pqos status=%d)", ret); + return (-1); + } + } + + return (0); +} + +static int rdtmon_shutdown(void) { + int ret; + + DEBUG(RDTMON_PLUGIN ": rdtmon_shutdown."); + + if (g_rdtmon == NULL) { + ERROR(RDTMON_PLUGIN ": rdtmon_shutdown: plugin not initialized."); + return (-EINVAL); + } + + /* Stop monitoring */ + for (int i = 0; i < g_rdtmon->num_groups; i++) { + pqos_mon_stop(g_rdtmon->pgroups[i]); + } + + ret = pqos_fini(); + if (ret != PQOS_RETVAL_OK) + ERROR(RDTMON_PLUGIN ": Error shutting down PQoS library."); + + rdtmon_free_cgroups(); + sfree(g_rdtmon); + + return (0); +} + +void module_register(void) { + plugin_register_init(RDTMON_PLUGIN, rdtmon_init); + plugin_register_complex_config(RDTMON_PLUGIN, rdtmon_config); + plugin_register_complex_read(NULL, RDTMON_PLUGIN, rdtmon_read, 0, NULL); + plugin_register_shutdown(RDTMON_PLUGIN, rdtmon_shutdown); +} diff --git a/src/types.db b/src/types.db index e3da48a4..970d100a 100644 --- a/src/types.db +++ b/src/types.db @@ -114,12 +114,15 @@ if_tx_packets value:DERIVE:0:U invocations value:DERIVE:0:U io_octets rx:DERIVE:0:U, tx:DERIVE:0:U io_packets rx:DERIVE:0:U, tx:DERIVE:0:U +ipc value:GAUGE:0:U ipt_bytes value:DERIVE:0:U ipt_packets value:DERIVE:0:U irq value:DERIVE:0:U latency value:GAUGE:0:U links value:GAUGE:0:U +llc value:GAUGE:0:U load shortterm:GAUGE:0:5000, midterm:GAUGE:0:5000, longterm:GAUGE:0:5000 +mbm local:GAUGE:0:U, remote:GAUGE:0:U, total:GAUGE:0:U, local_delta:DERIVE:0:U, remote_delta:DERIVE:0:U, total_delta:DERIVE:0:U md_disks value:GAUGE:0:U memcached_command value:DERIVE:0:U memcached_connections value:GAUGE:0:U -- 2.11.0