2 * collectd - src/intel_rdt.c
4 * Copyright(c) 2016-2019 Intel Corporation. All rights reserved.
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 * Serhiy Pshyk <serhiyx.pshyk@intel.com>
26 * Starzyk, Mateusz <mateuszx.starzyk@intel.com>
27 * Wojciech Andralojc <wojciechx.andralojc@intel.com>
28 * Michał Aleksiński <michalx.aleksinski@intel.com>
32 #include "utils/common/common.h"
33 #include "utils/config_cores/config_cores.h"
34 #include "utils/proc_pids/proc_pids.h"
37 #define RDT_PLUGIN "intel_rdt"
39 /* libpqos v2.0 or newer is required for process monitoring*/
41 #if defined(PQOS_VERSION) && PQOS_VERSION >= 20000
45 #define RDT_PLUGIN "intel_rdt"
47 #define RDT_MAX_SOCKETS 8
48 #define RDT_MAX_SOCKET_CORES 64
49 #define RDT_MAX_CORES (RDT_MAX_SOCKET_CORES * RDT_MAX_SOCKETS)
53 * Process name inside comm file is limited to 16 chars.
54 * More info here: http://man7.org/linux/man-pages/man5/proc.5.html
56 #define RDT_MAX_NAMES_GROUPS 64
57 #define RDT_PROC_PATH "/proc"
66 struct rdt_name_group_s {
70 proc_pids_t **proc_pids;
71 size_t monitored_pids_count;
72 enum pqos_mon_event events;
74 typedef struct rdt_name_group_s rdt_name_group_t;
78 core_groups_list_t cores;
79 enum pqos_mon_event events[RDT_MAX_CORES];
80 struct pqos_mon_data *pcgroups[RDT_MAX_CORES];
82 rdt_name_group_t ngroups[RDT_MAX_NAMES_GROUPS];
83 struct pqos_mon_data *pngroups[RDT_MAX_NAMES_GROUPS];
85 proc_pids_t **proc_pids;
88 const struct pqos_cpuinfo *pqos_cpu;
89 const struct pqos_cap *pqos_cap;
90 const struct pqos_capability *cap_mon;
92 typedef struct rdt_ctx_s rdt_ctx_t;
94 static rdt_ctx_t *g_rdt;
96 static rdt_config_status g_state = UNKNOWN;
98 static int g_interface = -1;
100 static void rdt_submit_derive(const char *cgroup, const char *type,
101 const char *type_instance, derive_t value) {
102 value_list_t vl = VALUE_LIST_INIT;
104 vl.values = &(value_t){.derive = value};
107 sstrncpy(vl.plugin, RDT_PLUGIN, sizeof(vl.plugin));
108 ssnprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s", cgroup);
109 sstrncpy(vl.type, type, sizeof(vl.type));
111 sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance));
113 plugin_dispatch_values(&vl);
116 static void rdt_submit_gauge(const char *cgroup, const char *type,
117 const char *type_instance, gauge_t value) {
118 value_list_t vl = VALUE_LIST_INIT;
120 vl.values = &(value_t){.gauge = value};
123 sstrncpy(vl.plugin, RDT_PLUGIN, sizeof(vl.plugin));
124 ssnprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s", cgroup);
125 sstrncpy(vl.type, type, sizeof(vl.type));
127 sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance));
129 plugin_dispatch_values(&vl);
133 static void rdt_dump_cgroups(void) {
134 char cores[RDT_MAX_CORES * 4];
139 DEBUG(RDT_PLUGIN ": Core Groups Dump");
140 DEBUG(RDT_PLUGIN ": groups count: %" PRIsz, g_rdt->cores.num_cgroups);
142 for (size_t i = 0; i < g_rdt->cores.num_cgroups; i++) {
143 core_group_t *cgroup = g_rdt->cores.cgroups + i;
145 memset(cores, 0, sizeof(cores));
146 for (size_t j = 0; j < cgroup->num_cores; j++) {
147 ssnprintf(cores + strlen(cores), sizeof(cores) - strlen(cores) - 1, " %d",
151 DEBUG(RDT_PLUGIN ": group[%zu]:", i);
152 DEBUG(RDT_PLUGIN ": description: %s", cgroup->desc);
153 DEBUG(RDT_PLUGIN ": cores: %s", cores);
154 DEBUG(RDT_PLUGIN ": events: 0x%X", g_rdt->events[i]);
161 static void rdt_dump_ngroups(void) {
163 char names[DATA_MAX_NAME_LEN];
168 DEBUG(RDT_PLUGIN ": Process Names Groups Dump");
169 DEBUG(RDT_PLUGIN ": groups count: %" PRIsz, g_rdt->num_ngroups);
171 for (size_t i = 0; i < g_rdt->num_ngroups; i++) {
172 memset(names, 0, sizeof(names));
173 for (size_t j = 0; j < g_rdt->ngroups[i].num_names; j++)
174 ssnprintf(names + strlen(names), sizeof(names) - strlen(names) - 1, " %s",
175 g_rdt->ngroups[i].names[j]);
177 DEBUG(RDT_PLUGIN ": group[%d]:", (int)i);
178 DEBUG(RDT_PLUGIN ": description: %s", g_rdt->ngroups[i].desc);
179 DEBUG(RDT_PLUGIN ": process names:%s", names);
180 DEBUG(RDT_PLUGIN ": events: 0x%X", g_rdt->ngroups[i].events);
185 #endif /* LIBPQOS2 */
187 static inline double bytes_to_kb(const double bytes) { return bytes / 1024.0; }
189 static inline double bytes_to_mb(const double bytes) {
190 return bytes / (1024.0 * 1024.0);
193 static void rdt_dump_cores_data(void) {
195 * CORE - monitored group of cores
196 * RMID - Resource Monitoring ID associated with the monitored group
197 * This is not available for monitoring with resource control
198 * LLC - last level cache occupancy
199 * MBL - local memory bandwidth
200 * MBR - remote memory bandwidth
203 if (g_interface == PQOS_INTER_OS_RESCTRL_MON) {
204 DEBUG(RDT_PLUGIN ": CORE LLC[KB] MBL[MB] MBR[MB]");
206 DEBUG(RDT_PLUGIN ": CORE RMID LLC[KB] MBL[MB] MBR[MB]");
209 DEBUG(RDT_PLUGIN ": CORE RMID LLC[KB] MBL[MB] MBR[MB]");
210 #endif /* LIBPQOS2 */
212 for (int i = 0; i < g_rdt->cores.num_cgroups; i++) {
213 const struct pqos_event_values *pv = &g_rdt->pcgroups[i]->values;
215 double llc = bytes_to_kb(pv->llc);
216 double mbr = bytes_to_mb(pv->mbm_remote_delta);
217 double mbl = bytes_to_mb(pv->mbm_local_delta);
219 if (g_interface == PQOS_INTER_OS_RESCTRL_MON) {
220 DEBUG(RDT_PLUGIN ": [%s] %10.1f %10.1f %10.1f",
221 g_rdt->cores.cgroups[i].desc, llc, mbl, mbr);
223 DEBUG(RDT_PLUGIN ": [%s] %8u %10.1f %10.1f %10.1f",
224 g_rdt->cores.cgroups[i].desc, g_rdt->pcgroups[i]->poll_ctx[0].rmid,
228 DEBUG(RDT_PLUGIN ": [%s] %8u %10.1f %10.1f %10.1f",
229 g_rdt->cores.cgroups[i].desc, g_rdt->pcgroups[i]->poll_ctx[0].rmid,
231 #endif /* LIBPQOS2 */
236 static void rdt_dump_pids_data(void) {
238 * NAME - monitored group of processes
239 * PIDs - list of PID numbers in the NAME group
240 * LLC - last level cache occupancy
241 * MBL - local memory bandwidth
242 * MBR - remote memory bandwidth
245 DEBUG(RDT_PLUGIN ": NAME PIDs");
246 char pids[DATA_MAX_NAME_LEN];
247 for (size_t i = 0; i < g_rdt->num_ngroups; ++i) {
248 memset(pids, 0, sizeof(pids));
249 for (size_t j = 0; j < g_rdt->ngroups[i].num_names; ++j) {
250 pids_list_t *list = g_rdt->ngroups[i].proc_pids[j]->curr;
251 for (size_t k = 0; k < list->size; k++)
252 ssnprintf(pids + strlen(pids), sizeof(pids) - strlen(pids) - 1, " %u",
255 DEBUG(RDT_PLUGIN ": [%s] %s", g_rdt->ngroups[i].desc, pids);
258 DEBUG(RDT_PLUGIN ": NAME LLC[KB] MBL[MB] MBR[MB]");
259 for (size_t i = 0; i < g_rdt->num_ngroups; i++) {
261 const struct pqos_event_values *pv = &g_rdt->pngroups[i]->values;
263 double llc = bytes_to_kb(pv->llc);
264 double mbr = bytes_to_mb(pv->mbm_remote_delta);
265 double mbl = bytes_to_mb(pv->mbm_local_delta);
267 DEBUG(RDT_PLUGIN ": [%s] %10.1f %10.1f %10.1f", g_rdt->ngroups[i].desc,
271 #endif /* LIBPQOS2 */
272 #endif /* COLLECT_DEBUG */
275 static int isdupstr(const char *names[], const size_t size, const char *name) {
276 for (size_t i = 0; i < size; i++)
277 if (strncmp(names[i], name, (size_t)MAX_PROC_NAME_LEN) == 0)
288 * Converts string representing list of strings into array of strings.
290 * name,name1,name2,name3
293 * `str_list' String representing list of strings.
294 * `names' Array to put extracted strings into.
295 * `names_num' Variable to put number of extracted strings.
298 * Number of elements placed into names.
300 static int strlisttoarray(char *str_list, char ***names, size_t *names_num) {
301 char *saveptr = NULL;
303 if (str_list == NULL || names == NULL)
306 if (strstr(str_list, ",,")) {
307 /* strtok ignores empty words between separators.
308 * This condition handles that by rejecting strings
309 * with consecutive seprators */
310 ERROR(RDT_PLUGIN ": Empty process name");
315 char *token = strtok_r(str_list, ",", &saveptr);
321 while (isspace(*token))
327 if ((isdupstr((const char **)*names, *names_num, token))) {
328 if (str_list != NULL)
329 ERROR(RDT_PLUGIN ": Duplicated process name \'%s\' in group \'%s\'",
332 ERROR(RDT_PLUGIN ": Duplicated process name \'%s\'", token);
336 if (0 != strarray_add(names, names_num, token)) {
337 ERROR(RDT_PLUGIN ": Error allocating process name string");
351 * Function to compare names in two name groups.
354 * `ng_a' Pointer to name group a.
355 * `ng_b' Pointer to name group b.
358 * 1 if both groups contain the same names
359 * 0 if none of their names match
360 * -1 if some but not all names match
362 static int ngroup_cmp(const rdt_name_group_t *ng_a,
363 const rdt_name_group_t *ng_b) {
366 assert(ng_a != NULL);
367 assert(ng_b != NULL);
369 const size_t sz_a = (unsigned)ng_a->num_names;
370 const size_t sz_b = (unsigned)ng_b->num_names;
371 const char **tab_a = (const char **)ng_a->names;
372 const char **tab_b = (const char **)ng_b->names;
374 for (size_t i = 0; i < sz_a; i++) {
375 for (size_t j = 0; j < sz_b; j++)
376 if (strncmp(tab_a[i], tab_b[j], (size_t)MAX_PROC_NAME_LEN) == 0)
379 /* if no names are the same */
382 /* if group contains same names */
383 if (sz_a == sz_b && sz_b == (size_t)found)
385 /* if not all names are the same */
394 * Function to set the descriptions and names for each process names group.
395 * Takes a config option containing list of strings that are used to set
396 * process group values.
399 * `item' Config option containing process names groups.
400 * `groups' Table of process name groups to set values in.
401 * `max_groups' Maximum number of process name groups allowed.
404 * On success, the number of name groups set up. On error, appropriate
405 * negative error value.
407 static int oconfig_to_ngroups(const oconfig_item_t *item,
408 rdt_name_group_t *groups,
409 const size_t max_groups) {
412 assert(groups != NULL);
413 assert(max_groups > 0);
414 assert(item != NULL);
416 for (int j = 0; j < item->values_num; j++) {
418 char value[DATA_MAX_NAME_LEN];
420 if ((item->values[j].value.string == NULL) ||
421 (strlen(item->values[j].value.string) == 0)) {
422 ERROR(RDT_PLUGIN ": Error - empty group");
426 sstrncpy(value, item->values[j].value.string, sizeof(value));
428 ret = strlisttoarray(value, &groups[index].names, &groups[index].num_names);
429 if (ret != 0 || groups[index].num_names == 0) {
430 ERROR(RDT_PLUGIN ": Error parsing process names group (%s)",
431 item->values[j].value.string);
435 /* set group description info */
436 groups[index].desc = sstrdup(item->values[j].value.string);
437 if (groups[index].desc == NULL) {
438 ERROR(RDT_PLUGIN ": Error allocating name group description");
442 groups[index].proc_pids = NULL;
443 groups[index].monitored_pids_count = 0;
447 if (index >= (const int)max_groups) {
448 WARNING(RDT_PLUGIN ": Too many process names groups configured");
461 * Function to deallocate memory allocated for name groups.
464 * `rdt' Pointer to rdt context
466 static void rdt_free_ngroups(rdt_ctx_t *rdt) {
467 for (int i = 0; i < RDT_MAX_NAMES_GROUPS; i++) {
468 if (rdt->ngroups[i].desc)
469 DEBUG(RDT_PLUGIN ": Freeing pids \'%s\' group\'s data...",
470 rdt->ngroups[i].desc);
471 sfree(rdt->ngroups[i].desc);
472 strarray_free(rdt->ngroups[i].names, rdt->ngroups[i].num_names);
474 if (rdt->ngroups[i].proc_pids)
475 proc_pids_free(rdt->ngroups[i].proc_pids, rdt->ngroups[i].num_names);
477 rdt->ngroups[i].num_names = 0;
478 sfree(rdt->pngroups[i]);
481 sfree(rdt->proc_pids);
483 rdt->num_ngroups = 0;
491 * Reads name groups configuration.
494 * `rdt` Pointer to rdt context
495 * `item' Config option containing process names groups.
498 * 0 on success. Negative number on error.
500 static int rdt_config_ngroups(rdt_ctx_t *rdt, const oconfig_item_t *item) {
502 enum pqos_mon_event events = 0;
505 DEBUG(RDT_PLUGIN ": ngroups_config: Invalid argument.");
509 DEBUG(RDT_PLUGIN ": Process names groups [%d]:", item->values_num);
510 for (int j = 0; j < item->values_num; j++) {
511 if (item->values[j].type != OCONFIG_TYPE_STRING) {
513 ": given process names group value is not a string [idx=%d]",
517 DEBUG(RDT_PLUGIN ": [%d]: %s", j, item->values[j].value.string);
520 n = oconfig_to_ngroups(item, rdt->ngroups, RDT_MAX_NAMES_GROUPS);
522 rdt_free_ngroups(rdt);
523 ERROR(RDT_PLUGIN ": Error parsing process name groups configuration.");
527 /* validate configured process name values */
528 for (int group_idx = 0; group_idx < n; group_idx++) {
529 DEBUG(RDT_PLUGIN ": checking group [%d]: %s", group_idx,
530 rdt->ngroups[group_idx].desc);
531 for (size_t name_idx = 0; name_idx < rdt->ngroups[group_idx].num_names;
533 DEBUG(RDT_PLUGIN ": checking process name [%zu]: %s", name_idx,
534 rdt->ngroups[group_idx].names[name_idx]);
535 if (!proc_pids_is_name_valid(rdt->ngroups[group_idx].names[name_idx])) {
536 ERROR(RDT_PLUGIN ": Process name group '%s' contains invalid name '%s'",
537 rdt->ngroups[group_idx].desc,
538 rdt->ngroups[group_idx].names[name_idx]);
539 rdt_free_ngroups(rdt);
546 ERROR(RDT_PLUGIN ": Empty process name groups configured.");
550 /* Get all available events on this platform */
551 for (unsigned i = 0; i < rdt->cap_mon->u.mon->num_events; i++)
552 events |= rdt->cap_mon->u.mon->events[i].type;
554 events &= ~(PQOS_PERF_EVENT_LLC_MISS);
556 DEBUG(RDT_PLUGIN ": Available events to monitor: %#x", events);
558 rdt->num_ngroups = n;
559 for (int i = 0; i < n; i++) {
560 for (int j = 0; j < i; j++) {
561 int found = ngroup_cmp(&rdt->ngroups[j], &rdt->ngroups[i]);
563 rdt_free_ngroups(rdt);
565 ": Cannot monitor same process name in different groups.");
570 rdt->ngroups[i].events = events;
571 rdt->pngroups[i] = calloc(1, sizeof(*rdt->pngroups[i]));
572 if (rdt->pngroups[i] == NULL) {
573 rdt_free_ngroups(rdt);
575 ": Failed to allocate memory for process name monitoring data.");
588 * Refresh pids monitored by name group.
591 * `ngroup` Pointer to name group.
592 * `group_mon_data' PQoS monitoring context.
595 * 0 on success. Negative number on error.
597 static int rdt_refresh_ngroup(rdt_name_group_t *ngroup,
598 struct pqos_mon_data *group_mon_data) {
605 if (NULL == ngroup->proc_pids) {
607 ": rdt_refresh_ngroup: \'%s\' uninitialized process pids array.",
613 DEBUG(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' process names group.",
616 proc_pids_t **proc_pids = ngroup->proc_pids;
617 pids_list_t added_pids;
618 pids_list_t removed_pids;
620 memset(&added_pids, 0, sizeof(added_pids));
621 memset(&removed_pids, 0, sizeof(removed_pids));
623 for (size_t i = 0; i < ngroup->num_names; ++i) {
624 int diff_result = pids_list_diff(proc_pids[i], &added_pids, &removed_pids);
625 if (0 != diff_result) {
627 ": rdt_refresh_ngroup: \'%s\'. Error [%d] during PID diff.",
628 ngroup->desc, diff_result);
634 DEBUG(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' process names group, added: "
636 ngroup->desc, (unsigned)added_pids.size, (unsigned)removed_pids.size);
638 if (added_pids.size > 0) {
640 /* no pids are monitored for this group yet: start monitoring */
641 if (0 == ngroup->monitored_pids_count) {
644 pqos_mon_start_pids(added_pids.size, added_pids.pids, ngroup->events,
645 (void *)ngroup->desc, group_mon_data);
646 if (PQOS_RETVAL_OK == start_result) {
647 ngroup->monitored_pids_count = added_pids.size;
649 ERROR(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\'. Error [%d] while "
650 "STARTING pids monitoring",
651 ngroup->desc, start_result);
653 goto pqos_error_recovery;
659 pqos_mon_add_pids(added_pids.size, added_pids.pids, group_mon_data);
660 if (PQOS_RETVAL_OK == add_result)
661 ngroup->monitored_pids_count += added_pids.size;
664 ": rdt_refresh_ngroup: \'%s\'. Error [%d] while ADDING pids.",
665 ngroup->desc, add_result);
667 goto pqos_error_recovery;
672 if (removed_pids.size > 0) {
674 /* all pids are removed: stop monitoring */
675 if (removed_pids.size == ngroup->monitored_pids_count) {
676 /* all pids for this group are lost: stop monitoring */
677 int stop_result = pqos_mon_stop(group_mon_data);
678 if (PQOS_RETVAL_OK != stop_result) {
679 ERROR(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\'. Error [%d] while "
680 "STOPPING monitoring",
681 ngroup->desc, stop_result);
683 goto pqos_error_recovery;
685 ngroup->monitored_pids_count = 0;
687 int remove_result = pqos_mon_remove_pids(
688 removed_pids.size, removed_pids.pids, group_mon_data);
689 if (PQOS_RETVAL_OK == remove_result) {
690 ngroup->monitored_pids_count -= removed_pids.size;
693 ": rdt_refresh_ngroup: \'%s\'. Error [%d] while REMOVING pids.",
694 ngroup->desc, remove_result);
696 goto pqos_error_recovery;
705 * Resources might be temporary unavailable.
708 * Collectd will halt the reading thread for this
709 * plugin if it returns an error.
710 * Consecutive errors will be increasing the read period
711 * up to 1 day interval.
712 * On pqos error stop monitoring current group
713 * and reset the proc_pids array
714 * monitoring will be restarted on next collectd read cycle
716 DEBUG(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' group RESET after error.",
718 pqos_mon_stop(group_mon_data);
719 for (size_t i = 0; i < ngroup->num_names; ++i)
720 if (ngroup->proc_pids[i]->curr)
721 ngroup->proc_pids[i]->curr->size = 0;
723 ngroup->monitored_pids_count = 0;
726 pids_list_clear(&added_pids);
727 pids_list_clear(&removed_pids);
737 * Poll monitoring statistics for name groups
740 * 0 on success. Negative number on error.
742 static int read_pids_data() {
744 if (0 == g_rdt->num_ngroups) {
745 DEBUG(RDT_PLUGIN ": read_pids_data: not configured - PIDs read skipped");
749 DEBUG(RDT_PLUGIN ": read_pids_data: Scanning active groups");
750 struct pqos_mon_data *active_groups[RDT_MAX_NAMES_GROUPS] = {0};
751 size_t active_group_idx = 0;
752 for (size_t pngroups_idx = 0;
753 pngroups_idx < STATIC_ARRAY_SIZE(g_rdt->pngroups); ++pngroups_idx)
754 if (0 != g_rdt->ngroups[pngroups_idx].monitored_pids_count)
755 active_groups[active_group_idx++] = g_rdt->pngroups[pngroups_idx];
759 if (0 == active_group_idx) {
760 DEBUG(RDT_PLUGIN ": read_pids_data: no active groups - PIDs read skipped");
764 DEBUG(RDT_PLUGIN ": read_pids_data: PIDs data polling");
766 int poll_result = pqos_mon_poll(active_groups, active_group_idx);
767 if (poll_result != PQOS_RETVAL_OK) {
768 ERROR(RDT_PLUGIN ": read_pids_data: Failed to poll monitoring data for "
775 for (size_t i = 0; i < g_rdt->num_ngroups; i++) {
776 enum pqos_mon_event mbm_events =
777 (PQOS_MON_EVENT_LMEM_BW | PQOS_MON_EVENT_TMEM_BW |
778 PQOS_MON_EVENT_RMEM_BW);
780 if (g_rdt->pngroups[i] == NULL ||
781 g_rdt->ngroups[i].monitored_pids_count == 0)
784 const struct pqos_event_values *pv = &g_rdt->pngroups[i]->values;
786 /* Submit only monitored events data */
788 if (g_rdt->ngroups[i].events & PQOS_MON_EVENT_L3_OCCUP)
789 rdt_submit_gauge(g_rdt->ngroups[i].desc, "bytes", "llc", pv->llc);
791 if (g_rdt->ngroups[i].events & PQOS_PERF_EVENT_IPC)
792 rdt_submit_gauge(g_rdt->ngroups[i].desc, "ipc", NULL, pv->ipc);
794 if (g_rdt->ngroups[i].events & mbm_events) {
795 rdt_submit_derive(g_rdt->ngroups[i].desc, "memory_bandwidth", "local",
796 pv->mbm_local_delta);
797 rdt_submit_derive(g_rdt->ngroups[i].desc, "memory_bandwidth", "remote",
798 pv->mbm_remote_delta);
803 rdt_dump_pids_data();
804 #endif /* COLLECT_DEBUG */
807 ret = proc_pids_update(RDT_PROC_PATH, g_rdt->proc_pids, g_rdt->num_proc_pids);
809 ERROR(RDT_PLUGIN ": Initial update of proc pids failed");
813 for (size_t i = 0; i < g_rdt->num_ngroups; i++) {
815 rdt_refresh_ngroup(&(g_rdt->ngroups[i]), g_rdt->pngroups[i]);
817 if (0 != refresh_result) {
818 ERROR(RDT_PLUGIN ": read_pids_data: NGroup %zu refresh failed. Error: %d",
821 /* refresh error will be escalated only if there were no
824 ret = refresh_result;
835 * rdt_init_pids_monitoring
838 * Initialize pids monitoring for all name groups
840 static void rdt_init_pids_monitoring() {
841 for (size_t group_idx = 0; group_idx < g_rdt->num_ngroups; group_idx++) {
843 * Each group must have not-null proc_pids array.
844 * Initial refresh is not mandatory for proper
845 * PIDs statistics detection.
847 rdt_name_group_t *ng = &g_rdt->ngroups[group_idx];
849 proc_pids_init((const char **)ng->names, ng->num_names, &ng->proc_pids);
850 if (0 != init_result) {
852 ": Initialization of proc_pids for group %zu failed. Error: %d",
853 group_idx, init_result);
857 /* update global proc_pids table */
858 proc_pids_t **proc_pids =
859 realloc(g_rdt->proc_pids, (g_rdt->num_proc_pids + ng->num_names) *
860 sizeof(*g_rdt->proc_pids));
861 if (NULL == proc_pids) {
862 ERROR(RDT_PLUGIN ": Alloc error\n");
866 for (size_t i = 0; i < ng->num_names; i++)
867 proc_pids[g_rdt->num_proc_pids + i] = ng->proc_pids[i];
869 g_rdt->proc_pids = proc_pids;
870 g_rdt->num_proc_pids += ng->num_names;
873 if (g_rdt->num_ngroups > 0) {
875 proc_pids_update(RDT_PROC_PATH, g_rdt->proc_pids, g_rdt->num_proc_pids);
876 if (0 != update_result)
877 ERROR(RDT_PLUGIN ": Initial update of proc pids failed");
880 for (size_t group_idx = 0; group_idx < g_rdt->num_ngroups; group_idx++) {
881 int refresh_result = rdt_refresh_ngroup(&(g_rdt->ngroups[group_idx]),
882 g_rdt->pngroups[group_idx]);
883 if (0 != refresh_result)
884 ERROR(RDT_PLUGIN ": Initial refresh of group %zu failed. Error: %d",
885 group_idx, refresh_result);
888 #endif /* LIBPQOS2 */
894 * Function to deallocate memory allocated for core groups.
896 static void rdt_free_cgroups(void) {
897 config_cores_cleanup(&g_rdt->cores);
898 for (int i = 0; i < RDT_MAX_CORES; i++) {
899 sfree(g_rdt->pcgroups[i]);
901 g_rdt->cores.num_cgroups = 0;
904 static int rdt_default_cgroups(void) {
905 unsigned num_cores = g_rdt->pqos_cpu->num_cores;
907 g_rdt->cores.cgroups = calloc(num_cores, sizeof(*(g_rdt->cores.cgroups)));
908 if (g_rdt->cores.cgroups == NULL) {
909 ERROR(RDT_PLUGIN ": Error allocating core groups array");
912 g_rdt->cores.num_cgroups = num_cores;
914 /* configure each core in separate group */
915 for (unsigned i = 0; i < num_cores; i++) {
916 core_group_t *cgroup = g_rdt->cores.cgroups + i;
917 char desc[DATA_MAX_NAME_LEN];
919 /* set core group info */
920 cgroup->cores = calloc(1, sizeof(*cgroup->cores));
921 if (cgroup->cores == NULL) {
922 ERROR(RDT_PLUGIN ": Error allocating cores array");
926 cgroup->num_cores = 1;
927 cgroup->cores[0] = i;
929 ssnprintf(desc, sizeof(desc), "%d", g_rdt->pqos_cpu->cores[i].lcore);
930 cgroup->desc = strdup(desc);
931 if (cgroup->desc == NULL) {
932 ERROR(RDT_PLUGIN ": Error allocating core group description");
941 static int rdt_is_core_id_valid(unsigned int core_id) {
943 for (unsigned int i = 0; i < g_rdt->pqos_cpu->num_cores; i++)
944 if (core_id == g_rdt->pqos_cpu->cores[i].lcore)
950 static int rdt_config_cgroups(oconfig_item_t *item) {
952 enum pqos_mon_event events = 0;
954 if (config_cores_parse(item, &g_rdt->cores) < 0) {
956 ERROR(RDT_PLUGIN ": Error parsing core groups configuration.");
959 n = g_rdt->cores.num_cgroups;
961 /* validate configured core id values */
962 for (size_t group_idx = 0; group_idx < n; group_idx++) {
963 core_group_t *cgroup = g_rdt->cores.cgroups + group_idx;
964 for (size_t core_idx = 0; core_idx < cgroup->num_cores; core_idx++) {
965 if (!rdt_is_core_id_valid(cgroup->cores[core_idx])) {
966 ERROR(RDT_PLUGIN ": Core group '%s' contains invalid core id '%u'",
967 cgroup->desc, cgroup->cores[core_idx]);
975 /* create default core groups if "Cores" config option is empty */
976 int ret = rdt_default_cgroups();
979 ERROR(RDT_PLUGIN ": Error creating default core groups configuration.");
984 ": No core groups configured. Default core groups created.");
987 /* Get all available events on this platform */
988 for (unsigned int i = 0; i < g_rdt->cap_mon->u.mon->num_events; i++)
989 events |= g_rdt->cap_mon->u.mon->events[i].type;
991 events &= ~(PQOS_PERF_EVENT_LLC_MISS);
993 DEBUG(RDT_PLUGIN ": Number of cores in the system: %u",
994 g_rdt->pqos_cpu->num_cores);
995 DEBUG(RDT_PLUGIN ": Available events to monitor: %#x", events);
997 g_rdt->cores.num_cgroups = n;
998 for (int i = 0; i < n; i++) {
999 for (int j = 0; j < i; j++) {
1001 found = config_cores_cmp_cgroups(&g_rdt->cores.cgroups[j],
1002 &g_rdt->cores.cgroups[i]);
1005 ERROR(RDT_PLUGIN ": Cannot monitor same cores in different groups.");
1010 g_rdt->events[i] = events;
1011 g_rdt->pcgroups[i] = calloc(1, sizeof(*g_rdt->pcgroups[i]));
1012 if (g_rdt->pcgroups[i] == NULL) {
1014 ERROR(RDT_PLUGIN ": Failed to allocate memory for monitoring data.");
1022 static void rdt_pqos_log(void *context, const size_t size, const char *msg) {
1023 DEBUG(RDT_PLUGIN ": %s", msg);
1026 static int rdt_preinit(void) {
1029 if (g_rdt != NULL) {
1030 /* already initialized if config callback was called before init callback */
1034 g_rdt = calloc(1, sizeof(*g_rdt));
1035 if (g_rdt == NULL) {
1036 ERROR(RDT_PLUGIN ": Failed to allocate memory for rdt context.");
1040 struct pqos_config pqos = {.fd_log = -1,
1041 .callback_log = rdt_pqos_log,
1042 .context_log = NULL,
1045 .interface = PQOS_INTER_OS_RESCTRL_MON};
1046 DEBUG(RDT_PLUGIN ": Initializing PQoS with RESCTRL interface");
1048 .interface = PQOS_INTER_MSR};
1049 DEBUG(RDT_PLUGIN ": Initializing PQoS with MSR interface");
1052 ret = pqos_init(&pqos);
1053 DEBUG(RDT_PLUGIN ": PQoS initialization result: [%d]", ret);
1056 if (ret == PQOS_RETVAL_INTER) {
1057 pqos.interface = PQOS_INTER_MSR;
1058 DEBUG(RDT_PLUGIN ": Initializing PQoS with MSR interface");
1059 ret = pqos_init(&pqos);
1060 DEBUG(RDT_PLUGIN ": PQoS initialization result: [%d]", ret);
1064 if (ret != PQOS_RETVAL_OK) {
1065 ERROR(RDT_PLUGIN ": Error initializing PQoS library!");
1066 goto rdt_preinit_error1;
1069 g_interface = pqos.interface;
1071 ret = pqos_cap_get(&g_rdt->pqos_cap, &g_rdt->pqos_cpu);
1072 if (ret != PQOS_RETVAL_OK) {
1073 ERROR(RDT_PLUGIN ": Error retrieving PQoS capabilities.");
1074 goto rdt_preinit_error2;
1077 ret = pqos_cap_get_type(g_rdt->pqos_cap, PQOS_CAP_TYPE_MON, &g_rdt->cap_mon);
1078 if (ret == PQOS_RETVAL_PARAM) {
1079 ERROR(RDT_PLUGIN ": Error retrieving monitoring capabilities.");
1080 goto rdt_preinit_error2;
1083 if (g_rdt->cap_mon == NULL) {
1086 ": Monitoring capability not detected. Nothing to do for the plugin.");
1087 goto rdt_preinit_error2;
1090 /* Reset pqos monitoring groups registers */
1104 static int rdt_config(oconfig_item_t *ci) {
1105 if (rdt_preinit() != 0) {
1106 g_state = CONFIGURATION_ERROR;
1107 /* if we return -1 at this point collectd
1108 reports a failure in configuration and
1114 for (int i = 0; i < ci->children_num; i++) {
1115 oconfig_item_t *child = ci->children + i;
1117 if (strncasecmp("Cores", child->key, (size_t)strlen("Cores")) == 0) {
1118 if (g_rdt->cores.num_cgroups > 0) {
1120 ": Configuration parameter \"%s\" can be used only once.",
1122 g_state = CONFIGURATION_ERROR;
1123 } else if (rdt_config_cgroups(child) != 0)
1124 g_state = CONFIGURATION_ERROR;
1126 if (g_state == CONFIGURATION_ERROR)
1127 /* if we return -1 at this point collectd
1128 reports a failure in configuration and
1135 #endif /* COLLECT_DEBUG */
1136 } else if (strncasecmp("Processes", child->key,
1137 (size_t)strlen("Processes")) == 0) {
1139 if (g_interface != PQOS_INTER_OS_RESCTRL_MON) {
1140 ERROR(RDT_PLUGIN ": Configuration parameter \"%s\" not supported. "
1141 "Resctrl monitoring is needed for PIDs monitoring.",
1143 g_state = CONFIGURATION_ERROR;
1146 else if (g_rdt->num_ngroups > 0) {
1148 ": Configuration parameter \"%s\" can be used only once.",
1150 g_state = CONFIGURATION_ERROR;
1153 else if (rdt_config_ngroups(g_rdt, child) != 0)
1154 g_state = CONFIGURATION_ERROR;
1156 if (g_state == CONFIGURATION_ERROR)
1157 /* if we return -1 at this point collectd
1158 reports a failure in configuration and
1165 #endif /* COLLECT_DEBUG */
1166 #else /* !LIBPQOS2 */
1167 ERROR(RDT_PLUGIN ": Configuration parameter \"%s\" not supported, please "
1168 "recompile collectd with libpqos version 2.0 or newer.",
1170 #endif /* LIBPQOS2 */
1172 ERROR(RDT_PLUGIN ": Unknown configuration parameter \"%s\".", child->key);
1179 static int read_cores_data() {
1181 if (0 == g_rdt->cores.num_cgroups) {
1182 DEBUG(RDT_PLUGIN ": read_cores_data: not configured - Cores read skipped");
1185 DEBUG(RDT_PLUGIN ": read_cores_data: Cores data poll");
1188 pqos_mon_poll(&g_rdt->pcgroups[0], (unsigned)g_rdt->cores.num_cgroups);
1189 if (ret != PQOS_RETVAL_OK) {
1190 ERROR(RDT_PLUGIN ": read_cores_data: Failed to poll monitoring data for "
1191 "cores. Error [%d].",
1196 for (size_t i = 0; i < g_rdt->cores.num_cgroups; i++) {
1197 core_group_t *cgroup = g_rdt->cores.cgroups + i;
1198 enum pqos_mon_event mbm_events =
1199 (PQOS_MON_EVENT_LMEM_BW | PQOS_MON_EVENT_TMEM_BW |
1200 PQOS_MON_EVENT_RMEM_BW);
1202 const struct pqos_event_values *pv = &g_rdt->pcgroups[i]->values;
1204 /* Submit only monitored events data */
1206 if (g_rdt->events[i] & PQOS_MON_EVENT_L3_OCCUP)
1207 rdt_submit_gauge(cgroup->desc, "bytes", "llc", pv->llc);
1209 if (g_rdt->events[i] & PQOS_PERF_EVENT_IPC)
1210 rdt_submit_gauge(cgroup->desc, "ipc", NULL, pv->ipc);
1212 if (g_rdt->events[i] & mbm_events) {
1213 rdt_submit_derive(cgroup->desc, "memory_bandwidth", "local",
1214 pv->mbm_local_delta);
1215 rdt_submit_derive(cgroup->desc, "memory_bandwidth", "remote",
1216 pv->mbm_remote_delta);
1221 rdt_dump_cores_data();
1222 #endif /* COLLECT_DEBUG */
1227 static int rdt_read(__attribute__((unused)) user_data_t *ud) {
1229 if (g_rdt == NULL) {
1230 ERROR(RDT_PLUGIN ": rdt_read: plugin not initialized.");
1234 int cores_read_result = read_cores_data();
1237 int pids_read_result = read_pids_data();
1238 #endif /* LIBPQOS2 */
1240 if (0 != cores_read_result)
1241 return cores_read_result;
1244 if (0 != pids_read_result)
1245 return pids_read_result;
1246 #endif /* LIBPQOS2 */
1251 static void rdt_init_cores_monitoring() {
1252 for (size_t i = 0; i < g_rdt->cores.num_cgroups; i++) {
1253 core_group_t *cg = g_rdt->cores.cgroups + i;
1255 int mon_start_result =
1256 pqos_mon_start(cg->num_cores, cg->cores, g_rdt->events[i],
1257 (void *)cg->desc, g_rdt->pcgroups[i]);
1259 if (mon_start_result != PQOS_RETVAL_OK)
1261 ": Error starting cores monitoring group %s (pqos status=%d)",
1262 cg->desc, mon_start_result);
1266 static int rdt_init(void) {
1268 if (g_state == CONFIGURATION_ERROR) {
1269 if (g_rdt != NULL) {
1270 if (g_rdt->cores.num_cgroups > 0)
1273 if (g_rdt->num_ngroups > 0)
1274 rdt_free_ngroups(g_rdt);
1280 int rdt_preinint_result = rdt_preinit();
1281 if (rdt_preinint_result != 0)
1282 return rdt_preinint_result;
1284 rdt_init_cores_monitoring();
1286 rdt_init_pids_monitoring();
1287 #endif /* LIBPQOS2 */
1292 static int rdt_shutdown(void) {
1295 DEBUG(RDT_PLUGIN ": rdt_shutdown.");
1300 /* Stop monitoring cores */
1301 for (size_t i = 0; i < g_rdt->cores.num_cgroups; i++) {
1302 pqos_mon_stop(g_rdt->pcgroups[i]);
1305 /* Stop pids monitoring */
1307 for (size_t i = 0; i < g_rdt->num_ngroups; i++)
1308 pqos_mon_stop(g_rdt->pngroups[i]);
1312 if (ret != PQOS_RETVAL_OK)
1313 ERROR(RDT_PLUGIN ": Error shutting down PQoS library.");
1316 rdt_free_ngroups(g_rdt);
1317 #endif /* LIBPQOS2 */
1323 void module_register(void) {
1324 plugin_register_init(RDT_PLUGIN, rdt_init);
1325 plugin_register_complex_config(RDT_PLUGIN, rdt_config);
1326 plugin_register_complex_read(NULL, RDT_PLUGIN, rdt_read, 0, NULL);
1327 plugin_register_shutdown(RDT_PLUGIN, rdt_shutdown);