+ DEBUG(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' process names group.",
+ ngroup->desc);
+
+ proc_pids_t *proc_pids_array_prev = ngroup->proc_pids_array;
+ proc_pids_t *proc_pids_array_curr = NULL;
+
+ int fetch_result =
+ fetch_pids_for_procs(RDT_PROC_PATH, (const char **)ngroup->names,
+ ngroup->num_names, &proc_pids_array_curr);
+
+ if (0 != fetch_result) {
+ ERROR(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' failed to fetch PIDs.",
+ ngroup->desc);
+ return fetch_result;
+ }
+
+ pids_list_t *new_pids = NULL;
+ size_t new_pids_count = 0;
+
+ pids_list_t *lost_pids = NULL;
+ size_t lost_pids_count = 0;
+
+ for (size_t i = 0; i < ngroup->num_names; ++i) {
+ if (NULL == proc_pids_array_prev[i].pids &&
+ NULL == proc_pids_array_curr[i].pids)
+ continue;
+ int diff_result = rdt_pid_list_diff(
+ proc_pids_array_prev[i].pids, proc_pids_array_curr[i].pids, &new_pids,
+ &new_pids_count, &lost_pids, &lost_pids_count);
+ if (0 != diff_result) {
+ ERROR(RDT_PLUGIN
+ ": rdt_refresh_ngroup: \'%s\'. Error [%d] during PID diff.",
+ ngroup->desc, diff_result);
+ result = -1;
+ goto cleanup;
+ }
+ }
+
+ DEBUG(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' process names group, added: "
+ "%u, removed: %u.",
+ ngroup->desc, (unsigned)new_pids_count, (unsigned)lost_pids_count);
+
+ if (new_pids_count != 0 || lost_pids_count != 0) {
+
+ if (new_pids) {
+ pid_t new_pids_array[new_pids_count];
+ pids_list_to_array(new_pids_array, new_pids,
+ STATIC_ARRAY_SIZE(new_pids_array));
+
+ /* no pids are monitored for this group yet: start monitoring */
+ if (0 == ngroup->monitored_pids_count) {
+
+ int start_result =
+ pqos_mon_start_pids(new_pids_count, new_pids_array, ngroup->events,
+ (void *)ngroup->desc, group_mon_data);
+ if (PQOS_RETVAL_OK == start_result) {
+ ngroup->monitored_pids_count = new_pids_count;
+ } else {
+ ERROR(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\'. Error [%d] while "
+ "STARTING pids monitoring",
+ ngroup->desc, start_result);
+ result = -1;
+ goto pqos_error_recovery;
+ }
+
+ } else {
+
+ int add_result =
+ pqos_mon_add_pids(new_pids_count, new_pids_array, group_mon_data);
+ if (PQOS_RETVAL_OK == add_result)
+ ngroup->monitored_pids_count += new_pids_count;
+ else {
+ ERROR(RDT_PLUGIN
+ ": rdt_refresh_ngroup: \'%s\'. Error [%d] while ADDING pids.",
+ ngroup->desc, add_result);
+ result = -1;
+ goto pqos_error_recovery;
+ }
+ }
+ }
+
+ if (lost_pids) {
+ pid_t lost_pids_array[lost_pids_count];
+ pids_list_to_array(lost_pids_array, lost_pids,
+ STATIC_ARRAY_SIZE(lost_pids_array));
+
+ if (lost_pids_count == ngroup->monitored_pids_count) {
+ /* all pids for this group are lost: stop monitoring */
+ int stop_result = pqos_mon_stop(group_mon_data);
+ if (PQOS_RETVAL_OK != stop_result) {
+ ERROR(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\'. Error [%d] while "
+ "STOPPING monitoring",
+ ngroup->desc, stop_result);
+ result = -1;
+ goto pqos_error_recovery;
+ }
+ ngroup->monitored_pids_count = 0;
+ } else {
+ assert(lost_pids_count < ngroup->monitored_pids_count);
+ int remove_result = pqos_mon_remove_pids(
+ lost_pids_count, lost_pids_array, group_mon_data);
+ if (PQOS_RETVAL_OK == remove_result) {
+ ngroup->monitored_pids_count -= lost_pids_count;
+ } else {
+ ERROR(RDT_PLUGIN
+ ": rdt_refresh_ngroup: \'%s\'. Error [%d] while REMOVING pids.",
+ ngroup->desc, remove_result);
+ result = -1;
+ goto pqos_error_recovery;
+ }
+ }
+ }
+
+ ngroup->proc_pids_array = proc_pids_array_curr;
+ }
+
+ goto cleanup;
+
+pqos_error_recovery:
+ /* Why?
+ * Resources might be temporary unavailable.
+ *
+ * How?
+ * Collectd will halt the reading thread for this
+ * plugin if it returns an error.
+ * Consecutive errors will be increasing the read period
+ * up to 1 day interval.
+ * On pqos error stop monitoring current group
+ * and reset the proc_pids array
+ * monitoring will be restarted on next collectd read cycle
+ */
+ DEBUG(RDT_PLUGIN ": rdt_refresh_ngroup: \'%s\' group RESET after error.",
+ ngroup->desc);
+ pqos_mon_stop(group_mon_data);
+ for (size_t i = 0; i < ngroup->num_names; ++i) {
+ if (ngroup->proc_pids_array[i].pids)
+ pids_list_free(ngroup->proc_pids_array[i].pids);
+ }
+ sfree(ngroup->proc_pids_array);
+
+ initialize_proc_pids((const char **)ngroup->names, ngroup->num_names,
+ &ngroup->proc_pids_array);
+ ngroup->monitored_pids_count = 0;
+
+cleanup:
+ if (ngroup->proc_pids_array == proc_pids_array_curr) {
+ assert(proc_pids_array_curr);
+ /* new list was successfully saved, free the old one */
+ for (size_t i = 0; i < ngroup->num_names; ++i)
+ if (proc_pids_array_prev[i].pids)
+ pids_list_free(proc_pids_array_prev[i].pids);
+
+ sfree(proc_pids_array_prev);
+
+ } else {
+ /* new list was not saved. Free the new list, keep the old one*/
+ for (size_t i = 0; i < ngroup->num_names; ++i)
+ if (proc_pids_array_curr[i].pids)
+ pids_list_free(proc_pids_array_curr[i].pids);
+
+ sfree(proc_pids_array_curr);
+ }
+
+ if (new_pids)
+ pids_list_free(new_pids);
+
+ if (lost_pids)
+ pids_list_free(lost_pids);
+
+ return result;
+}
+
+static int read_pids_data() {
+
+ if (0 == g_rdt->num_ngroups) {
+ DEBUG(RDT_PLUGIN ": read_pids_data: not configured - PIDs read skipped");
+ return 0;
+ }
+
+ DEBUG(RDT_PLUGIN ": read_pids_data: Scanning active groups");
+ struct pqos_mon_data *active_groups[RDT_MAX_NAMES_GROUPS] = {0};
+ size_t active_group_idx = 0;
+ for (size_t pngroups_idx = 0;
+ pngroups_idx < STATIC_ARRAY_SIZE(g_rdt->pngroups); ++pngroups_idx)
+ if (0 != g_rdt->ngroups[pngroups_idx].monitored_pids_count)
+ active_groups[active_group_idx++] = g_rdt->pngroups[pngroups_idx];
+
+ int ret = 0;
+
+ if (0 == active_group_idx) {
+ DEBUG(RDT_PLUGIN ": read_pids_data: no active groups - PIDs read skipped");
+ goto groups_refresh;
+ }
+
+ DEBUG(RDT_PLUGIN ": read_pids_data: PIDs data polling");
+
+ int poll_result = pqos_mon_poll(active_groups, active_group_idx);
+ if (poll_result != PQOS_RETVAL_OK) {
+ ERROR(RDT_PLUGIN ": read_pids_data: Failed to poll monitoring data for "
+ "pids. Error [%d].",
+ poll_result);
+ ret = -poll_result;
+ goto groups_refresh;
+ }
+
+ for (size_t i = 0; i < g_rdt->num_ngroups; i++) {
+ enum pqos_mon_event mbm_events =
+ (PQOS_MON_EVENT_LMEM_BW | PQOS_MON_EVENT_TMEM_BW |
+ PQOS_MON_EVENT_RMEM_BW);
+
+ if (g_rdt->pngroups[i] == NULL ||
+ g_rdt->ngroups[i].monitored_pids_count == 0)
+ continue;
+
+ const struct pqos_event_values *pv = &g_rdt->pngroups[i]->values;
+
+ /* Submit only monitored events data */
+
+ if (g_rdt->ngroups[i].events & PQOS_MON_EVENT_L3_OCCUP)
+ rdt_submit_gauge(g_rdt->ngroups[i].desc, "bytes", "llc", pv->llc);
+
+ if (g_rdt->ngroups[i].events & PQOS_PERF_EVENT_IPC)
+ rdt_submit_gauge(g_rdt->ngroups[i].desc, "ipc", NULL, pv->ipc);
+
+ if (g_rdt->ngroups[i].events & mbm_events) {
+ rdt_submit_derive(g_rdt->ngroups[i].desc, "memory_bandwidth", "local",
+ pv->mbm_local_delta);
+ rdt_submit_derive(g_rdt->ngroups[i].desc, "memory_bandwidth", "remote",
+ pv->mbm_remote_delta);
+ }
+ }
+