2 * collectd - src/threshold.c
3 * Copyright (C) 2007-2010 Florian Forster
4 * Copyright (C) 2008-2009 Sebastian Harl
5 * Copyright (C) 2009 Andrés J. Díaz
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; only version 2 of the License is applicable.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 * Florian octo Forster <octo at collectd.org>
22 * Sebastian Harl <sh at tokkee.org>
23 * Andrés J. Díaz <ajdiaz at connectical.com>
29 #include "utils/avltree/avltree.h"
30 #include "utils/common/common.h"
31 #include "utils_cache.h"
32 #include "utils_threshold.h"
35 * Threshold management
36 * ====================
37 * The following functions add, delete, search, etc. configured thresholds to
38 * the underlying AVL trees.
42 * int ut_threshold_add
44 * Adds a threshold configuration to the list of thresholds. The threshold_t
45 * structure is copied and may be destroyed after this call. Returns zero on
46 * success, non-zero otherwise.
48 static int ut_threshold_add(const threshold_t *th) { /* {{{ */
49 char name[6 * DATA_MAX_NAME_LEN];
55 if (format_name(name, sizeof(name), th->host, th->plugin, th->plugin_instance,
56 th->type, th->type_instance) != 0) {
57 ERROR("ut_threshold_add: format_name failed.");
61 name_copy = strdup(name);
62 if (name_copy == NULL) {
63 ERROR("ut_threshold_add: strdup failed.");
67 th_copy = malloc(sizeof(*th_copy));
68 if (th_copy == NULL) {
70 ERROR("ut_threshold_add: malloc failed.");
73 memcpy(th_copy, th, sizeof(threshold_t));
75 DEBUG("ut_threshold_add: Adding entry `%s'", name);
77 pthread_mutex_lock(&threshold_lock);
79 th_ptr = threshold_get(th->host, th->plugin, th->plugin_instance, th->type,
82 while ((th_ptr != NULL) && (th_ptr->next != NULL))
83 th_ptr = th_ptr->next;
85 if (th_ptr == NULL) /* no such threshold yet */
87 status = c_avl_insert(threshold_tree, name_copy, th_copy);
88 } else /* th_ptr points to the last threshold in the list */
90 th_ptr->next = th_copy;
91 /* name_copy isn't needed */
95 pthread_mutex_unlock(&threshold_lock);
98 ERROR("ut_threshold_add: c_avl_insert (%s) failed.", name);
104 } /* }}} int ut_threshold_add */
109 * The following approximately two hundred functions are used to handle the
110 * configuration and fill the threshold list.
112 static int ut_config_type(const threshold_t *th_orig, oconfig_item_t *ci) {
116 if ((ci->values_num != 1) || (ci->values[0].type != OCONFIG_TYPE_STRING)) {
117 WARNING("threshold values: The `Type' block needs exactly one string "
122 if (ci->children_num < 1) {
123 WARNING("threshold values: The `Type' block needs at least one option.");
127 memcpy(&th, th_orig, sizeof(th));
128 sstrncpy(th.type, ci->values[0].value.string, sizeof(th.type));
130 th.warning_min = NAN;
131 th.warning_max = NAN;
132 th.failure_min = NAN;
133 th.failure_max = NAN;
136 th.flags = UT_FLAG_INTERESTING; /* interesting by default */
138 for (int i = 0; i < ci->children_num; i++) {
139 oconfig_item_t *option = ci->children + i;
141 if (strcasecmp("Instance", option->key) == 0)
142 status = cf_util_get_string_buffer(option, th.type_instance,
143 sizeof(th.type_instance));
144 else if (strcasecmp("DataSource", option->key) == 0)
145 status = cf_util_get_string_buffer(option, th.data_source,
146 sizeof(th.data_source));
147 else if (strcasecmp("WarningMax", option->key) == 0)
148 status = cf_util_get_double(option, &th.warning_max);
149 else if (strcasecmp("FailureMax", option->key) == 0)
150 status = cf_util_get_double(option, &th.failure_max);
151 else if (strcasecmp("WarningMin", option->key) == 0)
152 status = cf_util_get_double(option, &th.warning_min);
153 else if (strcasecmp("FailureMin", option->key) == 0)
154 status = cf_util_get_double(option, &th.failure_min);
155 else if (strcasecmp("Interesting", option->key) == 0)
156 status = cf_util_get_flag(option, &th.flags, UT_FLAG_INTERESTING);
157 else if (strcasecmp("Invert", option->key) == 0)
158 status = cf_util_get_flag(option, &th.flags, UT_FLAG_INVERT);
159 else if (strcasecmp("Persist", option->key) == 0)
160 status = cf_util_get_flag(option, &th.flags, UT_FLAG_PERSIST);
161 else if (strcasecmp("PersistOK", option->key) == 0)
162 status = cf_util_get_flag(option, &th.flags, UT_FLAG_PERSIST_OK);
163 else if (strcasecmp("Percentage", option->key) == 0)
164 status = cf_util_get_flag(option, &th.flags, UT_FLAG_PERCENTAGE);
165 else if (strcasecmp("Hits", option->key) == 0)
166 status = cf_util_get_int(option, &th.hits);
167 else if (strcasecmp("Hysteresis", option->key) == 0)
168 status = cf_util_get_double(option, &th.hysteresis);
170 WARNING("threshold values: Option `%s' not allowed inside a `Type' "
181 status = ut_threshold_add(&th);
185 } /* int ut_config_type */
187 static int ut_config_plugin(const threshold_t *th_orig, oconfig_item_t *ci) {
191 if ((ci->values_num != 1) || (ci->values[0].type != OCONFIG_TYPE_STRING)) {
192 WARNING("threshold values: The `Plugin' block needs exactly one string "
197 if (ci->children_num < 1) {
198 WARNING("threshold values: The `Plugin' block needs at least one nested "
203 memcpy(&th, th_orig, sizeof(th));
204 sstrncpy(th.plugin, ci->values[0].value.string, sizeof(th.plugin));
206 for (int i = 0; i < ci->children_num; i++) {
207 oconfig_item_t *option = ci->children + i;
209 if (strcasecmp("Type", option->key) == 0)
210 status = ut_config_type(&th, option);
211 else if (strcasecmp("Instance", option->key) == 0)
212 status = cf_util_get_string_buffer(option, th.plugin_instance,
213 sizeof(th.plugin_instance));
215 WARNING("threshold values: Option `%s' not allowed inside a `Plugin' "
226 } /* int ut_config_plugin */
228 static int ut_config_host(const threshold_t *th_orig, oconfig_item_t *ci) {
232 if ((ci->values_num != 1) || (ci->values[0].type != OCONFIG_TYPE_STRING)) {
233 WARNING("threshold values: The `Host' block needs exactly one string "
238 if (ci->children_num < 1) {
239 WARNING("threshold values: The `Host' block needs at least one nested "
244 memcpy(&th, th_orig, sizeof(th));
245 sstrncpy(th.host, ci->values[0].value.string, sizeof(th.host));
247 for (int i = 0; i < ci->children_num; i++) {
248 oconfig_item_t *option = ci->children + i;
250 if (strcasecmp("Type", option->key) == 0)
251 status = ut_config_type(&th, option);
252 else if (strcasecmp("Plugin", option->key) == 0)
253 status = ut_config_plugin(&th, option);
255 WARNING("threshold values: Option `%s' not allowed inside a `Host' "
266 } /* int ut_config_host */
268 * End of the functions used to configure threshold values.
273 * int ut_report_state
275 * Checks if the `state' differs from the old state and creates a notification
279 static int ut_report_state(const data_set_t *ds, const value_list_t *vl,
280 const threshold_t *th, const gauge_t *values,
281 int ds_index, int state) { /* {{{ */
290 /* Check if hits matched */
291 if ((th->hits != 0)) {
292 int hits = uc_get_hits(ds, vl);
293 /* STATE_OKAY resets hits unless PERSIST_OK flag is set. Hits resets if
294 * threshold is hit. */
295 if (((state == STATE_OKAY) && ((th->flags & UT_FLAG_PERSIST_OK) == 0)) ||
297 DEBUG("ut_report_state: reset uc_get_hits = 0");
298 uc_set_hits(ds, vl, 0); /* reset hit counter and notify */
300 DEBUG("ut_report_state: th->hits = %d, uc_get_hits = %d", th->hits,
301 uc_get_hits(ds, vl));
302 (void)uc_inc_hits(ds, vl, 1); /* increase hit counter */
305 } /* end check hits */
307 state_old = uc_get_state(ds, vl);
309 /* If the state didn't change, report if `persistent' is specified. If the
310 * state is `okay', then only report if `persist_ok` flag is set. */
311 if (state == state_old) {
312 if (state == STATE_UNKNOWN) {
313 /* From UNKNOWN to UNKNOWN. Persist doesn't apply here. */
315 } else if ((th->flags & UT_FLAG_PERSIST) == 0)
317 else if ((state == STATE_OKAY) && ((th->flags & UT_FLAG_PERSIST_OK) == 0))
321 if (state != state_old)
322 uc_set_state(ds, vl, state);
324 NOTIFICATION_INIT_VL(&n, vl);
327 bufsize = sizeof(n.message);
329 if (state == STATE_OKAY)
330 n.severity = NOTIF_OKAY;
331 else if (state == STATE_WARNING)
332 n.severity = NOTIF_WARNING;
334 n.severity = NOTIF_FAILURE;
338 status = ssnprintf(buf, bufsize, "Host %s, plugin %s", vl->host, vl->plugin);
342 if (vl->plugin_instance[0] != '\0') {
343 status = ssnprintf(buf, bufsize, " (instance %s)", vl->plugin_instance);
348 status = ssnprintf(buf, bufsize, " type %s", vl->type);
352 if (vl->type_instance[0] != '\0') {
353 status = ssnprintf(buf, bufsize, " (instance %s)", vl->type_instance);
358 plugin_notification_meta_add_string(&n, "DataSource", ds->ds[ds_index].name);
359 plugin_notification_meta_add_double(&n, "CurrentValue", values[ds_index]);
360 plugin_notification_meta_add_double(&n, "WarningMin", th->warning_min);
361 plugin_notification_meta_add_double(&n, "WarningMax", th->warning_max);
362 plugin_notification_meta_add_double(&n, "FailureMin", th->failure_min);
363 plugin_notification_meta_add_double(&n, "FailureMax", th->failure_max);
365 /* Send an okay notification */
366 if (state == STATE_OKAY) {
367 if (state_old == STATE_MISSING)
368 ssnprintf(buf, bufsize, ": Value is no longer missing.");
370 ssnprintf(buf, bufsize,
371 ": All data sources are within range again. "
372 "Current value of \"%s\" is %f.",
373 ds->ds[ds_index].name, values[ds_index]);
374 } else if (state == STATE_UNKNOWN) {
375 ERROR("ut_report_state: metric transition to UNKNOWN from a different "
376 "state. This shouldn't happen.");
382 min = (state == STATE_ERROR) ? th->failure_min : th->warning_min;
383 max = (state == STATE_ERROR) ? th->failure_max : th->warning_max;
385 if (th->flags & UT_FLAG_INVERT) {
386 if (!isnan(min) && !isnan(max)) {
387 ssnprintf(buf, bufsize,
388 ": Data source \"%s\" is currently "
389 "%f. That is within the %s region of %f%s and %f%s.",
390 ds->ds[ds_index].name, values[ds_index],
391 (state == STATE_ERROR) ? "failure" : "warning", min,
392 ((th->flags & UT_FLAG_PERCENTAGE) != 0) ? "%" : "", max,
393 ((th->flags & UT_FLAG_PERCENTAGE) != 0) ? "%" : "");
395 ssnprintf(buf, bufsize,
396 ": Data source \"%s\" is currently "
397 "%f. That is %s the %s threshold of %f%s.",
398 ds->ds[ds_index].name, values[ds_index],
399 isnan(min) ? "below" : "above",
400 (state == STATE_ERROR) ? "failure" : "warning",
401 isnan(min) ? max : min,
402 ((th->flags & UT_FLAG_PERCENTAGE) != 0) ? "%" : "");
404 } else if (th->flags & UT_FLAG_PERCENTAGE) {
409 for (size_t i = 0; i < vl->values_len; i++) {
410 if (isnan(values[i]))
419 value = 100.0 * values[ds_index] / sum;
421 ssnprintf(buf, bufsize,
422 ": Data source \"%s\" is currently "
423 "%g (%.2f%%). That is %s the %s threshold of %.2f%%.",
424 ds->ds[ds_index].name, values[ds_index], value,
425 (value < min) ? "below" : "above",
426 (state == STATE_ERROR) ? "failure" : "warning",
427 (value < min) ? min : max);
428 } else /* is not inverted */
430 ssnprintf(buf, bufsize,
431 ": Data source \"%s\" is currently "
432 "%f. That is %s the %s threshold of %f.",
433 ds->ds[ds_index].name, values[ds_index],
434 (values[ds_index] < min) ? "below" : "above",
435 (state == STATE_ERROR) ? "failure" : "warning",
436 (values[ds_index] < min) ? min : max);
440 plugin_dispatch_notification(&n);
442 plugin_notification_meta_free(n.meta);
444 } /* }}} int ut_report_state */
447 * int ut_check_one_data_source
449 * Checks one data source against the given threshold configuration. If the
450 * `DataSource' option is set in the threshold, and the name does NOT match,
451 * `okay' is returned. If the threshold does match, its failure and warning
452 * min and max values are checked and `failure' or `warning' is returned if
456 static int ut_check_one_data_source(
457 const data_set_t *ds, const value_list_t __attribute__((unused)) * vl,
458 const threshold_t *th, const gauge_t *values, int ds_index) { /* {{{ */
462 int prev_state = STATE_OKAY;
464 /* check if this threshold applies to this data source */
466 ds_name = ds->ds[ds_index].name;
467 if ((th->data_source[0] != 0) && (strcmp(ds_name, th->data_source) != 0))
468 return STATE_UNKNOWN;
471 if ((th->flags & UT_FLAG_INVERT) != 0) {
476 /* XXX: This is an experimental code, not optimized, not fast, not reliable,
477 * and probably, do not work as you expect. Enjoy! :D */
478 if (th->hysteresis > 0) {
479 prev_state = uc_get_state(ds, vl);
480 /* The purpose of hysteresis is elliminating flapping state when the value
481 * oscilates around the thresholds. In other words, what is important is
482 * the previous state; if the new value would trigger a transition, make
483 * sure that we artificially widen the range which is considered to apply
484 * for the previous state, and only trigger the notification if the value
485 * is outside of this expanded range.
487 * There is no hysteresis for the OKAY state.
489 gauge_t hysteresis_for_warning = 0, hysteresis_for_failure = 0;
490 switch (prev_state) {
492 hysteresis_for_failure = th->hysteresis;
495 hysteresis_for_warning = th->hysteresis;
499 /* do nothing -- the hysteresis only applies to the non-normal states */
503 if ((!isnan(th->failure_min) &&
504 (th->failure_min + hysteresis_for_failure > values[ds_index])) ||
505 (!isnan(th->failure_max) &&
506 (th->failure_max - hysteresis_for_failure < values[ds_index])))
509 if ((!isnan(th->warning_min) &&
510 (th->warning_min + hysteresis_for_warning > values[ds_index])) ||
511 (!isnan(th->warning_max) &&
512 (th->warning_max - hysteresis_for_warning < values[ds_index])))
515 } else { /* no hysteresis */
516 if ((!isnan(th->failure_min) && (th->failure_min > values[ds_index])) ||
517 (!isnan(th->failure_max) && (th->failure_max < values[ds_index])))
520 if ((!isnan(th->warning_min) && (th->warning_min > values[ds_index])) ||
521 (!isnan(th->warning_max) && (th->warning_max < values[ds_index])))
529 return STATE_WARNING;
532 } /* }}} int ut_check_one_data_source */
535 * int ut_check_one_threshold
537 * Checks all data sources of a value list against the given threshold, using
538 * the ut_check_one_data_source function above. Returns the worst status,
539 * which is `okay' if nothing has failed or `unknown' if no valid datasource was
541 * Returns less than zero if the data set doesn't have any data sources.
543 static int ut_check_one_threshold(const data_set_t *ds, const value_list_t *vl,
544 const threshold_t *th, const gauge_t *values,
545 int *ret_ds_index) { /* {{{ */
548 gauge_t values_copy[ds->ds_num];
550 memcpy(values_copy, values, sizeof(values_copy));
552 if ((th->flags & UT_FLAG_PERCENTAGE) != 0) {
556 if (ds->ds_num == 1) {
558 "ut_check_one_threshold: The %s type has only one data "
559 "source, but you have configured to check this as a percentage. "
560 "That doesn't make much sense, because the percentage will always "
565 /* Prepare `sum' and `num'. */
566 for (size_t i = 0; i < ds->ds_num; i++)
567 if (!isnan(values[i])) {
572 if ((num == 0) /* All data sources are undefined. */
573 || (sum == 0.0)) /* Sum is zero, cannot calculate percentage. */
575 for (size_t i = 0; i < ds->ds_num; i++)
576 values_copy[i] = NAN;
577 } else /* We can actually calculate the percentage. */
579 for (size_t i = 0; i < ds->ds_num; i++)
580 values_copy[i] = 100.0 * values[i] / sum;
582 } /* if (UT_FLAG_PERCENTAGE) */
584 for (size_t i = 0; i < ds->ds_num; i++) {
587 status = ut_check_one_data_source(ds, vl, th, values_copy, i);
592 } /* for (ds->ds_num) */
594 if (ret_ds_index != NULL)
595 *ret_ds_index = ds_index;
598 } /* }}} int ut_check_one_threshold */
601 * int ut_check_threshold
603 * Gets a list of matching thresholds and searches for the worst status by one
604 * of the thresholds. Then reports that status using the ut_report_state
606 * Returns zero on success and if no threshold has been configured. Returns
607 * less than zero on failure.
609 static int ut_check_threshold(const data_set_t *ds, const value_list_t *vl,
610 __attribute__((unused))
611 user_data_t *ud) { /* {{{ */
616 int worst_state = -1;
617 threshold_t *worst_th = NULL;
618 int worst_ds_index = -1;
620 if (threshold_tree == NULL)
623 /* Is this lock really necessary? So far, thresholds are only inserted at
625 pthread_mutex_lock(&threshold_lock);
626 th = threshold_search(vl);
627 pthread_mutex_unlock(&threshold_lock);
631 DEBUG("ut_check_threshold: Found matching threshold(s)");
633 values = uc_get_rate(ds, vl);
640 status = ut_check_one_threshold(ds, vl, th, values, &ds_index);
642 ERROR("ut_check_threshold: ut_check_one_threshold failed.");
647 if (worst_state < status) {
648 worst_state = status;
650 worst_ds_index = ds_index;
657 ut_report_state(ds, vl, worst_th, values, worst_ds_index, worst_state);
659 ERROR("ut_check_threshold: ut_report_state failed.");
667 } /* }}} int ut_check_threshold */
672 * This function is called whenever a value goes "missing".
674 static int ut_missing(const value_list_t *vl,
675 __attribute__((unused)) user_data_t *ud) { /* {{{ */
677 cdtime_t missing_time;
678 char identifier[6 * DATA_MAX_NAME_LEN];
682 if (threshold_tree == NULL)
685 th = threshold_search(vl);
686 /* dispatch notifications for "interesting" values only */
687 if ((th == NULL) || ((th->flags & UT_FLAG_INTERESTING) == 0))
691 missing_time = now - vl->time;
692 FORMAT_VL(identifier, sizeof(identifier), vl);
694 NOTIFICATION_INIT_VL(&n, vl);
695 ssnprintf(n.message, sizeof(n.message),
696 "%s has not been updated for %.3f seconds.", identifier,
697 CDTIME_T_TO_DOUBLE(missing_time));
700 plugin_dispatch_notification(&n);
703 } /* }}} int ut_missing */
705 static int ut_config(oconfig_item_t *ci) { /* {{{ */
707 int old_size = c_avl_size(threshold_tree);
709 if (threshold_tree == NULL) {
710 threshold_tree = c_avl_create((int (*)(const void *, const void *))strcmp);
711 if (threshold_tree == NULL) {
712 ERROR("ut_config: c_avl_create failed.");
722 .flags = UT_FLAG_INTERESTING /* interesting by default */
725 for (int i = 0; i < ci->children_num; i++) {
726 oconfig_item_t *option = ci->children + i;
728 if (strcasecmp("Type", option->key) == 0)
729 status = ut_config_type(&th, option);
730 else if (strcasecmp("Plugin", option->key) == 0)
731 status = ut_config_plugin(&th, option);
732 else if (strcasecmp("Host", option->key) == 0)
733 status = ut_config_host(&th, option);
735 WARNING("threshold values: Option `%s' not allowed here.", option->key);
743 /* register callbacks if this is the first time we see a valid config */
744 if ((old_size == 0) && (c_avl_size(threshold_tree) > 0)) {
745 plugin_register_missing("threshold", ut_missing,
746 /* user data = */ NULL);
747 plugin_register_write("threshold", ut_check_threshold,
748 /* user data = */ NULL);
752 } /* }}} int um_config */
754 void module_register(void) {
755 plugin_register_complex_config("threshold", ut_config);