From: Pavel Rochnyak Date: Thu, 5 Jul 2018 09:03:17 +0000 (+0700) Subject: Merge pull request #2733 from elfiesmelfie/feat_pcie_aer X-Git-Url: https://git.octo.it/?p=collectd.git;a=commitdiff_plain;h=829683c47113c0f6305c9089424170ff706d047c;hp=e9c6bf25649bb8ead1bf383e51426b6552f08251 Merge pull request #2733 from elfiesmelfie/feat_pcie_aer New plugin to read PCIe errors --- diff --git a/Makefile.am b/Makefile.am index 190ce8ec..4d32f74c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1381,6 +1381,24 @@ ovs_stats_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBYAJL_LDFLAGS) ovs_stats_la_LIBADD = $(BUILD_WITH_LIBYAJL_LIBS) endif +if BUILD_PLUGIN_PCIE_ERRORS +pkglib_LTLIBRARIES += pcie_errors.la +pcie_errors_la_SOURCES = src/pcie_errors.c +pcie_errors_la_CPPFLAGS = $(AM_CPPFLAGS) +pcie_errors_la_LDFLAGS = $(PLUGIN_LDFLAGS) + +test_plugin_pcie_errors_SOURCES = \ + src/pcie_errors_test.c \ + src/daemon/utils_llist.c \ + src/daemon/configfile.c \ + src/daemon/types_list.c +test_plugin_pcie_errors_CPPFLAGS = $(AM_CPPFLAGS) +test_plugin_pcie_errors_LDFLAGS = $(PLUGIN_LDFLAGS) +test_plugin_pcie_errors_LDADD = liboconfig.la libplugin_mock.la +check_PROGRAMS += test_plugin_pcie_errors +TESTS += test_plugin_pcie_errors +endif + if BUILD_PLUGIN_PERL pkglib_LTLIBRARIES += perl.la perl_la_SOURCES = src/perl.c diff --git a/README b/README index 2210b2b9..a111e84a 100644 --- a/README +++ b/README @@ -314,6 +314,10 @@ Features OVS documentation. + - pcie_errors + Read errors from PCI Express Device Status and AER extended capabilities. + + - perl The perl plugin implements a Perl-interpreter into collectd. You can write your own plugins in Perl and return arbitrary values using this diff --git a/configure.ac b/configure.ac index 7a14e01b..a91b755e 100644 --- a/configure.ac +++ b/configure.ac @@ -550,6 +550,12 @@ if test "x$ac_system" = "xLinux"; then AC_DEFINE([HAVE_CAPABILITY], [1], [Define to 1 if you have cap_get_proc() (-lcap).]) fi + # For pcie_errors plugin + AC_CHECK_HEADERS([linux/pci_regs.h], + [have_pci_regs_h="yes"], + [have_pci_regs_h="no (linux/pci_regs.h not found)"] + ) + else have_linux_raid_md_u_h="no" have_linux_wireless_h="no" @@ -6288,6 +6294,7 @@ plugin_nfs="no" plugin_numa="no" plugin_ovs_events="no" plugin_ovs_stats="no" +plugin_pcie_errors="no" plugin_perl="no" plugin_pinba="no" plugin_processes="no" @@ -6366,6 +6373,10 @@ if test "x$ac_system" = "xLinux"; then plugin_ovs_events="yes" plugin_ovs_stats="yes" fi + + if test "x$have_pci_regs_h" = "xyes"; then + plugin_pcie_errors="yes" + fi fi if test "x$ac_system" = "xOpenBSD"; then @@ -6743,6 +6754,7 @@ AC_PLUGIN([openvpn], [yes], [OpenVPN client stat AC_PLUGIN([oracle], [$with_oracle], [Oracle plugin]) AC_PLUGIN([ovs_events], [$plugin_ovs_events], [OVS events plugin]) AC_PLUGIN([ovs_stats], [$plugin_ovs_stats], [OVS statistics plugin]) +AC_PLUGIN([pcie_errors], [$plugin_pcie_errors], [PCIe errors plugin]) AC_PLUGIN([perl], [$plugin_perl], [Embed a Perl interpreter]) AC_PLUGIN([pf], [$have_net_pfvar_h], [BSD packet filter (PF) statistics]) # FIXME: Check for libevent, too. @@ -7164,6 +7176,7 @@ AC_MSG_RESULT([ openvpn . . . . . . . $enable_openvpn]) AC_MSG_RESULT([ oracle . . . . . . . $enable_oracle]) AC_MSG_RESULT([ ovs_events . . . . . $enable_ovs_events]) AC_MSG_RESULT([ ovs_stats . . . . . . $enable_ovs_stats]) +AC_MSG_RESULT([ pcie_errors . . . . . $enable_pcie_errors]) AC_MSG_RESULT([ perl . . . . . . . . $enable_perl]) AC_MSG_RESULT([ pf . . . . . . . . . $enable_pf]) AC_MSG_RESULT([ pinba . . . . . . . . $enable_pinba]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 74b6c888..bb83bda9 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -173,6 +173,7 @@ #@BUILD_PLUGIN_ORACLE_TRUE@LoadPlugin oracle #@BUILD_PLUGIN_OVS_EVENTS_TRUE@LoadPlugin ovs_events #@BUILD_PLUGIN_OVS_STATS_TRUE@LoadPlugin ovs_stats +#@BUILD_PLUGIN_PCIE_ERRORS_TRUE@LoadPlugin pcie_errors #@BUILD_PLUGIN_PERL_TRUE@LoadPlugin perl #@BUILD_PLUGIN_PINBA_TRUE@LoadPlugin pinba #@BUILD_PLUGIN_PING_TRUE@LoadPlugin ping @@ -1130,6 +1131,12 @@ # Bridges "br0" "br_ext" # +# +# Source "sysfs" +# ReportMasked false +# PersistentNotifications false +# + # # IncludeDir "/my/include/path" # BaseName "Collectd::Plugins" diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 9cae9c2c..01fae1b0 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -6281,6 +6281,52 @@ Default: empty (monitor all bridges) =back +=head2 Plugin C + +The I plugin collects PCI Express errors from Device Status in Capability +structure and from Advanced Error Reporting Extended Capability where available. +At every read it polls config space of PCI Express devices and dispatches +notification for every error that is set. It checks for new errors at every read. +The device is indicated in plugin_instance according to format "domain:bus:dev.fn". +Errors are divided into categories indicated by type_instance: "correctable", and +for uncorrectable errors "non_fatal" or "fatal". +Fatal errors are reported as I and all others as I. + +B + + + Source "sysfs" + AccessDir "/sys/bus/pci" + ReportMasked false + PersistentNotifications false + + +B + +=over 4 + +=item B B|B + +Use B or B to read data from /sysfs or /proc. +The default value is B. + +=item B I + +Directory used to access device config space. It is optional and defaults to +/sys/bus/pci for B and to /proc/bus/pci for B. + +=item B B|B + +If true plugin will notify about errors that are set to masked in Error Mask register. +Such errors are not reported to the PCI Express Root Complex. Defaults to B. + +=item B B|B + +If false plugin will dispatch notification only on set/clear of error. +The ones already reported will be ignored. Defaults to B. + +=back + =head2 Plugin C This plugin embeds a Perl-interpreter into collectd and provides an interface diff --git a/src/pcie_errors.c b/src/pcie_errors.c new file mode 100644 index 00000000..b239a8c5 --- /dev/null +++ b/src/pcie_errors.c @@ -0,0 +1,795 @@ +/** + * collectd - src/pcie_errors.c + * + * Copyright(c) 2018 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Kamil Wiatrowski + **/ + +#include "collectd.h" + +#include "common.h" +#include "utils_llist.h" + +#include + +#define PCIE_ERRORS_PLUGIN "pcie_errors" +#define PCIE_DEFAULT_PROCDIR "/proc/bus/pci" +#define PCIE_DEFAULT_SYSFSDIR "/sys/bus/pci" +#define PCIE_NAME_LEN 512 +#define PCIE_BUFF_SIZE 1024 + +#define PCIE_ERROR "pcie_error" +#define PCIE_SEV_CE "correctable" +#define PCIE_SEV_FATAL "fatal" +#define PCIE_SEV_NOFATAL "non_fatal" + +#define PCIE_DEV(x) (((x) >> 3) & 0x1f) +#define PCIE_FN(x) ((x)&0x07) + +#define PCIE_ECAP_OFFSET 0x100 /* ECAP always begin at offset 0x100 */ + +typedef struct pcie_config_s { + bool use_sysfs; + bool notif_masked; + bool persistent; + char access_dir[PATH_MAX]; +} pcie_config_t; + +typedef struct pcie_device_s { + int fd; + int domain; + uint8_t bus; + uint8_t device; + uint8_t function; + int cap_exp; + int ecap_aer; + uint16_t device_status; + uint32_t correctable_errors; + uint32_t uncorrectable_errors; +} pcie_device_t; + +typedef struct pcie_fops_s { + int (*list_devices)(llist_t *dev_list); + int (*open)(pcie_device_t *dev); + void (*close)(pcie_device_t *dev); + int (*read)(pcie_device_t *dev, void *buff, int size, int pos); +} pcie_fops_t; + +typedef struct pcie_error_s { + int mask; + const char *desc; +} pcie_error_t; + +static llist_t *pcie_dev_list; +static pcie_config_t pcie_config = {.access_dir = "", .use_sysfs = true}; +static pcie_fops_t pcie_fops; + +/* Device Error Status */ +static const pcie_error_t pcie_base_errors[] = { + {PCI_EXP_DEVSTA_CED, "Correctable Error"}, + {PCI_EXP_DEVSTA_NFED, "Non-Fatal Error"}, + {PCI_EXP_DEVSTA_FED, "Fatal Error"}, + {PCI_EXP_DEVSTA_URD, "Unsupported Request"}}; +static const int pcie_base_errors_num = STATIC_ARRAY_SIZE(pcie_base_errors); + +/* Uncorrectable Error Status */ +static const pcie_error_t pcie_aer_ues[] = { +#ifdef PCI_ERR_UNC_DLP + {PCI_ERR_UNC_DLP, "Data Link Protocol"}, +#endif +#ifdef PCI_ERR_UNC_SURPDN + {PCI_ERR_UNC_SURPDN, "Surprise Down"}, +#endif +#ifdef PCI_ERR_UNC_POISON_TLP + {PCI_ERR_UNC_POISON_TLP, "Poisoned TLP"}, +#endif +#ifdef PCI_ERR_UNC_FCP + {PCI_ERR_UNC_FCP, "Flow Control Protocol"}, +#endif +#ifdef PCI_ERR_UNC_COMP_TIME + {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"}, +#endif +#ifdef PCI_ERR_UNC_COMP_ABORT + {PCI_ERR_UNC_COMP_ABORT, "Completer Abort"}, +#endif +#ifdef PCI_ERR_UNC_UNX_COMP + {PCI_ERR_UNC_UNX_COMP, "Unexpected Completion"}, +#endif +#ifdef PCI_ERR_UNC_RX_OVER + {PCI_ERR_UNC_RX_OVER, "Receiver Overflow"}, +#endif +#ifdef PCI_ERR_UNC_MALF_TLP + {PCI_ERR_UNC_MALF_TLP, "Malformed TLP"}, +#endif +#ifdef PCI_ERR_UNC_ECRC + {PCI_ERR_UNC_ECRC, "ECRC Error Status"}, +#endif +#ifdef PCI_ERR_UNC_UNSUP + {PCI_ERR_UNC_UNSUP, "Unsupported Request"}, +#endif +#ifdef PCI_ERR_UNC_ACSV + {PCI_ERR_UNC_ACSV, "ACS Violation"}, +#endif +#ifdef PCI_ERR_UNC_INTN + {PCI_ERR_UNC_INTN, "Internal"}, +#endif +#ifdef PCI_ERR_UNC_MCBTLP + {PCI_ERR_UNC_MCBTLP, "MC blocked TLP"}, +#endif +#ifdef PCI_ERR_UNC_ATOMEG + {PCI_ERR_UNC_ATOMEG, "Atomic egress blocked"}, +#endif +#ifdef PCI_ERR_UNC_TLPPRE + {PCI_ERR_UNC_TLPPRE, "TLP prefix blocked"}, +#endif +}; +static const int pcie_aer_ues_num = STATIC_ARRAY_SIZE(pcie_aer_ues); + +/* Correctable Error Status */ +static const pcie_error_t pcie_aer_ces[] = { +#ifdef PCI_ERR_COR_RCVR + {PCI_ERR_COR_RCVR, "Receiver Error Status"}, +#endif +#ifdef PCI_ERR_COR_BAD_TLP + {PCI_ERR_COR_BAD_TLP, "Bad TLP Status"}, +#endif +#ifdef PCI_ERR_COR_BAD_DLLP + {PCI_ERR_COR_BAD_DLLP, "Bad DLLP Status"}, +#endif +#ifdef PCI_ERR_COR_REP_ROLL + {PCI_ERR_COR_REP_ROLL, "REPLAY_NUM Rollover"}, +#endif +#ifdef PCI_ERR_COR_REP_TIMER + {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"}, +#endif +#ifdef PCI_ERR_COR_ADV_NFAT + {PCI_ERR_COR_ADV_NFAT, "Advisory Non-Fatal"}, +#endif +#ifdef PCI_ERR_COR_INTERNAL + {PCI_ERR_COR_INTERNAL, "Corrected Internal"}, +#endif +#ifdef PCI_ERR_COR_LOG_OVER + {PCI_ERR_COR_LOG_OVER, "Header Log Overflow"}, +#endif +}; +static const int pcie_aer_ces_num = STATIC_ARRAY_SIZE(pcie_aer_ces); + +static int pcie_add_device(llist_t *list, int domain, uint8_t bus, + uint8_t device, uint8_t fn) { + llentry_t *entry; + pcie_device_t *dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to allocate device"); + return -ENOMEM; + } + + dev->domain = domain; + dev->bus = bus; + dev->device = device; + dev->function = fn; + dev->cap_exp = -1; + dev->ecap_aer = -1; + entry = llentry_create(NULL, dev); + if (entry == NULL) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to create llentry"); + sfree(dev); + return -ENOMEM; + } + llist_append(list, entry); + + DEBUG(PCIE_ERRORS_PLUGIN ": pci device added to list: %04x:%02x:%02x.%d", + domain, bus, device, fn); + return 0; +} + +static void pcie_clear_list(llist_t *list) { + if (list == NULL) + return; + + for (llentry_t *e = llist_head(list); e != NULL; e = e->next) + sfree(e->value); + + llist_destroy(list); +} + +static int pcie_list_devices_proc(llist_t *dev_list) { + FILE *fd; + char file_name[PCIE_NAME_LEN]; + char buf[PCIE_BUFF_SIZE]; + unsigned int i = 0; + int ret = 0; + + if (dev_list == NULL) + return -EINVAL; + + ret = snprintf(file_name, sizeof(file_name), "%s/devices", + pcie_config.access_dir); + if (ret < 1 || (size_t)ret >= sizeof(file_name)) { + ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)", + pcie_config.access_dir, ret); + return -EINVAL; + } + fd = fopen(file_name, "r"); + if (!fd) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Cannot open file %s to get devices list: %s", + file_name, sstrerror(errno, errbuf, sizeof(errbuf))); + return -ENOENT; + } + + while (fgets(buf, sizeof(buf), fd)) { + unsigned int slot; + + if (sscanf(buf, "%x", &slot) != 1) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to read line %u from %s", i + 1, + file_name); + continue; + } + + uint8_t bus = slot >> 8U; + uint8_t dev = PCIE_DEV(slot); + uint8_t fn = PCIE_FN(slot); + ret = pcie_add_device(dev_list, 0, bus, dev, fn); + if (ret) + break; + + ++i; + } + + fclose(fd); + return ret; +} + +static int pcie_list_devices_sysfs(llist_t *dev_list) { + DIR *dir; + struct dirent *item; + char dir_name[PCIE_NAME_LEN]; + int ret = 0; + + if (dev_list == NULL) + return -EINVAL; + + ret = snprintf(dir_name, sizeof(dir_name), "%s/devices", + pcie_config.access_dir); + if (ret < 1 || (size_t)ret >= sizeof(dir_name)) { + ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)", + pcie_config.access_dir, ret); + return -EINVAL; + } + dir = opendir(dir_name); + if (!dir) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Cannot open dir %s to get devices list: %s", + dir_name, sstrerror(errno, errbuf, sizeof(errbuf))); + return -ENOENT; + } + + while ((item = readdir(dir))) { + unsigned int dom, bus, dev; + int fn; + + /* Omit special non-device entries */ + if (item->d_name[0] == '.') + continue; + + if (sscanf(item->d_name, "%x:%x:%x.%d", &dom, &bus, &dev, &fn) != 4) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to parse entry %s", item->d_name); + continue; + } + + ret = pcie_add_device(dev_list, dom, bus, dev, fn); + if (ret) + break; + } + + closedir(dir); + return ret; +} + +static void pcie_close(pcie_device_t *dev) { + if (close(dev->fd) == -1) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Failed to close %04x:%02x:%02x.%d, fd=%d: %s", + dev->domain, dev->bus, dev->device, dev->function, dev->fd, + sstrerror(errno, errbuf, sizeof(errbuf))); + } + + dev->fd = -1; +} + +static int pcie_open(pcie_device_t *dev, const char *name) { + dev->fd = open(name, O_RDONLY); + if (dev->fd == -1) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Failed to open file %s: %s", name, + sstrerror(errno, errbuf, sizeof(errbuf))); + return -ENOENT; + } + + return 0; +} + +static int pcie_open_proc(pcie_device_t *dev) { + char file_name[PCIE_NAME_LEN]; + + int ret = + snprintf(file_name, sizeof(file_name), "%s/%02x/%02x.%d", + pcie_config.access_dir, dev->bus, dev->device, dev->function); + if (ret < 1 || (size_t)ret >= sizeof(file_name)) { + ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)", + pcie_config.access_dir, ret); + return -EINVAL; + } + + return pcie_open(dev, file_name); +} + +static int pcie_open_sysfs(pcie_device_t *dev) { + char file_name[PCIE_NAME_LEN]; + + int ret = + snprintf(file_name, sizeof(file_name), + "%s/devices/%04x:%02x:%02x.%d/config", pcie_config.access_dir, + dev->domain, dev->bus, dev->device, dev->function); + if (ret < 1 || (size_t)ret >= sizeof(file_name)) { + ERROR(PCIE_ERRORS_PLUGIN ": Access dir `%s' is too long (%d)", + pcie_config.access_dir, ret); + return -EINVAL; + } + + return pcie_open(dev, file_name); +} + +static int pcie_read(pcie_device_t *dev, void *buff, int size, int pos) { + int len = pread(dev->fd, buff, size, pos); + if (len == size) + return 0; + + if (len == -1) { + char errbuf[PCIE_BUFF_SIZE]; + ERROR(PCIE_ERRORS_PLUGIN ": Failed to read %04x:%02x:%02x.%d at pos %d: %s", + dev->domain, dev->bus, dev->device, dev->function, pos, + sstrerror(errno, errbuf, sizeof(errbuf))); + } else { + ERROR(PCIE_ERRORS_PLUGIN + ": %04x:%02x:%02x.%d Read only %d bytes, should be %d", + dev->domain, dev->bus, dev->device, dev->function, len, size); + } + return -1; +} + +static uint8_t pcie_read8(pcie_device_t *dev, int pos) { + uint8_t value; + if (pcie_fops.read(dev, &value, 1, pos)) + return 0; + return value; +} + +static uint16_t pcie_read16(pcie_device_t *dev, int pos) { + uint16_t value; + if (pcie_fops.read(dev, &value, 2, pos)) + return 0; + return value; +} + +static uint32_t pcie_read32(pcie_device_t *dev, int pos) { + uint32_t value; + if (pcie_fops.read(dev, &value, 4, pos)) + return 0; + return value; +} + +static void pcie_dispatch_notification(pcie_device_t *dev, notification_t *n, + const char *type, + const char *type_instance) { + sstrncpy(n->host, hostname_g, sizeof(n->host)); + snprintf(n->plugin_instance, sizeof(n->plugin_instance), "%04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + sstrncpy(n->type, type, sizeof(n->type)); + sstrncpy(n->type_instance, type_instance, sizeof(n->type_instance)); + + plugin_dispatch_notification(n); +} + +/* Report errors found in AER Correctable Error Status register */ +static void pcie_dispatch_correctable_errors(pcie_device_t *dev, + uint32_t errors, uint32_t masked) { + for (int i = 0; i < pcie_aer_ces_num; i++) { + const pcie_error_t *err = pcie_aer_ces + i; + notification_t n = {.severity = NOTIF_WARNING, + .time = cdtime(), + .plugin = PCIE_ERRORS_PLUGIN, + .meta = NULL}; + + /* If not specifically set by config option omit masked errors */ + if (!pcie_config.notif_masked && (err->mask & masked)) + continue; + + if (err->mask & errors) { + /* Error already reported, notify only if persistent is set */ + if (!pcie_config.persistent && (err->mask & dev->correctable_errors)) + continue; + + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s set", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + snprintf(n.message, sizeof(n.message), "Correctable Error set: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, PCIE_SEV_CE); + + } else if (err->mask & dev->correctable_errors) { + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s cleared", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + + n.severity = NOTIF_OKAY; + snprintf(n.message, sizeof(n.message), "Correctable Error cleared: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, PCIE_SEV_CE); + } + } +} + +/* Report errors found in AER Uncorrectable Error Status register */ +static void pcie_dispatch_uncorrectable_errors(pcie_device_t *dev, + uint32_t errors, uint32_t masked, + uint32_t severity) { + for (int i = 0; i < pcie_aer_ues_num; i++) { + const pcie_error_t *err = pcie_aer_ues + i; + const char *type_instance = + (severity & err->mask) ? PCIE_SEV_FATAL : PCIE_SEV_NOFATAL; + notification_t n = { + .time = cdtime(), .plugin = PCIE_ERRORS_PLUGIN, .meta = NULL}; + + /* If not specifically set by config option omit masked errors */ + if (!pcie_config.notif_masked && (err->mask & masked)) + continue; + + if (err->mask & errors) { + /* Error already reported, notify only if persistent is set */ + if (!pcie_config.persistent && (err->mask & dev->uncorrectable_errors)) + continue; + + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s(%s) set", dev->domain, + dev->bus, dev->device, dev->function, err->desc, type_instance); + + n.severity = (severity & err->mask) ? NOTIF_FAILURE : NOTIF_WARNING; + snprintf(n.message, sizeof(n.message), "Uncorrectable(%s) Error set: %s", + type_instance, err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + + } else if (err->mask & dev->uncorrectable_errors) { + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s(%s) cleared", + dev->domain, dev->bus, dev->device, dev->function, err->desc, + type_instance); + + n.severity = NOTIF_OKAY; + snprintf(n.message, sizeof(n.message), + "Uncorrectable(%s) Error cleared: %s", type_instance, err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + } + } +} + +/* Find offset of PCI Express Capability Structure + * in PCI configuration space. + * Returns offset, -1 if not found. +**/ +static int pcie_find_cap_exp(pcie_device_t *dev) { + int pos = pcie_read8(dev, PCI_CAPABILITY_LIST) & ~3; + + while (pos) { + uint8_t id = pcie_read8(dev, pos + PCI_CAP_LIST_ID); + + if (id == 0xff) + break; + if (id == PCI_CAP_ID_EXP) + return pos; + + pos = pcie_read8(dev, pos + PCI_CAP_LIST_NEXT) & ~3; + } + + DEBUG(PCIE_ERRORS_PLUGIN ": Cannot find CAP EXP for %04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + + return -1; +} + +/* Find offset of Advanced Error Reporting Capability. + * Returns AER offset, -1 if not found. +**/ +static int pcie_find_ecap_aer(pcie_device_t *dev) { + int pos = PCIE_ECAP_OFFSET; + uint32_t header = pcie_read32(dev, pos); + int id = PCI_EXT_CAP_ID(header); + int next = PCI_EXT_CAP_NEXT(header); + + if (!id && !next) + return -1; + + if (id == PCI_EXT_CAP_ID_ERR) + return pos; + + while (next) { + if (next <= PCIE_ECAP_OFFSET) + break; + + header = pcie_read32(dev, next); + id = PCI_EXT_CAP_ID(header); + + if (id == PCI_EXT_CAP_ID_ERR) + return next; + + next = PCI_EXT_CAP_NEXT(header); + } + + return -1; +} + +static void pcie_check_dev_status(pcie_device_t *dev, int pos) { + /* Read Device Status register with mask for errors only */ + uint16_t new_status = pcie_read16(dev, pos + PCI_EXP_DEVSTA) & 0xf; + + /* Check if anything new should be reported */ + if (!(pcie_config.persistent && new_status) && + (new_status == dev->device_status)) + return; + + /* Report errors found in Device Status register */ + for (int i = 0; i < pcie_base_errors_num; i++) { + const pcie_error_t *err = pcie_base_errors + i; + const char *type_instance = (err->mask == PCI_EXP_DEVSTA_FED) + ? PCIE_SEV_FATAL + : (err->mask == PCI_EXP_DEVSTA_CED) + ? PCIE_SEV_CE + : PCIE_SEV_NOFATAL; + int severity = + (err->mask == PCI_EXP_DEVSTA_FED) ? NOTIF_FAILURE : NOTIF_WARNING; + notification_t n = {.severity = severity, + .time = cdtime(), + .plugin = PCIE_ERRORS_PLUGIN, + .meta = NULL}; + + if (err->mask & new_status) { + /* Error already reported, notify only if persistent is set */ + if (!pcie_config.persistent && (err->mask & dev->device_status)) + continue; + + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s set", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + snprintf(n.message, sizeof(n.message), "Device Status Error set: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + + } else if (err->mask & dev->device_status) { + DEBUG(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: %s cleared", dev->domain, + dev->bus, dev->device, dev->function, err->desc); + n.severity = NOTIF_OKAY; + snprintf(n.message, sizeof(n.message), "Device Status Error cleared: %s", + err->desc); + pcie_dispatch_notification(dev, &n, PCIE_ERROR, type_instance); + } + } + + dev->device_status = new_status; +} + +static void pcie_check_aer(pcie_device_t *dev, int pos) { + /* Check for AER uncorrectable errors */ + uint32_t errors = pcie_read32(dev, pos + PCI_ERR_UNCOR_STATUS); + + if ((pcie_config.persistent && errors) || + (errors != dev->uncorrectable_errors)) { + uint32_t masked = pcie_read32(dev, pos + PCI_ERR_UNCOR_MASK); + uint32_t severity = pcie_read32(dev, pos + PCI_ERR_UNCOR_SEVER); + pcie_dispatch_uncorrectable_errors(dev, errors, masked, severity); + } + dev->uncorrectable_errors = errors; + + /* Check for AER correctable errors */ + errors = pcie_read32(dev, pos + PCI_ERR_COR_STATUS); + if ((pcie_config.persistent && errors) || + (errors != dev->correctable_errors)) { + uint32_t masked = pcie_read32(dev, pos + PCI_ERR_COR_MASK); + pcie_dispatch_correctable_errors(dev, errors, masked); + } + dev->correctable_errors = errors; +} + +static int pcie_process_devices(llist_t *devs) { + int ret = 0; + if (devs == NULL) + return -1; + + for (llentry_t *e = llist_head(devs); e != NULL; e = e->next) { + pcie_device_t *dev = e->value; + + if (pcie_fops.open(dev) == 0) { + pcie_check_dev_status(dev, dev->cap_exp); + if (dev->ecap_aer != -1) + pcie_check_aer(dev, dev->ecap_aer); + + pcie_fops.close(dev); + } else { + notification_t n = {.severity = NOTIF_FAILURE, + .time = cdtime(), + .message = "Failed to read device status", + .plugin = PCIE_ERRORS_PLUGIN, + .meta = NULL}; + pcie_dispatch_notification(dev, &n, "", ""); + ret = -1; + } + } + + return ret; +} + +/* This function is to be called during init to filter out no pcie devices */ +static void pcie_preprocess_devices(llist_t *devs) { + llentry_t *e_next; + + if (devs == NULL) + return; + + for (llentry_t *e = llist_head(devs); e != NULL; e = e_next) { + pcie_device_t *dev = e->value; + bool del = false; + + if (pcie_fops.open(dev) == 0) { + uint16_t status = pcie_read16(dev, PCI_STATUS); + if (status & PCI_STATUS_CAP_LIST) + dev->cap_exp = pcie_find_cap_exp(dev); + + /* Every PCIe device must have Capability Structure */ + if (dev->cap_exp == -1) { + DEBUG(PCIE_ERRORS_PLUGIN ": Not PCI Express device: %04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + del = true; + } else { + dev->ecap_aer = pcie_find_ecap_aer(dev); + if (dev->ecap_aer == -1) + INFO(PCIE_ERRORS_PLUGIN + ": Device is not AER capable: %04x:%02x:%02x.%d", + dev->domain, dev->bus, dev->device, dev->function); + } + + pcie_fops.close(dev); + } else { + ERROR(PCIE_ERRORS_PLUGIN ": %04x:%02x:%02x.%d: failed to open", + dev->domain, dev->bus, dev->device, dev->function); + del = true; + } + + e_next = e->next; + if (del) { + sfree(dev); + llist_remove(devs, e); + llentry_destroy(e); + } + } +} + +static int pcie_plugin_read(__attribute__((unused)) user_data_t *ud) { + + if (pcie_process_devices(pcie_dev_list) < 0) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to read devices state"); + return -1; + } + return 0; +} + +static void pcie_access_config(void) { + /* Set functions for register access to + * use proc or sysfs depending on config. */ + if (pcie_config.use_sysfs) { + pcie_fops.list_devices = pcie_list_devices_sysfs; + pcie_fops.open = pcie_open_sysfs; + if (pcie_config.access_dir[0] == '\0') + sstrncpy(pcie_config.access_dir, PCIE_DEFAULT_SYSFSDIR, + sizeof(pcie_config.access_dir)); + } else { + /* use proc */ + pcie_fops.list_devices = pcie_list_devices_proc; + pcie_fops.open = pcie_open_proc; + if (pcie_config.access_dir[0] == '\0') + sstrncpy(pcie_config.access_dir, PCIE_DEFAULT_PROCDIR, + sizeof(pcie_config.access_dir)); + } + /* Common functions */ + pcie_fops.close = pcie_close; + pcie_fops.read = pcie_read; +} + +static int pcie_plugin_config(oconfig_item_t *ci) { + int status = 0; + + for (int i = 0; i < ci->children_num; i++) { + oconfig_item_t *child = ci->children + i; + + if (strcasecmp("Source", child->key) == 0) { + if ((child->values_num != 1) || + (child->values[0].type != OCONFIG_TYPE_STRING)) { + status = -1; + } else if (strcasecmp("proc", child->values[0].value.string) == 0) { + pcie_config.use_sysfs = false; + } else if (strcasecmp("sysfs", child->values[0].value.string) != 0) { + ERROR(PCIE_ERRORS_PLUGIN ": Allowed sources are 'proc' or 'sysfs'."); + status = -1; + } + } else if (strcasecmp("AccessDir", child->key) == 0) { + status = cf_util_get_string_buffer(child, pcie_config.access_dir, + sizeof(pcie_config.access_dir)); + } else if (strcasecmp("ReportMasked", child->key) == 0) { + status = cf_util_get_boolean(child, &pcie_config.notif_masked); + } else if (strcasecmp("PersistentNotifications", child->key) == 0) { + status = cf_util_get_boolean(child, &pcie_config.persistent); + } else { + ERROR(PCIE_ERRORS_PLUGIN ": Invalid configuration option \"%s\".", + child->key); + status = -1; + break; + } + + if (status) { + ERROR(PCIE_ERRORS_PLUGIN ": Invalid configuration parameter \"%s\".", + child->key); + break; + } + } + + return status; +} + +static int pcie_shutdown(void) { + pcie_clear_list(pcie_dev_list); + pcie_dev_list = NULL; + + return 0; +} + +static int pcie_init(void) { + + pcie_access_config(); + pcie_dev_list = llist_create(); + if (pcie_fops.list_devices(pcie_dev_list) != 0) { + ERROR(PCIE_ERRORS_PLUGIN ": Failed to find devices."); + pcie_shutdown(); + return -1; + } + pcie_preprocess_devices(pcie_dev_list); + if (llist_size(pcie_dev_list) == 0) { + /* No any PCI Express devices were found on the system */ + ERROR(PCIE_ERRORS_PLUGIN ": No PCIe devices found in %s", + pcie_config.access_dir); + pcie_shutdown(); + return -1; + } + + return 0; +} + +void module_register(void) { + plugin_register_init(PCIE_ERRORS_PLUGIN, pcie_init); + plugin_register_complex_config(PCIE_ERRORS_PLUGIN, pcie_plugin_config); + plugin_register_complex_read(NULL, PCIE_ERRORS_PLUGIN, pcie_plugin_read, 0, + NULL); + plugin_register_shutdown(PCIE_ERRORS_PLUGIN, pcie_shutdown); +} diff --git a/src/pcie_errors_test.c b/src/pcie_errors_test.c new file mode 100644 index 00000000..5cb95fa4 --- /dev/null +++ b/src/pcie_errors_test.c @@ -0,0 +1,570 @@ +/** + * collectd - src/pcie_errors.c + * + * Copyright(c) 2018 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Kamil Wiatrowski + **/ + +#define plugin_dispatch_notification plugin_dispatch_notification_pcie_test + +#include "pcie_errors.c" /* sic */ +#include "testing.h" + +#define TEST_DOMAIN 1 +#define TEST_BUS 5 +#define TEST_DEVICE 0xc +#define TEST_FUNCTION 2 +#define TEST_DEVICE_STR "0001:05:0c.2" + +#define G_BUFF_LEN 4 + +static notification_t last_notif; +static char g_buff[G_BUFF_LEN]; + +/* mock functions */ +int plugin_dispatch_notification_pcie_test(const notification_t *notif) { + last_notif = *notif; + return ENOTSUP; +} + +ssize_t pread(__attribute__((unused)) int fd, void *buf, size_t count, + __attribute__((unused)) off_t offset) { + if (count == 0 || count > G_BUFF_LEN) + return -1; + + memcpy(buf, g_buff, count); + return count; +} +/* end mock functions */ + +DEF_TEST(clear_dev_list) { + pcie_clear_list(NULL); + + llist_t *test_list = llist_create(); + CHECK_NOT_NULL(test_list); + + pcie_device_t *dev = calloc(1, sizeof(*dev)); + CHECK_NOT_NULL(dev); + + llentry_t *entry = llentry_create(NULL, dev); + CHECK_NOT_NULL(entry); + + llist_append(test_list, entry); + + for (llentry_t *e = llist_head(test_list); e != NULL; e = e->next) { + EXPECT_EQ_PTR(dev, e->value); + } + + pcie_clear_list(test_list); + + return 0; +} + +DEF_TEST(add_to_list) { + llist_t *test_list = llist_create(); + CHECK_NOT_NULL(test_list); + + int ret = pcie_add_device(test_list, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, + TEST_FUNCTION); + EXPECT_EQ_INT(0, ret); + + llentry_t *e = llist_head(test_list); + CHECK_NOT_NULL(e); + OK(NULL == e->next); + + pcie_device_t *dev = e->value; + CHECK_NOT_NULL(dev); + EXPECT_EQ_INT(TEST_DOMAIN, dev->domain); + EXPECT_EQ_INT(TEST_BUS, dev->bus); + EXPECT_EQ_INT(TEST_DEVICE, dev->device); + EXPECT_EQ_INT(TEST_FUNCTION, dev->function); + EXPECT_EQ_INT(-1, dev->cap_exp); + EXPECT_EQ_INT(-1, dev->ecap_aer); + + pcie_clear_list(test_list); + + return 0; +} + +DEF_TEST(pcie_read) { + int ret; + pcie_device_t dev = {0}; + uint32_t val = 0; + g_buff[0] = 4; + g_buff[1] = 3; + g_buff[2] = 2; + g_buff[3] = 1; + + ret = pcie_read(&dev, &val, 1, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(4, val); + + ret = pcie_read(&dev, &val, 2, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0x304, val); + + ret = pcie_read(&dev, &val, 3, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0x20304, val); + + ret = pcie_read(&dev, &val, 4, 0); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0x1020304, val); + + ret = pcie_read(&dev, &val, G_BUFF_LEN + 1, 0); + EXPECT_EQ_INT(-1, ret); + + pcie_fops.read = pcie_read; + + uint8_t val8 = pcie_read8(&dev, 0); + EXPECT_EQ_INT(4, val8); + + uint16_t val16 = pcie_read16(&dev, 0); + EXPECT_EQ_INT(0x304, val16); + + uint32_t val32 = pcie_read32(&dev, 0); + EXPECT_EQ_INT(0x1020304, val32); + + return 0; +} + +DEF_TEST(dispatch_notification) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + cdtime_t t = cdtime(); + notification_t n = { + .severity = 1, .time = t, .plugin = "pcie_errors_test", .meta = NULL}; + + pcie_dispatch_notification(&dev, &n, "test_type", "test_type_instance"); + EXPECT_EQ_INT(1, last_notif.severity); + EXPECT_EQ_UINT64(t, last_notif.time); + EXPECT_EQ_STR("pcie_errors_test", last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(hostname_g, last_notif.host); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR("test_type", last_notif.type); + EXPECT_EQ_STR("test_type_instance", last_notif.type_instance); + + return 0; +} + +DEF_TEST(access_config) { + pcie_config.use_sysfs = 0; + pcie_access_config(); + EXPECT_EQ_PTR(pcie_list_devices_proc, pcie_fops.list_devices); + EXPECT_EQ_PTR(pcie_open_proc, pcie_fops.open); + EXPECT_EQ_PTR(pcie_close, pcie_fops.close); + EXPECT_EQ_PTR(pcie_read, pcie_fops.read); + EXPECT_EQ_STR(PCIE_DEFAULT_PROCDIR, pcie_config.access_dir); + + sstrncpy(pcie_config.access_dir, "Test", sizeof(pcie_config.access_dir)); + pcie_access_config(); + EXPECT_EQ_STR("Test", pcie_config.access_dir); + + pcie_config.use_sysfs = 1; + pcie_access_config(); + EXPECT_EQ_PTR(pcie_list_devices_sysfs, pcie_fops.list_devices); + EXPECT_EQ_PTR(pcie_open_sysfs, pcie_fops.open); + EXPECT_EQ_PTR(pcie_close, pcie_fops.close); + EXPECT_EQ_PTR(pcie_read, pcie_fops.read); + EXPECT_EQ_STR("Test", pcie_config.access_dir); + + pcie_config.access_dir[0] = '\0'; + pcie_access_config(); + EXPECT_EQ_STR(PCIE_DEFAULT_SYSFSDIR, pcie_config.access_dir); + + return 0; +} + +DEF_TEST(plugin_config_fail) { + oconfig_item_t test_cfg_parent = {"pcie_errors", NULL, 0, NULL, NULL, 0}; + char value_buff[256] = "procs"; + char key_buff[256] = "Sources"; + oconfig_value_t test_cfg_value = {{value_buff}, OCONFIG_TYPE_STRING}; + oconfig_item_t test_cfg = { + key_buff, &test_cfg_value, 1, &test_cfg_parent, NULL, 0}; + + test_cfg_parent.children = &test_cfg; + test_cfg_parent.children_num = 1; + + int ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(-1, ret); + + sstrncpy(key_buff, "Source", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(-1, ret); + + sstrncpy(value_buff, "proc", sizeof(value_buff)); + test_cfg_value.type = OCONFIG_TYPE_NUMBER; + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(-1, ret); + + sstrncpy(key_buff, "AccessDir", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(-1, ret); + + return 0; +} + +DEF_TEST(plugin_config) { + oconfig_item_t test_cfg_parent = {"pcie_errors", NULL, 0, NULL, NULL, 0}; + char value_buff[256] = "proc"; + char key_buff[256] = "source"; + oconfig_value_t test_cfg_value = {{value_buff}, OCONFIG_TYPE_STRING}; + oconfig_item_t test_cfg = { + key_buff, &test_cfg_value, 1, &test_cfg_parent, NULL, 0}; + + test_cfg_parent.children = &test_cfg; + test_cfg_parent.children_num = 1; + + pcie_config.use_sysfs = 1; + int ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(0, pcie_config.use_sysfs); + + pcie_config.use_sysfs = 1; + sstrncpy(value_buff, "sysfs", sizeof(value_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.use_sysfs); + + sstrncpy(key_buff, "AccessDir", sizeof(key_buff)); + sstrncpy(value_buff, "some/test/value", sizeof(value_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_STR("some/test/value", pcie_config.access_dir); + + memset(&test_cfg_value.value, 0, sizeof(test_cfg_value.value)); + test_cfg_value.value.boolean = 1; + test_cfg_value.type = OCONFIG_TYPE_BOOLEAN; + sstrncpy(key_buff, "ReportMasked", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.notif_masked); + + sstrncpy(key_buff, "PersistentNotifications", sizeof(key_buff)); + ret = pcie_plugin_config(&test_cfg_parent); + EXPECT_EQ_INT(0, ret); + EXPECT_EQ_INT(1, pcie_config.persistent); + + return 0; +} + +#define BAD_TLP_SET_MSG "Correctable Error set: Bad TLP Status" +#define BAD_TLP_CLEAR_MSG "Correctable Error cleared: Bad TLP Status" + +DEF_TEST(dispatch_correctable_errors) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + pcie_config.notif_masked = 0; + pcie_config.persistent = 0; + + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + dev.correctable_errors = PCI_ERR_COR_BAD_TLP; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.persistent = 1; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.notif_masked = 1; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + pcie_config.persistent = 0; + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + dev.correctable_errors = 0; + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + PCI_ERR_COR_BAD_TLP); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + pcie_dispatch_correctable_errors(&dev, PCI_ERR_COR_BAD_TLP, + ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_SET_MSG, last_notif.message); + + pcie_config.notif_masked = 0; + dev.correctable_errors = PCI_ERR_COR_BAD_TLP; + pcie_dispatch_correctable_errors(&dev, 0, ~(PCI_ERR_COR_BAD_TLP)); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_CE, last_notif.type_instance); + EXPECT_EQ_STR(BAD_TLP_CLEAR_MSG, last_notif.message); + + return 0; +} + +#define FCP_NF_SET_MSG \ + "Uncorrectable(non_fatal) Error set: Flow Control Protocol" +#define FCP_F_SET_MSG "Uncorrectable(fatal) Error set: Flow Control Protocol" +#define FCP_NF_CLEAR_MSG \ + "Uncorrectable(non_fatal) Error cleared: Flow Control Protocol" +#define FCP_F_CLEAR_MSG \ + "Uncorrectable(fatal) Error cleared: Flow Control Protocol" + +DEF_TEST(dispatch_uncorrectable_errors) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + pcie_config.notif_masked = 0; + pcie_config.persistent = 0; + + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + ~(PCI_ERR_UNC_FCP)); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_NF_SET_MSG, last_notif.message); + + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + dev.uncorrectable_errors = PCI_ERR_UNC_FCP; + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.persistent = 1; + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, PCI_ERR_UNC_FCP, + PCI_ERR_UNC_FCP); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.notif_masked = 1; + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, PCI_ERR_UNC_FCP, + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + pcie_config.persistent = 0; + dev.uncorrectable_errors = 0; + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_uncorrectable_errors(&dev, PCI_ERR_UNC_FCP, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_SET_MSG, last_notif.message); + + pcie_config.notif_masked = 0; + dev.uncorrectable_errors = PCI_ERR_UNC_FCP; + pcie_dispatch_uncorrectable_errors(&dev, 0, ~(PCI_ERR_UNC_FCP), + ~(PCI_ERR_UNC_FCP)); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_NF_CLEAR_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_dispatch_uncorrectable_errors(&dev, 0, ~(PCI_ERR_UNC_FCP), + PCI_ERR_UNC_FCP); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FCP_F_CLEAR_MSG, last_notif.message); + + return 0; +} + +#define UR_SET_MSG "Device Status Error set: Unsupported Request" +#define UR_CLEAR_MSG "Device Status Error cleared: Unsupported Request" +#define FE_SET_MSG "Device Status Error set: Fatal Error" +#define FE_CLEAR_MSG "Device Status Error cleared: Fatal Error" + +DEF_TEST(device_status_errors) { + pcie_device_t dev = {0, TEST_DOMAIN, TEST_BUS, TEST_DEVICE, TEST_FUNCTION, + 0, 0, 0, 0, 0}; + pcie_config.persistent = 0; + g_buff[0] = (PCI_EXP_DEVSTA_URD & 0xff); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_SET_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + pcie_config.persistent = 1; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_WARNING, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_SET_MSG, last_notif.message); + + g_buff[0] = 0; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_CLEAR_MSG, last_notif.message); + + pcie_config.persistent = 0; + dev.device_status = PCI_EXP_DEVSTA_URD; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_NOFATAL, last_notif.type_instance); + EXPECT_EQ_STR(UR_CLEAR_MSG, last_notif.message); + + memset(&last_notif, 0, sizeof(last_notif)); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_STR("", last_notif.plugin_instance); + + g_buff[0] = (PCI_EXP_DEVSTA_FED & 0xff); + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_FAILURE, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FE_SET_MSG, last_notif.message); + + g_buff[0] = 0; + pcie_check_dev_status(&dev, 0); + EXPECT_EQ_INT(NOTIF_OKAY, last_notif.severity); + EXPECT_EQ_STR(PCIE_ERRORS_PLUGIN, last_notif.plugin); + OK(NULL == last_notif.meta); + EXPECT_EQ_STR(TEST_DEVICE_STR, last_notif.plugin_instance); + EXPECT_EQ_STR(PCIE_ERROR, last_notif.type); + EXPECT_EQ_STR(PCIE_SEV_FATAL, last_notif.type_instance); + EXPECT_EQ_STR(FE_CLEAR_MSG, last_notif.message); + + return 0; +} + +int main(void) { + RUN_TEST(clear_dev_list); + RUN_TEST(add_to_list); + RUN_TEST(pcie_read); + RUN_TEST(dispatch_notification); + + RUN_TEST(access_config); + RUN_TEST(plugin_config_fail); + RUN_TEST(plugin_config); + + RUN_TEST(dispatch_correctable_errors); + RUN_TEST(dispatch_uncorrectable_errors); + RUN_TEST(device_status_errors); + + END_TEST; +} diff --git a/src/testing.h b/src/testing.h index 5cf69559..fd7e6c66 100644 --- a/src/testing.h +++ b/src/testing.h @@ -100,6 +100,18 @@ static int check_count__; printf("ok %i - %s = %" PRIu64 "\n", ++check_count__, #actual, got__); \ } while (0) +#define EXPECT_EQ_PTR(expect, actual) \ + do { \ + void *want__ = expect; \ + void *got__ = actual; \ + if (got__ != want__) { \ + printf("not ok %i - %s = %p, want %p\n", ++check_count__, #actual, \ + got__, want__); \ + return -1; \ + } \ + printf("ok %i - %s = %p\n", ++check_count__, #actual, got__); \ + } while (0) + #define EXPECT_EQ_DOUBLE(expect, actual) \ do { \ double want__ = (double)expect; \