diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 8dd75e0bb4da..ecf20418e494 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -141,6 +141,17 @@ zfs_unavail_pool(zpool_handle_t *zhp, void *data) return (0); } +/* + * Write an array of strings to the zed log + */ +static void lines_to_zed_log_msg(char **lines, int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + zed_log_msg(LOG_INFO, "%s", lines[i]); + } +} + /* * Two stage replace on Linux * since we get disk notifications @@ -195,6 +206,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; + char **lines = NULL; + int lines_cnt = 0; if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; @@ -209,8 +222,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); @@ -377,6 +394,22 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) if (is_mpath_wholedisk) { /* Don't label device mapper or multipath disks. */ + zed_log_msg(LOG_INFO, + " it's a multipath wholedisk, don't label"); + if (zpool_prepare_disk(zhp, vdev, "autoreplace", &lines, + &lines_cnt) != 0) { + zed_log_msg(LOG_INFO, + " zpool_prepare_disk: could not " + "prepare '%s' (%s)", fullpath, + libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); + return; + } } else if (!labeled) { /* * we're auto-replacing a raw disk, so label it first @@ -399,10 +432,18 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * If this is a request to label a whole disk, then attempt to * write out the label. */ - if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { - zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, + vdev, "autoreplace", &lines, &lines_cnt) != 0) { + zed_log_msg(LOG_INFO, + " zpool_prepare_and_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); @@ -457,6 +498,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) DEV_BYID_PATH, new_devid); } + libzfs_free_str_array(lines, lines_cnt); + /* * Construct the root vdev to pass to zpool_vdev_attach(). While adding * the entire vdev structure is harmless, we construct a reduced set of diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh index 150012abe71a..06acce93b8aa 100755 --- a/cmd/zed/zed.d/statechange-slot_off.sh +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -5,7 +5,7 @@ # # Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos # as they flip between FAULTED and ONLINE. If -# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT is set in zed.rc, and the disk gets # FAULTED, then power down the slot via sysfs: # # /sys/class/enclosure///power_status @@ -19,7 +19,7 @@ # Exit codes: # 0: slot successfully powered off # 1: enclosure not available -# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT disabled # 3: vdev was not FAULTED # 4: The enclosure sysfs path passed from ZFS does not exist # 5: Enclosure slot didn't actually turn off after we told it to @@ -32,7 +32,7 @@ if [ ! -d /sys/class/enclosure ] ; then exit 1 fi -if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then +if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT}" != "1" ] ; then exit 2 fi diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 1dfd43454a41..91ceea79e5c5 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -147,4 +147,4 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # Power off the drive's slot in the enclosure if it becomes FAULTED. This can # help silence misbehaving drives. This assumes your drive enclosure fully # supports slot power control via sysfs. -#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1 +#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1 diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index 9eaad0e92fbb..e384b16dc8f0 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -35,6 +35,7 @@ #include "zed_strings.h" #include "agents/zfs_agents.h" +#include #define MAXBUF 4096 @@ -907,6 +908,25 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) } } + +static void +_zed_event_update_enc_sysfs_path(nvlist_t *nvl) +{ + char *vdev_path; + + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + &vdev_path) != 0) { + return; /* some other kind of event, ignore it */ + } + + if (vdev_path == NULL) { + return; + } + + update_vdev_config_dev_sysfs_path(nvl, vdev_path, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH); +} + /* * Service the next zevent, blocking until one is available. */ @@ -954,6 +974,17 @@ zed_event_service(struct zed_conf *zcp) zed_log_msg(LOG_WARNING, "Failed to lookup zevent class (eid=%llu)", eid); } else { + /* + * Special case: If we can dynamically detect an enclosure sysfs + * path, then use that value rather than the one stored in the + * vd->vdev_enc_sysfs_path. There have been rare cases where + * vd->vdev_enc_sysfs_path becomes outdated. However, there + * will be other times when we can not dynamically detect the + * sysfs path (like if a disk disappears) and have to rely on + * the old value for things like turning on the fault LED. + */ + _zed_event_update_enc_sysfs_path(nvl); + /* let internal modules see this event first */ zfs_agent_post_event(class, NULL, nvl); diff --git a/cmd/zpool/os/freebsd/zpool_vdev_os.c b/cmd/zpool/os/freebsd/zpool_vdev_os.c index aa66d29fa604..08f382fb527b 100644 --- a/cmd/zpool/os/freebsd/zpool_vdev_os.c +++ b/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -116,3 +116,24 @@ after_zpool_upgrade(zpool_handle_t *zhp) "details.\n"), zpool_get_name(zhp)); } } + +int +zpool_power_current_state(zpool_handle_t *zhp, char *vdev) +{ + + (void) zhp; + (void) vdev; + /* Enclosure slot power not supported on FreeBSD yet */ + return (-1); +} + +int +zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on) +{ + + (void) zhp; + (void) vdev; + (void) turn_on; + /* Enclosure slot power not supported on FreeBSD yet */ + return (ENOTSUP); +} diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c index da87aa79f365..cfaeef56a20e 100644 --- a/cmd/zpool/os/linux/zpool_vdev_os.c +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -410,3 +410,258 @@ void after_zpool_upgrade(zpool_handle_t *zhp) { } + +/* + * Read from a sysfs file and return an allocated string. Removes + * the newline from the end of the string if there is one. + * + * Returns a string on success (which must be freed), or NULL on error. + */ +static char *zpool_sysfs_gets(char *path) +{ + int fd; + struct stat statbuf; + char *buf = NULL; + ssize_t count = 0; + fd = open(path, O_RDONLY); + if (fd < 0) + return (NULL); + + if (fstat(fd, &statbuf) != 0) { + close(fd); + return (NULL); + } + + buf = calloc(sizeof (*buf), statbuf.st_size + 1); + if (buf == NULL) { + close(fd); + return (NULL); + } + + /* + * Note, we can read less bytes than st_size, and that's ok. Sysfs + * files will report their size is 4k even if they only return a small + * string. + */ + count = read(fd, buf, statbuf.st_size); + if (count < 0) { + /* Error doing read() or we overran the buffer */ + close(fd); + free(buf); + return (NULL); + } + + /* Remove trailing newline */ + if (buf[count - 1] == '\n') + buf[count - 1] = 0; + + close(fd); + + return (buf); +} + +/* + * Write a string to a sysfs file. + * + * Returns 0 on success, non-zero otherwise. + */ +static int zpool_sysfs_puts(char *path, char *str) +{ + FILE *file; + + file = fopen(path, "w"); + if (!file) { + return (-1); + } + + if (fputs(str, file) < 0) { + fclose(file); + return (-2); + } + fclose(file); + return (0); +} + +/* Given a vdev nvlist_t, rescan its enclosure sysfs path */ +static void +rescan_vdev_config_dev_sysfs_path(nvlist_t *vdev_nv) +{ + update_vdev_config_dev_sysfs_path(vdev_nv, + fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH), + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); +} + +/* + * Given a power string: "on", "off", "1", or "0", return 0 if it's an + * off value, 1 if it's an on value, and -1 if the value is unrecognized. + */ +static int zpool_power_parse_value(char *str) +{ + if ((strcmp(str, "off") == 0) || (strcmp(str, "0") == 0)) + return (0); + + if ((strcmp(str, "on") == 0) || (strcmp(str, "1") == 0)) + return (1); + + return (-1); +} + +/* + * Given a vdev string return an allocated string containing the sysfs path to + * its power control file. Also do a check if the power control file really + * exists and has correct permissions. + * + * Example returned strings: + * + * /sys/class/enclosure/0:0:122:0/10/power_status + * /sys/bus/pci/slots/10/power + * + * Returns allocated string on success (which must be freed), NULL on failure. + */ +static char * +zpool_power_sysfs_path(zpool_handle_t *zhp, char *vdev) +{ + char *enc_sysfs_dir = NULL; + char *path = NULL; + nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); + + if (vdev_nv == NULL) { + return (NULL); + } + + /* Make sure we're getting the updated enclosure sysfs path */ + rescan_vdev_config_dev_sysfs_path(vdev_nv); + + if (nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_dir) != 0) { + return (NULL); + } + + if (asprintf(&path, "%s/power_status", enc_sysfs_dir) == -1) + return (NULL); + + if (access(path, W_OK) != 0) { + free(path); + path = NULL; + /* No HDD 'power_control' file, maybe it's NVMe? */ + if (asprintf(&path, "%s/power", enc_sysfs_dir) == -1) { + return (NULL); + } + + if (access(path, R_OK | W_OK) != 0) { + /* Not NVMe either */ + free(path); + return (NULL); + } + } + + return (path); +} + +/* + * Given a path to a sysfs power control file, return B_TRUE if you should use + * "on/off" words to control it, or B_FALSE otherwise ("0/1" to control). + */ +static boolean_t +zpool_power_use_word(char *sysfs_path) +{ + if (strcmp(&sysfs_path[strlen(sysfs_path) - strlen("power_status")], + "power_status") == 0) { + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Check the sysfs power control value for a vdev. + * + * Returns: + * 0 - Power is off + * 1 - Power is on + * -1 - Error or unsupported + */ +int +zpool_power_current_state(zpool_handle_t *zhp, char *vdev) +{ + char *val; + int rc; + + char *path = zpool_power_sysfs_path(zhp, vdev); + if (path == NULL) + return (-1); + + val = zpool_sysfs_gets(path); + if (val == NULL) { + free(path); + return (-1); + } + + rc = zpool_power_parse_value(val); + free(val); + free(path); + return (rc); +} + +/* + * Turn on or off the slot to a device + * + * Device path is the full path to the device (like /dev/sda or /dev/sda1). + * + * Return code: + * 0: Success + * ENOTSUP: Power control not supported for OS + * EBADSLT: Couldn't read current power state + * ENOENT: No sysfs path to power control + * EIO: Couldn't write sysfs power value + * EBADE: Sysfs power value didn't change + */ +int +zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on) +{ + char *sysfs_path; + const char *val; + int rc; + int timeout_ms; + + rc = zpool_power_current_state(zhp, vdev); + if (rc == -1) { + return (EBADSLT); + } + + /* Already correct value? */ + if (rc == (int)turn_on) + return (0); + + sysfs_path = zpool_power_sysfs_path(zhp, vdev); + if (sysfs_path == NULL) + return (ENOENT); + + if (zpool_power_use_word(sysfs_path)) { + val = turn_on ? "on" : "off"; + } else { + val = turn_on ? "1" : "0"; + } + + rc = zpool_sysfs_puts(sysfs_path, (char *)val); + + free(sysfs_path); + if (rc != 0) { + return (EIO); + } + + /* + * Wait up to 30 seconds for sysfs power value to change after + * writing it. + */ + timeout_ms = zpool_getenv_int("ZPOOL_POWER_ON_SLOT_TIMEOUT_MS", 30000); + for (int i = 0; i < MAX(1, timeout_ms / 200); i++) { + rc = zpool_power_current_state(zhp, vdev); + if (rc == (int)turn_on) + return (0); /* success */ + + fsleep(0.200); /* 200ms */ + } + + /* sysfs value never changed */ + return (EBADE); +} diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses index b51fe31894ab..8be3eb57acf2 100755 --- a/cmd/zpool/zpool.d/ses +++ b/cmd/zpool/zpool.d/ses @@ -32,10 +32,18 @@ for i in $scripts ; do val="" case $i in enc) - val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$VDEV_ENC_SYSFS_PATH" + else + val="$(ls """$VDEV_ENC_SYSFS_PATH/../../""" 2>/dev/null)" + fi ;; slot) - val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$(basename """$VDEV_ENC_SYSFS_PATH""")" + else + val="$(cat """$VDEV_ENC_SYSFS_PATH/slot""" 2>/dev/null)" + fi ;; encdev) val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index abfa2b7f6b90..82250f692700 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -439,39 +439,23 @@ static void vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) { int rc; - char *argv[2] = {cmd, 0}; - char *env[5] = {"PATH=/bin:/sbin:/usr/bin:/usr/sbin", NULL, NULL, NULL, - NULL}; + char *argv[2] = {cmd}; + char **env; char **lines = NULL; int lines_cnt = 0; int i; - /* Setup our custom environment variables */ - rc = asprintf(&env[1], "VDEV_PATH=%s", - data->path ? data->path : ""); - if (rc == -1) { - env[1] = NULL; + env = zpool_vdev_script_alloc_env(data->pool, data->path, data->upath, + data->vdev_enc_sysfs_path, NULL, NULL); + if (env == NULL) goto out; - } - - rc = asprintf(&env[2], "VDEV_UPATH=%s", - data->upath ? data->upath : ""); - if (rc == -1) { - env[2] = NULL; - goto out; - } - - rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", - data->vdev_enc_sysfs_path ? - data->vdev_enc_sysfs_path : ""); - if (rc == -1) { - env[3] = NULL; - goto out; - } /* Run the command */ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, &lines_cnt); + + zpool_vdev_script_free_env(env); + if (rc != 0) goto out; @@ -483,10 +467,6 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) out: if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); - - /* Start with i = 1 since env[0] was statically allocated */ - for (i = 1; i < ARRAY_SIZE(env); i++) - free(env[i]); } /* @@ -571,6 +551,10 @@ for_each_vdev_run_cb(void *zhp_data, nvlist_t *nv, void *cb_vcdl) if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); + /* Make sure we're getting the updated enclosure sysfs path */ + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vdev_enc_sysfs_path); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index a06af9aeceb4..c69bfbdaa07a 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -345,7 +345,7 @@ get_usage(zpool_help_t idx) return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: - return (gettext("\tclear [-nF] [device]\n")); + return (gettext("\tclear [[--power]|[-nF]] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" @@ -381,9 +381,11 @@ get_usage(zpool_help_t idx) "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: - return (gettext("\toffline [-f] [-t] ...\n")); + return (gettext("\toffline [--power]|[[-f][-t]] " + " ...\n")); case HELP_ONLINE: - return (gettext("\tonline [-e] ...\n")); + return (gettext("\tonline [--power][-e] " + "...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); @@ -402,7 +404,7 @@ get_usage(zpool_help_t idx) return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: - return (gettext("\tstatus [-c [script1,script2,...]] " + return (gettext("\tstatus [--power] [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: @@ -485,6 +487,77 @@ print_prop_cb(int prop, void *cb) return (ZPROP_CONT); } +/* + * Given a leaf vdev name like 'L5' return its VDEV_CONFIG_PATH like + * '/dev/disk/by-vdev/L5'. + */ +static const char * +vdev_name_to_path(zpool_handle_t *zhp, char *vdev) +{ + nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); + if (vdev_nv == NULL) { + return (NULL); + } + return (fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH)); +} + +static int +zpool_power_on(zpool_handle_t *zhp, char *vdev) +{ + return (zpool_power(zhp, vdev, B_TRUE)); +} + +static int +zpool_power_on_and_disk_wait(zpool_handle_t *zhp, char *vdev) +{ + int rc; + + rc = zpool_power_on(zhp, vdev); + if (rc != 0) + return (rc); + + zpool_disk_wait(vdev_name_to_path(zhp, vdev)); + + return (0); +} + +static int +zpool_power_on_pool_and_wait_for_devices(zpool_handle_t *zhp) +{ + nvlist_t *nv; + const char *path = NULL; + int rc; + + /* Power up all the devices first */ + FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { + path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + if (path != NULL) { + rc = zpool_power_on(zhp, (char *)path); + if (rc != 0) { + return (rc); + } + } + } + + /* + * Wait for their devices to show up. Since we powered them on + * at roughly the same time, they should all come online around + * the same time. + */ + FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { + path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + zpool_disk_wait(path); + } + + return (0); +} + +static int +zpool_power_off(zpool_handle_t *zhp, char *vdev) +{ + return (zpool_power(zhp, vdev, B_FALSE)); +} + /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display @@ -2054,11 +2127,13 @@ typedef struct status_cbdata { boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; + boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; + boolean_t cb_print_power; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ @@ -2251,6 +2326,35 @@ health_str_to_color(const char *health) return (NULL); } +/* + * Called for each leaf vdev. Returns 0 if the vdev is healthy. + * A vdev is unhealthy if any of the following are true: + * 1) there are read, write, or checksum errors, + * 2) its state is not ONLINE, or + * 3) slow IO reporting was requested (-s) and there are slow IOs. + */ +static int +vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) +{ + status_cbdata_t *cb = data; + vdev_stat_t *vs; + uint_t vsc; + (void) hdl_data; + + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) != 0) + return (1); + + if (vs->vs_checksum_errors || vs->vs_read_errors || + vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) + return (1); + + if (cb->cb_print_slow_ios && vs->vs_slow_ios) + return (1); + + return (0); +} + /* * Print out configuration state as requested by status_callback. */ @@ -2269,7 +2373,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, const char *state; char *type; char *path = NULL; - char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, + *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -2296,6 +2401,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, state = gettext("AVAIL"); } + /* + * If '-e' is specified then top-level vdevs and their children + * can be pruned if all of their leaves are healthy. + */ + if (cb->cb_print_unhealthy && depth > 0 && + for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { + return; + } + printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); @@ -2310,6 +2424,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (vs->vs_checksum_errors) ccolor = ANSI_RED; + if (vs->vs_slow_ios) + scolor = ANSI_BLUE; + if (cb->cb_literal) { printf(" "); printf_color(rcolor, "%5llu", @@ -2342,9 +2459,30 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } if (cb->cb_literal) - printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + printf_color(scolor, " %5llu", + (u_longlong_t)vs->vs_slow_ios); else - printf(" %5s", rbuf); + printf_color(scolor, " %5s", rbuf); + } + if (cb->cb_print_power) { + if (children == 0) { + /* Only leaf vdevs have physical slots */ + switch (zpool_power_current_state(zhp, (char *) + fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH))) { + case 0: + printf_color(ANSI_RED, " %5s", + gettext("off")); + break; + case 1: + printf(" %5s", gettext("on")); + break; + default: + printf(" %5s", "-"); + } + } else { + printf(" %5s", "-"); + } } } @@ -5385,19 +5523,6 @@ get_interval_count_filter_guids(int *argc, char **argv, float *interval, interval, count); } -/* - * Floating point sleep(). Allows you to pass in a floating point value for - * seconds. - */ -static void -fsleep(float sec) -{ - struct timespec req; - req.tv_sec = floor(sec); - req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; - nanosleep(&req, NULL); -} - /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. @@ -6886,10 +7011,12 @@ zpool_do_split(int argc, char **argv) return (ret); } - +#define POWER_OPT 1024 /* - * zpool online ... + * zpool online [--power] ... + * + * --power: Power on the enclosure slot to the drive (if possible) */ int zpool_do_online(int argc, char **argv) @@ -6900,13 +7027,21 @@ zpool_do_online(int argc, char **argv) int ret = 0; vdev_state_t newstate; int flags = 0; + boolean_t is_power_on = B_FALSE; + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; /* check options */ - while ((c = getopt(argc, argv, "e")) != -1) { + while ((c = getopt_long(argc, argv, "e", long_options, NULL)) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; + case POWER_OPT: + is_power_on = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6914,6 +7049,9 @@ zpool_do_online(int argc, char **argv) } } + if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) + is_power_on = B_TRUE; + argc -= optind; argv += optind; @@ -6935,6 +7073,18 @@ zpool_do_online(int argc, char **argv) for (i = 1; i < argc; i++) { vdev_state_t oldstate; boolean_t avail_spare, l2cache; + int rc; + + if (is_power_on) { + rc = zpool_power_on_and_disk_wait(zhp, argv[i]); + if (rc == ENOTSUP) { + (void) fprintf(stderr, + gettext("Power control not supported\n")); + } + if (rc != 0) + return (rc); + } + nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, &l2cache, NULL); if (tgt == NULL) { @@ -6980,12 +7130,15 @@ zpool_do_online(int argc, char **argv) } /* - * zpool offline [-ft] ... + * zpool offline [-ft]|[--power] ... + * * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. + * + * --power Power off the enclosure slot to the drive (if possible) */ /* ARGSUSED */ int @@ -6997,9 +7150,15 @@ zpool_do_offline(int argc, char **argv) int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; + boolean_t is_power_off = B_FALSE; + + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; /* check options */ - while ((c = getopt(argc, argv, "ft")) != -1) { + while ((c = getopt_long(argc, argv, "ft", long_options, NULL)) != -1) { switch (c) { case 'f': fault = B_TRUE; @@ -7007,6 +7166,9 @@ zpool_do_offline(int argc, char **argv) case 't': istmp = B_TRUE; break; + case POWER_OPT: + is_power_off = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -7014,6 +7176,20 @@ zpool_do_offline(int argc, char **argv) } } + if (is_power_off && fault) { + (void) fprintf(stderr, + gettext("-0 and -f cannot be used together\n")); + usage(B_FALSE); + return (1); + } + + if (is_power_off && istmp) { + (void) fprintf(stderr, + gettext("-0 and -t cannot be used together\n")); + usage(B_FALSE); + return (1); + } + argc -= optind; argv += optind; @@ -7033,8 +7209,22 @@ zpool_do_offline(int argc, char **argv) return (1); for (i = 1; i < argc; i++) { - if (fault) { - uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + if (is_power_off) { + /* + * Note: we have to power off first, then set REMOVED, + * or else zpool_vdev_set_removed_state() returns + * EAGAIN. + */ + ret = zpool_power_off(zhp, argv[i]); + if (ret != 0) { + (void) fprintf(stderr, "%s %s %d\n", + gettext("unable to power off slot for"), + argv[i], ret); + } + zpool_vdev_set_removed_state(zhp, guid, VDEV_AUX_NONE); + + } else if (fault) { vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ @@ -7057,7 +7247,7 @@ zpool_do_offline(int argc, char **argv) } /* - * zpool clear [device] + * zpool clear [-nF]|[--power] [device] * * Clear all errors associated with a pool or a particular device. */ @@ -7069,13 +7259,20 @@ zpool_do_clear(int argc, char **argv) boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; + boolean_t is_power_on = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "FnX")) != -1) { + while ((c = getopt_long(argc, argv, "FnX", long_options, + NULL)) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; @@ -7086,6 +7283,9 @@ zpool_do_clear(int argc, char **argv) case 'X': xtreme_rewind = B_TRUE; break; + case POWER_OPT: + is_power_on = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -7093,6 +7293,9 @@ zpool_do_clear(int argc, char **argv) } } + if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) + is_power_on = B_TRUE; + argc -= optind; argv += optind; @@ -7133,6 +7336,14 @@ zpool_do_clear(int argc, char **argv) return (1); } + if (is_power_on) { + if (device == NULL) { + zpool_power_on_pool_and_wait_for_devices(zhp); + } else { + zpool_power_on_and_disk_wait(zhp, device); + } + } + if (zpool_clear(zhp, device, policy) != 0) ret = 1; @@ -8641,6 +8852,10 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } + if (cbp->cb_print_power) { + printf_color(ANSI_BOLD, " %5s", gettext("POWER")); + } + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -8688,11 +8903,13 @@ status_callback(zpool_handle_t *zhp, void *data) if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); - else if (!cbp->cb_verbose) + else if (!cbp->cb_verbose) { + color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); - else + color_end(); + } else print_error_log(zhp); } @@ -8707,10 +8924,11 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... - * [interval [count]] + * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ... + * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -e Display only unhealthy vdevs * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -8722,6 +8940,7 @@ status_callback(zpool_handle_t *zhp, void *data) * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format + * --power Display vdev enclosure slot power status * * Describes the health status of all pools or some subset. */ @@ -8735,8 +8954,14 @@ zpool_do_status(int argc, char **argv) status_cbdata_t cb = { 0 }; char *cmd = NULL; + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { + while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, + NULL)) != -1) { switch (c) { case 'c': if (cmd != NULL) { @@ -8762,6 +8987,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'e': + cb.cb_print_unhealthy = B_TRUE; + break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; @@ -8795,6 +9023,9 @@ zpool_do_status(int argc, char **argv) case 'T': get_timestamp_arg(*optarg); break; + case POWER_OPT: + cb.cb_print_power = B_TRUE; + break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index da75866f5145..8fb389d6113f 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -124,6 +124,10 @@ vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); +void free_vdev_cmd_data(vdev_cmd_data_t *data); + +int vdev_run_cmd_simple(char *path, char *cmd); + int check_device(const char *path, boolean_t force, boolean_t isspare, boolean_t iswholedisk); boolean_t check_sector_size_database(char *path, int *sector_size); @@ -131,6 +135,9 @@ void vdev_error(const char *fmt, ...); int check_file(const char *file, boolean_t force, boolean_t isspare); void after_zpool_upgrade(zpool_handle_t *zhp); +int zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on); +int zpool_power_current_state(zpool_handle_t *zhp, char *vdev); + #ifdef __cplusplus } #endif diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 3d83da641ecb..c8daa730e721 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -373,6 +373,10 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + /* Lookup and add the enclosure sysfs path (if exists) */ + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -921,6 +925,15 @@ zero_label(char *path) return (0); } +static void +lines_to_stderr(char *lines[], int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + fprintf(stderr, "%s\n", lines[i]); + } +} + /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this @@ -932,7 +945,7 @@ zero_label(char *path) * need to get the devid after we label the disk. */ static int -make_disks(zpool_handle_t *zhp, nvlist_t *nv) +make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) { nvlist_t **child; uint_t c, children; @@ -1017,6 +1030,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) */ if (!is_exclusive && !is_spare(NULL, udevpath)) { char *devnode = strrchr(devpath, '/') + 1; + char **lines = NULL; + int lines_cnt = 0; ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { @@ -1028,9 +1043,27 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * When labeling a pool the raw device node name * is provided as it appears under /dev/. + * + * Note that 'zhp' will be NULL when we're creating a + * pool. */ - if (zpool_label_disk(g_zfs, zhp, devnode) == -1) + if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, + nv, zhp == NULL ? "create" : + replacing ? "replace" : "add", &lines, + &lines_cnt) != 0) { + (void) fprintf(stderr, + gettext( + "Error preparing/labeling disk.\n")); + if (lines_cnt > 0) { + (void) fprintf(stderr, + gettext("zfs_prepare_disk output:\n")); + lines_to_stderr(lines, lines_cnt); + } + + libzfs_free_str_array(lines, lines_cnt); return (-1); + } + libzfs_free_str_array(lines, lines_cnt); /* * Wait for udev to signal the device is available @@ -1067,19 +1100,19 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) } for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); return (0); @@ -1740,7 +1773,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, return (NULL); } - if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { nvlist_free(newroot); return (NULL); } @@ -1861,7 +1894,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, /* * Run through the vdev specification and label any whole disks found. */ - if (!dryrun && make_disks(zhp, newroot) != 0) { + if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { nvlist_free(newroot); return (NULL); } diff --git a/config/Rules.am b/config/Rules.am index 3b24e3630102..8c484d18c2a0 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -41,6 +41,7 @@ AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" +AM_CPPFLAGS += -DZFSEXECDIR=\"$(zfsexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" diff --git a/include/libzfs.h b/include/libzfs.h index 214a188f9474..6c335bbc4af9 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -311,6 +311,8 @@ _LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *); extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); +extern int zpool_vdev_set_removed_state(zpool_handle_t *, uint64_t, + vdev_aux_t); extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, @@ -318,6 +320,15 @@ extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); +extern int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt); +extern int zpool_prepare_and_label_disk(libzfs_handle_t *hdl, + zpool_handle_t *, const char *, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt); +extern char ** zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val); +extern void zpool_vdev_script_free_env(char **env); extern uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); const char *zpool_get_state_str(zpool_handle_t *); diff --git a/include/libzutil.h b/include/libzutil.h index 15024a4e8888..af0f74318729 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -78,6 +78,7 @@ extern int zpool_find_config(void *, const char *, nvlist_t **, importargs_t *, extern const char * const * zpool_default_search_paths(size_t *count); extern int zpool_read_label(int, nvlist_t **, int *); extern int zpool_label_disk_wait(const char *, int); +extern int zpool_disk_wait(const char *); struct udev_device; @@ -143,6 +144,8 @@ extern void zfs_niceraw(uint64_t, char *, size_t); extern void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); +extern void fsleep(float sec); +extern int zpool_getenv_int(const char *env, int default_val); struct zfs_cmd; int zfs_ioctl_fd(int fd, unsigned long request, struct zfs_cmd *zc); @@ -184,9 +187,65 @@ _LIBZUTIL_H void zfs_setproctitle(const char *fmt, ...); typedef int (*pool_vdev_iter_f)(void *, nvlist_t *, void *); int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, void *data); +int for_each_vdev_macro_helper_func(void *zhp_data, nvlist_t *nv, void *data); +int for_each_real_leaf_vdev_macro_helper_func(void *zhp_data, nvlist_t *nv, + void *data); +/* + * Often you'll want to iterate over all the vdevs in the pool, but don't want + * to use for_each_vdev() since it requires a callback function. + * + * Instead you can use FOR_EACH_VDEV(): + * + * zpool_handle_t *zhp // Assume this is initialized + * nvlist_t *nv + * ... + * FOR_EACH_VDEV(zhp, nv) { + * const char *path = NULL; + * nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path); + * printf("Looking at vdev %s\n", path); + * } + * + * Note: FOR_EACH_VDEV runs in O(n^2) time where n = number of vdevs. However, + * there's an upper limit of 256 vdevs per dRAID top-level vdevs (TLDs), 255 for + * raidz2 TLDs, a real world limit of ~500 vdevs for mirrors, so this shouldn't + * really be an issue. + * + * Here are some micro-benchmarks of a complete FOR_EACH_VDEV loop on a RAID0 + * pool: + * + * 100 vdevs = 0.7ms + * 500 vdevs = 17ms + * 750 vdevs = 40ms + * 1000 vdevs = 82ms + * + * The '__nv += 0' at the end of the for() loop gets around a "comma or + * semicolon followed by non-blank" checkstyle error. Note on most compliers + * the '__nv += 0' can just be replaced with 'NULL', but gcc on Centos 7 + * will give a 'warning: statement with no effect' error if you do that. + */ +#define __FOR_EACH_VDEV(__zhp, __nv, __func) { \ + __nv = zpool_get_config(__zhp, NULL); \ + VERIFY0(nvlist_lookup_nvlist(__nv, ZPOOL_CONFIG_VDEV_TREE, &__nv)); \ + } \ + for (nvlist_t *__root_nv = __nv, *__state = (nvlist_t *)0; \ + for_each_vdev_cb(&__state, __root_nv, __func, &__nv) == 1; \ + __nv += 0) + +#define FOR_EACH_VDEV(__zhp, __nv) \ + __FOR_EACH_VDEV(__zhp, __nv, for_each_vdev_macro_helper_func) + +/* + * "real leaf" vdevs are leaf vdevs that are real devices (disks or files). + * This excludes leaf vdevs like like draid spares. + */ +#define FOR_EACH_REAL_LEAF_VDEV(__zhp, __nv) \ + __FOR_EACH_VDEV(__zhp, __nv, for_each_real_leaf_vdev_macro_helper_func) + int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); +_LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, + const char *path, const char *key); #ifdef __cplusplus } #endif diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi index 4f961c83667d..fa21e485f6ed 100644 --- a/lib/libnvpair/libnvpair.abi +++ b/lib/libnvpair/libnvpair.abi @@ -589,14 +589,6 @@ - - - - - - - - @@ -655,6 +647,14 @@ + + + + + + + + @@ -1771,6 +1771,63 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1880,63 +1937,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -2452,17 +2452,6 @@ - - - - - - - - - - - @@ -2491,6 +2480,19 @@ + + + + + + + + + + + + + @@ -3124,14 +3126,15 @@ - + + + + + - - - diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index 21418ec1d4a0..7b1a2cd3af60 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -774,16 +774,23 @@ - + + + + + + + - + + @@ -791,17 +798,13 @@ - - - - - - + + @@ -827,10 +830,11 @@ - + + @@ -879,6 +883,26 @@ + + + + + + + + + + + + + + + + + + + + @@ -933,26 +957,6 @@ - - - - - - - - - - - - - - - - - - - - @@ -1019,6 +1023,7 @@ + @@ -1055,7 +1060,6 @@ - @@ -1071,11 +1075,6 @@ - - - - - @@ -1091,31 +1090,21 @@ + + + + + + - - - - - - - - - - - - - - - - @@ -1151,13 +1140,6 @@ - - - - - - - @@ -1166,18 +1148,6 @@ - - - - - - - - - - - - @@ -1201,6 +1171,7 @@ + @@ -1326,6 +1297,7 @@ + @@ -1388,7 +1360,6 @@ - @@ -1633,6 +1604,20 @@ + + + + + + + + + + + + + + @@ -1757,6 +1742,7 @@ + @@ -1789,22 +1775,24 @@ - + + + + + + - + + - - - - @@ -2144,13 +2132,6 @@ - - - - - - - @@ -2160,6 +2141,12 @@ + + + + + + diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 58c2d7635f11..7d5f07d8fa55 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,14 +1,10 @@ - - - - - + @@ -333,6 +329,8 @@ + + @@ -380,6 +378,9 @@ + + + @@ -538,11 +539,6 @@ - - - - - @@ -601,8 +597,9 @@ - + + @@ -1762,6 +1759,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1910,54 +1955,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -2736,11 +2733,6 @@ - - - - - @@ -2802,6 +2794,18 @@ + + + + + + + + + + + + @@ -3010,18 +3014,6 @@ - - - - - - - - - - - - @@ -3253,24 +3245,6 @@ - - - - - - - - - - - - - - - - - - @@ -3283,13 +3257,6 @@ - - - - - - - @@ -3311,12 +3278,6 @@ - - - - - - @@ -3340,12 +3301,6 @@ - - - - - - @@ -3357,6 +3312,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3401,13 +3390,13 @@ - + - + - + @@ -3479,55 +3468,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -3585,6 +3525,55 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -4459,12 +4448,6 @@ - - - - - - @@ -4480,12 +4463,6 @@ - - - - - - @@ -4506,12 +4483,6 @@ - - - - - - @@ -4524,6 +4495,19 @@ + + + + + + + + + + + + + @@ -4536,7 +4520,7 @@ - + @@ -4577,18 +4561,21 @@ - + + + + - + - + - + @@ -4626,6 +4613,13 @@ + + + + + + + @@ -4650,6 +4644,11 @@ + + + + + @@ -4779,31 +4778,26 @@ - - - - - - - - - - - - + - + + + + + + + @@ -4902,6 +4896,9 @@ + + + @@ -5015,9 +5012,6 @@ - - - @@ -5071,9 +5065,6 @@ - - - @@ -5240,13 +5231,19 @@ - + - - + + + + + + + + @@ -5258,10 +5255,10 @@ - + - + @@ -5881,6 +5878,12 @@ + + + + + + @@ -6053,11 +6056,6 @@ - - - - - @@ -6076,6 +6074,12 @@ + + + + + + @@ -6865,21 +6869,10 @@ - - - - - - - - - - - @@ -7082,6 +7075,12 @@ + + + + + + @@ -7096,6 +7095,10 @@ + + + + @@ -7262,23 +7265,45 @@ - - - - - - + + - + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -7297,25 +7322,6 @@ - - - - - - - - - - - - - - - - - - - @@ -7335,11 +7341,10 @@ - - - + + - + @@ -7375,6 +7380,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -7383,6 +7413,9 @@ + + + @@ -7396,9 +7429,6 @@ - - - @@ -7569,12 +7599,6 @@ - - - - - - @@ -7589,11 +7613,6 @@ - - - - - diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index fc6c6e8e2cf6..b3e8948cefcb 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2808,6 +2808,9 @@ zpool_vdev_is_interior(const char *name) return (B_FALSE); } +/* + * Lookup the nvlist for a given vdev. + */ nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) @@ -2815,6 +2818,7 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; + boolean_t __avail_spare, __l2cache, __log; verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); @@ -2830,6 +2834,18 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + /* + * User can pass NULL for avail_spare, l2cache, and log, but + * we still need to provide variables to vdev_to_nvlist_iter(), so + * just point them to junk variables here. + */ + if (!avail_spare) + avail_spare = &__avail_spare; + if (!l2cache) + l2cache = &__l2cache; + if (!log) + log = &__log; + *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) @@ -3233,21 +3249,23 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) } /* - * Mark the given vdev degraded. + * Generic set vdev state function */ -int -zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +static int +zpool_vdev_set_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux, + vdev_state_t state) { zfs_cmd_t zc = {"\0"}; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); + dgettext(TEXT_DOMAIN, "cannot set %s %llu"), + zpool_state_to_name(state, aux), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; - zc.zc_cookie = VDEV_STATE_DEGRADED; + zc.zc_cookie = state; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) @@ -3256,6 +3274,27 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) return (zpool_standard_error(hdl, errno, msg)); } +/* + * Mark the given vdev degraded. + */ +int +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_DEGRADED)); +} + +/* + * Mark the given vdev as in a removed state (as if the device does not exist). + * + * This is different than zpool_vdev_remove() which does a removal of a device + * from the pool (but the device does exist). + */ +int +zpool_vdev_set_removed_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_REMOVED)); +} + /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 7c4d310782eb..256ae9d1b778 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -2121,3 +2121,196 @@ printf_color(const char *color, char *format, ...) return (rc); } + +/* PATH + 5 env vars + a NULL entry = 7 */ +#define ZPOOL_VDEV_SCRIPT_ENV_COUNT 7 + +/* + * There's a few places where ZFS will call external scripts (like the script + * in zpool.d/ and `zfs_prepare_disk`). These scripts are called with a + * reduced $PATH, and some vdev specific environment vars set. This function + * will allocate an populate the environment variable array that is passed to + * these scripts. The user must free the arrays with zpool_vdev_free_env() when + * they are done. + * + * The following env vars will be set (but value could be blank): + * + * POOL_NAME + * VDEV_PATH + * VDEV_UPATH + * VDEV_ENC_SYSFS_PATH + * + * In addition, you can set an optional environment variable named 'opt_key' + * to 'opt_val' if you want. + * + * Returns allocated env[] array on success, NULL otherwise. + */ +char ** +zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val) +{ + char **env = NULL; + int rc; + + env = calloc(ZPOOL_VDEV_SCRIPT_ENV_COUNT, sizeof (*env)); + if (!env) + return (NULL); + + env[0] = strdup("PATH=/bin:/sbin:/usr/bin:/usr/sbin"); + if (!env[0]) + goto error; + + /* Setup our custom environment variables */ + rc = asprintf(&env[1], "POOL_NAME=%s", pool_name ? pool_name : ""); + if (rc == -1) { + env[1] = NULL; + goto error; + } + + rc = asprintf(&env[2], "VDEV_PATH=%s", vdev_path ? vdev_path : ""); + if (rc == -1) { + env[2] = NULL; + goto error; + } + + rc = asprintf(&env[3], "VDEV_UPATH=%s", vdev_upath ? vdev_upath : ""); + if (rc == -1) { + env[3] = NULL; + goto error; + } + + rc = asprintf(&env[4], "VDEV_ENC_SYSFS_PATH=%s", + vdev_enc_sysfs_path ? vdev_enc_sysfs_path : ""); + if (rc == -1) { + env[4] = NULL; + goto error; + } + + if (opt_key != NULL) { + rc = asprintf(&env[5], "%s=%s", opt_key, + opt_val ? opt_val : ""); + if (rc == -1) { + env[5] = NULL; + goto error; + } + } + + return (env); + +error: + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); + + return (NULL); +} + +/* + * Free the env[] array that was allocated by zpool_vdev_script_alloc_env(). + */ +void +zpool_vdev_script_free_env(char **env) +{ + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); +} + +/* + * Prepare a disk by (optionally) running a program before labeling the disk. + * This can be useful for installing disk firmware or doing some pre-flight + * checks on the disk before it becomes part of the pool. The program run is + * located at ZFSEXECDIR/zfs_prepare_disk + * (E.x: /usr/local/libexec/zfs/zfs_prepare_disk). + * + * Return 0 on success, non-zero on failure. + */ +int +zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt) +{ + const char *script_path = ZFSEXECDIR "/zfs_prepare_disk"; + const char *pool_name; + int rc = 0; + + /* Path to script and a NULL entry */ + char *argv[2] = {(char *)script_path}; + char **env = NULL; + char *path = NULL, *enc_sysfs_path = NULL; + char *upath; + *lines_cnt = 0; + + if (access(script_path, X_OK) != 0) { + /* No script, nothing to do */ + return (0); + } + + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH, &path); + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + + upath = zfs_get_underlying_path(path); + pool_name = zhp ? zpool_get_name(zhp) : NULL; + + env = zpool_vdev_script_alloc_env(pool_name, path, upath, + enc_sysfs_path, "VDEV_PREPARE", prepare_str); + + free(upath); + + if (env == NULL) { + return (ENOMEM); + } + + rc = libzfs_run_process_get_stdout(script_path, argv, env, lines, + lines_cnt); + + zpool_vdev_script_free_env(env); + + return (rc); +} + +/* + * Optionally run a script and then label a disk. The script can be used to + * prepare a disk for inclusion into the pool. For example, it might update + * the disk's firmware or check its health. + * + * The 'name' provided is the short name, stripped of any leading + * /dev path, and is passed to zpool_label_disk. vdev_nv is the nvlist for + * the vdev. prepare_str is a string that gets passed as the VDEV_PREPARE + * env variable to the script. + * + * The following env vars are passed to the script: + * + * POOL_NAME: The pool name (blank during zpool create) + * VDEV_PREPARE: Reason why the disk is being prepared for inclusion: + * "create", "add", "replace", or "autoreplace" + * VDEV_PATH: Path to the disk + * VDEV_UPATH: One of the 'underlying paths' to the disk. This is + * useful for DM devices. + * VDEV_ENC_SYSFS_PATH: Path to the disk's enclosure sysfs path, if available. + * + * Note, some of these values can be blank. + * + * Return 0 on success, non-zero otherwise. + */ +int +zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, + const char *name, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt) +{ + int rc; + char vdev_path[MAXPATHLEN]; + (void) snprintf(vdev_path, sizeof (vdev_path), "%s/%s", DISK_ROOT, + name); + + /* zhp will be NULL when creating a pool */ + rc = zpool_prepare_disk(zhp, vdev_nv, prepare_str, lines, lines_cnt); + if (rc != 0) + return (rc); + + rc = zpool_label_disk(hdl, zhp, name); + return (rc); +} diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 7ede3e0972d5..089cf48ae9ef 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -6,7 +6,6 @@ - @@ -159,8 +158,11 @@ + + + @@ -255,6 +257,7 @@ + @@ -278,9 +281,11 @@ + + @@ -407,12 +412,6 @@ - - - - - - @@ -440,16 +439,32 @@ + + + + + + + + + + + + + + + - + + + + + - - - @@ -895,16 +910,18 @@ - + + - + + @@ -1024,15 +1041,8 @@ - - - - - - - - + @@ -1069,7 +1079,6 @@ - @@ -1144,6 +1153,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1176,10 +1219,10 @@ - + - + @@ -1197,30 +1240,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - @@ -1232,23 +1251,6 @@ - - - - - - - - - - - - - - - - - @@ -1267,8 +1269,6 @@ - - @@ -1306,16 +1306,6 @@ - - - - - - - - - - @@ -1468,6 +1458,16 @@ + + + + + + + + + + @@ -1866,7 +1866,7 @@ - + @@ -1884,7 +1884,7 @@ - + @@ -2988,10 +2988,17 @@ - + + + + + + + + @@ -3159,21 +3166,23 @@ - - - - - - - + + + + + + + + + @@ -3211,19 +3220,18 @@ - - - + + + - @@ -3307,11 +3315,6 @@ - - - - - @@ -3321,6 +3324,11 @@ + + + + + @@ -3342,6 +3350,16 @@ + + + + + + + + + + @@ -3399,23 +3417,10 @@ - - - - - - - - - - - - - @@ -3469,20 +3474,6 @@ - - - - - - - - - - - - - - @@ -3672,6 +3663,8 @@ + + @@ -3837,10 +3830,10 @@ - + - + @@ -3957,8 +3950,6 @@ - - @@ -3967,7 +3958,6 @@ - @@ -4211,6 +4201,18 @@ + + + + + + + + + + + + @@ -4269,25 +4271,6 @@ - - - - - - - - - - - - - - - - - - - @@ -4309,11 +4292,6 @@ - - - - - @@ -4329,30 +4307,51 @@ - + - + + + + + + + + + - + + - + + - + + + + + + + + - + - + + + - - + + + + @@ -4431,6 +4430,9 @@ + + + @@ -4440,14 +4442,12 @@ - - - + @@ -4485,6 +4485,9 @@ + + + @@ -4498,9 +4501,13 @@ - - - + + + + + + + @@ -4508,5 +4515,16 @@ + + + + + + + + + + + diff --git a/lib/libzfsbootenv/libzfsbootenv.abi b/lib/libzfsbootenv/libzfsbootenv.abi index 0ddd41d0630e..8a7fa782d8a0 100644 --- a/lib/libzfsbootenv/libzfsbootenv.abi +++ b/lib/libzfsbootenv/libzfsbootenv.abi @@ -1,16 +1,7 @@ - - - - - - - - - @@ -289,18 +280,6 @@ - - - - - - - - - - - - @@ -319,6 +298,20 @@ + + + + + + + + + + + + + + diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c index 7c48e06f9315..a57a2db57eba 100644 --- a/lib/libzutil/os/freebsd/zutil_import_os.c +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -248,7 +248,24 @@ zfs_dev_flush(int fd __unused) return (0); } +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) +{ + (void) nv; + (void) path; + (void) key; +} + void update_vdevs_config_dev_sysfs_path(nvlist_t *config) { } + +int +zpool_disk_wait(const char *path) +{ + + (void) path; + return (ENOTSUP); +} diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index 6c406d373a0c..ebf20956a213 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -188,25 +188,17 @@ zpool_open_func(void *arg) if (rn->rn_labelpaths) { char *path = NULL; char *devid = NULL; - char *env = NULL; rdsk_node_t *slice; avl_index_t where; - int timeout; int error; if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) return; - env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); - if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || - timeout < 0) { - timeout = DISK_LABEL_WAIT; - } - /* * Allow devlinks to stabilize so all paths are available. */ - zpool_label_disk_wait(rn->rn_name, timeout); + zpool_disk_wait(rn->rn_name); if (path != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); @@ -700,6 +692,20 @@ zpool_label_disk_wait(const char *path, int timeout_ms) #endif /* HAVE_LIBUDEV */ } +/* + * Simplified version of zpool_label_disk_wait() where we wait for a device + * to appear using the default timeouts. + */ +int +zpool_disk_wait(const char *path) +{ + int timeout; + timeout = zpool_getenv_int("ZPOOL_IMPORT_UDEV_TIMEOUT_MS", + DISK_LABEL_WAIT); + + return (zpool_label_disk_wait(path, timeout)); +} + /* * Encode the persistent devices strings * used for the vdev disk label @@ -781,20 +787,37 @@ encode_device_strings(const char *path, vdev_dev_strs_t *ds, * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + * + * If an old path was in the nvlist, and the rescan can not find a new path, + * then keep the old path, since the disk may have been removed. + * + * path: The vdev path (value from ZPOOL_CONFIG_PATH) + * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH) */ -static void -update_vdev_config_dev_sysfs_path(nvlist_t *nv, char *path) +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) { char *upath, *spath; + char *oldpath = NULL; + + (void) nvlist_lookup_string(nv, key, &oldpath); /* Add enclosure sysfs path (if disk is in an enclosure). */ upath = zfs_get_underlying_path(path); spath = zfs_get_enclosure_sysfs_path(upath); if (spath) { - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); + (void) nvlist_add_string(nv, key, spath); } else { - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + /* + * We couldn't dynamically scan the disk's enclosure sysfs path. + * This could be because the disk went away. If there's an old + * enclosure sysfs path in the nvlist, then keep using it. + */ + if (!oldpath) { + (void) nvlist_remove_all(nv, key); + } } free(upath); @@ -812,7 +835,8 @@ sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) return (1); /* Rescan our enclosure sysfs path for this vdev */ - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); return (0); } @@ -901,7 +925,8 @@ update_vdev_config_dev_strs(nvlist_t *nv) (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vds.vds_devphys); } - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } else { /* Clear out any stale entries. */ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 98f138957ba6..b9a0b67f2b8c 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1836,6 +1836,104 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, return (0); } +/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ +static boolean_t +vdev_is_leaf(nvlist_t *nv) +{ + uint_t children = 0; + nvlist_t **child; + + (void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + return (children == 0); +} + +/* Return if a vdev is a leaf vdev and a real device (disk or file) */ +static boolean_t +vdev_is_real_leaf(nvlist_t *nv) +{ + char *type = NULL; + if (!vdev_is_leaf(nv)) + return (B_FALSE); + + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); + if ((strcmp(type, VDEV_TYPE_DISK) == 0) || + (strcmp(type, VDEV_TYPE_FILE) == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This function is called by our FOR_EACH_VDEV() macros. + * + * state: State machine status (stored inside of a (nvlist_t *)) + * nv: The current vdev nvlist_t we are iterating over. + * last_nv: The previous vdev nvlist_t we returned to the user in + * the last iteration of FOR_EACH_VDEV(). We use it + * to find the next vdev nvlist_t we should return. + * real_leaves_only: Only return leaf vdevs. + * + * Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if + * we're still searching for it. + */ +static int +__for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, + boolean_t real_leaves_only) +{ + enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2}; + + /* The very first entry in the NV list is a special case */ + if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { + if (real_leaves_only && !vdev_is_real_leaf(nv)) + return (0); + + *((nvlist_t **)last_nv) = nv; + *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; + return (1); + } + + /* + * We came across our last_nv, meaning the next one is the one we + * want + */ + if (nv == *((nvlist_t **)last_nv)) { + /* Next iteration of this function will return the nvlist_t */ + *((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH; + return (0); + } + + /* + * We marked NEXT_IS_MATCH on the previous iteration, so this is the one + * we want. + */ + if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { + if (real_leaves_only && !vdev_is_real_leaf(nv)) + return (0); + + *((nvlist_t **)last_nv) = nv; + *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; + return (1); + } + + return (0); +} + +int +for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv) +{ + return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE)); +} + +int +for_each_real_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv, + void *last_nv) +{ + return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE)); +} + /* * Internal function for iterating over the vdevs. * diff --git a/lib/libzutil/zutil_pool.c b/lib/libzutil/zutil_pool.c index 734650f3cffc..eeb7c589ee05 100644 --- a/lib/libzutil/zutil_pool.c +++ b/lib/libzutil/zutil_pool.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -143,3 +144,33 @@ zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, *leftover = bytes_read; return (0); } + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +void +fsleep(float sec) +{ + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + +/* + * Get environment variable 'env' and return it as an integer. + * If 'env' is not set, then return 'default_val' instead. + */ +int +zpool_getenv_int(const char *env, int default_val) +{ + char *str; + int val; + str = getenv(env); + if ((str == NULL) || sscanf(str, "%d", &val) != 1 || + val < 0) { + val = default_val; + } + return (val); +} diff --git a/man/Makefile.am b/man/Makefile.am index 64650c2b988a..2608461625d4 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -104,7 +104,8 @@ dist_man_MANS = \ nodist_man_MANS = \ man8/zed.8 \ - man8/zfs-mount-generator.8 + man8/zfs-mount-generator.8 \ + man8/zfs_prepare_disk.8 SUBSTFILES += $(nodist_man_MANS) diff --git a/man/man8/.gitignore b/man/man8/.gitignore index f2fc702147e9..a468f9cbf9d3 100644 --- a/man/man8/.gitignore +++ b/man/man8/.gitignore @@ -1,2 +1,3 @@ /zed.8 /zfs-mount-generator.8 +/zfs_prepare_disk.8 diff --git a/man/man8/zfs_prepare_disk.8.in b/man/man8/zfs_prepare_disk.8.in new file mode 100644 index 000000000000..2a741531e415 --- /dev/null +++ b/man/man8/zfs_prepare_disk.8.in @@ -0,0 +1,70 @@ +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +.\" Copyright (C) 2023 Lawrence Livermore National Security, LLC. +.\" Refer to the OpenZFS git commit log for authoritative copyright attribution. +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License Version 1.0 (CDDL-1.0). +.\" You can obtain a copy of the license from the top-level file +.\" "OPENSOLARIS.LICENSE" or at . +.\" You may not use this file except in compliance with the license. +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +.\" +.Dd August 30, 2023 +.Dt ZFS_PREPARE_DISK 8 +.Os +. +.Sh NAME +.Nm zfs_prepare_disk +.Nd special script that gets run before bringing a disk into a pool +.Sh DESCRIPTION +.Nm +is an optional script that gets called by libzfs before bringing a disk into a +pool. +It can be modified by the user to run whatever commands are necessary to prepare +a disk for inclusion into the pool. +For example, users can add lines to +.Nm zfs_prepare_disk +to do things like update the drive's firmware or check the drive's health. +.Nm zfs_prepare_disk +is optional and can be removed if not needed. +libzfs will look for the script at @zfsexecdir@/zfs_prepare_disk. +. +.Ss Properties +.Nm zfs_prepare_disk +will be passed the following environment variables: +.sp +.Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH" +. +.It Nm POOL_NAME +.No Name of the pool +.It Nm VDEV_PATH +.No Path to the disk (like /dev/sda) +.It Nm VDEV_PREPARE +.No Reason why the disk is being prepared for inclusion +('create', 'add', 'replace', or 'autoreplace'). +This can be useful if you only want the script to be run under certain actions. +.It Nm VDEV_UPATH +.No Path to one of the underlying devices for the +disk. +For multipath this would return one of the /dev/sd* paths to the disk. +If the device is not a device mapper device, then +.Nm VDEV_UPATH +just returns the same value as +.Nm VDEV_PATH +.It Nm VDEV_ENC_SYSFS_PATH +.No Path to the disk's enclosure sysfs path, if available +.El +.Pp +Note that some of these variables may have a blank value. +.Nm POOL_NAME +is blank at pool creation time, for example. +.Sh ENVIRONMENT +.Nm zfs_prepare_disk +runs with a limited $PATH. +.Sh EXIT STATUS +.Nm zfs_prepare_disk +should return 0 on success, non-zero otherwise. +If non-zero is returned, the disk will not be included in the pool. +. diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index 0b256b28bd21..19861a319000 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -36,6 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm clear +.Op Fl -power .Ar pool .Oo Ar device Oc Ns … . @@ -52,6 +53,16 @@ Pools with enabled which have been suspended cannot be resumed. While the pool was suspended, it may have been imported on another host, and resuming I/O could result in pool damage. +.Bl -tag -width Ds +.It Fl -power +Power on the devices's slot in the storage enclosure and wait for the device +to show up before attempting to clear errors. +This is done on all the devices specified. +Alternatively, you can set the +.Sy ZPOOL_AUTO_POWER_ON_SLOT +environment variable to always enable this behavior. +Note: This flag currently works on Linux only. +.El . .Sh SEE ALSO .Xr zdb 8 , diff --git a/man/man8/zpool-offline.8 b/man/man8/zpool-offline.8 index 9b2cf59cf414..011cefed2f13 100644 --- a/man/man8/zpool-offline.8 +++ b/man/man8/zpool-offline.8 @@ -36,12 +36,13 @@ .Sh SYNOPSIS .Nm zpool .Cm offline -.Op Fl ft +.Op Fl Sy -power Ns | Ns Op Fl Sy ft .Ar pool .Ar device Ns … .Nm zpool .Cm online -.Op Fl e +.Op Fl Sy -power +.Op Fl Sy e .Ar pool .Ar device Ns … . @@ -50,7 +51,7 @@ .It Xo .Nm zpool .Cm offline -.Op Fl ft +.Op Fl Sy -power Ns | Ns Op Fl Sy ft .Ar pool .Ar device Ns … .Xc @@ -60,6 +61,9 @@ While the is offline, no attempt is made to read or write to the device. This command is not applicable to spares. .Bl -tag -width Ds +.It Fl -power +Power off the device's slot in the storage enclosure. +This flag currently works on Linux only .It Fl f Force fault. Instead of offlining the disk, put it into a faulted state. @@ -73,6 +77,7 @@ Upon reboot, the specified physical device reverts to its previous state. .It Xo .Nm zpool .Cm online +.Op Fl -power .Op Fl e .Ar pool .Ar device Ns … @@ -80,6 +85,13 @@ Upon reboot, the specified physical device reverts to its previous state. Brings the specified physical device online. This command is not applicable to spares. .Bl -tag -width Ds +.It Fl -power +Power on the device's slot in the storage enclosure and wait for the device +to show up before attempting to online it. +Alternatively, you can set the +.Sy ZPOOL_AUTO_POWER_ON_SLOT +environment variable to always enable this behavior. +This flag currently works on Linux only .It Fl e Expand the device to use all available space. If the device is part of a mirror or raidz then all devices must be expanded diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 7c825f69d8e2..11d0696ad5eb 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DigLpPstvx +.Op Fl DeigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -57,6 +57,8 @@ and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change. .Bl -tag -width Ds +.It Fl -power +Display vdev enclosure slot power status (on or off). .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … Run a script (or scripts) on each vdev and include the output as a new column in the @@ -67,6 +69,8 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl e +Only show unhealthy vdevs (not-ONLINE or with errors). .It Fl i Display vdev initialization status. .It Fl g diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index e5d7c8515177..591c7772c749 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -424,7 +424,7 @@ rpool 14.6G 54.9G 4 55 250K 2.69M .El . .Sh ENVIRONMENT VARIABLES -.Bl -tag -compact -width "ZPOOL_IMPORT_UDEV_TIMEOUT_MS" +.Bl -tag -compact -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE" .It Sy ZFS_ABORT Cause .Nm @@ -436,6 +436,23 @@ Use ANSI color in and .Nm zpool iostat output. +.It Sy ZPOOL_AUTO_POWER_ON_SLOT +Automatically attempt to turn on the drives enclosure slot power to a drive when +running the +.Nm zpool Cm online +or +.Nm zpool Cm clear +commands. +This has the same effect as passing the +.Fl -power +option to those commands. +.It Sy ZPOOL_POWER_ON_SLOT_TIMEOUT_MS +The maximum time in milliseconds to wait for a slot power sysfs value +to return the correct value after writing it. +For example, after writing "on" to the sysfs enclosure slot power_control file, +it can take some time for the enclosure to power down the slot and return +"on" if you read back the 'power_control' value. +Defaults to 30 seconds (30000ms) if not set. .It Sy ZPOOL_IMPORT_PATH The search path for devices or files to use with the pool. This is a colon-separated list of directories in which diff --git a/module/lua/ldebug.c b/module/lua/ldebug.c index da005c44376e..c724e4632ee5 100644 --- a/module/lua/ldebug.c +++ b/module/lua/ldebug.c @@ -112,10 +112,11 @@ static const char *upvalname (Proto *p, int uv) { static const char *findvararg (CallInfo *ci, int n, StkId *pos) { int nparams = clLvalue(ci->func)->p->numparams; - if (n >= ci->u.l.base - ci->func - nparams) + int nvararg = cast_int(ci->u.l.base - ci->func) - nparams; + if (n <= -nvararg) return NULL; /* no such vararg */ else { - *pos = ci->func + nparams + n; + *pos = ci->func + nparams - n; return "(*vararg)"; /* generic name for any vararg */ } } @@ -127,7 +128,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n, StkId base; if (isLua(ci)) { if (n < 0) /* access to vararg values? */ - return findvararg(ci, -n, pos); + return findvararg(ci, n, pos); else { base = ci->u.l.base; name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci)); diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 047ae7eaca6d..f3c133a29ae8 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -9,6 +9,9 @@ dist_pkgdata_SCRIPTS = \ zloop.sh \ zfs-helpers.sh +dist_zfsexec_SCRIPTS = \ + zfs_prepare_disk + EXTRA_SCRIPTS = \ commitcheck.sh \ common.sh.in \ diff --git a/scripts/zfs_prepare_disk b/scripts/zfs_prepare_disk new file mode 100755 index 000000000000..02aa9f8a7728 --- /dev/null +++ b/scripts/zfs_prepare_disk @@ -0,0 +1,17 @@ +#!/bin/sh +# +# This is an optional helper script that is automatically called by libzfs +# before a disk is about to be added into the pool. It can be modified by +# the user to run whatever commands are necessary to prepare a disk for +# inclusion into the pool. For example, users can add lines to this +# script to do things like update the drive's firmware or check the drive's +# health. The script is optional and can be removed if it is not needed. +# +# See the zfs_prepare_disk(8) man page for details. +# +# Example: +# +# echo "Prepare disk $VDEV_PATH ($VDEV_UPATH) for $VDEV_PREPARE in $POOL_NAME" +# + +exit 0 diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 6c2296d4c269..5097e08b1b55 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -482,7 +482,7 @@ tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_008_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] @@ -575,7 +575,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos'] +tests = ['cp_files_001_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index d686f27232cf..9e7f46d5f7d1 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -37,6 +37,12 @@ . ${STF_SUITE}/include/tunables.cfg +# On AlmaLinux 9 we will see $PWD = '.' instead of the full path. This causes +# some tests to fail. Fix it up here. +if [ "$PWD" = "." ] ; then + PWD="$(readlink -f $PWD)" +fi + # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. @@ -3356,6 +3362,18 @@ function is_te_enabled fi } +# Return the number of CPUs (cross-platform) +function get_num_cpus +{ + if is_linux ; then + grep -c '^processor' /proc/cpuinfo + elif is_freebsd; then + sysctl -n kern.smp.cpus + else + psrinfo | wc -l + fi +} + # Utility function to determine if a system has multiple cpus. function is_mp { diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib index f7461437c615..3a8cba7b0f16 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib @@ -124,7 +124,10 @@ if not httpd: with open('$HTTPS_PORT_FILE', 'w') as portf: print(port, file=portf) -httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS) +sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) +sslctx.check_hostname = False +sslctx.load_cert_chain(certfile='$SSL_CA_CERT_FILE', keyfile='/$TESTPOOL/snakeoil.key') +httpd.socket = httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True) os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key') diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am index 5553061c67b3..8dff69381a4e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am @@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_status_001_pos.ksh \ zpool_status_002_pos.ksh \ + zpool_status_008_pos.ksh \ zpool_status_features_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh index e2751b112597..976906bce0c9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh @@ -51,7 +51,7 @@ else fi set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \ - "-vx $testpool" + "-vx $testpool" "-e $testpool" "-es $testpool" log_assert "Executing 'zpool status' with correct options succeeds" @@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do (( i = i + 1 )) done +cleanup + log_pass "'zpool status' with correct options succeeded" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh new file mode 100755 index 000000000000..6be2ad5a7410 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status -e' only shows unhealthy devices. +# +# STRATEGY: +# 1. Create zpool +# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs +# 3. Verify vdevs are reported correctly with -e and -s +# 4. Verify parents are reported as DEGRADED +# 5. Verify healthy children are not reported +# + +function cleanup +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + zinject -c all + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 + log_must rm -f $all_vdevs +} + +log_assert "Verify 'zpool status -e'" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/vdev{1..6}) +log_must mkdir -p $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) + +for raid_type in "draid2:3d:6c:1s" "raidz2"; do + + log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs + + # Check DEGRADED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE" + log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED" + + # Check FAULTED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE" + log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED" + + # Check no ONLINE vdevs are shown + log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" + + # Check no ONLINE slow vdevs are show. Then mark IOs greater than + # 10ms slow, delay IOs 20ms to vdev6, check slow IOs. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2 + log_must mkfile 1048576 /$TESTPOOL2/testfile + sync_pool $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + + # Check vdev6 slow IOs are only shown when requested with -s. + log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + + # Pool level and top-vdev level status must be DEGRADED. + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED" + log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED" + + # Check that healthy vdevs[1-3] aren't shown with -e. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" + + log_must zinject -c all + log_must zpool status -es $TESTPOOL2 + + zpool destroy $TESTPOOL2 +done + +log_pass "Verify zpool status -e shows only unhealthy vdevs" diff --git a/tests/zfs-tests/tests/functional/cp_files/.gitignore b/tests/zfs-tests/tests/functional/cp_files/.gitignore index eac05e155378..1f29418c0175 100644 --- a/tests/zfs-tests/tests/functional/cp_files/.gitignore +++ b/tests/zfs-tests/tests/functional/cp_files/.gitignore @@ -1 +1,2 @@ -/cp_files +cp_files +seekflood diff --git a/tests/zfs-tests/tests/functional/cp_files/Makefile.am b/tests/zfs-tests/tests/functional/cp_files/Makefile.am index 06c31f5f3f92..f01f1c3402a2 100644 --- a/tests/zfs-tests/tests/functional/cp_files/Makefile.am +++ b/tests/zfs-tests/tests/functional/cp_files/Makefile.am @@ -4,10 +4,12 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cp_files dist_pkgdata_SCRIPTS = \ cp_files_001_pos.ksh \ + cp_stress.ksh \ cleanup.ksh \ setup.ksh pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cp_files -pkgexec_PROGRAMS = cp_files +pkgexec_PROGRAMS = cp_files seekflood cp_files_SOURCES= cp_files.c +seekflood_SOURCES = seekflood.c diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh new file mode 100755 index 000000000000..43bb8ab572d2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh @@ -0,0 +1,73 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2023 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# +# https://github.com/openzfs/zfs/issues/15526 identified a dirty dnode +# SEEK_HOLE/SEEK_DATA bug. https://github.com/openzfs/zfs/pull/15571 +# fixed the bug, and was backported to 2.1.14 and 2.2.2. +# +# This test is to ensure that the bug, as understood, will not recur. +# +# STRATEGY: +# +# 1. Run the 'seekflood' binary, for creation of files with timing +# characteristics that can trigger #15526. +# 2. A single run is not always a trigger, so run repeatedly. + +verify_runnable "global" + +function cleanup +{ + rm -rf /$TESTDIR/cp_stress +} + +log_assert "Run the 'seekflood' binary repeatedly to try to trigger #15526" + +log_onexit cleanup + +log_must mkdir /$TESTPOOL/cp_stress + +MYPWD="$PWD" +cd /$TESTPOOL/cp_stress +CPUS=$(get_num_cpus) + +if is_freebsd ; then + # 'seekflood' takes longer on FreeBSD and can timeout the test + RUNS=3 +else + RUNS=10 +fi + +for i in $(seq 1 $RUNS) ; do + # Each run takes around 12 seconds. + log_must $STF_SUITE/tests/functional/cp_files/seekflood 2000 $CPUS +done +cd "$MYPWD" + +log_pass "No corruption detected" diff --git a/tests/zfs-tests/tests/functional/cp_files/seekflood.c b/tests/zfs-tests/tests/functional/cp_files/seekflood.c new file mode 100644 index 000000000000..02c2c8e6eca5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/seekflood.c @@ -0,0 +1,180 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2023, Rob Norris + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DATASIZE (4096) +char data[DATASIZE]; + +static int +_open_file(int n, int wr) +{ + char buf[256]; + int fd; + + snprintf(buf, sizeof (buf), "testdata_%d_%d", getpid(), n); + + if ((fd = open(buf, wr ? (O_WRONLY | O_CREAT) : O_RDONLY, + wr ? (S_IRUSR | S_IWUSR) : 0)) < 0) { + fprintf(stderr, "Error: open '%s' (%s): %s\n", + buf, wr ? "write" : "read", strerror(errno)); + exit(1); + } + + return (fd); +} + +static void +_write_file(int n, int fd) +{ + /* write a big ball of stuff */ + ssize_t nwr = write(fd, data, DATASIZE); + if (nwr < 0) { + fprintf(stderr, "Error: write '%d_%d': %s\n", + getpid(), n, strerror(errno)); + exit(1); + } else if (nwr < DATASIZE) { + fprintf(stderr, "Error: write '%d_%d': short write\n", getpid(), + n); + exit(1); + } +} + +static int +_seek_file(int n, int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) { + fprintf(stderr, "Error: fstat '%d_%d': %s\n", getpid(), n, + strerror(errno)); + exit(1); + } + + /* + * A zero-sized file correctly has no data, so seeking the file is + * pointless. + */ + if (st.st_size == 0) + return (0); + + /* size is real, and we only write, so SEEK_DATA must find something */ + if (lseek(fd, 0, SEEK_DATA) < 0) { + if (errno == ENXIO) + return (1); + fprintf(stderr, "Error: lseek '%d_%d': %s\n", + getpid(), n, strerror(errno)); + exit(2); + } + + return (0); +} + +int +main(int argc, char **argv) +{ + int nfiles = 0; + int nthreads = 0; + + if (argc < 3 || (nfiles = atoi(argv[1])) == 0 || + (nthreads = atoi(argv[2])) == 0) { + printf("usage: seekflood \n"); + exit(1); + } + + memset(data, 0x5a, DATASIZE); + + /* fork off some flood threads */ + for (int i = 0; i < nthreads; i++) { + if (!fork()) { + /* thread main */ + + /* create zero file */ + int fd = _open_file(0, 1); + _write_file(0, fd); + close(fd); + + int count = 0; + + int h = 0, i, j, rfd, wfd; + for (i = 0; i < nfiles; i += 2, h++) { + j = i+1; + + /* seek h, write i */ + rfd = _open_file(h, 0); + wfd = _open_file(i, 1); + count += _seek_file(h, rfd); + _write_file(i, wfd); + close(rfd); + close(wfd); + + /* seek i, write j */ + rfd = _open_file(i, 0); + wfd = _open_file(j, 1); + count += _seek_file(i, rfd); + _write_file(j, wfd); + close(rfd); + close(wfd); + } + + /* return count of failed seeks to parent */ + exit(count < 256 ? count : 255); + } + } + + /* wait for threads, take their seek fail counts from exit code */ + int count = 0, crashed = 0; + for (int i = 0; i < nthreads; i++) { + int wstatus; + wait(&wstatus); + if (WIFEXITED(wstatus)) + count += WEXITSTATUS(wstatus); + else + crashed++; + } + + if (crashed) { + fprintf(stderr, "Error: child crashed; test failed\n"); + exit(1); + } + + if (count) { + fprintf(stderr, "Error: %d seek failures; test failed\n", + count); + exit(1); + } + + exit(0); +}