diff --git a/include/os/freebsd/spl/sys/mod.h b/include/os/freebsd/spl/sys/mod.h index 2aa66bbe19b7..f0f5e182560e 100644 --- a/include/os/freebsd/spl/sys/mod.h +++ b/include/os/freebsd/spl/sys/mod.h @@ -77,6 +77,9 @@ #define param_set_active_allocator_args(var) \ CTLTYPE_STRING, NULL, 0, param_set_active_allocator, "A" +#define param_set_active_weightfunc_args(var) \ + CTLTYPE_STRING, NULL, 0, param_set_active_weightfunc, "A" + #define param_set_deadman_synctime_args(var) \ CTLTYPE_U64, NULL, 0, param_set_deadman_synctime, "QU" diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 36cbe06bacce..d9759c746d9f 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -44,9 +44,13 @@ typedef struct metaslab_ops { uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *); } metaslab_ops_t; - extern const metaslab_ops_t zfs_metaslab_ops; +typedef struct metaslab_wfs { + const char *mswf_name; + uint64_t (*mswf_func)(metaslab_t *); +} metaslab_wfs_t; + int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); @@ -103,7 +107,7 @@ void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); metaslab_class_t *metaslab_class_create(spa_t *, const char *, - const metaslab_ops_t *, boolean_t); + const metaslab_ops_t *, const metaslab_wfs_t *, boolean_t); void metaslab_class_destroy(metaslab_class_t *); void metaslab_class_validate(metaslab_class_t *); void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 6ce995d0a086..ba17bf4e9aa3 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -77,11 +77,14 @@ typedef enum trace_alloc_type { #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) -#define METASLAB_WEIGHT_TYPE (1ULL << 60) +#define METASLAB_WEIGHT_MASK ((1ULL << 60) | 1ULL << 59) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) +#define METASLAB_WEIGHT_MAX_IDX 58 +#define METASLAB_WEIGHT_MAX ((1ULL << (METASLAB_WEIGHT_MAX_IDX + 1)) - 1) + /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the @@ -103,18 +106,30 @@ typedef enum trace_alloc_type { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PSC1| weighted-free space | + * |PSC10| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * + * Space-based weight v2: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PSC11| weighted-free space | idx | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio + * idx - index for the highest bucket in the histogram + * space - the fragmentation-weighted space + * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PSC0| idx| count of segments in region | + * |PSC00| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation @@ -125,17 +140,22 @@ typedef enum trace_alloc_type { #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) +#define WEIGHT_GET_TYPE(weight) BF64_GET((weight), 59, 2) +#define WEIGHT_SET_TYPE(weight, x) BF64_SET((weight), 59, 2, x) #define WEIGHT_IS_SPACEBASED(weight) \ - ((weight) == 0 || BF64_GET((weight), 60, 1)) -#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) + ((weight) == 0 || WEIGHT_GET_TYPE((weight))) +#define WEIGHT_SET_SPACEBASED(weight) WEIGHT_SET_TYPE((weight), 2) +#define WEIGHT_IS_SPACEBASED_V2(weight) \ + ((weight) == 0 || WEIGHT_GET_TYPE((weight)) == 3) +#define WEIGHT_SET_SPACEBASED_V2(weight) WEIGHT_SET_TYPE((weight), 3) /* * These macros are only applicable to segment-based weighting. */ -#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) -#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) -#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) -#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 53, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 53, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 53) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 53, x) /* * Per-allocator data structure. @@ -183,6 +203,7 @@ struct metaslab_class { spa_t *mc_spa; const char *mc_name; const metaslab_ops_t *mc_ops; + const metaslab_wfs_t *mc_wfs; /* * Track the number of metaslab groups that have been initialized diff --git a/include/sys/spa.h b/include/sys/spa.h index db30b5a066de..314db678bf47 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1124,6 +1124,8 @@ extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); extern int spa_get_allocator(spa_t *spa); extern void spa_set_allocator(spa_t *spa, const char *allocator); +extern int spa_get_weightfunc(spa_t *spa); +extern void spa_set_weightfunc(spa_t *spa, const char *weightfunc); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...) @@ -1282,6 +1284,7 @@ int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS); +int param_set_active_weightfunc(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 62b062984d36..9367ef57ef68 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -275,6 +275,7 @@ struct spa { spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ + int spa_active_weightfunc; /* selectable weight function */ /* per-allocator sync thread taskqs */ taskq_t *spa_sync_tq; @@ -501,7 +502,9 @@ extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; +extern const char *zfs_active_weightfunc; extern int param_set_active_allocator_common(const char *val); +extern int param_set_active_weightfunc_common(const char *val); #ifdef __cplusplus } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index a2faec4e18c4..e9e0dc636657 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -325,6 +325,8 @@ Prevent log spacemaps from being destroyed during pool exports and destroys. . .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable segment-based metaslab selection. +Note that this tunable has been deprecated in favor of +.Sy zfs_active_weightfunc . . .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int When using segment-based metaslab selection, continue allocating @@ -720,6 +722,25 @@ depends on kernel configuration. This is the minimum allocation size that will use scatter (page-based) ABDs. Smaller allocations will use linear ABDs. . +.It Sy zfs_active_weightfunc Ns = Ns Sy auto Pq string +Selects the metaslab weighting function to use. +The weighting function can be one of auto, space, space_v2, and segment. +This determines what weighting algorithm will be used to sort metaslabs for +selection during the allocation process. +The space_v2 and segment algorithms require the spacemap histogram feature to be +enabled, and will fall back to the space algorithm if that feature is not +enabled on the pool. +.Pp +The space algorithm combines the total free space in the metaslab with the +fragmentation metric to balance toward metaslabs with more contiguous free +space. +The segment algorithm uses the spacemap histograms to calculate the weight from +the largest free segment size, bucketed to powers of two, and the number of +segments in that bucket. +The space_v2 algorithm considers not only the largest free segment bucket, but +the smaller ones as well, providing a higher weight to larger contiguous chunks. +Auto provides the recommended algorithm (currently space_v2). +. .It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64 When the number of bytes consumed by dnodes in the ARC exceeds this number of bytes, try to unpin some of it in response to demand for non-metadata. diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 393bfaa65ff5..10777212af88 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -273,6 +273,24 @@ param_set_active_allocator(SYSCTL_HANDLER_ARGS) return (param_set_active_allocator_common(buf)); } +int +param_set_active_weightfunc(SYSCTL_HANDLER_ARGS) +{ + char buf[16]; + int rc; + + if (req->newptr == NULL) + strlcpy(buf, zfs_active_weightfunc, sizeof (buf)); + + rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (rc || req->newptr == NULL) + return (rc); + if (strcmp(buf, zfs_active_weightfunc) == 0) + return (0); + + return (param_set_active_weightfunc_common(buf)); +} + /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each diff --git a/module/os/linux/zfs/spa_misc_os.c b/module/os/linux/zfs/spa_misc_os.c index d6323fd56a8f..1fa102dd4ca7 100644 --- a/module/os/linux/zfs/spa_misc_os.c +++ b/module/os/linux/zfs/spa_misc_os.c @@ -116,6 +116,18 @@ param_set_active_allocator(const char *val, zfs_kernel_param_t *kp) return (error); } +int +param_set_active_weightfunc(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = -param_set_active_weightfunc_common(val); + if (error == 0) + error = param_set_charp(val, kp); + + return (error); +} + const char * spa_history_zone(void) { diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3f649ffb44e4..255823f75363 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -253,11 +253,6 @@ static int metaslab_perf_bias = 1; */ static const boolean_t zfs_remap_blkptr_enable = B_TRUE; -/* - * Enable/disable segment-based metaslab selection. - */ -static int zfs_metaslab_segment_weight_enabled = B_TRUE; - /* * When using segment-based metaslab selection, we will continue * allocating from the active metaslab until we have exhausted @@ -423,7 +418,7 @@ metaslab_stat_fini(void) */ metaslab_class_t * metaslab_class_create(spa_t *spa, const char *name, - const metaslab_ops_t *ops, boolean_t is_log) + const metaslab_ops_t *ops, const metaslab_wfs_t *wfs, boolean_t is_log) { metaslab_class_t *mc; @@ -433,6 +428,7 @@ metaslab_class_create(spa_t *spa, const char *name, mc->mc_spa = spa; mc->mc_name = name; mc->mc_ops = ops; + mc->mc_wfs = wfs; mc->mc_is_log = is_log; mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE; mc->mc_alloc_max = UINT64_MAX; @@ -2339,7 +2335,7 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint8_t type = WEIGHT_GET_TYPE(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; @@ -2367,8 +2363,7 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ - if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || - (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + if (type != WEIGHT_GET_TYPE(msp->ms_weight)) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; @@ -2753,6 +2748,7 @@ metaslab_unload(metaslab_t *msp) return; zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); + msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); @@ -3072,6 +3068,233 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } +static uint64_t metaslab_space_weight(metaslab_t *msp); +static uint64_t metaslab_segment_weight(metaslab_t *msp); +static uint64_t metaslab_space_weight_v2(metaslab_t *msp); +metaslab_wfs_t *metaslab_weightfunc(spa_t *spa); + +static metaslab_wfs_t metaslab_weightfuncs[] = { + { "auto", metaslab_space_weight_v2 }, + { "space", metaslab_space_weight }, + { "space_v2", metaslab_space_weight_v2 }, + { "segment", metaslab_segment_weight }, +}; + +static int +spa_find_weightfunc_byname(const char *val) +{ + int a = ARRAY_SIZE(metaslab_weightfuncs) - 1; + for (; a >= 0; a--) { + if (strcmp(val, metaslab_weightfuncs[a].mswf_name) == 0) + return (a); + } + return (-1); +} + +void +spa_set_weightfunc(spa_t *spa, const char *weightfunc) +{ + int a = spa_find_weightfunc_byname(weightfunc); + if (a < 0) a = 0; + if (a != 1 && !spa_feature_is_enabled(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + zfs_dbgmsg("warning: weight function %s will not be used for " + "pool %s since space map histograms are not enabled", + weightfunc, spa_name(spa)); + } + spa->spa_active_weightfunc = a; + zfs_dbgmsg("spa weight function: %s", + metaslab_weightfuncs[a].mswf_name); +} + +int +spa_get_weightfunc(spa_t *spa) +{ + return (spa->spa_active_weightfunc); +} + +#if defined(_KERNEL) +int +param_set_active_weightfunc_common(const char *val) +{ + char *p; + + if (val == NULL) + return (SET_ERROR(EINVAL)); + + if ((p = strchr(val, '\n')) != NULL) + *p = '\0'; + + int a = spa_find_weightfunc_byname(val); + if (a < 0) + return (SET_ERROR(EINVAL)); + + zfs_active_weightfunc = metaslab_weightfuncs[a].mswf_name; + return (0); +} +#endif + +metaslab_wfs_t * +metaslab_weightfunc(spa_t *spa) +{ + int weightfunc = spa_get_weightfunc(spa); + return (&metaslab_weightfuncs[weightfunc]); +} + +/* + * Return the weight of the specified metaslab, according to the new space-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_space_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint8_t vd_shift = msp->ms_group->mg_vd->vdev_ashift; + ASSERT3U(vd_shift, >=, 3); + + ASSERT(msp->ms_loaded); + ASSERT3U(vd_shift, >=, SPA_MINBLOCKSHIFT); + + for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= vd_shift; + i--) { + uint8_t seg_shift = ((3 * (i - vd_shift)) / 2) + 6; + uint64_t segments = msp->ms_allocatable->rt_histogram[i]; + if (segments == 0) + continue; + if (weight == 0) + weight = i; + // Prevent overflow using log_2 math + if (seg_shift + highbit64(segments) > METASLAB_WEIGHT_MAX_IDX) + return (METASLAB_WEIGHT_MAX); + weight = MIN(METASLAB_WEIGHT_MAX, + weight + (segments << seg_shift)); + } + return (weight); +} + +/* + * Calculate the new space-based weight based on the on-disk histogram. + * Should be applied only to unloaded metaslabs (i.e no incoming allocations) + * in-order to give results consistent with the on-disk state. + */ +static uint64_t +metaslab_space_weight_from_spacemap(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(!msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(space_map_object(sm), !=, 0); + ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + uint8_t vd_shift = msp->ms_group->mg_vd->vdev_ashift; + ASSERT3U(vd_shift, >=, 3); + + /* + * Create a joint histogram from all the segments that have made + * it to the metaslab's space map histogram, that are not yet + * available for allocation because they are still in the freeing + * pipeline (e.g. freeing, freed, and defer trees). Then subtract + * these segments from the space map's histogram to get a more + * accurate weight. + */ + uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + deferspace_histogram[i] += msp->ms_synchist[i]; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + deferspace_histogram[i] += msp->ms_deferhist[t][i]; + } + } + + uint64_t weight = 0; + for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + uint8_t seg_shift = (3 * (i + sm->sm_shift - vd_shift) / 2) + 6; + uint64_t segments = + sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; + if (segments == 0) + continue; + if (weight == 0) + weight = i + sm->sm_shift; + // Prevent overflow using log_2 math + if (seg_shift + highbit64(segments) > METASLAB_WEIGHT_MAX_IDX) + return (METASLAB_WEIGHT_MAX); + weight = MIN(METASLAB_WEIGHT_MAX, + weight + (segments << seg_shift)); + } + return (weight); +} + +/* + * The space weight v2 algorithm uses information from the free space + * histograms to provide a more useful weighting of the free space in + * the metaslab. Rather than simply using the fragmentation metric, we + * actually use the number of segments in each bucket to determine the + * weight. The weight is calculated as follows: + * + * sum from i = 0 to 29 of N(i) * 2^{2i}, where N(i) is the number of free + * segments of size 2^{i + shift} + * + * N(i) * 2^i is just the space used by the segments in a bucket divided + * by the shift, and the additional factor of 2^i weights the larger + * segments more heavily. If there are any segments of size larger than + * (28 + shift), we just max out the weight. That metaslab is free enough + * for any purpose. + */ +static uint64_t +metaslab_space_weight_v2(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) || + (msp->ms_sm != NULL && msp->ms_sm->sm_dbuf->db_size != + sizeof (space_map_phys_t))) { + return (metaslab_space_weight(msp)); + } + + if (metaslab_allocated_space(msp) == 0) { + int idx = highbit64(msp->ms_size) - shift - 1 + 3; + weight = 1ULL << MIN(METASLAB_WEIGHT_MAX_IDX, 2 * idx); + weight += highbit64(msp->ms_size) - 1; + WEIGHT_SET_SPACEBASED_V2(weight); + return (weight); + } + + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (metaslab_allocated_space(msp) == msp->ms_size) { + WEIGHT_SET_SPACEBASED_V2(weight); + return (weight); + } + + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) + weight = metaslab_space_weight_from_range_tree(msp); + else + weight = metaslab_space_weight_from_spacemap(msp); + ASSERT3U(weight, <=, METASLAB_WEIGHT_MAX); + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + WEIGHT_SET_SPACEBASED_V2(weight); + return (weight); +} + /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done @@ -3368,11 +3591,18 @@ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) || + (msp->ms_sm != NULL && msp->ms_sm->sm_dbuf->db_size != + sizeof (space_map_phys_t))) { + return (metaslab_space_weight(msp)); + } + /* * The metaslab is completely free. */ @@ -3463,9 +3693,12 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else if (WEIGHT_IS_SPACEBASED_V2(msp->ms_weight)) { + should_allocate = (asize < + 1ULL << ((msp->ms_weight & 0x3f) + 1)); } else { should_allocate = (asize <= - (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + (msp->ms_weight & ~METASLAB_WEIGHT_MASK)); } return (should_allocate); @@ -3474,8 +3707,6 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) static uint64_t metaslab_weight(metaslab_t *msp, boolean_t nodirty) { - vdev_t *vd = msp->ms_group->mg_vd; - spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -3499,17 +3730,7 @@ metaslab_weight(metaslab_t *msp, boolean_t nodirty) metaslab_largest_unflushed_free(msp)); } - /* - * Segment-based weighting requires space map histogram support. - */ - if (zfs_metaslab_segment_weight_enabled && - spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && - (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == - sizeof (space_map_phys_t))) { - weight = metaslab_segment_weight(msp); - } else { - weight = metaslab_space_weight(msp); - } + weight = msp->ms_group->mg_class->mc_wfs->mswf_func(msp); return (weight); } @@ -3678,14 +3899,15 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { - uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; + uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_MASK; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || + ASSERT(!(WEIGHT_IS_SPACEBASED(msp->ms_weight) && + !WEIGHT_IS_SPACEBASED_V2(msp->ms_weight)) || size >= SPA_MINBLOCKSIZE || zfs_range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); @@ -6398,9 +6620,6 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW, "Enable performance-based metaslab group biasing"); -ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, - ZMOD_RW, "Enable segment-based metaslab selection"); - ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); @@ -6431,3 +6650,7 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); + +ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_weightfunc, + param_set_active_weightfunc, param_get_charp, ZMOD_RW, + "SPA active weight function"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6c0ef0656744..2c51f8131624 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1762,6 +1762,7 @@ spa_thread(void *arg) #endif extern metaslab_ops_t *metaslab_allocator(spa_t *spa); +extern metaslab_wfs_t *metaslab_weightfunc(spa_t *spa); /* * Activate an uninitialized pool. @@ -1769,7 +1770,8 @@ extern metaslab_ops_t *metaslab_allocator(spa_t *spa); static void spa_activate(spa_t *spa, spa_mode_t mode) { - metaslab_ops_t *msp = metaslab_allocator(spa); + metaslab_ops_t *mso = metaslab_allocator(spa); + metaslab_wfs_t *msw = metaslab_weightfunc(spa); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; @@ -1778,16 +1780,17 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_read_spacemaps = spa_mode_readable_spacemaps; spa->spa_normal_class = metaslab_class_create(spa, "normal", - msp, B_FALSE); - spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE); + mso, msw, B_FALSE); + spa->spa_log_class = metaslab_class_create(spa, "log", mso, msw, + B_TRUE); spa->spa_embedded_log_class = metaslab_class_create(spa, - "embedded_log", msp, B_TRUE); + "embedded_log", mso, msw, B_TRUE); spa->spa_special_class = metaslab_class_create(spa, "special", - msp, B_FALSE); + mso, msw, B_FALSE); spa->spa_special_embedded_log_class = metaslab_class_create(spa, - "special_embedded_log", msp, B_TRUE); + "special_embedded_log", mso, msw, B_TRUE); spa->spa_dedup_class = metaslab_class_create(spa, "dedup", - msp, B_FALSE); + mso, msw, B_FALSE); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index aee170e9ea51..d5521fee61e0 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -404,6 +404,12 @@ static int spa_cpus_per_allocator = 4; */ const char *zfs_active_allocator = "dynamic"; +/* + * Spa active weight function. + * Valid values are zfs_active_weightfunc=. + */ +const char *zfs_active_weightfunc = "auto"; + void spa_load_failed(spa_t *spa, const char *fmt, ...) { @@ -795,6 +801,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_allocator(spa, zfs_active_allocator); + spa_set_weightfunc(spa, zfs_active_weightfunc); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 62c3cb0c039a..b98778d5780b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -763,7 +763,8 @@ tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] tests = ['gang_blocks_001_pos', 'gang_blocks_redundant', 'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos', - 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi'] + 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi', + 'metaslab_tuning_001_pos'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 3d697726a763..32fb7dbbb756 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3405,6 +3405,17 @@ function set_tunable32 set_tunable_impl "$1" "$2" W } +# +# Set a global system tunable (string value) +# +# $1 tunable name (use a NAME defined in tunables.cfg) +# $2 tunable values +# +function set_tunable_string +{ + set_tunable_impl "$1" "$2" S +} + function set_tunable_impl { typeset name="$1" @@ -3459,6 +3470,16 @@ function restore_tunable fi } +function restore_tunable_string +{ + if tunable_exists $1 ; then + [[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1 + val="$(cat $TEST_BASE_DIR/tunable-"""$1""")" + set_tunable_string "$1" "$val" + rm $TEST_BASE_DIR/tunable-$1 + fi +} + # # Get a global system tunable # diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 54b50c9dba77..9e17d5d60f31 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -16,6 +16,7 @@ UNAME=$(uname) # NAME FreeBSD tunable Linux tunable cat <<%%%% | +ACTIVE_WEIGHTFUNC zfs.active_weightfunc zfs_active_weightfunc ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount ARC_MAX arc.max zfs_arc_max diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index f973d606166b..03f5f044edfa 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1622,6 +1622,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \ functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \ functional/gang_blocks/gang_blocks_dyn_multi.ksh \ + functional/gang_blocks/metaslab_tuning_001_pos.ksh \ functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/gang_blocks/metaslab_tuning_001_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/metaslab_tuning_001_pos.ksh new file mode 100755 index 000000000000..ede8ad37af8f --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/metaslab_tuning_001_pos.ksh @@ -0,0 +1,49 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026 by Klara Inc. +# + +# +# Description: +# Verify that metaslab weight algorithm selection works correctly. +# +# Strategy: +# 1. Set the zfs_active_weightfunc to auto. +# 2. Create a pool. +# 3. Repeat steps 1 and 2 with the other valid values. +# + +. $STF_SUITE/include/libtest.shlib + +log_assert "Metaslab weight algorithm selection works correctly." + +function cleanup +{ + restore_tunable_string ACTIVE_WEIGHTFUNC + poolexists $TESTPOOL && zpool destroy $TESTPOOL +} +log_onexit cleanup + +save_tunable ACTIVE_WEIGHTFUNC + +for value in "auto" "space" "space_v2" "segment" +do + log_must set_tunable_string ACTIVE_WEIGHTFUNC $value + log_must zpool create -f $TESTPOOL $DISKS + log_must fill_fs /$TESTPOOL 1 100 1048576 R + log_must zpool destroy $TESTPOOL +done + +log_pass "Metaslab weight algorithm selection works correctly."