Skip to content

Commit 4951514

Browse files
committed
weight selector
Signed-off-by: Paul Dagnelie <[email protected]>
1 parent fa7b45e commit 4951514

File tree

11 files changed

+178
-53
lines changed

11 files changed

+178
-53
lines changed

include/os/freebsd/spl/sys/mod.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@
7777
#define param_set_active_allocator_args(var) \
7878
CTLTYPE_STRING, NULL, 0, param_set_active_allocator, "A"
7979

80+
#define param_set_active_weightfunc_args(var) \
81+
CTLTYPE_STRING, NULL, 0, param_set_active_weightfunc, "A"
82+
8083
#define param_set_deadman_synctime_args(var) \
8184
CTLTYPE_U64, NULL, 0, param_set_deadman_synctime, "QU"
8285

include/sys/metaslab.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@ typedef struct metaslab_ops {
4444
uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *);
4545
} metaslab_ops_t;
4646

47-
4847
extern const metaslab_ops_t zfs_metaslab_ops;
4948

49+
typedef struct metaslab_wfs {
50+
const char *mswf_name;
51+
uint64_t (*mswf_func)(metaslab_t *);
52+
} metaslab_wfs_t;
53+
5054
int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
5155
metaslab_t **);
5256
void metaslab_fini(metaslab_t *);
@@ -103,7 +107,7 @@ void metaslab_trace_init(zio_alloc_list_t *);
103107
void metaslab_trace_fini(zio_alloc_list_t *);
104108

105109
metaslab_class_t *metaslab_class_create(spa_t *, const char *,
106-
const metaslab_ops_t *, boolean_t);
110+
const metaslab_ops_t *, const metaslab_wfs_t *, boolean_t);
107111
void metaslab_class_destroy(metaslab_class_t *);
108112
void metaslab_class_validate(metaslab_class_t *);
109113
void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync);

include/sys/metaslab_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ struct metaslab_class {
203203
spa_t *mc_spa;
204204
const char *mc_name;
205205
const metaslab_ops_t *mc_ops;
206+
const metaslab_wfs_t *mc_wfs;
206207

207208
/*
208209
* Track the number of metaslab groups that have been initialized

include/sys/spa.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,8 @@ extern uint64_t spa_dirty_data(spa_t *spa);
11241124
extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
11251125
extern int spa_get_allocator(spa_t *spa);
11261126
extern void spa_set_allocator(spa_t *spa, const char *allocator);
1127+
extern int spa_get_weightfunc(spa_t *spa);
1128+
extern void spa_set_weightfunc(spa_t *spa, const char *weightfunc);
11271129

11281130
/* Miscellaneous support routines */
11291131
extern void spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -1282,6 +1284,7 @@ int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS);
12821284
int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS);
12831285
int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS);
12841286
int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS);
1287+
int param_set_active_weightfunc(ZFS_MODULE_PARAM_ARGS);
12851288

12861289
#ifdef ZFS_DEBUG
12871290
#define dprintf_bp(bp, fmt, ...) do { \

include/sys/spa_impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ struct spa {
275275
spa_allocs_use_t *spa_allocs_use;
276276
int spa_alloc_count;
277277
int spa_active_allocator; /* selectable allocator */
278+
int spa_active_weightfunc; /* selectable weight function */
278279

279280
/* per-allocator sync thread taskqs */
280281
taskq_t *spa_sync_tq;
@@ -501,7 +502,9 @@ extern void spa_set_deadman_synctime(hrtime_t ns);
501502
extern void spa_set_deadman_ziotime(hrtime_t ns);
502503
extern const char *spa_history_zone(void);
503504
extern const char *zfs_active_allocator;
505+
extern const char *zfs_active_weightfunc;
504506
extern int param_set_active_allocator_common(const char *val);
507+
extern int param_set_active_weightfunc_common(const char *val);
505508

506509
#ifdef __cplusplus
507510
}

man/man4/zfs.4

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,25 @@ depends on kernel configuration.
728728
This is the minimum allocation size that will use scatter (page-based) ABDs.
729729
Smaller allocations will use linear ABDs.
730730
.
731+
.It Sy zfs_active_weightfunc Ns = Ns Sy auto Pq string
732+
Selects the metaslab weighting function to use.
733+
The weighting function can be one of auto, space, space_v2, and segment.
734+
This determines what weighting algorithm will be used to sort metaslabs for
735+
selection during the allocation process.
736+
The space_v2 and segment algorithms require the spacemap histogram feature to be
737+
enabled, and will fall back to the space algorithm if that feature is not
738+
enabled on the pool.
739+
.Pp
740+
The space algorithm combines the total free space in the metaslab with the
741+
fragmentation metric to balance toward metaslabs with more contiguous free
742+
space.
743+
The segment algorithm uses the spacemap histograms to calculate the weight from
744+
the largest free segment size, bucketed to powers of two, and the number of
745+
segments in that bucket.
746+
The space_v2 algorithm considers not only the largest free segment bucket, but
747+
the smaller ones as well, providing a higher weight to larger contiguous chunks.
748+
Auto provides the recommended algorithm (currently space_v2).
749+
.
731750
.It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64
732751
When the number of bytes consumed by dnodes in the ARC exceeds this number of
733752
bytes, try to unpin some of it in response to demand for non-metadata.

module/os/freebsd/zfs/sysctl_os.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,24 @@ param_set_active_allocator(SYSCTL_HANDLER_ARGS)
273273
return (param_set_active_allocator_common(buf));
274274
}
275275

276+
int
277+
param_set_active_weightfunc(SYSCTL_HANDLER_ARGS)
278+
{
279+
char buf[16];
280+
int rc;
281+
282+
if (req->newptr == NULL)
283+
strlcpy(buf, zfs_active_weightfunc, sizeof (buf));
284+
285+
rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
286+
if (rc || req->newptr == NULL)
287+
return (rc);
288+
if (strcmp(buf, zfs_active_weightfunc) == 0)
289+
return (0);
290+
291+
return (param_set_active_weightfunc_common(buf));
292+
}
293+
276294
/*
277295
* In pools where the log space map feature is not enabled we touch
278296
* multiple metaslabs (and their respective space maps) with each

module/os/linux/zfs/spa_misc_os.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,18 @@ param_set_active_allocator(const char *val, zfs_kernel_param_t *kp)
116116
return (error);
117117
}
118118

119+
int
120+
param_set_active_weightfunc(const char *val, zfs_kernel_param_t *kp)
121+
{
122+
int error;
123+
124+
error = -param_set_active_weightfunc_common(val);
125+
if (error == 0)
126+
error = param_set_charp(val, kp);
127+
128+
return (error);
129+
}
130+
119131
const char *
120132
spa_history_zone(void)
121133
{

module/zfs/metaslab.c

Lines changed: 96 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -253,21 +253,6 @@ static int metaslab_perf_bias = 1;
253253
*/
254254
static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
255255

256-
/*
257-
* Enable/disable segment-based metaslab selection.
258-
*/
259-
static int zfs_metaslab_segment_weight_enabled = B_TRUE;
260-
261-
/*
262-
* Enable/disable the new space-based metaslab selection algorithm.
263-
*
264-
* The new space-based algorithm attempts to take into account not only the
265-
* largest free segment, as the segment-based weight does, but other segments
266-
* that are almost as large. This can improve metaslab selection and reduce the
267-
* number of metaslab loads needed to satisfy a given set of allocations.
268-
*/
269-
static int zfs_metaslab_space_weight_v2_enabled = B_TRUE;
270-
271256
/*
272257
* When using segment-based metaslab selection, we will continue
273258
* allocating from the active metaslab until we have exhausted
@@ -433,7 +418,7 @@ metaslab_stat_fini(void)
433418
*/
434419
metaslab_class_t *
435420
metaslab_class_create(spa_t *spa, const char *name,
436-
const metaslab_ops_t *ops, boolean_t is_log)
421+
const metaslab_ops_t *ops, const metaslab_wfs_t *wfs, boolean_t is_log)
437422
{
438423
metaslab_class_t *mc;
439424

@@ -443,6 +428,7 @@ metaslab_class_create(spa_t *spa, const char *name,
443428
mc->mc_spa = spa;
444429
mc->mc_name = name;
445430
mc->mc_ops = ops;
431+
mc->mc_wfs = wfs;
446432
mc->mc_is_log = is_log;
447433
mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE;
448434
mc->mc_alloc_max = UINT64_MAX;
@@ -3082,6 +3068,79 @@ metaslab_fini(metaslab_t *msp)
30823068
kmem_free(msp, sizeof (metaslab_t));
30833069
}
30843070

3071+
static uint64_t metaslab_space_weight(metaslab_t *msp);
3072+
static uint64_t metaslab_segment_weight(metaslab_t *msp);
3073+
static uint64_t metaslab_space_weight_v2(metaslab_t *msp);
3074+
metaslab_wfs_t *metaslab_weightfunc(spa_t *spa);
3075+
3076+
static metaslab_wfs_t metaslab_weightfuncs[] = {
3077+
{ "auto", metaslab_space_weight_v2 },
3078+
{ "space", metaslab_space_weight },
3079+
{ "space_v2", metaslab_space_weight_v2 },
3080+
{ "segment", metaslab_segment_weight },
3081+
};
3082+
3083+
static int
3084+
spa_find_weightfunc_byname(const char *val)
3085+
{
3086+
int a = ARRAY_SIZE(metaslab_weightfuncs) - 1;
3087+
for (; a >= 0; a--) {
3088+
if (strcmp(val, metaslab_weightfuncs[a].mswf_name) == 0)
3089+
return (a);
3090+
}
3091+
return (-1);
3092+
}
3093+
3094+
void
3095+
spa_set_weightfunc(spa_t *spa, const char *weightfunc)
3096+
{
3097+
int a = spa_find_weightfunc_byname(weightfunc);
3098+
if (a < 0) a = 0;
3099+
if (a != 1 && !spa_feature_is_enabled(spa,
3100+
SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
3101+
zfs_dbgmsg("warning: weight function %s will not be used for "
3102+
"pool %s since space map histograms are not enabled",
3103+
weightfunc, spa_name(spa));
3104+
}
3105+
spa->spa_active_weightfunc = a;
3106+
zfs_dbgmsg("spa weight function: %s",
3107+
metaslab_weightfuncs[a].mswf_name);
3108+
}
3109+
3110+
int
3111+
spa_get_weightfunc(spa_t *spa)
3112+
{
3113+
return (spa->spa_active_weightfunc);
3114+
}
3115+
3116+
#if defined(_KERNEL)
3117+
int
3118+
param_set_active_weightfunc_common(const char *val)
3119+
{
3120+
char *p;
3121+
3122+
if (val == NULL)
3123+
return (SET_ERROR(EINVAL));
3124+
3125+
if ((p = strchr(val, '\n')) != NULL)
3126+
*p = '\0';
3127+
3128+
int a = spa_find_weightfunc_byname(val);
3129+
if (a < 0)
3130+
return (SET_ERROR(EINVAL));
3131+
3132+
zfs_active_weightfunc = metaslab_weightfuncs[a].mswf_name;
3133+
return (0);
3134+
}
3135+
#endif
3136+
3137+
metaslab_wfs_t *
3138+
metaslab_weightfunc(spa_t *spa)
3139+
{
3140+
int weightfunc = spa_get_weightfunc(spa);
3141+
return (&metaslab_weightfuncs[weightfunc]);
3142+
}
3143+
30853144
/*
30863145
* Return the weight of the specified metaslab, according to the new space-based
30873146
* weighting algorithm. The metaslab must be loaded. This function can
@@ -3156,7 +3215,7 @@ metaslab_space_weight_from_spacemap(metaslab_t *msp)
31563215
if (segments == 0)
31573216
continue;
31583217
if (weight == 0)
3159-
weight = i + sm->sm_shift;
3218+
weight = i + sm->sm_shift;
31603219
// Prevent overflow using log_2 math
31613220
if (seg_shift + highbit64(segments) > METASLAB_WEIGHT_MAX_IDX)
31623221
return (METASLAB_WEIGHT_MAX);
@@ -3186,8 +3245,16 @@ static uint64_t
31863245
metaslab_space_weight_v2(metaslab_t *msp)
31873246
{
31883247
metaslab_group_t *mg = msp->ms_group;
3248+
spa_t *spa = mg->mg_vd->vdev_spa;
31893249
uint64_t weight = 0;
31903250
uint8_t shift = mg->mg_vd->vdev_ashift;
3251+
3252+
if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) ||
3253+
(msp->ms_sm != NULL && msp->ms_sm->sm_dbuf->db_size !=
3254+
sizeof (space_map_phys_t))) {
3255+
return (metaslab_space_weight(msp));
3256+
}
3257+
31913258
if (metaslab_allocated_space(msp) == 0) {
31923259
int idx = highbit64(msp->ms_size) - shift - 1 + 3;
31933260
weight = 1ULL << MIN(METASLAB_WEIGHT_MAX_IDX, 2 * idx);
@@ -3365,18 +3432,10 @@ metaslab_space_weight(metaslab_t *msp)
33653432
{
33663433
metaslab_group_t *mg = msp->ms_group;
33673434
vdev_t *vd = mg->mg_vd;
3368-
spa_t *spa = vd->vdev_spa;
33693435
uint64_t weight, space;
33703436

33713437
ASSERT(MUTEX_HELD(&msp->ms_lock));
33723438

3373-
if (zfs_metaslab_space_weight_v2_enabled &&
3374-
spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
3375-
(msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3376-
sizeof (space_map_phys_t))) {
3377-
return (metaslab_space_weight_v2(msp));
3378-
}
3379-
33803439
/*
33813440
* The baseline weight is the metaslab's free space.
33823441
*/
@@ -3532,11 +3591,18 @@ static uint64_t
35323591
metaslab_segment_weight(metaslab_t *msp)
35333592
{
35343593
metaslab_group_t *mg = msp->ms_group;
3594+
spa_t *spa = mg->mg_vd->vdev_spa;
35353595
uint64_t weight = 0;
35363596
uint8_t shift = mg->mg_vd->vdev_ashift;
35373597

35383598
ASSERT(MUTEX_HELD(&msp->ms_lock));
35393599

3600+
if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) ||
3601+
(msp->ms_sm != NULL && msp->ms_sm->sm_dbuf->db_size !=
3602+
sizeof (space_map_phys_t))) {
3603+
return (metaslab_space_weight(msp));
3604+
}
3605+
35403606
/*
35413607
* The metaslab is completely free.
35423608
*/
@@ -3641,8 +3707,6 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
36413707
static uint64_t
36423708
metaslab_weight(metaslab_t *msp, boolean_t nodirty)
36433709
{
3644-
vdev_t *vd = msp->ms_group->mg_vd;
3645-
spa_t *spa = vd->vdev_spa;
36463710
uint64_t weight;
36473711

36483712
ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -3666,17 +3730,7 @@ metaslab_weight(metaslab_t *msp, boolean_t nodirty)
36663730
metaslab_largest_unflushed_free(msp));
36673731
}
36683732

3669-
/*
3670-
* Segment-based weighting requires space map histogram support.
3671-
*/
3672-
if (zfs_metaslab_segment_weight_enabled &&
3673-
spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
3674-
(msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3675-
sizeof (space_map_phys_t))) {
3676-
weight = metaslab_segment_weight(msp);
3677-
} else {
3678-
weight = metaslab_space_weight(msp);
3679-
}
3733+
weight = msp->ms_group->mg_class->mc_wfs->mswf_func(msp);
36803734
return (weight);
36813735
}
36823736

@@ -6566,12 +6620,6 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
65666620
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW,
65676621
"Enable performance-based metaslab group biasing");
65686622

6569-
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
6570-
ZMOD_RW, "Enable segment-based metaslab selection");
6571-
6572-
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, space_weight_v2_enabled, INT,
6573-
ZMOD_RW, "Enable new space-based metaslab selection");
6574-
65756623
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
65766624
"Segment-based metaslab selection maximum buckets before switching");
65776625

@@ -6602,3 +6650,7 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
66026650
ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
66036651
param_set_active_allocator, param_get_charp, ZMOD_RW,
66046652
"SPA active allocator");
6653+
6654+
ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_weightfunc,
6655+
param_set_active_weightfunc, param_get_charp, ZMOD_RW,
6656+
"SPA active weight function");

0 commit comments

Comments
 (0)