Skip to content

Commit b7654bd

Browse files
authored
Trim L2ARC
The l2arc_evict() function is responsible for evicting buffers which reference the next bytes of the L2ARC device to be overwritten. Teach this function to additionally TRIM that vdev space before it is overwritten if the device has been filled with data. This is done by vdev_trim_simple() which trims by issuing a new type of TRIM, TRIM_TYPE_SIMPLE. We also implement a "Trim Ahead" feature. It is a zfs module parameter, expressed in % of the current write size. This trims ahead of the current write size. A minimum of 64MB will be trimmed. The default is 0 which disables TRIM on L2ARC as it can put significant stress to underlying storage devices. To enable TRIM on L2ARC we set l2arc_trim_ahead > 0. We also implement TRIM of the whole cache device upon addition to a pool, pool creation or when the header of the device is invalid upon importing a pool or onlining a cache device. This is dependent on l2arc_trim_ahead > 0. TRIM of the whole device is done with TRIM_TYPE_MANUAL so that its status can be monitored by zpool status -t. We save the TRIM state for the whole device and the time of completion on-disk in the header, and restore these upon L2ARC rebuild so that zpool status -t can correctly report them. Whole device TRIM is done asynchronously so that the user can export of the pool or remove the cache device while it is trimming (ie if it is too slow). We do not TRIM the whole device if persistent L2ARC has been disabled by l2arc_rebuild_enabled = 0 because we may not want to lose all cached buffers (eg we may want to import the pool with l2arc_rebuild_enabled = 0 only once because of memory pressure). If persistent L2ARC has been disabled by setting the module parameter l2arc_rebuild_blocks_min_l2size to a value greater than the size of the cache device then the whole device is trimmed upon creation or import of a pool if l2arc_trim_ahead > 0. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Adam D. Moss <[email protected]> Signed-off-by: George Amanakis <[email protected]> Closes #9713 Closes #9789 Closes #10224
1 parent 32f26ea commit b7654bd

File tree

18 files changed

+573
-51
lines changed

18 files changed

+573
-51
lines changed

cmd/zdb/zdb.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -3707,8 +3707,12 @@ dump_l2arc_header(int fd)
37073707
(u_longlong_t)l2dhdr.dh_evict);
37083708
(void) printf(" lb_asize_refcount: %llu\n",
37093709
(u_longlong_t)l2dhdr.dh_lb_asize);
3710-
(void) printf(" lb_count_refcount: %llu\n\n",
3710+
(void) printf(" lb_count_refcount: %llu\n",
37113711
(u_longlong_t)l2dhdr.dh_lb_count);
3712+
(void) printf(" trim_action_time: %llu\n",
3713+
(u_longlong_t)l2dhdr.dh_trim_action_time);
3714+
(void) printf(" trim_state: %llu\n\n",
3715+
(u_longlong_t)l2dhdr.dh_trim_state);
37123716
}
37133717

37143718
dump_l2arc_log_blocks(fd, l2dhdr, &rebuild);

include/sys/arc_impl.h

+13-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,14 @@ typedef struct l2arc_dev_hdr_phys {
240240
*/
241241
uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
242242
uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
243-
const uint64_t dh_pad[32]; /* pad to 512 bytes */
243+
/*
244+
* Mirrors of vdev_trim_action_time and vdev_trim_state, used to
245+
* display when the cache device was fully trimmed for the last
246+
* time.
247+
*/
248+
uint64_t dh_trim_action_time;
249+
uint64_t dh_trim_state;
250+
const uint64_t dh_pad[30]; /* pad to 512 bytes */
244251
zio_eck_t dh_tail;
245252
} l2arc_dev_hdr_phys_t;
246253
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
@@ -399,6 +406,7 @@ typedef struct l2arc_dev {
399406
* Number of log blocks present on the device.
400407
*/
401408
zfs_refcount_t l2ad_lb_count;
409+
boolean_t l2ad_trim_all; /* TRIM whole device */
402410
} l2arc_dev_t;
403411

404412
/*
@@ -902,6 +910,10 @@ extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
902910
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
903911
const l2arc_log_blkptr_t *lbp);
904912

913+
/* used in vdev_trim.c */
914+
void l2arc_dev_hdr_update(l2arc_dev_t *dev);
915+
l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
916+
905917
#ifdef __cplusplus
906918
}
907919
#endif

include/sys/spa.h

+8
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,7 @@ typedef enum {
745745
typedef enum trim_type {
746746
TRIM_TYPE_MANUAL = 0,
747747
TRIM_TYPE_AUTO = 1,
748+
TRIM_TYPE_SIMPLE = 2
748749
} trim_type_t;
749750

750751
/* state manipulation functions */
@@ -788,6 +789,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
788789
#define SPA_ASYNC_TRIM_RESTART 0x200
789790
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
790791
#define SPA_ASYNC_L2CACHE_REBUILD 0x800
792+
#define SPA_ASYNC_L2CACHE_TRIM 0x1000
791793

792794
/*
793795
* Controls the behavior of spa_vdev_remove().
@@ -940,6 +942,12 @@ typedef struct spa_iostats {
940942
kstat_named_t autotrim_bytes_skipped;
941943
kstat_named_t autotrim_extents_failed;
942944
kstat_named_t autotrim_bytes_failed;
945+
kstat_named_t simple_trim_extents_written;
946+
kstat_named_t simple_trim_bytes_written;
947+
kstat_named_t simple_trim_extents_skipped;
948+
kstat_named_t simple_trim_bytes_skipped;
949+
kstat_named_t simple_trim_extents_failed;
950+
kstat_named_t simple_trim_bytes_failed;
943951
} spa_iostats_t;
944952

945953
extern void spa_stats_init(spa_t *spa);

include/sys/vdev_impl.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ struct vdev {
301301
uint64_t vdev_initialize_inflight;
302302
kmutex_t vdev_trim_io_lock;
303303
kcondvar_t vdev_trim_io_cv;
304-
uint64_t vdev_trim_inflight[2];
304+
uint64_t vdev_trim_inflight[3];
305305

306306
/*
307307
* Values stored in the config for an indirect or removing vdev.

include/sys/vdev_trim.h

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ extern void vdev_autotrim(spa_t *spa);
4444
extern void vdev_autotrim_stop_all(spa_t *spa);
4545
extern void vdev_autotrim_stop_wait(vdev_t *vd);
4646
extern void vdev_autotrim_restart(spa_t *spa);
47+
extern int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size);
48+
extern void vdev_trim_l2arc(spa_t *spa);
4749

4850
#ifdef __cplusplus
4951
}

man/man5/zfs-module-parameters.5

+21-1
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,31 @@ Default value: \fB2\fR.
194194
.ad
195195
.RS 12n
196196
Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being
197-
successfully compressed before writing. A value of 100 disables this feature.
197+
successfully compressed before writing. A value of \fB100\fR disables this
198+
feature.
198199
.sp
199200
Default value: \fB200\fR%.
200201
.RE
201202

203+
.sp
204+
.ne 2
205+
.na
206+
\fBl2arc_trim_ahead\fR (ulong)
207+
.ad
208+
.RS 12n
209+
Trims ahead of the current write size (\fBl2arc_write_max\fR) on L2ARC devices
210+
by this percentage of write size if we have filled the device. If set to
211+
\fB100\fR we TRIM twice the space required to accommodate upcoming writes. A
212+
minimum of 64MB will be trimmed. It also enables TRIM of the whole L2ARC device
213+
upon creation or addition to an existing pool or if the header of the device is
214+
invalid upon importing a pool or onlining a cache device. A value of \fB0\fR
215+
disables TRIM on L2ARC altogether and is the default as it can put significant
216+
stress on the underlying storage devices. This will vary depending of how well
217+
the specific device handles these commands.
218+
.sp
219+
Default value: \fB0\fR%.
220+
.RE
221+
202222
.sp
203223
.ne 2
204224
.na

man/man8/zpoolprops.8

+2
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,8 @@ this property is
238238
Automatic TRIM does not immediately reclaim blocks after a free. Instead,
239239
it will optimistically delay allowing smaller ranges to be aggregated in to
240240
a few larger ones. These can then be issued more efficiently to the storage.
241+
TRIM on L2ARC devices is enabled by setting
242+
.Sy l2arc_trim_ahead > 0 .
241243
.Pp
242244
Be aware that automatic trimming of recently freed data blocks can put
243245
significant stress on the underlying storage devices. This will vary

module/os/linux/zfs/spa_stats.c

+14-1
Original file line numberDiff line numberDiff line change
@@ -903,6 +903,12 @@ static spa_iostats_t spa_iostats_template = {
903903
{ "autotrim_bytes_skipped", KSTAT_DATA_UINT64 },
904904
{ "autotrim_extents_failed", KSTAT_DATA_UINT64 },
905905
{ "autotrim_bytes_failed", KSTAT_DATA_UINT64 },
906+
{ "simple_trim_extents_written", KSTAT_DATA_UINT64 },
907+
{ "simple_trim_bytes_written", KSTAT_DATA_UINT64 },
908+
{ "simple_trim_extents_skipped", KSTAT_DATA_UINT64 },
909+
{ "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 },
910+
{ "simple_trim_extents_failed", KSTAT_DATA_UINT64 },
911+
{ "simple_trim_bytes_failed", KSTAT_DATA_UINT64 },
906912
};
907913

908914
#define SPA_IOSTATS_ADD(stat, val) \
@@ -929,13 +935,20 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
929935
SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
930936
SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
931937
SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
932-
} else {
938+
} else if (type == TRIM_TYPE_AUTO) {
933939
SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
934940
SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
935941
SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
936942
SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
937943
SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
938944
SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
945+
} else {
946+
SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
947+
SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
948+
SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
949+
SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
950+
SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
951+
SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
939952
}
940953
}
941954

0 commit comments

Comments
 (0)