Skip to content

Commit df30667

Browse files
author
DocMAX
committed
btrfs-progs: scrub: improve stability during device disconnections
This commit enhances the scrub command to handle temporary device disconnections more gracefully, improving stability and preventing data loss: - Preserve maximum last_physical position during resume after interruptions - Handle device disconnection errors (ENODEV, ENOTCONN, EIO) as interrupted rather than canceled, allowing scrub to resume from the last position - Add retry logic for temporarily unavailable devices with automatic reconnection - Force more frequent progress saving when device issues are detected - Preserve progress data during temporary disconnections (e.g., USB hub resets) These changes ensure that scrub operations can survive transient hardware issues and resume properly without losing progress, making the filesystem maintenance more robust in environments with less reliable storage devices.
1 parent a522c59 commit df30667

File tree

1 file changed

+81
-4
lines changed

1 file changed

+81
-4
lines changed

cmds/scrub.c

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,18 @@ static struct scrub_progress *scrub_resumed_stats(struct scrub_progress *data,
816816
_SCRUB_SUM(dest, data, malloc_errors);
817817
_SCRUB_SUM(dest, data, uncorrectable_errors);
818818
_SCRUB_SUM(dest, data, corrected_errors);
819-
_SCRUB_COPY(dest, data, last_physical);
819+
820+
/*
821+
* Preserve the maximum last_physical position from resumed or current data.
822+
* This handles the case where last_physical was reset to 0 due to device
823+
* disconnection but we still want to resume from the highest position
824+
* we actually reached.
825+
*/
826+
if (data->resumed->p.last_physical > data->scrub_args.progress.last_physical)
827+
dest->scrub_args.progress.last_physical = data->resumed->p.last_physical;
828+
else
829+
dest->scrub_args.progress.last_physical = data->scrub_args.progress.last_physical;
830+
820831
dest->stats.canceled = data->stats.canceled;
821832
dest->stats.finished = data->stats.finished;
822833
dest->stats.t_resumed = data->stats.t_start;
@@ -968,10 +979,23 @@ static void *scrub_one_dev(void *ctx)
968979
sp->stats.duration = tv.tv_sec - sp->stats.t_start;
969980
sp->stats.canceled = !!ret;
970981
sp->ioctl_errno = errno;
982+
983+
/*
984+
* For device disconnection errors, preserve the progress by marking
985+
* as interrupted rather than canceled, to allow resume to continue
986+
* from the last position
987+
*/
988+
if (ret && (errno == ENODEV || errno == ENOTCONN || errno == EIO)) {
989+
sp->stats.canceled = 0;
990+
sp->stats.finished = 0; /* Mark as interrupted for resume */
991+
} else {
992+
sp->stats.canceled = !!ret;
993+
sp->stats.finished = 1;
994+
}
995+
971996
ret = pthread_mutex_lock(&sp->progress_mutex);
972997
if (ret)
973998
return ERR_PTR(-ret);
974-
sp->stats.finished = 1;
975999
ret = pthread_mutex_unlock(&sp->progress_mutex);
9761000
if (ret)
9771001
return ERR_PTR(-ret);
@@ -1051,12 +1075,26 @@ static void *scrub_progress_cycle(void *ctx)
10511075
gettimeofday(&tv, NULL);
10521076
this = (this + 1)%2;
10531077
last = (last + 1)%2;
1078+
10541079
for (i = 0; i < ndev; ++i) {
10551080
sp = &spc->progress[this * ndev + i];
10561081
sp_last = &spc->progress[last * ndev + i];
10571082
sp_shared = &spc->shared_progress[i];
1083+
10581084
if (sp->stats.finished)
10591085
continue;
1086+
1087+
/*
1088+
* For devices with recent connection issues, try to
1089+
* reconnect by retrying the progress ioctl a few times
1090+
* in case the device comes back online
1091+
*/
1092+
int retry_count = 0;
1093+
if (sp_last->ioctl_errno == ENODEV || sp_last->ioctl_errno == ENOTCONN) {
1094+
retry_count = 3;
1095+
}
1096+
1097+
retry_progress:
10601098
progress_one_dev(sp);
10611099
sp->stats.duration = tv.tv_sec - sp->stats.t_start;
10621100
if (!sp->ret)
@@ -1066,11 +1104,27 @@ static void *scrub_progress_cycle(void *ctx)
10661104
ret = -sp->ioctl_errno;
10671105
goto out;
10681106
}
1107+
1108+
/*
1109+
* If device is temporarily unavailable and we have retries left,
1110+
* wait a moment and try again
1111+
*/
1112+
if (retry_count > 0 && (sp->ioctl_errno == ENODEV || sp->ioctl_errno == ENOTCONN)) {
1113+
struct timespec sleep_time = {0, 500000000}; /* 0.5 seconds */
1114+
nanosleep(&sleep_time, NULL);
1115+
retry_count--;
1116+
goto retry_progress;
1117+
}
1118+
10691119
/*
10701120
* scrub finished or device removed, check the
10711121
* finished flag. if unset, just use the last
10721122
* result we got for the current write and go
10731123
* on. flag should be set on next cycle, then.
1124+
*
1125+
* For device removal (ENODEV), preserve the last_physical
1126+
* position in case this was caused by a temporary
1127+
* disconnection like USB hub reset.
10741128
*/
10751129
perr = pthread_setcancelstate(
10761130
PTHREAD_CANCEL_DISABLE, &old);
@@ -1080,6 +1134,13 @@ static void *scrub_progress_cycle(void *ctx)
10801134
if (perr)
10811135
goto out;
10821136
if (!sp_shared->stats.finished) {
1137+
/*
1138+
* Preserve the last_physical position to avoid
1139+
* losing progress on temporary disconnections
1140+
*/
1141+
if (sp->ioctl_errno == ENODEV && sp_last->scrub_args.progress.last_physical > 0) {
1142+
sp_shared->scrub_args.progress.last_physical = sp_last->scrub_args.progress.last_physical;
1143+
}
10831144
perr = pthread_mutex_unlock(
10841145
&sp_shared->progress_mutex);
10851146
if (perr)
@@ -1120,8 +1181,24 @@ static void *scrub_progress_cycle(void *ctx)
11201181
}
11211182
if (!spc->do_record)
11221183
continue;
1123-
ret = scrub_write_progress(spc->write_mutex, fsid,
1124-
&spc->progress[this * ndev], ndev);
1184+
1185+
/*
1186+
* Force progress saving more frequently if we have device issues
1187+
* to prevent data loss during temporary disconnections
1188+
*/
1189+
int force_write = 0;
1190+
for (i = 0; i < ndev; ++i) {
1191+
struct scrub_progress *sp_check = &spc->progress[this * ndev + i];
1192+
if (sp_check->ioctl_errno == ENODEV || sp_check->ioctl_errno == ENOTCONN) {
1193+
force_write = 1;
1194+
break;
1195+
}
1196+
}
1197+
1198+
if (force_write || (tv.tv_sec % 30) == 0) {
1199+
ret = scrub_write_progress(spc->write_mutex, fsid,
1200+
&spc->progress[this * ndev], ndev);
1201+
}
11251202
if (ret)
11261203
goto out;
11271204
}

0 commit comments

Comments
 (0)