@@ -816,7 +816,18 @@ static struct scrub_progress *scrub_resumed_stats(struct scrub_progress *data,
816816 _SCRUB_SUM (dest , data , malloc_errors );
817817 _SCRUB_SUM (dest , data , uncorrectable_errors );
818818 _SCRUB_SUM (dest , data , corrected_errors );
819- _SCRUB_COPY (dest , data , last_physical );
819+
820+ /*
821+ * Preserve the maximum last_physical position from resumed or current data.
822+ * This handles the case where last_physical was reset to 0 due to device
823+ * disconnection but we still want to resume from the highest position
824+ * we actually reached.
825+ */
826+ if (data -> resumed -> p .last_physical > data -> scrub_args .progress .last_physical )
827+ dest -> scrub_args .progress .last_physical = data -> resumed -> p .last_physical ;
828+ else
829+ dest -> scrub_args .progress .last_physical = data -> scrub_args .progress .last_physical ;
830+
820831 dest -> stats .canceled = data -> stats .canceled ;
821832 dest -> stats .finished = data -> stats .finished ;
822833 dest -> stats .t_resumed = data -> stats .t_start ;
@@ -968,10 +979,23 @@ static void *scrub_one_dev(void *ctx)
968979 sp -> stats .duration = tv .tv_sec - sp -> stats .t_start ;
969980 sp -> stats .canceled = !!ret ;
970981 sp -> ioctl_errno = errno ;
982+
983+ /*
984+ * For device disconnection errors, preserve the progress by marking
985+ * as interrupted rather than canceled, to allow resume to continue
986+ * from the last position
987+ */
988+ if (ret && (errno == ENODEV || errno == ENOTCONN || errno == EIO )) {
989+ sp -> stats .canceled = 0 ;
990+ sp -> stats .finished = 0 ; /* Mark as interrupted for resume */
991+ } else {
992+ sp -> stats .canceled = !!ret ;
993+ sp -> stats .finished = 1 ;
994+ }
995+
971996 ret = pthread_mutex_lock (& sp -> progress_mutex );
972997 if (ret )
973998 return ERR_PTR (- ret );
974- sp -> stats .finished = 1 ;
975999 ret = pthread_mutex_unlock (& sp -> progress_mutex );
9761000 if (ret )
9771001 return ERR_PTR (- ret );
@@ -1051,12 +1075,26 @@ static void *scrub_progress_cycle(void *ctx)
10511075 gettimeofday (& tv , NULL );
10521076 this = (this + 1 )%2 ;
10531077 last = (last + 1 )%2 ;
1078+
10541079 for (i = 0 ; i < ndev ; ++ i ) {
10551080 sp = & spc -> progress [this * ndev + i ];
10561081 sp_last = & spc -> progress [last * ndev + i ];
10571082 sp_shared = & spc -> shared_progress [i ];
1083+
10581084 if (sp -> stats .finished )
10591085 continue ;
1086+
1087+ /*
1088+ * For devices with recent connection issues, try to
1089+ * reconnect by retrying the progress ioctl a few times
1090+ * in case the device comes back online
1091+ */
1092+ int retry_count = 0 ;
1093+ if (sp_last -> ioctl_errno == ENODEV || sp_last -> ioctl_errno == ENOTCONN ) {
1094+ retry_count = 3 ;
1095+ }
1096+
1097+ retry_progress :
10601098 progress_one_dev (sp );
10611099 sp -> stats .duration = tv .tv_sec - sp -> stats .t_start ;
10621100 if (!sp -> ret )
@@ -1066,11 +1104,27 @@ static void *scrub_progress_cycle(void *ctx)
10661104 ret = - sp -> ioctl_errno ;
10671105 goto out ;
10681106 }
1107+
1108+ /*
1109+ * If device is temporarily unavailable and we have retries left,
1110+ * wait a moment and try again
1111+ */
1112+ if (retry_count > 0 && (sp -> ioctl_errno == ENODEV || sp -> ioctl_errno == ENOTCONN )) {
1113+ struct timespec sleep_time = {0 , 500000000 }; /* 0.5 seconds */
1114+ nanosleep (& sleep_time , NULL );
1115+ retry_count -- ;
1116+ goto retry_progress ;
1117+ }
1118+
10691119 /*
10701120 * scrub finished or device removed, check the
10711121 * finished flag. if unset, just use the last
10721122 * result we got for the current write and go
10731123 * on. flag should be set on next cycle, then.
1124+ *
1125+ * For device removal (ENODEV), preserve the last_physical
1126+ * position in case this was caused by a temporary
1127+ * disconnection like USB hub reset.
10741128 */
10751129 perr = pthread_setcancelstate (
10761130 PTHREAD_CANCEL_DISABLE , & old );
@@ -1080,6 +1134,13 @@ static void *scrub_progress_cycle(void *ctx)
10801134 if (perr )
10811135 goto out ;
10821136 if (!sp_shared -> stats .finished ) {
1137+ /*
1138+ * Preserve the last_physical position to avoid
1139+ * losing progress on temporary disconnections
1140+ */
1141+ if (sp -> ioctl_errno == ENODEV && sp_last -> scrub_args .progress .last_physical > 0 ) {
1142+ sp_shared -> scrub_args .progress .last_physical = sp_last -> scrub_args .progress .last_physical ;
1143+ }
10831144 perr = pthread_mutex_unlock (
10841145 & sp_shared -> progress_mutex );
10851146 if (perr )
@@ -1120,8 +1181,24 @@ static void *scrub_progress_cycle(void *ctx)
11201181 }
11211182 if (!spc -> do_record )
11221183 continue ;
1123- ret = scrub_write_progress (spc -> write_mutex , fsid ,
1124- & spc -> progress [this * ndev ], ndev );
1184+
1185+ /*
1186+ * Force progress saving more frequently if we have device issues
1187+ * to prevent data loss during temporary disconnections
1188+ */
1189+ int force_write = 0 ;
1190+ for (i = 0 ; i < ndev ; ++ i ) {
1191+ struct scrub_progress * sp_check = & spc -> progress [this * ndev + i ];
1192+ if (sp_check -> ioctl_errno == ENODEV || sp_check -> ioctl_errno == ENOTCONN ) {
1193+ force_write = 1 ;
1194+ break ;
1195+ }
1196+ }
1197+
1198+ if (force_write || (tv .tv_sec % 30 ) == 0 ) {
1199+ ret = scrub_write_progress (spc -> write_mutex , fsid ,
1200+ & spc -> progress [this * ndev ], ndev );
1201+ }
11251202 if (ret )
11261203 goto out ;
11271204 }
0 commit comments