Skip to content

Commit 6756b81

Browse files
disable restart for destroy-pending repl-dev (#605)
1 parent 69ea506 commit 6756b81

File tree

5 files changed

+30
-6
lines changed

5 files changed

+30
-6
lines changed

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
class HomestoreConan(ConanFile):
1111
name = "homestore"
12-
version = "6.5.25"
12+
version = "6.5.26"
1313

1414
homepage = "https://github.com/eBay/Homestore"
1515
description = "HomeStore Storage Engine"

src/lib/logstore/log_dev.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,10 @@ void LogDev::remove_log_store(logstore_id_t store_id) {
641641
{
642642
folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx);
643643
auto ret = m_id_logstore_map.erase(store_id);
644-
HS_REL_ASSERT((ret == 1), "try to remove invalid store_id {}-{}", m_logdev_id, store_id);
644+
if (ret == 0) {
645+
LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id);
646+
return;
647+
}
645648
}
646649
unreserve_store_id(store_id);
647650
}

src/lib/logstore/log_store_service.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,10 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor
286286
folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx);
287287
COUNTER_INCREMENT(m_metrics, logstores_count, 1);
288288
const auto it = m_id_logdev_map.find(logdev_id);
289-
HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id);
289+
if (it == m_id_logdev_map.end()) {
290+
HS_LOG(WARN, logstore, "logdev id {} doesnt exist", logdev_id);
291+
return;
292+
}
290293
it->second->remove_log_store(store_id);
291294
COUNTER_DECREMENT(m_metrics, logstores_count, 1);
292295
}

src/lib/replication/repl_dev/raft_repl_dev.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1197,11 +1197,13 @@ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { retu
11971197

11981198
void RaftReplDev::permanent_destroy() {
11991199
RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str());
1200-
m_rd_sb.destroy();
12011200
m_raft_config_sb.destroy();
12021201
m_data_journal->remove_store();
12031202
logstore_service().destroy_log_dev(m_data_journal->logdev_id());
12041203
m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; });
1204+
// we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery
1205+
// occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources.
1206+
m_rd_sb.destroy();
12051207
}
12061208

12071209
void RaftReplDev::leave() {

src/lib/replication/service/raft_repl_service.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ void RaftReplService::start() {
128128
// We need to first load the repl_dev with its config and then attach the raft config to that repl dev.
129129
for (auto const& [buf, mblk] : m_config_sb_bufs) {
130130
auto rdev = raft_group_config_found(buf, voidptr_cast(mblk));
131-
rdev->on_restart();
131+
// if repl_dev is in destroy_pending state, it will not be loaded.
132+
if (rdev) rdev->on_restart();
132133
}
133134
m_config_sb_bufs.clear();
134135
LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed");
@@ -383,7 +384,22 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki
383384
}
384385

385386
if (rd_sb->destroy_pending == 0x1) {
386-
LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, skipping the load", group_id);
387+
LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, reclaim the stale resource", group_id);
388+
// if we do not add the repl_dev to m_rd_map, it will not be permanently destroyed since gc thread finds the
389+
// pending destroy repl_dev only from m_rd_map. so, we should try to reclaim all the repl_dev stale resources
390+
// here.
391+
392+
// 1 since we permanantly destroy the repl_dev here, it will not join_raft group where raft_server will be
393+
// created. hence , no need to detroy it through nuraft_mesg, where raft_server will be shutdown.
394+
// 2 m_raft_config_sb will be destroyed in raft_group_config_found() method if repl_dev is is not found, so
395+
// skip it.
396+
397+
// 3 logdev will be destroyed in delete_unopened_logdevs() if we don't open it(create repl_dev) here, so skip
398+
// it.
399+
400+
// 4 destroy the superblk, and after this, the repl_dev will not be loaded and found again.
401+
rd_sb.destroy();
402+
387403
return;
388404
}
389405

0 commit comments

Comments
 (0)