diff --git a/mysql-test/suite/rocksdb/r/recovery_after_alter_crash.result b/mysql-test/suite/rocksdb/r/recovery_after_alter_crash.result new file mode 100644 index 000000000000..2e7e36b22934 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/recovery_after_alter_crash.result @@ -0,0 +1,42 @@ +# Include helper scripts to check if RocksDB and debug sync features are available +# 1. Disable query logging temporarily to suppress expected warnings during test +# 2. Create a test table with MyRocks engine, a primary key, and one secondary index +CREATE TABLE t ( +i INT PRIMARY KEY, +c1 INT, +c2 INT, +KEY k1 (c1) +) ENGINE=ROCKSDB; +# 3. Insert sample data into the table +INSERT INTO t VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300); +# 4. Set debug sync to simulate crash after RocksDB commits an ALTER operation +SET SESSION debug='+d,crash_commit_after_prepare'; +# 5. Expect the server to crash when committing the ALTER statement +ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE; +ERROR HY000: Lost connection to MySQL server during query +# 6. Wait until the server is fully disconnected due to the crash +# 7. Restart the MySQL server; MyRocks should automatically rebuild the corrupted table definition +# restart +# 8. Verify that the data in the table is intact after reconstruction +SELECT * FROM t ORDER BY i; +i c1 c2 +1 10 100 +2 20 200 +3 30 300 +# 9. Verify that the table structure is consistent and not corrupted +ANALYZE TABLE t; +Table Op Msg_type Msg_text +test.t analyze status OK +# 10. Check that the original index (k1) exists and can be used in queries +EXPLAIN SELECT * FROM t WHERE c1 = 200; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t NULL ref k1 k1 5 const 1 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t`.`i` AS `i`,`test`.`t`.`c1` AS `c1`,`test`.`t`.`c2` AS `c2` from `test`.`t` where (`test`.`t`.`c1` = 200) +# 11. Assert that RocksDB successfully reconstructed the corrupted table +include/assert_grep.inc [RocksDB table reconstruction finished] +# 12. Attempt to add a new index k2 on column c2 using the INPLACE algorithm. +# This command causes a server crash when `rocksdb_validate_tables=1`, which is the default setting +ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE; +# 13. Clean up the test table +DROP TABLE t; diff --git a/mysql-test/suite/rocksdb/t/recovery_after_alter_crash-master.opt b/mysql-test/suite/rocksdb/t/recovery_after_alter_crash-master.opt new file mode 100644 index 000000000000..b4855a4160ec --- /dev/null +++ b/mysql-test/suite/rocksdb/t/recovery_after_alter_crash-master.opt @@ -0,0 +1 @@ +--loose-rocksdb_validate_tables=2 diff --git a/mysql-test/suite/rocksdb/t/recovery_after_alter_crash.test b/mysql-test/suite/rocksdb/t/recovery_after_alter_crash.test new file mode 100644 index 000000000000..bbb11743343d --- /dev/null +++ b/mysql-test/suite/rocksdb/t/recovery_after_alter_crash.test @@ -0,0 +1,56 @@ +--echo # Include helper scripts to check if RocksDB and debug sync features are available +--source include/have_rocksdb.inc +--source include/have_debug_sync.inc + +--echo # 1. Disable query logging temporarily to suppress expected warnings during test +--disable_query_log +CALL mtr.add_suppression("Plugin rocksdb reported: 'Table '.*' definition differs between MyRocks .* ADD INDEX'"); +--enable_query_log + +--echo # 2. Create a test table with MyRocks engine, a primary key, and one secondary index +CREATE TABLE t ( + i INT PRIMARY KEY, + c1 INT, + c2 INT, + KEY k1 (c1) +) ENGINE=ROCKSDB; + +--echo # 3. Insert sample data into the table +INSERT INTO t VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300); + +--echo # 4. Set debug sync to simulate crash after RocksDB commits an ALTER operation +SET SESSION debug='+d,crash_commit_after_prepare'; + +--echo # 5. Expect the server to crash when committing the ALTER statement +--source include/expect_crash.inc +--error CR_SERVER_LOST +ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE; + +--echo # 6. Wait until the server is fully disconnected due to the crash +--source include/wait_until_disconnected.inc + +--echo # 7. Restart the MySQL server; MyRocks should automatically rebuild the corrupted table definition +--source include/start_mysqld.inc + +--echo # 8. Verify that the data in the table is intact after reconstruction +SELECT * FROM t ORDER BY i; + +--echo # 9. Verify that the table structure is consistent and not corrupted +ANALYZE TABLE t; + +--echo # 10. Check that the original index (k1) exists and can be used in queries +EXPLAIN SELECT * FROM t WHERE c1 = 200; + +--echo # 11. Assert that RocksDB successfully reconstructed the corrupted table +--let $assert_text = RocksDB table reconstruction finished +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.1.err +--let $assert_select = Reconstruction of the corrupted table .* has finished. +--let $assert_count = 1 +--source include/assert_grep.inc + +--echo # 12. Attempt to add a new index k2 on column c2 using the INPLACE algorithm. +--echo # This command causes a server crash when `rocksdb_validate_tables=1`, which is the default setting +ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE; + +--echo # 13. Clean up the test table +DROP TABLE t; diff --git a/storage/rocksdb/ChangeLog.md b/storage/rocksdb/ChangeLog.md index 7d605a9219e5..455265ab7400 100644 --- a/storage/rocksdb/ChangeLog.md +++ b/storage/rocksdb/ChangeLog.md @@ -1,5 +1,10 @@ # MyRocks Changelog +## MyRocks 9.3.1-4 +**This version was shipped with Percona Server 8.0.45-36 and 8.4.8-8.** +* PS-10287 – Add automatic table definition recovery after crash on ALTER +· [Jira](https://perconadev.atlassian.net/browse/PS-10287) + ## MyRocks 9.3.1-3 **This version was shipped with Percona Server 8.0.44-35 and 8.4.7-7.** * PS-9680 – Fix potential data corruption in MyRocks after RocksDB 7.10.0 changes diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 58d469328068..cfa91db819b5 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -8069,11 +8069,18 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked, uint n_keys = table->s->keys + (has_hidden_pk(*table) ? 1 : 0); if (m_tbl_def->m_key_count != n_keys) { + const char *alter_op; + if (m_tbl_def->m_key_count > n_keys) + alter_op = "ALTER TABLE ... ADD INDEX"; + else + alter_op = "ALTER TABLE ... DROP INDEX"; + LogPluginErrMsg(ERROR_LEVEL, 0, - "Table '%s' definition mismatch between MyRocks " - "(m_key_count=%d) and data dictionary (n_keys=%d)", + "Table '%s' definition differs between MyRocks " + "(m_key_count=%d) and the data dictionary (n_keys=%d). " + "This was likely caused by a crash during '%s'.", m_tbl_def->full_tablename().c_str(), m_tbl_def->m_key_count, - n_keys); + n_keys, alter_op); for (uint i = 0; i < table->s->keys; i++) { const char *key_name; @@ -8081,25 +8088,52 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked, assert((*table).key_info[i].name != nullptr); key_name = (*table).key_info[i].name; - LogPluginErrMsg( - ERROR_LEVEL, 0, - "ha_rocksdb::open TABLE table_name=%s i=%d/%d key_name=%s", - table->s->table_name.str, i, table->s->keys, key_name); + LogPluginErrMsg(INFORMATION_LEVEL, 0, + "TABLE table_name=%s i=%d/%d key_name=%s", + table->s->table_name.str, i, table->s->keys, key_name); } uint pk = table->s->primary_key; LogPluginErrMsg( - ERROR_LEVEL, 0, "ha_rocksdb::open TABLE PK key_name=%s", + INFORMATION_LEVEL, 0, "TABLE PK key_name=%s", pk == MAX_INDEXES ? HIDDEN_PK_NAME : (*table).key_info[pk].name); for (uint i = 0; i < m_tbl_def->m_key_count; i++) { const char *rdb_name = m_tbl_def->m_key_descr_arr[i]->m_name.c_str(); - LogPluginErrMsg( - ERROR_LEVEL, 0, - "ha_rocksdb::open KEY_descr_arr table_name=%s i=%d/%d key_name=%s", - m_tbl_def->full_tablename().c_str(), i, m_tbl_def->m_key_count, - rdb_name); + LogPluginErrMsg(INFORMATION_LEVEL, 0, + "KEY_descr_arr table_name=%s i=%d/%d key_name=%s", + m_tbl_def->full_tablename().c_str(), i, + m_tbl_def->m_key_count, rdb_name); } + +#if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES + if (m_tbl_def->m_key_count > n_keys && rocksdb_validate_tables > 0) { + if (rocksdb_validate_tables == 1) { + LogPluginErrMsg( + ERROR_LEVEL, 0, + "Start the server with \"rocksdb_validate_tables=2\" to attempt to " + "fix the problem."); + } else { + const std::string tbl_name = m_tbl_def->full_tablename(); + LogPluginErrMsg(INFORMATION_LEVEL, 0, + "Starting reconstruction of the corrupted table %s.", + m_tbl_def->full_tablename().c_str()); + + if (rebuild_table_def_from_table(table) != HA_EXIT_SUCCESS) { + LogPluginErrMsg(ERROR_LEVEL, 0, + "Failed to rebuild table definition for table '%s'", + tbl_name.c_str()); + my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0)); + return HA_EXIT_FAILURE; + } + + LogPluginErrMsg( + INFORMATION_LEVEL, 0, + "Reconstruction of the corrupted table %s has finished.", + m_tbl_def->full_tablename().c_str()); + } + } +#endif } // close() above has already called free_key_buffers(). No need to do it here. @@ -8685,8 +8719,6 @@ std::unordered_map ha_rocksdb::get_old_key_positions( const TABLE &old_table_arg, const Rdb_tbl_def &old_tbl_def_arg) const { DBUG_ENTER_FUNC(); - std::shared_ptr *const old_key_descr = - old_tbl_def_arg.m_key_descr_arr; std::unordered_map old_key_pos; std::unordered_map new_key_pos; uint i; @@ -8697,7 +8729,7 @@ std::unordered_map ha_rocksdb::get_old_key_positions( for (i = 0; i < old_tbl_def_arg.m_key_count; i++) { if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) { - old_key_pos[old_key_descr[i]->m_name] = i; + old_key_pos[HIDDEN_PK_NAME] = i; continue; } @@ -8710,6 +8742,9 @@ std::unordered_map ha_rocksdb::get_old_key_positions( CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB; ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE; */ + if (i >= old_table_arg.s->keys) { + continue; + } const KEY *const old_key = &old_table_arg.key_info[i]; const auto &it = new_key_pos.find(old_key->name); if (it == new_key_pos.end()) { @@ -13019,6 +13054,40 @@ int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) { DBUG_RETURN(HA_EXIT_SUCCESS); } +// A simplified version of ha_rocksdb::delete_table +// that leaves the table's data intact +int ha_rocksdb::delete_table_def(Rdb_tbl_def *const tbl) { + DBUG_ENTER_FUNC(); + + assert(tbl != nullptr); + assert(m_tbl_def == nullptr || m_tbl_def == tbl); + uint table_default_cf_id = tbl->m_key_descr_arr[0]->get_gl_index_id().cf_id; + auto local_dict_manager = + dict_manager.get_dict_manager_selector_non_const(table_default_cf_id); + const std::unique_ptr wb = local_dict_manager->begin(); + rocksdb::WriteBatch *const batch = wb.get(); + + { + /* + Remove the table entry in data dictionary (this will also remove it from + the persistent data dictionary). + */ + ddl_manager.remove(tbl, batch, table_default_cf_id, true); + + int err = local_dict_manager->commit(batch); + if (err) { + DBUG_RETURN(err); + } + } + + // avoid dangling pointer + m_tbl_def = nullptr; + m_iterator = nullptr; + m_pk_iterator = nullptr; + inited = NONE; + DBUG_RETURN(HA_EXIT_SUCCESS); +} + /* Note: the following function is called when the table is not open. That is, this->table==nullptr, pk_key_descr==nullptr, etc. @@ -14402,6 +14471,84 @@ bool ha_rocksdb::prepare_inplace_alter_table( DBUG_RETURN(HA_EXIT_SUCCESS); } +// Reconstructs a corrupted m_tbl_def (Rdb_tbl_def) by removing the +// partially added index left after an incomplete "ALTER ... ADD INDEX". +// Derived from ha_rocksdb::prepare_inplace_alter_table. +bool ha_rocksdb::rebuild_table_def_from_table(TABLE *altered_table) { + DBUG_ENTER_FUNC(); + assert(altered_table != nullptr); + + Rdb_tbl_def *new_tdef = nullptr; + std::shared_ptr *new_key_descr = nullptr; + uint new_n_keys = + altered_table->s->keys + (has_hidden_pk(*altered_table) ? 1 : 0); + const std::string tbl_name = m_tbl_def->full_tablename(); + + try { + new_key_descr = new std::shared_ptr[new_n_keys]; + new_tdef = new Rdb_tbl_def(tbl_name); + } catch (const std::bad_alloc &) { + my_error(ER_OUTOFMEMORY, MYF(0)); + DBUG_RETURN(HA_EXIT_FAILURE); + } + + new_tdef->m_key_descr_arr = new_key_descr; + new_tdef->m_key_count = new_n_keys; + new_tdef->m_pk_index = altered_table->s->primary_key; + new_tdef->m_auto_incr_val = + m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed); + new_tdef->m_hidden_pk_val = + m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed); + + if (create_key_defs(*altered_table, *new_tdef, + "" /* actual_user_table_name */, false /* is_dd_tbl */, + table, m_tbl_def)) { + goto error; + } + + // delete old table_def but keep data + if (delete_table_def(m_tbl_def) != HA_EXIT_SUCCESS) { + LogPluginErrMsg(ERROR_LEVEL, 0, "Failure when trying to drop table %s", + tbl_name.c_str()); + goto error; + } + + { + auto local_dict_manager = + dict_manager.get_dict_manager_selector_non_const(false); + const std::unique_ptr wb = local_dict_manager->begin(); + rocksdb::WriteBatch *const batch = wb.get(); + + std::lock_guard dm_lock(*local_dict_manager); + + int err = ddl_manager.put_and_write(new_tdef, batch); + if (err != HA_EXIT_SUCCESS) goto error; + + err = local_dict_manager->commit(batch); + if (err != HA_EXIT_SUCCESS) goto error; + } + + m_tbl_def = new_tdef; + m_key_descr_arr = m_tbl_def->m_key_descr_arr; + m_pk_descr = m_key_descr_arr[pk_index(*altered_table, *m_tbl_def)]; + + DBUG_RETURN(HA_EXIT_SUCCESS); + +error: + if (new_tdef) { + if (new_tdef->m_key_descr_arr) { + delete[] new_tdef->m_key_descr_arr; + new_tdef->m_key_descr_arr = nullptr; + } + delete new_tdef; + } else if (new_key_descr) { + delete[] new_key_descr; + } + + my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0)); + DBUG_RETURN(HA_EXIT_FAILURE); +} + /** Alter the table structure in-place with operations specified using HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index dbd23a7bcfbe..ab7b0076bcc1 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -861,6 +861,7 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { key_range *const max_key) override MY_ATTRIBUTE((__warn_unused_result__)); + int delete_table_def(Rdb_tbl_def *const tbl); int delete_table(Rdb_tbl_def *const tbl); int delete_table(const char *const from, const dd::Table *table_def) override MY_ATTRIBUTE((__warn_unused_result__)); @@ -900,6 +901,7 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { TABLE *altered_table, my_core::Alter_inplace_info *ha_alter_info) override; + bool rebuild_table_def_from_table(TABLE *altered_table); bool prepare_inplace_alter_table(TABLE *altered_table, my_core::Alter_inplace_info *ha_alter_info, const dd::Table *old_table_def,