Skip to content

Commit 8452dd1

Browse files
committed
PS-10287 [8.0]: Add automatic table definition recovery after crash on ALTER
Add mechanisms to detect and recover corrupted table definitions (`Rdb_tbl_def`) in MyRocks after incomplete `ALTER TABLE ... ADD INDEX` operations. - Log detailed mismatch information between `m_key_count` and actual table keys, including the likely ALTER operation that caused it. - Introduce `delete_table_def()` to remove table definitions without touching table data. - Implement `rebuild_table_def_from_table()` to reconstruct corrupted `Rdb_tbl_def` by removing partially added indexes, preserving existing data. - Reduce verbosity of informational key logging from `ERROR_LEVEL` to `INFORMATION_LEVEL`. - Add optional automatic table validation/reconstruction controlled by `rocksdb_validate_tables` server variable. - Introduce `rocksdb.recovery_after_alter_crash` server variable to enable automatic detection and recovery of table definitions immediately after a crash during ALTER operations.
1 parent c2363d0 commit 8452dd1

File tree

6 files changed

+269
-16
lines changed

6 files changed

+269
-16
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Include helper scripts to check if RocksDB and debug sync features are available
2+
# 1. Disable query logging temporarily to suppress expected warnings during test
3+
# 2. Create a test table with MyRocks engine, a primary key, and one secondary index
4+
CREATE TABLE t (
5+
i INT PRIMARY KEY,
6+
c1 INT,
7+
c2 INT,
8+
KEY k1 (c1)
9+
) ENGINE=ROCKSDB;
10+
# 3. Insert sample data into the table
11+
INSERT INTO t VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300);
12+
# 4. Set debug sync to simulate crash after RocksDB commits an ALTER operation
13+
SET SESSION debug='+d,crash_commit_after_prepare';
14+
# 5. Expect the server to crash when committing the ALTER statement
15+
ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE;
16+
ERROR HY000: Lost connection to MySQL server during query
17+
# 6. Wait until the server is fully disconnected due to the crash
18+
# 7. Restart the MySQL server; MyRocks should automatically rebuild the corrupted table definition
19+
# restart
20+
# 8. Verify that the data in the table is intact after reconstruction
21+
SELECT * FROM t ORDER BY i;
22+
i c1 c2
23+
1 10 100
24+
2 20 200
25+
3 30 300
26+
# 9. Verify that the table structure is consistent and not corrupted
27+
ANALYZE TABLE t;
28+
Table Op Msg_type Msg_text
29+
test.t analyze status OK
30+
# 10. Check that the original index (k1) exists and can be used in queries
31+
EXPLAIN SELECT * FROM t WHERE c1 = 200;
32+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
33+
1 SIMPLE t NULL ref k1 k1 5 const 1 100.00 NULL
34+
Warnings:
35+
Note 1003 /* select#1 */ select `test`.`t`.`i` AS `i`,`test`.`t`.`c1` AS `c1`,`test`.`t`.`c2` AS `c2` from `test`.`t` where (`test`.`t`.`c1` = 200)
36+
# 11. Assert that RocksDB successfully reconstructed the corrupted table
37+
include/assert_grep.inc [RocksDB table reconstruction finished]
38+
# 12. Attempt to add a new index k2 on column c2 using the INPLACE algorithm.
39+
# This command causes a server crash when `rocksdb_validate_tables=1`, which is the default setting
40+
ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE;
41+
# 13. Clean up the test table
42+
DROP TABLE t;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--loose-rocksdb_validate_tables=2
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
--echo # Include helper scripts to check if RocksDB and debug sync features are available
2+
--source include/have_rocksdb.inc
3+
--source include/have_debug_sync.inc
4+
5+
--echo # 1. Disable query logging temporarily to suppress expected warnings during test
6+
--disable_query_log
7+
CALL mtr.add_suppression("Plugin rocksdb reported: 'Table '.*' definition differs between MyRocks .* ADD INDEX'");
8+
--enable_query_log
9+
10+
--echo # 2. Create a test table with MyRocks engine, a primary key, and one secondary index
11+
CREATE TABLE t (
12+
i INT PRIMARY KEY,
13+
c1 INT,
14+
c2 INT,
15+
KEY k1 (c1)
16+
) ENGINE=ROCKSDB;
17+
18+
--echo # 3. Insert sample data into the table
19+
INSERT INTO t VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300);
20+
21+
--echo # 4. Set debug sync to simulate crash after RocksDB commits an ALTER operation
22+
SET SESSION debug='+d,crash_commit_after_prepare';
23+
24+
--echo # 5. Expect the server to crash when committing the ALTER statement
25+
--source include/expect_crash.inc
26+
--error CR_SERVER_LOST
27+
ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE;
28+
29+
--echo # 6. Wait until the server is fully disconnected due to the crash
30+
--source include/wait_until_disconnected.inc
31+
32+
--echo # 7. Restart the MySQL server; MyRocks should automatically rebuild the corrupted table definition
33+
--source include/start_mysqld.inc
34+
35+
--echo # 8. Verify that the data in the table is intact after reconstruction
36+
SELECT * FROM t ORDER BY i;
37+
38+
--echo # 9. Verify that the table structure is consistent and not corrupted
39+
ANALYZE TABLE t;
40+
41+
--echo # 10. Check that the original index (k1) exists and can be used in queries
42+
EXPLAIN SELECT * FROM t WHERE c1 = 200;
43+
44+
--echo # 11. Assert that RocksDB successfully reconstructed the corrupted table
45+
--let $assert_text = RocksDB table reconstruction finished
46+
--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.1.err
47+
--let $assert_select = Reconstruction of the corrupted table .* has finished.
48+
--let $assert_count = 1
49+
--source include/assert_grep.inc
50+
51+
--echo # 12. Attempt to add a new index k2 on column c2 using the INPLACE algorithm.
52+
--echo # This command causes a server crash when `rocksdb_validate_tables=1`, which is the default setting
53+
ALTER TABLE t ADD INDEX k2 (c2), ALGORITHM=INPLACE;
54+
55+
--echo # 13. Clean up the test table
56+
DROP TABLE t;

storage/rocksdb/ChangeLog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# MyRocks Changelog
22

3+
## MyRocks 9.3.1-4
4+
**This version was shipped with Percona Server 8.0.45-36 and 8.4.8-8.**
5+
* PS-10287 – Add automatic table definition recovery after crash on ALTER
6+
· [Jira](https://perconadev.atlassian.net/browse/PS-10287)
7+
38
## MyRocks 9.3.1-3
49
**This version was shipped with Percona Server 8.0.44-35 and 8.4.7-7.**
510
* PS-9680 – Fix potential data corruption in MyRocks after RocksDB 7.10.0 changes

storage/rocksdb/ha_rocksdb.cc

Lines changed: 163 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8069,37 +8069,71 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked,
80698069
uint n_keys = table->s->keys + (has_hidden_pk(*table) ? 1 : 0);
80708070

80718071
if (m_tbl_def->m_key_count != n_keys) {
8072+
const char *alter_op;
8073+
if (m_tbl_def->m_key_count > n_keys)
8074+
alter_op = "ALTER TABLE ... ADD INDEX";
8075+
else
8076+
alter_op = "ALTER TABLE ... DROP INDEX";
8077+
80728078
LogPluginErrMsg(ERROR_LEVEL, 0,
8073-
"Table '%s' definition mismatch between MyRocks "
8074-
"(m_key_count=%d) and data dictionary (n_keys=%d)",
8079+
"Table '%s' definition differs between MyRocks "
8080+
"(m_key_count=%d) and the data dictionary (n_keys=%d). "
8081+
"This was likely caused by a crash during '%s'.",
80758082
m_tbl_def->full_tablename().c_str(), m_tbl_def->m_key_count,
8076-
n_keys);
8083+
n_keys, alter_op);
80778084

80788085
for (uint i = 0; i < table->s->keys; i++) {
80798086
const char *key_name;
80808087
assert((*table).key_info != nullptr);
80818088
assert((*table).key_info[i].name != nullptr);
80828089
key_name = (*table).key_info[i].name;
80838090

8084-
LogPluginErrMsg(
8085-
ERROR_LEVEL, 0,
8086-
"ha_rocksdb::open TABLE table_name=%s i=%d/%d key_name=%s",
8087-
table->s->table_name.str, i, table->s->keys, key_name);
8091+
LogPluginErrMsg(INFORMATION_LEVEL, 0,
8092+
"TABLE table_name=%s i=%d/%d key_name=%s",
8093+
table->s->table_name.str, i, table->s->keys, key_name);
80888094
}
80898095

80908096
uint pk = table->s->primary_key;
80918097
LogPluginErrMsg(
8092-
ERROR_LEVEL, 0, "ha_rocksdb::open TABLE PK key_name=%s",
8098+
INFORMATION_LEVEL, 0, "TABLE PK key_name=%s",
80938099
pk == MAX_INDEXES ? HIDDEN_PK_NAME : (*table).key_info[pk].name);
80948100

80958101
for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
80968102
const char *rdb_name = m_tbl_def->m_key_descr_arr[i]->m_name.c_str();
8097-
LogPluginErrMsg(
8098-
ERROR_LEVEL, 0,
8099-
"ha_rocksdb::open KEY_descr_arr table_name=%s i=%d/%d key_name=%s",
8100-
m_tbl_def->full_tablename().c_str(), i, m_tbl_def->m_key_count,
8101-
rdb_name);
8103+
LogPluginErrMsg(INFORMATION_LEVEL, 0,
8104+
"KEY_descr_arr table_name=%s i=%d/%d key_name=%s",
8105+
m_tbl_def->full_tablename().c_str(), i,
8106+
m_tbl_def->m_key_count, rdb_name);
81028107
}
8108+
8109+
#if defined(ROCKSDB_INCLUDE_VALIDATE_TABLES) && ROCKSDB_INCLUDE_VALIDATE_TABLES
8110+
if (m_tbl_def->m_key_count > n_keys && rocksdb_validate_tables > 0) {
8111+
if (rocksdb_validate_tables == 1) {
8112+
LogPluginErrMsg(
8113+
ERROR_LEVEL, 0,
8114+
"Start the server with \"rocksdb_validate_tables=2\" to attempt to "
8115+
"fix the problem.");
8116+
} else {
8117+
const std::string tbl_name = m_tbl_def->full_tablename();
8118+
LogPluginErrMsg(INFORMATION_LEVEL, 0,
8119+
"Starting reconstruction of the corrupted table %s.",
8120+
m_tbl_def->full_tablename().c_str());
8121+
8122+
if (rebuild_table_def_from_table(table) != HA_EXIT_SUCCESS) {
8123+
LogPluginErrMsg(ERROR_LEVEL, 0,
8124+
"Failed to rebuild table definition for table '%s'",
8125+
tbl_name.c_str());
8126+
my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
8127+
return HA_EXIT_FAILURE;
8128+
}
8129+
8130+
LogPluginErrMsg(
8131+
INFORMATION_LEVEL, 0,
8132+
"Reconstruction of the corrupted table %s has finished.",
8133+
m_tbl_def->full_tablename().c_str());
8134+
}
8135+
}
8136+
#endif
81038137
}
81048138

81058139
// close() above has already called free_key_buffers(). No need to do it here.
@@ -8685,8 +8719,6 @@ std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
86858719
const TABLE &old_table_arg, const Rdb_tbl_def &old_tbl_def_arg) const {
86868720
DBUG_ENTER_FUNC();
86878721

8688-
std::shared_ptr<Rdb_key_def> *const old_key_descr =
8689-
old_tbl_def_arg.m_key_descr_arr;
86908722
std::unordered_map<std::string, uint> old_key_pos;
86918723
std::unordered_map<std::string, uint> new_key_pos;
86928724
uint i;
@@ -8697,7 +8729,7 @@ std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
86978729

86988730
for (i = 0; i < old_tbl_def_arg.m_key_count; i++) {
86998731
if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
8700-
old_key_pos[old_key_descr[i]->m_name] = i;
8732+
old_key_pos[HIDDEN_PK_NAME] = i;
87018733
continue;
87028734
}
87038735

@@ -8710,6 +8742,9 @@ std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
87108742
CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
87118743
ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
87128744
*/
8745+
if (i >= old_table_arg.s->keys) {
8746+
continue;
8747+
}
87138748
const KEY *const old_key = &old_table_arg.key_info[i];
87148749
const auto &it = new_key_pos.find(old_key->name);
87158750
if (it == new_key_pos.end()) {
@@ -13019,6 +13054,40 @@ int ha_rocksdb::delete_table(Rdb_tbl_def *const tbl) {
1301913054
DBUG_RETURN(HA_EXIT_SUCCESS);
1302013055
}
1302113056

13057+
// A simplified version of ha_rocksdb::delete_table
13058+
// that leaves the table's data intact
13059+
int ha_rocksdb::delete_table_def(Rdb_tbl_def *const tbl) {
13060+
DBUG_ENTER_FUNC();
13061+
13062+
assert(tbl != nullptr);
13063+
assert(m_tbl_def == nullptr || m_tbl_def == tbl);
13064+
uint table_default_cf_id = tbl->m_key_descr_arr[0]->get_gl_index_id().cf_id;
13065+
auto local_dict_manager =
13066+
dict_manager.get_dict_manager_selector_non_const(table_default_cf_id);
13067+
const std::unique_ptr<rocksdb::WriteBatch> wb = local_dict_manager->begin();
13068+
rocksdb::WriteBatch *const batch = wb.get();
13069+
13070+
{
13071+
/*
13072+
Remove the table entry in data dictionary (this will also remove it from
13073+
the persistent data dictionary).
13074+
*/
13075+
ddl_manager.remove(tbl, batch, table_default_cf_id, true);
13076+
13077+
int err = local_dict_manager->commit(batch);
13078+
if (err) {
13079+
DBUG_RETURN(err);
13080+
}
13081+
}
13082+
13083+
// avoid dangling pointer
13084+
m_tbl_def = nullptr;
13085+
m_iterator = nullptr;
13086+
m_pk_iterator = nullptr;
13087+
inited = NONE;
13088+
DBUG_RETURN(HA_EXIT_SUCCESS);
13089+
}
13090+
1302213091
/*
1302313092
Note: the following function is called when the table is not open. That is,
1302413093
this->table==nullptr, pk_key_descr==nullptr, etc.
@@ -14402,6 +14471,84 @@ bool ha_rocksdb::prepare_inplace_alter_table(
1440214471
DBUG_RETURN(HA_EXIT_SUCCESS);
1440314472
}
1440414473

14474+
// Reconstructs a corrupted m_tbl_def (Rdb_tbl_def) by removing the
14475+
// partially added index left after an incomplete "ALTER ... ADD INDEX".
14476+
// Derived from ha_rocksdb::prepare_inplace_alter_table.
14477+
bool ha_rocksdb::rebuild_table_def_from_table(TABLE *altered_table) {
14478+
DBUG_ENTER_FUNC();
14479+
assert(altered_table != nullptr);
14480+
14481+
Rdb_tbl_def *new_tdef = nullptr;
14482+
std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
14483+
uint new_n_keys =
14484+
altered_table->s->keys + (has_hidden_pk(*altered_table) ? 1 : 0);
14485+
const std::string tbl_name = m_tbl_def->full_tablename();
14486+
14487+
try {
14488+
new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
14489+
new_tdef = new Rdb_tbl_def(tbl_name);
14490+
} catch (const std::bad_alloc &) {
14491+
my_error(ER_OUTOFMEMORY, MYF(0));
14492+
DBUG_RETURN(HA_EXIT_FAILURE);
14493+
}
14494+
14495+
new_tdef->m_key_descr_arr = new_key_descr;
14496+
new_tdef->m_key_count = new_n_keys;
14497+
new_tdef->m_pk_index = altered_table->s->primary_key;
14498+
new_tdef->m_auto_incr_val =
14499+
m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
14500+
new_tdef->m_hidden_pk_val =
14501+
m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
14502+
14503+
if (create_key_defs(*altered_table, *new_tdef,
14504+
"" /* actual_user_table_name */, false /* is_dd_tbl */,
14505+
table, m_tbl_def)) {
14506+
goto error;
14507+
}
14508+
14509+
// delete old table_def but keep data
14510+
if (delete_table_def(m_tbl_def) != HA_EXIT_SUCCESS) {
14511+
LogPluginErrMsg(ERROR_LEVEL, 0, "Failure when trying to drop table %s",
14512+
tbl_name.c_str());
14513+
goto error;
14514+
}
14515+
14516+
{
14517+
auto local_dict_manager =
14518+
dict_manager.get_dict_manager_selector_non_const(false);
14519+
const std::unique_ptr<rocksdb::WriteBatch> wb = local_dict_manager->begin();
14520+
rocksdb::WriteBatch *const batch = wb.get();
14521+
14522+
std::lock_guard<Rdb_dict_manager> dm_lock(*local_dict_manager);
14523+
14524+
int err = ddl_manager.put_and_write(new_tdef, batch);
14525+
if (err != HA_EXIT_SUCCESS) goto error;
14526+
14527+
err = local_dict_manager->commit(batch);
14528+
if (err != HA_EXIT_SUCCESS) goto error;
14529+
}
14530+
14531+
m_tbl_def = new_tdef;
14532+
m_key_descr_arr = m_tbl_def->m_key_descr_arr;
14533+
m_pk_descr = m_key_descr_arr[pk_index(*altered_table, *m_tbl_def)];
14534+
14535+
DBUG_RETURN(HA_EXIT_SUCCESS);
14536+
14537+
error:
14538+
if (new_tdef) {
14539+
if (new_tdef->m_key_descr_arr) {
14540+
delete[] new_tdef->m_key_descr_arr;
14541+
new_tdef->m_key_descr_arr = nullptr;
14542+
}
14543+
delete new_tdef;
14544+
} else if (new_key_descr) {
14545+
delete[] new_key_descr;
14546+
}
14547+
14548+
my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
14549+
DBUG_RETURN(HA_EXIT_FAILURE);
14550+
}
14551+
1440514552
/**
1440614553
Alter the table structure in-place with operations specified using
1440714554
HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed

storage/rocksdb/ha_rocksdb.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,7 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
861861
key_range *const max_key) override
862862
MY_ATTRIBUTE((__warn_unused_result__));
863863

864+
int delete_table_def(Rdb_tbl_def *const tbl);
864865
int delete_table(Rdb_tbl_def *const tbl);
865866
int delete_table(const char *const from, const dd::Table *table_def) override
866867
MY_ATTRIBUTE((__warn_unused_result__));
@@ -900,6 +901,7 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
900901
TABLE *altered_table,
901902
my_core::Alter_inplace_info *ha_alter_info) override;
902903

904+
bool rebuild_table_def_from_table(TABLE *altered_table);
903905
bool prepare_inplace_alter_table(TABLE *altered_table,
904906
my_core::Alter_inplace_info *ha_alter_info,
905907
const dd::Table *old_table_def,

0 commit comments

Comments
 (0)