From dde8a8b7c3de9ae7e4ee4177800c9192035f0cf4 Mon Sep 17 00:00:00 2001 From: Satya Bodapati Date: Fri, 14 Jun 2024 10:48:50 +0100 Subject: [PATCH] PS-9683 : Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions https://perconadev.atlassian.net/browse/PS-9683 Problem: -------- In some of the customer environments, it is found that an external LOB's first page is shared between two records. This shouldn't be possible. But it can happen rarely. The root cause is not known yet. Using table in state, can lead to corruption and assertion failures. Fix: --- But we can detect such a scenario by scanning all records and the external LOB's first page. the EXTENDED keyword currently is ignored by InnoDB. We use it to enable the LOB checks and mark index as corrupted if an external LOB's first page is shared between two records. A thread local blob map is used to identify the duplicate user record that has the same external LOB page. usage: ------ CHECK TABLE t1 EXTENDED A sample error log when such corruption is detected: 2025-04-11T10:30:28.078607Z 9 [ERROR] [MY-011825] [InnoDB] Invalid record! External LOB first page cannot be shared between two records 2025-04-11T10:30:28.078625Z 9 [ERROR] [MY-011825] [InnoDB] The external LOB first page is [page id: space=6, page number=347] 2025-04-11T10:30:28.078631Z 9 [ERROR] [MY-011825] [InnoDB] The first occurence of the external LOB first page is in record : page_no: 3 with heap_no: 6 2025-04-11T10:30:28.078638Z 9 [ERROR] [MY-011825] [InnoDB] The second occurence of the external LOB first page is in record: page_no: 4 with heap no: 7 2025-04-11T10:30:28.078646Z 9 [ERROR] [MY-012738] [InnoDB] Apparent corruption in space 6 page 4 index `PRIMARY` 2025-04-11T10:30:28.078663Z 9 [ERROR] [MY-013050] [InnoDB] In page 4 of index `PRIMARY` of table `test`.`t1` 2025-04-11T10:30:28.088156Z 9 [Warning] [MY-012382] [InnoDB] Cannot open table test/t1Please refer to http://dev.mysql.com/doc/refman/8.0/en/innodb-troubleshooting.html for how to resolve the issue. --- .../percona_extended_check_table_debug.result | 47 +++++++ .../t/percona_extended_check_table_debug.test | 51 +++++++ storage/innobase/fsp/fsp0fsp.cc | 10 +- storage/innobase/handler/ha_innodb.cc | 26 +++- storage/innobase/include/page0page.h | 18 +++ storage/innobase/page/page0page.cc | 130 ++++++++++++++++++ 6 files changed, 277 insertions(+), 5 deletions(-) create mode 100644 mysql-test/suite/innodb/r/percona_extended_check_table_debug.result create mode 100644 mysql-test/suite/innodb/t/percona_extended_check_table_debug.test diff --git a/mysql-test/suite/innodb/r/percona_extended_check_table_debug.result b/mysql-test/suite/innodb/r/percona_extended_check_table_debug.result new file mode 100644 index 000000000000..500a7b27c391 --- /dev/null +++ b/mysql-test/suite/innodb/r/percona_extended_check_table_debug.result @@ -0,0 +1,47 @@ +# +# PS-9638 - Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions +# +call mtr.add_suppression("\\[ERROR\\] .* Invalid record! External LOB first page cannot be shared between two records"); +call mtr.add_suppression("\\[ERROR\\] .* The external LOB first page is \\[page id: space=\\d+, page number=\\d+\\]"); +call mtr.add_suppression("\\[ERROR\\] .* The first occurence of the external LOB first page is in record : page_no: \\d+ with heap_no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* The second occurence of the external LOB first page is in record: page_no: \\d+ with heap no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* Apparent corruption in space \\d+ page \\d+ index `PRIMARY`"); +call mtr.add_suppression("\\[ERROR\\] .* In page \\d+ of index `PRIMARY` of table `test`.`t1`"); +call mtr.add_suppression("\\[Warning\\] .* Cannot open table test/t1Please refer to .*innodb-troubleshooting.html for how to resolve the issue."); +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))); +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +CHECK TABLE t1 EXTENDED; +Table Op Msg_type Msg_text +test.t1 check Warning InnoDB: The B-tree of index PRIMARY is corrupted. +test.t1 check error Corrupt +SELECT * FROM t1; +ERROR 42S02: Table 'test.t1' doesn't exist +DROP TABLE t1; +case 2: compressed table +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))) ROW_FORMAT=COMPRESSED; +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +CHECK TABLE t1 EXTENDED; +Table Op Msg_type Msg_text +test.t1 check Warning InnoDB: The B-tree of index PRIMARY is corrupted. +test.t1 check error Corrupt +SELECT * FROM t1; +ERROR 42S02: Table 'test.t1' doesn't exist +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/percona_extended_check_table_debug.test b/mysql-test/suite/innodb/t/percona_extended_check_table_debug.test new file mode 100644 index 000000000000..8a41d1d857ab --- /dev/null +++ b/mysql-test/suite/innodb/t/percona_extended_check_table_debug.test @@ -0,0 +1,51 @@ +--source include/have_debug.inc + +--echo # +--echo # PS-9638 - Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions +--echo # +call mtr.add_suppression("\\[ERROR\\] .* Invalid record! External LOB first page cannot be shared between two records"); +call mtr.add_suppression("\\[ERROR\\] .* The external LOB first page is \\[page id: space=\\d+, page number=\\d+\\]"); +call mtr.add_suppression("\\[ERROR\\] .* The first occurence of the external LOB first page is in record : page_no: \\d+ with heap_no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* The second occurence of the external LOB first page is in record: page_no: \\d+ with heap no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* Apparent corruption in space \\d+ page \\d+ index `PRIMARY`"); +call mtr.add_suppression("\\[ERROR\\] .* In page \\d+ of index `PRIMARY` of table `test`.`t1`"); +call mtr.add_suppression("\\[Warning\\] .* Cannot open table test/t1Please refer to .*innodb-troubleshooting.html for how to resolve the issue."); + +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))); + +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); + +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; + +CHECK TABLE t1 EXTENDED; + +--error ER_NO_SUCH_TABLE +SELECT * FROM t1; + +DROP TABLE t1; + +--echo case 2: compressed table +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))) ROW_FORMAT=COMPRESSED; + +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); + +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; + +CHECK TABLE t1 EXTENDED; + +--error ER_NO_SUCH_TABLE +SELECT * FROM t1; + +DROP TABLE t1; diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 8fcb0cfdb37e..d9ef042cbcae 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -3566,11 +3566,13 @@ bool fseg_page_is_free(fseg_header_t *seg_header, /*!< in: segment header */ const page_size_t page_size(space->flags); - seg_inode = fseg_inode_get(seg_header, space_id, page_size, &mtr); + if (seg_header != nullptr) { + seg_inode = fseg_inode_get(seg_header, space_id, page_size, &mtr); - ut_a(seg_inode); - ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); - ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_a(seg_inode); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + } descr = xdes_get_descriptor(space_id, page, page_size, &mtr); ut_a(descr); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 2a5a7cb71f73..0c2b08bf0ada 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -206,6 +206,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "os0enc.h" #include "os0file.h" +#include #include #include #include @@ -18907,11 +18908,25 @@ int ha_innobase::check(THD *thd, /*!< in: user thread handle */ continue; } + /* true if user uses CHECK TABLE t1 EXTENDED */ + const bool is_extended = check_opt->flags & T_EXTEND; + if (!(check_opt->flags & T_QUICK) && !index->is_corrupted()) { /* Enlarge the fatal lock wait timeout during CHECK TABLE. */ srv_fatal_semaphore_wait_extend.fetch_add(1); + if (is_extended && index->is_clustered()) { + // Setup the thread local map for clustered index only + thread_local_blob_map = new blob_ref_map(); + } + + auto blob_ref_clear_guard = create_scope_guard([]() { + if (!thread_local_blob_map) return; + delete thread_local_blob_map; + thread_local_blob_map = nullptr; + }); + bool valid = btr_validate_index(index, m_prebuilt->trx, false); /* Restore the fatal lock wait timeout after @@ -18925,7 +18940,16 @@ int ha_innobase::check(THD *thd, /*!< in: user thread handle */ "InnoDB: The B-tree of" " index %s is corrupted.", index->name()); - continue; + + // with extended mode, if clustered index is corrupted, it is marked + // as corrupted. We skip checking other indexes. The table is not + // repairable and user has to drop it + if (is_extended && index->is_clustered()) { + dict_set_corrupted(index); + break; + } else { + continue; + } } } diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index ac8e0d60fe03..f5c84092cd77 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -802,6 +802,24 @@ param[in] index index @return true if ok */ bool page_is_spatial_non_leaf(const rec_t *rec, dict_index_t *index); +/** A blob map to track the first page no of external LOB and its parent record +which is the . This is used to find duplicate external LOB +pages that is shared between two records. This can happen only on corruption +(cause unknown yet). CHECK TABLE t1 EXTENDED will use this map to report +corruption and mark the table as corrupted */ +using blob_ref_map = std::unordered_map>; +extern thread_local blob_ref_map *thread_local_blob_map; + +/** Validate that the external LOB's first page is not shared between records of +a clustered index +@param[in] rec physical record +@param[in] index index of the table +@param[in] offsets the record offset array +@return true If OK else false if external LOB is found to be shared between two +records, ie false on failure */ +bool page_rec_blob_validate(const rec_t *rec, const dict_index_t *index, + const ulint *offsets); + #include "page0page.ic" #endif diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index 39a171b7c431..02eb3aaade11 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -46,6 +46,14 @@ this program; if not, write to the Free Software Foundation, Inc., #include "lock0lock.h" #include "srv0srv.h" #endif /* !UNIV_HOTBACKUP */ +#include "lob0lob.h" + +/** A blob map to track the first page no of external LOB and its parent record +which is the . This is used to find duplicate external LOB +pages that is shared between two records. This can happen only on corruption +(cause unknown yet). CHECK TABLE t1 EXTENDED will use this map to report +corruption and mark the table as corrupted */ +thread_local blob_ref_map *thread_local_blob_map = nullptr; /* THE INDEX PAGE ============== @@ -1721,6 +1729,124 @@ bool page_rec_validate( return true; } +/** Validate that the external LOB's first page is not shared between records of +a clustered index +@param[in] rec physical record +@param[in] index index of the table +@param[in] offsets the record offset array +@return true If OK else false if external LOB is found to be shared between two +records, ie false on failure */ +bool page_rec_blob_validate(const rec_t *rec, const dict_index_t *index, + const ulint *offsets) { + // this means reference check is not enabled. Enabled only via + // CHECK TABLE path + if (thread_local_blob_map == nullptr) { + return true; + } + + // if index is not PRIMARY, return true + if (!index->is_clustered()) { + return true; + } + + // if page-level is not zero, return true because blob exists only on leaf + // level + const page_t *page = page_align(rec); + if (!page_is_leaf(page)) { + return true; + } + + // if rec is not user record, blobs dont exist, return true + if (!page_rec_is_user_rec(rec)) { + return true; + } + + // if rec doesn't have any external LOB, return true + if (!rec_offs_any_extern(offsets)) { + return true; + } + + // if rec is deleted marked, return true, we cannot validate the blob. the + // blob pages in the deleted marked records could be freed + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + return true; + } + + // if rec is not the owner of the blob, we cannot validate if blob page state + // now validate that the blob first page is not marked as free from page + // bitmap + + ulint n_fields = rec_offs_n_fields(offsets); + + for (ulint i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(index, offsets, i)) { + // We do const_cast to remove constness because lob::ref_t doesn't have a + // variant that takes const record pointer + byte *field_ref = const_cast( + lob::btr_rec_get_field_ref(index, rec, offsets, i)); + + lob::ref_t ref(field_ref); + if (!ref.is_owner() || ref.is_null() || ref.is_null_relaxed() || + ref.is_being_modified()) { + continue; + } + + if (ref.length() == 0) { + // LOB purged + continue; + } + + space_id_t blob_space_id = ref.space_id(); + page_no_t blob_page_no = ref.page_no(); + + page_id_t blob_page_id(blob_space_id, blob_page_no); + bool is_free = fseg_page_is_free(nullptr, blob_space_id, blob_page_no); + if (is_free) { + // This should not be possible. A record that owns the BLOB shouldn't + // have the first page marked as free in page bitmap + ut_ad(0); + ib::error() << "Invalid record. The record's blob reference is marked" + << " as free although the record owns it " + << " page_no: " << page_get_page_no(page) + << " heap_no: " << page_rec_get_heap_no(rec); + ib::error() << "BLOB reference that is marked free " << blob_page_id; + + return false; + } + + DBUG_EXECUTE_IF( + "simulate_lob_corruption", + // introduce corruption after 5 external LOB entries + if (thread_local_blob_map->size() >= 5) { + // we introduce a fake entry in the map + (*thread_local_blob_map)[blob_page_no] = std::make_pair( + page_get_page_no(page) - 1, page_rec_get_heap_no(rec) - 1); + }); + + auto it = thread_local_blob_map->find(blob_page_no); + if (it == thread_local_blob_map->end()) { + (*thread_local_blob_map)[blob_page_no] = + std::make_pair(page_get_page_no(page), page_rec_get_heap_no(rec)); + } else { + auto val = it->second; + ib::error() << "Invalid record! External LOB first page cannot be " + "shared between " + "two records"; + ib::error() << "The external LOB first page is " << blob_page_id; + ib::error() << "The first occurence of the external LOB first page is " + "in record : page_no: " + << val.first << " with heap_no: " << val.second; + ib::error() << "The second occurence of the external LOB first page is " + "in record: page_no: " + << page_get_page_no(page) + << " with heap no: " << page_rec_get_heap_no(rec); + return false; + } + } + } + return true; +} + #ifndef UNIV_HOTBACKUP #ifdef UNIV_DEBUG /** Checks that the first directory slot points to the infimum record and @@ -2234,6 +2360,10 @@ bool page_validate(const page_t *page, dict_index_t *index) { goto func_exit; } + if (!page_rec_blob_validate(const_cast(rec), index, offsets)) { + goto func_exit; + } + DBUG_EXECUTE_IF( "check_table_set_wrong_min_bit", if (page_rec_is_user_rec(rec) &&