diff --git a/mysql-test/suite/innodb/r/percona_extended_check_table_debug.result b/mysql-test/suite/innodb/r/percona_extended_check_table_debug.result new file mode 100644 index 000000000000..500a7b27c391 --- /dev/null +++ b/mysql-test/suite/innodb/r/percona_extended_check_table_debug.result @@ -0,0 +1,47 @@ +# +# PS-9638 - Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions +# +call mtr.add_suppression("\\[ERROR\\] .* Invalid record! External LOB first page cannot be shared between two records"); +call mtr.add_suppression("\\[ERROR\\] .* The external LOB first page is \\[page id: space=\\d+, page number=\\d+\\]"); +call mtr.add_suppression("\\[ERROR\\] .* The first occurence of the external LOB first page is in record : page_no: \\d+ with heap_no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* The second occurence of the external LOB first page is in record: page_no: \\d+ with heap no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* Apparent corruption in space \\d+ page \\d+ index `PRIMARY`"); +call mtr.add_suppression("\\[ERROR\\] .* In page \\d+ of index `PRIMARY` of table `test`.`t1`"); +call mtr.add_suppression("\\[Warning\\] .* Cannot open table test/t1Please refer to .*innodb-troubleshooting.html for how to resolve the issue."); +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))); +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +CHECK TABLE t1 EXTENDED; +Table Op Msg_type Msg_text +test.t1 check Warning InnoDB: The B-tree of index PRIMARY is corrupted. +test.t1 check error Corrupt +SELECT * FROM t1; +ERROR 42S02: Table 'test.t1' doesn't exist +DROP TABLE t1; +case 2: compressed table +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))) ROW_FORMAT=COMPRESSED; +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +CHECK TABLE t1 EXTENDED; +Table Op Msg_type Msg_text +test.t1 check Warning InnoDB: The B-tree of index PRIMARY is corrupted. +test.t1 check error Corrupt +SELECT * FROM t1; +ERROR 42S02: Table 'test.t1' doesn't exist +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/percona_extended_check_table_debug.test b/mysql-test/suite/innodb/t/percona_extended_check_table_debug.test new file mode 100644 index 000000000000..8a41d1d857ab --- /dev/null +++ b/mysql-test/suite/innodb/t/percona_extended_check_table_debug.test @@ -0,0 +1,51 @@ +--source include/have_debug.inc + +--echo # +--echo # PS-9638 - Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions +--echo # +call mtr.add_suppression("\\[ERROR\\] .* Invalid record! External LOB first page cannot be shared between two records"); +call mtr.add_suppression("\\[ERROR\\] .* The external LOB first page is \\[page id: space=\\d+, page number=\\d+\\]"); +call mtr.add_suppression("\\[ERROR\\] .* The first occurence of the external LOB first page is in record : page_no: \\d+ with heap_no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* The second occurence of the external LOB first page is in record: page_no: \\d+ with heap no: \\d+"); +call mtr.add_suppression("\\[ERROR\\] .* Apparent corruption in space \\d+ page \\d+ index `PRIMARY`"); +call mtr.add_suppression("\\[ERROR\\] .* In page \\d+ of index `PRIMARY` of table `test`.`t1`"); +call mtr.add_suppression("\\[Warning\\] .* Cannot open table test/t1Please refer to .*innodb-troubleshooting.html for how to resolve the issue."); + +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))); + +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); + +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; + +CHECK TABLE t1 EXTENDED; + +--error ER_NO_SUCH_TABLE +SELECT * FROM t1; + +DROP TABLE t1; + +--echo case 2: compressed table +CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))) ROW_FORMAT=COMPRESSED; + +INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000)); +INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000)); +INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000)); +INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000)); +INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000)); +INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000)); + +SET DEBUG='+d, simulate_lob_corruption'; +CHECK TABLE t1; + +CHECK TABLE t1 EXTENDED; + +--error ER_NO_SUCH_TABLE +SELECT * FROM t1; + +DROP TABLE t1; diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index e9596fd4a35d..7330fa078735 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -3627,11 +3627,13 @@ bool fseg_page_is_free(fseg_header_t *seg_header, /*!< in: segment header */ const page_size_t page_size(space->flags); - seg_inode = fseg_inode_get(seg_header, space_id, page_size, &mtr); + if (seg_header != nullptr) { + seg_inode = fseg_inode_get(seg_header, space_id, page_size, &mtr); - ut_a(seg_inode); - ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); - ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_a(seg_inode); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + } descr = xdes_get_descriptor(space_id, page, page_size, &mtr); ut_a(descr); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 869908940221..b17a645faac3 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -214,6 +214,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "os0enc.h" #include "os0file.h" +#include #include #include #include @@ -19081,11 +19082,25 @@ int ha_innobase::check(THD *thd, /*!< in: user thread handle */ continue; } + /* true if user uses CHECK TABLE t1 EXTENDED */ + const bool is_extended = check_opt->flags & T_EXTEND; + if (!(check_opt->flags & T_QUICK) && !index->is_corrupted()) { /* Enlarge the fatal lock wait timeout during CHECK TABLE. */ srv_fatal_semaphore_wait_extend.fetch_add(1); + if (is_extended && index->is_clustered()) { + // Setup the thread local map for clustered index only + thread_local_blob_map = new blob_ref_map(); + } + + auto blob_ref_clear_guard = create_scope_guard([]() { + if (!thread_local_blob_map) return; + delete thread_local_blob_map; + thread_local_blob_map = nullptr; + }); + bool valid = btr_validate_index(index, m_prebuilt->trx, false); /* Restore the fatal lock wait timeout after @@ -19099,7 +19114,16 @@ int ha_innobase::check(THD *thd, /*!< in: user thread handle */ "InnoDB: The B-tree of" " index %s is corrupted.", index->name()); - continue; + + // with extended mode, if clustered index is corrupted, it is marked + // as corrupted. We skip checking other indexes. The table is not + // repairable and user has to drop it + if (is_extended && index->is_clustered()) { + dict_set_corrupted(index); + break; + } else { + continue; + } } } diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index d0211af98c7b..c8bdb1a73443 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -807,6 +807,24 @@ bool page_is_spatial_non_leaf(const rec_t *rec, dict_index_t *index); page_t *page_create_low(buf_block_t *block, ulint comp, page_type_t page_type); +/** A blob map to track the first page no of external LOB and its parent record +which is the . This is used to find duplicate external LOB +pages that is shared between two records. This can happen only on corruption +(cause unknown yet). CHECK TABLE t1 EXTENDED will use this map to report +corruption and mark the table as corrupted */ +using blob_ref_map = std::unordered_map>; +extern thread_local blob_ref_map *thread_local_blob_map; + +/** Validate that the external LOB's first page is not shared between records of +a clustered index +@param[in] rec physical record +@param[in] index index of the table +@param[in] offsets the record offset array +@return true If OK else false if external LOB is found to be shared between two +records, ie false on failure */ +bool page_rec_blob_validate(const rec_t *rec, const dict_index_t *index, + const ulint *offsets); + #include "page0page.ic" #endif diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index 60f64c0a7e29..054572e32068 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -46,6 +46,14 @@ this program; if not, write to the Free Software Foundation, Inc., #include "lock0lock.h" #include "srv0srv.h" #endif /* !UNIV_HOTBACKUP */ +#include "lob0lob.h" + +/** A blob map to track the first page no of external LOB and its parent record +which is the . This is used to find duplicate external LOB +pages that is shared between two records. This can happen only on corruption +(cause unknown yet). CHECK TABLE t1 EXTENDED will use this map to report +corruption and mark the table as corrupted */ +thread_local blob_ref_map *thread_local_blob_map = nullptr; /* THE INDEX PAGE ============== @@ -1721,6 +1729,124 @@ bool page_rec_validate( return true; } +/** Validate that the external LOB's first page is not shared between records of +a clustered index +@param[in] rec physical record +@param[in] index index of the table +@param[in] offsets the record offset array +@return true If OK else false if external LOB is found to be shared between two +records, ie false on failure */ +bool page_rec_blob_validate(const rec_t *rec, const dict_index_t *index, + const ulint *offsets) { + // this means reference check is not enabled. Enabled only via + // CHECK TABLE path + if (thread_local_blob_map == nullptr) { + return true; + } + + // if index is not PRIMARY, return true + if (!index->is_clustered()) { + return true; + } + + // if page-level is not zero, return true because blob exists only on leaf + // level + const page_t *page = page_align(rec); + if (!page_is_leaf(page)) { + return true; + } + + // if rec is not user record, blobs dont exist, return true + if (!page_rec_is_user_rec(rec)) { + return true; + } + + // if rec doesn't have any external LOB, return true + if (!rec_offs_any_extern(offsets)) { + return true; + } + + // if rec is deleted marked, return true, we cannot validate the blob. the + // blob pages in the deleted marked records could be freed + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + return true; + } + + // if rec is not the owner of the blob, we cannot validate if blob page state + // now validate that the blob first page is not marked as free from page + // bitmap + + ulint n_fields = rec_offs_n_fields(offsets); + + for (ulint i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(index, offsets, i)) { + // We do const_cast to remove constness because lob::ref_t doesn't have a + // variant that takes const record pointer + byte *field_ref = const_cast( + lob::btr_rec_get_field_ref(index, rec, offsets, i)); + + lob::ref_t ref(field_ref); + if (!ref.is_owner() || ref.is_null() || ref.is_null_relaxed() || + ref.is_being_modified()) { + continue; + } + + if (ref.length() == 0) { + // LOB purged + continue; + } + + space_id_t blob_space_id = ref.space_id(); + page_no_t blob_page_no = ref.page_no(); + + page_id_t blob_page_id(blob_space_id, blob_page_no); + bool is_free = fseg_page_is_free(nullptr, blob_space_id, blob_page_no); + if (is_free) { + // This should not be possible. A record that owns the BLOB shouldn't + // have the first page marked as free in page bitmap + ut_ad(0); + ib::error() << "Invalid record. The record's blob reference is marked" + << " as free although the record owns it " + << " page_no: " << page_get_page_no(page) + << " heap_no: " << page_rec_get_heap_no(rec); + ib::error() << "BLOB reference that is marked free " << blob_page_id; + + return false; + } + + DBUG_EXECUTE_IF( + "simulate_lob_corruption", + // introduce corruption after 5 external LOB entries + if (thread_local_blob_map->size() >= 5) { + // we introduce a fake entry in the map + (*thread_local_blob_map)[blob_page_no] = std::make_pair( + page_get_page_no(page) - 1, page_rec_get_heap_no(rec) - 1); + }); + + auto it = thread_local_blob_map->find(blob_page_no); + if (it == thread_local_blob_map->end()) { + (*thread_local_blob_map)[blob_page_no] = + std::make_pair(page_get_page_no(page), page_rec_get_heap_no(rec)); + } else { + auto val = it->second; + ib::error() << "Invalid record! External LOB first page cannot be " + "shared between " + "two records"; + ib::error() << "The external LOB first page is " << blob_page_id; + ib::error() << "The first occurence of the external LOB first page is " + "in record : page_no: " + << val.first << " with heap_no: " << val.second; + ib::error() << "The second occurence of the external LOB first page is " + "in record: page_no: " + << page_get_page_no(page) + << " with heap no: " << page_rec_get_heap_no(rec); + return false; + } + } + } + return true; +} + #ifndef UNIV_HOTBACKUP #ifdef UNIV_DEBUG /** Checks that the first directory slot points to the infimum record and @@ -2235,6 +2361,10 @@ bool page_validate(const page_t *page, dict_index_t *index, goto func_exit; } + if (!page_rec_blob_validate(const_cast(rec), index, offsets)) { + goto func_exit; + } + DBUG_EXECUTE_IF( "check_table_set_wrong_min_bit", if (page_rec_is_user_rec(rec) &&