From b6eda03c1dfe73eb4cdbf3db798660dd50fb0ccb Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Mon, 16 Sep 2024 16:43:45 -0400 Subject: [PATCH 01/52] Opt-in catalog migration for converting subsources to source tables --- src/adapter-types/src/dyncfgs.rs | 8 ++ src/adapter/src/catalog/migrate.rs | 166 ++++++++++++++++++++++++++++- 2 files changed, 169 insertions(+), 5 deletions(-) diff --git a/src/adapter-types/src/dyncfgs.rs b/src/adapter-types/src/dyncfgs.rs index 795a342f284a4..1ddd317c835fb 100644 --- a/src/adapter-types/src/dyncfgs.rs +++ b/src/adapter-types/src/dyncfgs.rs @@ -111,6 +111,13 @@ pub const ENABLE_EXPRESSION_CACHE: Config = Config::new( "Use a cache to store optimized expressions to help speed up start times.", ); +/// Whether to enable the migration to convert all sources to use tables. +pub const ENABLE_SOURCE_TABLE_MIGRATION: Config = Config::new( + "enable_source_table_migration", + false, + "Whether to enable the migration to convert all sources to use tables.", +); + /// Adds the full set of all compute `Config`s. pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { configs @@ -128,4 +135,5 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { .add(&DEFAULT_SINK_PARTITION_STRATEGY) .add(&ENABLE_CONTINUAL_TASK_BUILTINS) .add(&ENABLE_EXPRESSION_CACHE) + .add(&ENABLE_SOURCE_TABLE_MIGRATION) } diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index c62d768a838e6..888824f0d389a 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -10,7 +10,7 @@ use std::collections::BTreeMap; use mz_catalog::builtin::BuiltinTable; -use mz_catalog::durable::Transaction; +use mz_catalog::durable::{Transaction, Item}; use mz_catalog::memory::objects::StateUpdate; use mz_ore::collections::CollectionExt; use mz_ore::now::NowFn; @@ -31,13 +31,23 @@ where &'a mut Transaction<'_>, CatalogItemId, &'a mut Statement, + &'a Vec<(Item, Statement)>, ) -> Result<(), anyhow::Error>, { let mut updated_items = BTreeMap::new(); + let items_with_statements = tx + .get_items() + .map(|item| { + let stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; + Ok((item, stmt)) + }) + .collect::, anyhow::Error>>()?; - for mut item in tx.get_items() { + // Clone this vec to be referenced within the closure if needed + let items_with_statements_ref = items_with_statements.clone(); + for (mut item, mut stmt) in items_with_statements { let mut stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; - f(tx, item.id, &mut stmt)?; + f(tx, item.id, &mut stmt, &items_with_statements_ref)?; item.create_sql = stmt.to_ast_string_stable(); @@ -97,7 +107,7 @@ pub(crate) async fn migrate( catalog_version ); - rewrite_ast_items(tx, |_tx, _id, _stmt| { + rewrite_ast_items(tx, |tx, _id, stmt, all_items_and_statements| { // Add per-item AST migrations below. // // Each migration should be a function that takes `stmt` (the AST @@ -107,7 +117,21 @@ pub(crate) async fn migrate( // Migration functions may also take `tx` as input to stage // arbitrary changes to the catalog. - Ok(()) + // Special block for `ast_rewrite_sources_to_tables` migration + // since it requires a feature flag. + if let Some(config_val) = tx.get_system_config("enable_source_table_migration") { + let enable_migration = config_val.parse::().map_err(|e| { + anyhow::anyhow!( + "could not parse enable_source_table_migration config value: {}", + e + ) + })?; + if enable_migration { + info!("migrate: enable_source_table_migration"); + ast_rewrite_sources_to_tables(stmt, all_items_and_statements)?; + } + } + Ok(()) })?; // Load items into catalog. We make sure to consolidate the old updates with the new updates to @@ -182,6 +206,138 @@ pub(crate) async fn migrate( // Please include the adapter team on any code reviews that add or edit // migrations. +/// Migrates all sources to use the new sources as tables model +/// +/// First we migrate existing `CREATE SUBSOURCE` statements, turning them into +/// `CREATE TABLE .. FROM SOURCE` statements. This covers existing Postgres, +/// MySQL, and multi-output (tpch, auction, marketing) load-generator subsources. +/// +/// Second we migrate existing `CREATE SOURCE` statements for these multi-output +/// sources to remove their subsource qualification option (e.g. FOR ALL TABLES..) +/// and any subsource-specific options (e.g. TEXT COLUMNS). +/// +/// Third we migrate existing `CREATE SOURCE` statements that refer to a single +/// output collection. This includes existing Kafka and single-output load-generator +/// subsources. These statements will generate an additional `CREATE TABLE .. FROM SOURCE` +/// statement that copies over all the export-specific options. This statement will be tied +/// to the existing statement's persist collection, but using a new GlobalID. The original +/// source statement will be updated to remove the export-specific options and will be +/// renamed to `_source` while keeping its same GlobalId. +/// +fn ast_rewrite_sources_to_tables( + stmt: &mut Statement, + all_items_and_statements: &Vec<(Item, Statement)>, +) -> Result<(), anyhow::Error> { + use mz_sql::ast::{ + CreateSubsourceOptionName, CreateTableFromSourceStatement, RawItemName, + TableFromSourceOption, WithOptionValue, + }; + + // Since some subsources have named-only references to their `of_source` and some have the + // global_id of their source, we first generate a reference mapping from all source names to + // items so we can correct the references in the `CREATE TABLE .. FROM SOURCE` + // statements to always use the ID of the source. + let source_name_to_item: BTreeMap<_, _> = all_items_and_statements + .iter() + .filter_map(|(item, statement)| match statement { + Statement::CreateSource(stmt) => Some((stmt.name.clone(), item)), + _ => None, + }) + .collect(); + + match stmt { + // Migrate each 'source export' `CREATE SUBSOURCE` statement to an equivalent + // `CREATE TABLE .. FROM SOURCE` statement. Note that we will not be migrating + // 'progress' `CREATE SUBSOURCE` statements as they are not currently relevant + // to the new table model. + Statement::CreateSubsource(subsource) if subsource.of_source.is_some() => { + let raw_source_name = subsource.of_source.as_ref().unwrap(); + let source = match raw_source_name { + RawItemName::Name(name) => { + // Convert the name reference to an ID reference. + let source_item = source_name_to_item.get(name).expect("source must exist"); + RawItemName::Id(source_item.id.to_string(), name.clone(), None) + } + RawItemName::Id(..) => raw_source_name.clone(), + }; + + // The external reference is a `with_option` on subsource statements but is a + // separate field on table statements. + let external_reference_option = subsource + .with_options + .iter() + .find(|o| o.name == CreateSubsourceOptionName::ExternalReference) + .expect("subsources must have external reference"); + let external_reference = match &external_reference_option.value { + Some(WithOptionValue::UnresolvedItemName(name)) => name, + _ => unreachable!("external reference must be an unresolved item name"), + }; + + // The `details` option on both subsources and tables is identical, using the same + // ProtoSourceExportStatementDetails serialized value. + let existing_details = subsource + .with_options + .iter() + .find(|o| o.name == CreateSubsourceOptionName::Details) + .expect("subsources must have details"); + + let mut with_options = vec![TableFromSourceOption { + name: mz_sql::ast::TableFromSourceOptionName::Details, + value: existing_details.value.clone(), + }]; + + // TEXT COLUMNS and EXCLUDE COLUMNS options are stored on each subsource and can be copied + // directly to the table statement. Note that we also store a 'normalized' version of the + // text columns and exclude columns in the `with_options` field of the primary source statement + // for legacy reasons, but that is redundant info and will be removed below when we migrate + // the source statements. + if let Some(text_cols_option) = subsource + .with_options + .iter() + .find(|option| option.name == CreateSubsourceOptionName::TextColumns) + { + with_options.push(TableFromSourceOption { + name: mz_sql::ast::TableFromSourceOptionName::TextColumns, + value: text_cols_option.value.clone(), + }); + } + if let Some(exclude_cols_option) = subsource + .with_options + .iter() + .find(|option| option.name == CreateSubsourceOptionName::ExcludeColumns) + { + with_options.push(TableFromSourceOption { + name: mz_sql::ast::TableFromSourceOptionName::ExcludeColumns, + value: exclude_cols_option.value.clone(), + }); + } + + let table = CreateTableFromSourceStatement { + name: subsource.name.clone(), + constraints: subsource.constraints.clone(), + columns: mz_sql::ast::TableFromSourceColumns::Defined(subsource.columns.clone()), + if_not_exists: subsource.if_not_exists, + source, + external_reference: Some(external_reference.clone()), + with_options, + // Subsources don't have `envelope`, `include_metadata`, or `format` options. + envelope: None, + include_metadata: vec![], + format: None, + }; + + info!( + "migrate: converted subsource {:?} to table {:?}", + subsource, table + ); + *stmt = Statement::CreateTableFromSource(table); + } + _ => (), + } + + Ok(()) +} + // Durable migrations /// Migrations that run only on the durable catalog before any data is loaded into memory. From 7d9bc2601b15acacd3254324f6e83abf0ca4596d Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Wed, 23 Oct 2024 12:57:26 -0400 Subject: [PATCH 02/52] Add migration logic for source statements --- src/adapter/src/catalog/migrate.rs | 295 ++++++++++++++++++++++++++--- 1 file changed, 273 insertions(+), 22 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 888824f0d389a..acf7fa6b5d47d 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -29,7 +29,7 @@ fn rewrite_ast_items(tx: &mut Transaction<'_>, mut f: F) -> Result<(), anyhow where F: for<'a> FnMut( &'a mut Transaction<'_>, - CatalogItemId, + &'a mut Item, &'a mut Statement, &'a Vec<(Item, Statement)>, ) -> Result<(), anyhow::Error>, @@ -47,7 +47,7 @@ where let items_with_statements_ref = items_with_statements.clone(); for (mut item, mut stmt) in items_with_statements { let mut stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; - f(tx, item.id, &mut stmt, &items_with_statements_ref)?; + f(tx, &mut item, &mut stmt, &items_with_statements_ref)?; item.create_sql = stmt.to_ast_string_stable(); @@ -107,7 +107,7 @@ pub(crate) async fn migrate( catalog_version ); - rewrite_ast_items(tx, |tx, _id, stmt, all_items_and_statements| { + rewrite_ast_items(tx, |tx, item, stmt, all_items_and_statements| { // Add per-item AST migrations below. // // Each migration should be a function that takes `stmt` (the AST @@ -128,7 +128,7 @@ pub(crate) async fn migrate( })?; if enable_migration { info!("migrate: enable_source_table_migration"); - ast_rewrite_sources_to_tables(stmt, all_items_and_statements)?; + ast_rewrite_sources_to_tables(tx, item, stmt, all_items_and_statements)?; } } Ok(()) @@ -225,18 +225,30 @@ pub(crate) async fn migrate( /// renamed to `_source` while keeping its same GlobalId. /// fn ast_rewrite_sources_to_tables( + tx: &mut Transaction<'_>, + item: &mut Item, stmt: &mut Statement, all_items_and_statements: &Vec<(Item, Statement)>, ) -> Result<(), anyhow::Error> { + use maplit::btreemap; + use maplit::btreeset; + use mz_persist_types::ShardId; + use mz_proto::RustType; use mz_sql::ast::{ - CreateSubsourceOptionName, CreateTableFromSourceStatement, RawItemName, - TableFromSourceOption, WithOptionValue, + CreateSourceConnection, CreateSourceOptionName, CreateSourceStatement, + CreateSubsourceOptionName, CreateTableFromSourceStatement, Ident, LoadGenerator, + MySqlConfigOptionName, PgConfigOptionName, RawItemName, TableFromSourceColumns, + TableFromSourceOption, TableFromSourceOptionName, Value, WithOptionValue, }; + use mz_storage_client::controller::StorageTxn; + use mz_storage_types::sources::load_generator::LoadGeneratorOutput; + use mz_storage_types::sources::SourceExportStatementDetails; + use prost::Message; // Since some subsources have named-only references to their `of_source` and some have the // global_id of their source, we first generate a reference mapping from all source names to - // items so we can correct the references in the `CREATE TABLE .. FROM SOURCE` - // statements to always use the ID of the source. + // items so we can ensure we always use an ID-based reference in the stored + // `CREATE TABLE .. FROM SOURCE` statements. let source_name_to_item: BTreeMap<_, _> = all_items_and_statements .iter() .filter_map(|(item, statement)| match statement { @@ -245,13 +257,34 @@ fn ast_rewrite_sources_to_tables( }) .collect(); + // Collect any existing `CREATE TABLE .. FROM SOURCE` statements by the source they reference. + let tables_by_source: BTreeMap<_, Vec<_>> = all_items_and_statements + .iter() + .filter_map(|(_, statement)| match statement { + Statement::CreateTableFromSource(stmt) => { + let source: GlobalId = match &stmt.source { + RawItemName::Name(_) => unreachable!("tables store source as ID"), + RawItemName::Id(id, _, _) => id.parse().expect("valid id"), + }; + Some((source, stmt)) + } + _ => None, + }) + .fold(BTreeMap::new(), |mut acc, (source, stmt)| { + acc.entry(source).or_insert_with(Vec::new).push(stmt); + acc + }); + match stmt { - // Migrate each 'source export' `CREATE SUBSOURCE` statement to an equivalent - // `CREATE TABLE .. FROM SOURCE` statement. Note that we will not be migrating - // 'progress' `CREATE SUBSOURCE` statements as they are not currently relevant - // to the new table model. - Statement::CreateSubsource(subsource) if subsource.of_source.is_some() => { - let raw_source_name = subsource.of_source.as_ref().unwrap(); + // Migrate each `CREATE SUBSOURCE` statement to an equivalent + // `CREATE TABLE .. FROM SOURCE` statement. + Statement::CreateSubsource(subsource) => { + let raw_source_name = match &subsource.of_source { + // We will not be migrating 'progress' `CREATE SUBSOURCE` statements as they + // are not currently relevant to the new table model. + None => return Ok(()), + Some(name) => name, + }; let source = match raw_source_name { RawItemName::Name(name) => { // Convert the name reference to an ID reference. @@ -282,22 +315,19 @@ fn ast_rewrite_sources_to_tables( .expect("subsources must have details"); let mut with_options = vec![TableFromSourceOption { - name: mz_sql::ast::TableFromSourceOptionName::Details, + name: TableFromSourceOptionName::Details, value: existing_details.value.clone(), }]; // TEXT COLUMNS and EXCLUDE COLUMNS options are stored on each subsource and can be copied - // directly to the table statement. Note that we also store a 'normalized' version of the - // text columns and exclude columns in the `with_options` field of the primary source statement - // for legacy reasons, but that is redundant info and will be removed below when we migrate - // the source statements. + // directly to the table statement. if let Some(text_cols_option) = subsource .with_options .iter() .find(|option| option.name == CreateSubsourceOptionName::TextColumns) { with_options.push(TableFromSourceOption { - name: mz_sql::ast::TableFromSourceOptionName::TextColumns, + name: TableFromSourceOptionName::TextColumns, value: text_cols_option.value.clone(), }); } @@ -307,7 +337,7 @@ fn ast_rewrite_sources_to_tables( .find(|option| option.name == CreateSubsourceOptionName::ExcludeColumns) { with_options.push(TableFromSourceOption { - name: mz_sql::ast::TableFromSourceOptionName::ExcludeColumns, + name: TableFromSourceOptionName::ExcludeColumns, value: exclude_cols_option.value.clone(), }); } @@ -327,11 +357,232 @@ fn ast_rewrite_sources_to_tables( }; info!( - "migrate: converted subsource {:?} to table {:?}", + "migrate: converted subsource {} to table {}", subsource, table ); *stmt = Statement::CreateTableFromSource(table); } + + // Postgres sources are multi-output sources whose subsources are + // migrated above. All we need to do is remove the subsource-related + // options from this statement since they are no longer relevant. + Statement::CreateSource(CreateSourceStatement { + connection: + CreateSourceConnection::Postgres { options, .. } + | CreateSourceConnection::Yugabyte { options, .. }, + .. + }) => { + // This option storing text columns on the primary source statement is redundant + // with the option on subsource statements so can just be removed. + // This was kept for round-tripping of `CREATE SOURCE` statements that automatically + // generated subsources, which is no longer necessary. + if options + .iter() + .any(|o| matches!(o.name, PgConfigOptionName::TextColumns)) + { + options.retain(|o| !matches!(o.name, PgConfigOptionName::TextColumns)); + info!("migrate: converted postgres source {stmt} to remove subsource options"); + } + } + // MySQL sources are multi-output sources whose subsources are + // migrated above. All we need to do is remove the subsource-related + // options from this statement since they are no longer relevant. + Statement::CreateSource(CreateSourceStatement { + connection: CreateSourceConnection::MySql { options, .. }, + .. + }) => { + // These options storing text and exclude columns on the primary source statement + // are redundant with the options on subsource statements so can just be removed. + // They was kept for round-tripping of `CREATE SOURCE` statements that automatically + // generated subsources, which is no longer necessary. + if options.iter().any(|o| { + matches!( + o.name, + MySqlConfigOptionName::TextColumns | MySqlConfigOptionName::ExcludeColumns + ) + }) { + options.retain(|o| { + !matches!( + o.name, + MySqlConfigOptionName::TextColumns | MySqlConfigOptionName::ExcludeColumns + ) + }); + info!("migrate: converted mysql source {stmt} to remove subsource options"); + } + } + // Multi-output load generator sources whose subsources are already + // migrated above. There is no need to remove any options from this + // statement since they are not export-specific. + Statement::CreateSource(CreateSourceStatement { + connection: + CreateSourceConnection::LoadGenerator { + generator: + LoadGenerator::Auction | LoadGenerator::Marketing | LoadGenerator::Tpch, + .. + }, + .. + }) => {} + // Single-output sources that need to be migrated to tables. These sources currently output + // data to the primary collection of the source statement. We will create a new table + // statement for them and move all export-specific options over from the source statement, + // while moving the `CREATE SOURCE` statement to a new name and moving its shard to the + // new table statement. + Statement::CreateSource(CreateSourceStatement { + connection: + conn @ (CreateSourceConnection::Kafka { .. } + | CreateSourceConnection::LoadGenerator { + generator: + LoadGenerator::Clock + | LoadGenerator::Datums + | LoadGenerator::Counter + | LoadGenerator::KeyValue, + .. + }), + name, + col_names, + include_metadata, + format, + envelope, + with_options, + .. + }) => { + // To check if this is a source that has already been migrated we use a basic + // heuristic: if there is at least one existing table for the source, and if + // the envelope/format/include_metadata options are empty, we assume it's + // already been migrated. + if let Some(existing_tables) = tables_by_source.get(&item.id) { + if !existing_tables.is_empty() + && envelope.is_none() + && format.is_none() + && include_metadata.is_empty() + { + return Ok(()); + } + } + + // Use the current source name as the new table name, and rename the source to + // `_source`. This is intended to allow users to continue using + // queries that reference the source name, since they will now need to query the + // table instead. + let new_source_name_ident = Ident::new_unchecked( + name.0.last().expect("at least one ident").to_string() + "_source", + ); + let mut new_source_name = name.clone(); + *new_source_name.0.last_mut().expect("at least one ident") = new_source_name_ident; + let table_name = std::mem::replace(name, new_source_name.clone()); + + // Also update the name of the source 'item' + let mut table_item_name = item.name.clone() + "_source"; + std::mem::swap(&mut item.name, &mut table_item_name); + + // A reference to the source that will be included in the table statement + let source_ref = RawItemName::Id(item.id.to_string(), new_source_name, None); + + let columns = if col_names.is_empty() { + TableFromSourceColumns::NotSpecified + } else { + TableFromSourceColumns::Named(col_names.drain(..).collect()) + }; + + // All source tables must have a `details` option, which is a serialized proto + // describing any source-specific details for this table statement. + let details = match conn { + // For kafka sources this proto is currently empty. + CreateSourceConnection::Kafka { .. } => SourceExportStatementDetails::Kafka {}, + CreateSourceConnection::LoadGenerator { .. } => { + // Since these load generators are single-output we use the default output. + SourceExportStatementDetails::LoadGenerator { + output: LoadGeneratorOutput::Default, + } + } + _ => unreachable!("match determined above"), + }; + let mut table_with_options = vec![TableFromSourceOption { + name: TableFromSourceOptionName::Details, + value: Some(WithOptionValue::Value(Value::String(hex::encode( + details.into_proto().encode_to_vec(), + )))), + }]; + + // Move over the IgnoreKeys option if it exists. + let mut i = 0; + while i < with_options.len() { + if with_options[i].name == CreateSourceOptionName::IgnoreKeys { + let option = with_options.remove(i); + table_with_options.push(TableFromSourceOption { + name: TableFromSourceOptionName::IgnoreKeys, + value: option.value, + }); + } else { + i += 1; + } + } + + // Move over the Timeline option if it exists. + i = 0; + while i < with_options.len() { + if with_options[i].name == CreateSourceOptionName::Timeline { + let option = with_options.remove(i); + table_with_options.push(TableFromSourceOption { + name: TableFromSourceOptionName::Timeline, + value: option.value, + }); + } else { + i += 1; + } + } + + // The new table statement, stealing the export-specific options from the + // create source statement. + let table = CreateTableFromSourceStatement { + name: table_name, + constraints: vec![], + columns, + if_not_exists: false, + source: source_ref, + // external_reference is not required for single-output sources + external_reference: None, + with_options: table_with_options, + envelope: envelope.take(), + include_metadata: include_metadata.drain(..).collect(), + format: format.take(), + }; + + // Insert the new table statement into the catalog with a new id. + let ids = tx.allocate_user_item_ids(1)?; + let new_table_id = ids[0]; + tx.insert_user_item( + new_table_id, + item.schema_id.clone(), + &table_item_name, + table.to_ast_string_stable(), + item.owner_id.clone(), + item.privileges.clone(), + &Default::default(), + )?; + // We need to move the shard currently attached to the source statement to the + // table statement such that the existing data in the shard is preserved and can + // be queried on the new table statement. However, we need to keep the GlobalId of + // the source the same, to preserve existing references to that statement in + // external tools such as DBT and Terraform. We will insert a new shard for the source + // statement which will be automatically created after the migration is complete. + let new_source_shard = ShardId::new(); + let (source_id, existing_source_shard) = tx + .delete_collection_metadata(btreeset! {item.id}) + .pop() + .expect("shard should exist"); + tx.insert_collection_metadata(btreemap! { + new_table_id => existing_source_shard, + source_id => new_source_shard + })?; + + info!("migrate: updated source {stmt} and added table {table}"); + } + + // When we upgrade to > rust 1.81 we should use #[expect(unreachable_patterns)] + // to enforce that we have covered all CreateSourceStatement variants. + #[allow(unreachable_patterns)] + Statement::CreateSource(_) => {} _ => (), } From b4f9f8f0037ccb79db41937d475a5b8687b2ac61 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Wed, 23 Oct 2024 14:08:31 -0400 Subject: [PATCH 03/52] Rename the feature flag --- src/adapter-types/src/dyncfgs.rs | 6 ++++-- src/adapter/src/catalog/migrate.rs | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/adapter-types/src/dyncfgs.rs b/src/adapter-types/src/dyncfgs.rs index 1ddd317c835fb..14ca83651565f 100644 --- a/src/adapter-types/src/dyncfgs.rs +++ b/src/adapter-types/src/dyncfgs.rs @@ -112,8 +112,9 @@ pub const ENABLE_EXPRESSION_CACHE: Config = Config::new( ); /// Whether to enable the migration to convert all sources to use tables. -pub const ENABLE_SOURCE_TABLE_MIGRATION: Config = Config::new( - "enable_source_table_migration", +/// TODO(#8678): This should also disable usage of the old source syntax. +pub const FORCE_SOURCE_TABLE_SYNTAX: Config = Config::new( + "force_source_table_syntax", false, "Whether to enable the migration to convert all sources to use tables.", ); @@ -136,4 +137,5 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { .add(&ENABLE_CONTINUAL_TASK_BUILTINS) .add(&ENABLE_EXPRESSION_CACHE) .add(&ENABLE_SOURCE_TABLE_MIGRATION) + .add(&FORCE_SOURCE_TABLE_SYNTAX) } diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index acf7fa6b5d47d..abbf5653a42a5 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -119,15 +119,15 @@ pub(crate) async fn migrate( // Special block for `ast_rewrite_sources_to_tables` migration // since it requires a feature flag. - if let Some(config_val) = tx.get_system_config("enable_source_table_migration") { + if let Some(config_val) = tx.get_system_config("force_source_table_syntax") { let enable_migration = config_val.parse::().map_err(|e| { anyhow::anyhow!( - "could not parse enable_source_table_migration config value: {}", + "could not parse force_source_table_syntax config value: {}", e ) })?; if enable_migration { - info!("migrate: enable_source_table_migration"); + info!("migrate: force_source_table_syntax"); ast_rewrite_sources_to_tables(tx, item, stmt, all_items_and_statements)?; } } From ecb16af96039c571beb609ebd6397cc080964099 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Wed, 23 Oct 2024 16:24:18 -0400 Subject: [PATCH 04/52] Add new table to audit log --- src/adapter/src/catalog/migrate.rs | 40 ++++++++++++++++++++++++-- src/catalog/src/durable/transaction.rs | 7 +++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index abbf5653a42a5..155c42d8b478f 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -93,7 +93,7 @@ pub(crate) async fn migrate( tx: &mut Transaction<'_>, local_expr_cache: &mut LocalExpressionCache, item_updates: Vec, - _now: NowFn, + now: NowFn, _boot_ts: Timestamp, ) -> Result>, anyhow::Error> { let catalog_version = tx.get_catalog_content_version(); @@ -108,6 +108,7 @@ pub(crate) async fn migrate( ); rewrite_ast_items(tx, |tx, item, stmt, all_items_and_statements| { + let now = now.clone(); // Add per-item AST migrations below. // // Each migration should be a function that takes `stmt` (the AST @@ -128,7 +129,7 @@ pub(crate) async fn migrate( })?; if enable_migration { info!("migrate: force_source_table_syntax"); - ast_rewrite_sources_to_tables(tx, item, stmt, all_items_and_statements)?; + ast_rewrite_sources_to_tables(tx, now, item, stmt, all_items_and_statements)?; } } Ok(()) @@ -226,6 +227,7 @@ pub(crate) async fn migrate( /// fn ast_rewrite_sources_to_tables( tx: &mut Transaction<'_>, + now: NowFn, item: &mut Item, stmt: &mut Statement, all_items_and_statements: &Vec<(Item, Statement)>, @@ -576,6 +578,26 @@ fn ast_rewrite_sources_to_tables( source_id => new_source_shard })?; + let schema = tx.get_schema(&item.schema_id).expect("schema must exist"); + + add_to_audit_log( + tx, + mz_audit_log::EventType::Create, + mz_audit_log::ObjectType::Table, + mz_audit_log::EventDetails::IdFullNameV1(mz_audit_log::IdFullNameV1 { + id: new_table_id.to_string(), + name: mz_audit_log::FullNameV1 { + database: schema + .database_id + .map(|d| d.to_string()) + .unwrap_or_default(), + schema: schema.name, + item: table_item_name, + }, + }), + now(), + )?; + info!("migrate: updated source {stmt} and added table {table}"); } @@ -609,3 +631,17 @@ pub(crate) fn durable_migrate( // // Please include the adapter team on any code reviews that add or edit // migrations. + +fn add_to_audit_log( + tx: &mut Transaction, + event_type: mz_audit_log::EventType, + object_type: mz_audit_log::ObjectType, + details: mz_audit_log::EventDetails, + occurred_at: mz_ore::now::EpochMillis, +) -> Result<(), anyhow::Error> { + let id = tx.get_and_increment_id(mz_catalog::durable::AUDIT_LOG_ID_ALLOC_KEY.to_string())?; + let event = + mz_audit_log::VersionedEvent::new(id, event_type, object_type, details, None, occurred_at); + tx.insert_audit_log_event(event); + Ok(()) +} diff --git a/src/catalog/src/durable/transaction.rs b/src/catalog/src/durable/transaction.rs index 544b6702f5721..1255333895ed1 100644 --- a/src/catalog/src/durable/transaction.rs +++ b/src/catalog/src/durable/transaction.rs @@ -2047,6 +2047,13 @@ impl<'a> Transaction<'a> { .map(|(k, v)| DurableType::from_key_value(k, v)) } + pub fn get_schema(&self, id: &SchemaId) -> Option { + let key = SchemaKey { id: *id }; + self.schemas + .get(&key) + .map(|v| DurableType::from_key_value(key, v.clone())) + } + pub fn get_introspection_source_indexes( &self, cluster_id: ClusterId, From d70ab1521d92049a1e0dfdf222c8cb3428fd3edb Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Wed, 23 Oct 2024 16:37:47 -0400 Subject: [PATCH 05/52] Add platform check scenario to test migration --- .../checks/all_checks/source_tables.py | 1 - .../materialize/checks/all_checks/upsert.py | 56 +++++++++++++++++++ .../materialize/checks/scenarios_upgrade.py | 46 +++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/misc/python/materialize/checks/all_checks/source_tables.py b/misc/python/materialize/checks/all_checks/source_tables.py index 279ede6a4022e..c36bb1eb8af12 100644 --- a/misc/python/materialize/checks/all_checks/source_tables.py +++ b/misc/python/materialize/checks/all_checks/source_tables.py @@ -28,7 +28,6 @@ class TableFromPgSource(TableFromSourceBase): suffix = "tbl_from_pg_source" def initialize(self) -> Testdrive: - return Testdrive( self.generic_setup() + dedent( diff --git a/misc/python/materialize/checks/all_checks/upsert.py b/misc/python/materialize/checks/all_checks/upsert.py index dd7f92caf831e..739bc9d5effa9 100644 --- a/misc/python/materialize/checks/all_checks/upsert.py +++ b/misc/python/materialize/checks/all_checks/upsert.py @@ -164,3 +164,59 @@ def validate(self) -> Testdrive: """ ) ) + + +class UpsertLegacy(Check): + """ + An upsert source test that uses the legacy syntax to create the source + on all versions to ensure the source is properly migrated with the + ActivateSourceVersioningMigration scenario + """ + + def initialize(self) -> Testdrive: + return Testdrive( + schemas() + + dedent( + """ + $ kafka-create-topic topic=upsert-legacy-syntax + + $ kafka-ingest format=avro key-format=avro topic=upsert-legacy-syntax key-schema=${keyschema} schema=${schema} repeat=10000 + {"key1": "A${kafka-ingest.iteration}"} {"f1": "A${kafka-ingest.iteration}"} + + > CREATE SOURCE upsert_insert + FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-upsert-legacy-syntax-${testdrive.seed}') + FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn + ENVELOPE UPSERT + + > CREATE MATERIALIZED VIEW upsert_insert_view AS SELECT COUNT(DISTINCT key1 || ' ' || f1) FROM upsert_insert; + """ + ) + ) + + def manipulate(self) -> list[Testdrive]: + return [ + Testdrive(schemas() + dedent(s)) + for s in [ + """ + $ kafka-ingest format=avro key-format=avro topic=upsert-legacy-syntax key-schema=${keyschema} schema=${schema} repeat=10000 + {"key1": "A${kafka-ingest.iteration}"} {"f1": "A${kafka-ingest.iteration}"} + """, + """ + $ kafka-ingest format=avro key-format=avro topic=upsert-legacy-syntax key-schema=${keyschema} schema=${schema} repeat=10000 + {"key1": "A${kafka-ingest.iteration}"} {"f1": "A${kafka-ingest.iteration}"} + """, + ] + ] + + def validate(self) -> Testdrive: + return Testdrive( + dedent( + """ + > SELECT COUNT(*), COUNT(DISTINCT key1), COUNT(DISTINCT f1) FROM upsert_insert + 10000 10000 10000 + + > SELECT * FROM upsert_insert_view; + 10000 + """ + ) + ) diff --git a/misc/python/materialize/checks/scenarios_upgrade.py b/misc/python/materialize/checks/scenarios_upgrade.py index fe9a9bd26c470..5c404c4f0b9be 100644 --- a/misc/python/materialize/checks/scenarios_upgrade.py +++ b/misc/python/materialize/checks/scenarios_upgrade.py @@ -387,3 +387,49 @@ def actions(self) -> list[Action]: ), Validate(self), ] + + +class ActivateSourceVersioningMigration(Scenario): + """ + Starts MZ, initializes and manipulates, then forces the migration + of sources to the new table model (introducing Source Versioning). + """ + + def base_version(self) -> MzVersion: + return get_last_version() + + def actions(self) -> list[Action]: + print(f"Upgrading from tag {self.base_version()}") + return [ + StartMz( + self, + tag=self.base_version(), + ), + Initialize(self), + Manipulate(self, phase=1), + KillMz( + capture_logs=True + ), # We always use True here otherwise docker-compose will lose the pre-upgrade logs + StartMz( + self, + tag=None, + # Activate the `force_source_table_syntax` flag + # which should trigger the migration of sources + # using the old syntax to the new table model. + additional_system_parameter_defaults={ + "force_source_table_syntax": "true", + }, + ), + Manipulate(self, phase=2), + Validate(self), + # A second restart while already on the new version + KillMz(capture_logs=True), + StartMz( + self, + tag=None, + additional_system_parameter_defaults={ + "force_source_table_syntax": "true", + }, + ), + Validate(self), + ] From 35236b0cb8a8ef1bd41c99413e5053f2b622cec0 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Thu, 24 Oct 2024 10:00:47 -0400 Subject: [PATCH 06/52] Switch to using system vars instead of flags to allow console access to values m --- src/adapter-types/src/dyncfgs.rs | 9 --------- src/adapter/src/catalog/migrate.rs | 16 +++++----------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/adapter-types/src/dyncfgs.rs b/src/adapter-types/src/dyncfgs.rs index 14ca83651565f..0ab56c9b41144 100644 --- a/src/adapter-types/src/dyncfgs.rs +++ b/src/adapter-types/src/dyncfgs.rs @@ -111,14 +111,6 @@ pub const ENABLE_EXPRESSION_CACHE: Config = Config::new( "Use a cache to store optimized expressions to help speed up start times.", ); -/// Whether to enable the migration to convert all sources to use tables. -/// TODO(#8678): This should also disable usage of the old source syntax. -pub const FORCE_SOURCE_TABLE_SYNTAX: Config = Config::new( - "force_source_table_syntax", - false, - "Whether to enable the migration to convert all sources to use tables.", -); - /// Adds the full set of all compute `Config`s. pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { configs @@ -137,5 +129,4 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { .add(&ENABLE_CONTINUAL_TASK_BUILTINS) .add(&ENABLE_EXPRESSION_CACHE) .add(&ENABLE_SOURCE_TABLE_MIGRATION) - .add(&FORCE_SOURCE_TABLE_SYNTAX) } diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 155c42d8b478f..923c733b9c395 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -107,6 +107,8 @@ pub(crate) async fn migrate( catalog_version ); + let enable_source_table_migration = state.system_config().force_source_table_syntax(); + rewrite_ast_items(tx, |tx, item, stmt, all_items_and_statements| { let now = now.clone(); // Add per-item AST migrations below. @@ -118,20 +120,12 @@ pub(crate) async fn migrate( // Migration functions may also take `tx` as input to stage // arbitrary changes to the catalog. - // Special block for `ast_rewrite_sources_to_tables` migration - // since it requires a feature flag. - if let Some(config_val) = tx.get_system_config("force_source_table_syntax") { - let enable_migration = config_val.parse::().map_err(|e| { - anyhow::anyhow!( - "could not parse force_source_table_syntax config value: {}", - e - ) - })?; - if enable_migration { + // Special block for `ast_rewrite_sources_to_tables` migration + // since it requires a feature flag. + if enable_source_table_migration { info!("migrate: force_source_table_syntax"); ast_rewrite_sources_to_tables(tx, now, item, stmt, all_items_and_statements)?; } - } Ok(()) })?; From 969e722ca2565718a76bc8abbde5fd5d54f2e2f8 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Thu, 24 Oct 2024 14:37:21 -0400 Subject: [PATCH 07/52] Fixes to migration based on testing --- src/adapter/src/catalog/migrate.rs | 916 +++++++++++------- src/adapter/src/catalog/open.rs | 30 +- src/sql-parser/src/ast/defs/ddl.rs | 18 + src/sql/src/names.rs | 39 + .../src/sources/load_generator.rs | 2 + 5 files changed, 624 insertions(+), 381 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 923c733b9c395..ff21c13ce89fb 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -11,11 +11,12 @@ use std::collections::BTreeMap; use mz_catalog::builtin::BuiltinTable; use mz_catalog::durable::{Transaction, Item}; -use mz_catalog::memory::objects::StateUpdate; +use mz_catalog::memory::objects::{StateUpdate,BootstrapStateUpdateKind}; use mz_ore::collections::CollectionExt; use mz_ore::now::NowFn; use mz_repr::{CatalogItemId, Timestamp}; use mz_sql::ast::display::AstDisplay; +use mz_sql::names::FullItemName; use mz_sql_parser::ast::{Raw, Statement}; use semver::Version; use tracing::info; @@ -29,26 +30,15 @@ fn rewrite_ast_items(tx: &mut Transaction<'_>, mut f: F) -> Result<(), anyhow where F: for<'a> FnMut( &'a mut Transaction<'_>, - &'a mut Item, + GlobalId, &'a mut Statement, - &'a Vec<(Item, Statement)>, ) -> Result<(), anyhow::Error>, { let mut updated_items = BTreeMap::new(); - let items_with_statements = tx - .get_items() - .map(|item| { - let stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; - Ok((item, stmt)) - }) - .collect::, anyhow::Error>>()?; - // Clone this vec to be referenced within the closure if needed - let items_with_statements_ref = items_with_statements.clone(); - for (mut item, mut stmt) in items_with_statements { + for mut item in tx.get_items() { let mut stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; - f(tx, &mut item, &mut stmt, &items_with_statements_ref)?; - + f(tx, item.id, &mut stmt).await?; item.create_sql = stmt.to_ast_string_stable(); updated_items.insert(item.id, item); @@ -85,6 +75,11 @@ where Ok(()) } +pub(crate) struct MigrateResult { + pub(crate) builtin_table_updates: Vec>, + pub(crate) post_item_updates: Vec<(BootstrapStateUpdateKind, Timestamp, i64)>, +} + /// Migrates all user items and loads them into `state`. /// /// Returns the builtin updates corresponding to all user items. @@ -95,7 +90,7 @@ pub(crate) async fn migrate( item_updates: Vec, now: NowFn, _boot_ts: Timestamp, -) -> Result>, anyhow::Error> { +) -> Result { let catalog_version = tx.get_catalog_content_version(); let catalog_version = match catalog_version { Some(v) => Version::parse(&v)?, @@ -107,7 +102,11 @@ pub(crate) async fn migrate( catalog_version ); - let enable_source_table_migration = state.system_config().force_source_table_syntax(); + // Special block for `ast_rewrite_sources_to_tables` migration + // since it requires a feature flag needs to update multiple AST items at once. + if state.system_config().force_source_table_syntax() { + ast_rewrite_sources_to_tables(tx, now)?; + } rewrite_ast_items(tx, |tx, item, stmt, all_items_and_statements| { let now = now.clone(); @@ -120,12 +119,6 @@ pub(crate) async fn migrate( // Migration functions may also take `tx` as input to stage // arbitrary changes to the catalog. - // Special block for `ast_rewrite_sources_to_tables` migration - // since it requires a feature flag. - if enable_source_table_migration { - info!("migrate: force_source_table_syntax"); - ast_rewrite_sources_to_tables(tx, now, item, stmt, all_items_and_statements)?; - } Ok(()) })?; @@ -137,6 +130,18 @@ pub(crate) async fn migrate( let op_item_updates = into_consolidatable_updates_startup(op_item_updates, commit_ts); item_updates.extend(op_item_updates); differential_dataflow::consolidation::consolidate_updates(&mut item_updates); + + // Since some migrations might introduce non-item 'post-item' updates, we sequester those + // so they can be applied with other post-item updates after migrations to avoid + // accumulating negative diffs. + let (post_item_updates, item_updates): (Vec<_>, Vec<_>) = item_updates + .into_iter() + // The only post-item update kind we currently generate is to + // update storage collection metadata. + .partition(|(kind, _, _)| { + matches!(kind, BootstrapStateUpdateKind::StorageCollectionMetadata(_)) + }); + let item_updates = item_updates .into_iter() .map(|(kind, ts, diff)| StateUpdate { @@ -188,7 +193,10 @@ pub(crate) async fn migrate( "migration from catalog version {:?} complete", catalog_version ); - Ok(ast_builtin_table_updates) + Ok(MigrateResult { + builtin_table_updates: ast_builtin_table_updates, + post_item_updates, + }) } // Add new migrations below their appropriate heading, and precede them with a @@ -208,23 +216,20 @@ pub(crate) async fn migrate( /// MySQL, and multi-output (tpch, auction, marketing) load-generator subsources. /// /// Second we migrate existing `CREATE SOURCE` statements for these multi-output -/// sources to remove their subsource qualification option (e.g. FOR ALL TABLES..) -/// and any subsource-specific options (e.g. TEXT COLUMNS). +/// sources to remove any subsource-specific options (e.g. TEXT COLUMNS). /// -/// Third we migrate existing `CREATE SOURCE` statements that refer to a single -/// output collection. This includes existing Kafka and single-output load-generator -/// subsources. These statements will generate an additional `CREATE TABLE .. FROM SOURCE` -/// statement that copies over all the export-specific options. This statement will be tied -/// to the existing statement's persist collection, but using a new GlobalID. The original -/// source statement will be updated to remove the export-specific options and will be -/// renamed to `_source` while keeping its same GlobalId. +/// Third we migrate existing single-output `CREATE SOURCE` statements. +/// This includes existing Kafka and single-output load-generator +/// subsources. This will generate an additional `CREATE TABLE .. FROM SOURCE` +/// statement that copies over all the export-specific options. This table will use +/// to the existing source statement's persist shard but use a new GlobalID. +/// The original source statement will be updated to remove the export-specific options, +/// renamed to `_source`, and use a new empty shard while keeping its +/// same GlobalId. /// fn ast_rewrite_sources_to_tables( tx: &mut Transaction<'_>, now: NowFn, - item: &mut Item, - stmt: &mut Statement, - all_items_and_statements: &Vec<(Item, Statement)>, ) -> Result<(), anyhow::Error> { use maplit::btreemap; use maplit::btreeset; @@ -232,374 +237,543 @@ fn ast_rewrite_sources_to_tables( use mz_proto::RustType; use mz_sql::ast::{ CreateSourceConnection, CreateSourceOptionName, CreateSourceStatement, - CreateSubsourceOptionName, CreateTableFromSourceStatement, Ident, LoadGenerator, - MySqlConfigOptionName, PgConfigOptionName, RawItemName, TableFromSourceColumns, - TableFromSourceOption, TableFromSourceOptionName, Value, WithOptionValue, + CreateSubsourceOptionName, CreateSubsourceStatement, CreateTableFromSourceStatement, Ident, + KafkaSourceConfigOptionName, LoadGenerator, MySqlConfigOptionName, PgConfigOptionName, + RawItemName, TableFromSourceColumns, TableFromSourceOption, TableFromSourceOptionName, + UnresolvedItemName, Value, WithOptionValue, }; use mz_storage_client::controller::StorageTxn; use mz_storage_types::sources::load_generator::LoadGeneratorOutput; use mz_storage_types::sources::SourceExportStatementDetails; use prost::Message; - // Since some subsources have named-only references to their `of_source` and some have the - // global_id of their source, we first generate a reference mapping from all source names to - // items so we can ensure we always use an ID-based reference in the stored - // `CREATE TABLE .. FROM SOURCE` statements. - let source_name_to_item: BTreeMap<_, _> = all_items_and_statements - .iter() - .filter_map(|(item, statement)| match statement { - Statement::CreateSource(stmt) => Some((stmt.name.clone(), item)), - _ => None, + let items_with_statements = tx + .get_items() + .map(|item| { + let stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; + Ok((item, stmt)) }) - .collect(); + .collect::, anyhow::Error>>()?; + let items_with_statements_copied = items_with_statements.clone(); + + // Any GlobalId that should be changed to a new GlobalId in any statements that + // reference it. This is necessary for ensuring downstream statements (e.g. + // mat views, indexes) that reference a single-output source (e.g. kafka) + // will now reference the corresponding new table, with the same data, instead. + let mut changed_ids = BTreeMap::new(); + + for (mut item, stmt) in items_with_statements { + match stmt { + // Migrate each `CREATE SUBSOURCE` statement to an equivalent + // `CREATE TABLE .. FROM SOURCE` statement. + Statement::CreateSubsource(CreateSubsourceStatement { + name, + columns, + constraints, + of_source, + if_not_exists, + mut with_options, + }) => { + let raw_source_name = match of_source { + // If `of_source` is None then this is a `progress` subsource which we + // are not migrating as they are not currently relevant to the new table model. + None => return Ok(()), + Some(name) => name, + }; + let source = match raw_source_name { + // Some legacy subsources have named-only references to their `of_source` + // so we ensure we always use an ID-based reference in the stored + // `CREATE TABLE .. FROM SOURCE` statements. + RawItemName::Name(name) => { + // Convert the name reference to an ID reference. + let (source_item, _) = items_with_statements_copied + .iter() + .find(|(_, statement)| match statement { + Statement::CreateSource(stmt) => stmt.name == name, + _ => false, + }) + .expect("source must exist"); + RawItemName::Id(source_item.id.to_string(), name, None) + } + RawItemName::Id(..) => raw_source_name, + }; - // Collect any existing `CREATE TABLE .. FROM SOURCE` statements by the source they reference. - let tables_by_source: BTreeMap<_, Vec<_>> = all_items_and_statements - .iter() - .filter_map(|(_, statement)| match statement { - Statement::CreateTableFromSource(stmt) => { - let source: GlobalId = match &stmt.source { - RawItemName::Name(_) => unreachable!("tables store source as ID"), - RawItemName::Id(id, _, _) => id.parse().expect("valid id"), + // The external reference is a `with_option` on subsource statements but is a + // separate field on table statements. + let mut i = 0; + let external_reference = loop { + if i >= with_options.len() { + panic!("subsource must have an external reference"); + } + if with_options[i].name == CreateSubsourceOptionName::ExternalReference { + let option = with_options.remove(i); + match option.value { + Some(WithOptionValue::UnresolvedItemName(name)) => break name, + _ => unreachable!("external reference must be an unresolved item name"), + }; + } else { + i += 1; + } + }; + let with_options = with_options + .into_iter() + .map(|option| { + match option.name { + CreateSubsourceOptionName::Details => TableFromSourceOption { + name: TableFromSourceOptionName::Details, + // The `details` option on both subsources and tables is identical, using the same + // ProtoSourceExportStatementDetails serialized value. + value: option.value, + }, + CreateSubsourceOptionName::TextColumns => TableFromSourceOption { + name: TableFromSourceOptionName::TextColumns, + value: option.value, + }, + CreateSubsourceOptionName::ExcludeColumns => TableFromSourceOption { + name: TableFromSourceOptionName::ExcludeColumns, + value: option.value, + }, + CreateSubsourceOptionName::Progress => { + panic!("progress option should not exist on this subsource") + } + CreateSubsourceOptionName::ExternalReference => { + // This option is handled separately above. + unreachable!() + } + } + }) + .collect::>(); + + let table = CreateTableFromSourceStatement { + name, + constraints, + columns: mz_sql::ast::TableFromSourceColumns::Defined(columns), + if_not_exists, + source, + external_reference: Some(external_reference.clone()), + with_options, + // Subsources don't have `envelope`, `include_metadata`, or `format` options. + envelope: None, + include_metadata: vec![], + format: None, }; - Some((source, stmt)) - } - _ => None, - }) - .fold(BTreeMap::new(), |mut acc, (source, stmt)| { - acc.entry(source).or_insert_with(Vec::new).push(stmt); - acc - }); - match stmt { - // Migrate each `CREATE SUBSOURCE` statement to an equivalent - // `CREATE TABLE .. FROM SOURCE` statement. - Statement::CreateSubsource(subsource) => { - let raw_source_name = match &subsource.of_source { - // We will not be migrating 'progress' `CREATE SUBSOURCE` statements as they - // are not currently relevant to the new table model. - None => return Ok(()), - Some(name) => name, - }; - let source = match raw_source_name { - RawItemName::Name(name) => { - // Convert the name reference to an ID reference. - let source_item = source_name_to_item.get(name).expect("source must exist"); - RawItemName::Id(source_item.id.to_string(), name.clone(), None) - } - RawItemName::Id(..) => raw_source_name.clone(), - }; - - // The external reference is a `with_option` on subsource statements but is a - // separate field on table statements. - let external_reference_option = subsource - .with_options - .iter() - .find(|o| o.name == CreateSubsourceOptionName::ExternalReference) - .expect("subsources must have external reference"); - let external_reference = match &external_reference_option.value { - Some(WithOptionValue::UnresolvedItemName(name)) => name, - _ => unreachable!("external reference must be an unresolved item name"), - }; - - // The `details` option on both subsources and tables is identical, using the same - // ProtoSourceExportStatementDetails serialized value. - let existing_details = subsource - .with_options - .iter() - .find(|o| o.name == CreateSubsourceOptionName::Details) - .expect("subsources must have details"); - - let mut with_options = vec![TableFromSourceOption { - name: TableFromSourceOptionName::Details, - value: existing_details.value.clone(), - }]; - - // TEXT COLUMNS and EXCLUDE COLUMNS options are stored on each subsource and can be copied - // directly to the table statement. - if let Some(text_cols_option) = subsource - .with_options - .iter() - .find(|option| option.name == CreateSubsourceOptionName::TextColumns) - { - with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::TextColumns, - value: text_cols_option.value.clone(), - }); - } - if let Some(exclude_cols_option) = subsource - .with_options - .iter() - .find(|option| option.name == CreateSubsourceOptionName::ExcludeColumns) - { - with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::ExcludeColumns, - value: exclude_cols_option.value.clone(), - }); + info!( + "migrate: converted subsource {} to table {}", + item.create_sql, table + ); + item.create_sql = Statement::CreateTableFromSource(table).to_ast_string_stable(); + tx.update_item(item.id, item)?; } - let table = CreateTableFromSourceStatement { - name: subsource.name.clone(), - constraints: subsource.constraints.clone(), - columns: mz_sql::ast::TableFromSourceColumns::Defined(subsource.columns.clone()), - if_not_exists: subsource.if_not_exists, - source, - external_reference: Some(external_reference.clone()), + // Postgres sources are multi-output sources whose subsources are + // migrated above. All we need to do is remove the subsource-related + // options from this statement since they are no longer relevant. + Statement::CreateSource(CreateSourceStatement { + connection: + mut conn @ (CreateSourceConnection::Postgres { .. } + | CreateSourceConnection::Yugabyte { .. }), + name, + if_not_exists, + in_cluster, + include_metadata, + format, + envelope, + col_names, with_options, - // Subsources don't have `envelope`, `include_metadata`, or `format` options. - envelope: None, - include_metadata: vec![], - format: None, - }; - - info!( - "migrate: converted subsource {} to table {}", - subsource, table - ); - *stmt = Statement::CreateTableFromSource(table); - } - - // Postgres sources are multi-output sources whose subsources are - // migrated above. All we need to do is remove the subsource-related - // options from this statement since they are no longer relevant. - Statement::CreateSource(CreateSourceStatement { - connection: - CreateSourceConnection::Postgres { options, .. } - | CreateSourceConnection::Yugabyte { options, .. }, - .. - }) => { - // This option storing text columns on the primary source statement is redundant - // with the option on subsource statements so can just be removed. - // This was kept for round-tripping of `CREATE SOURCE` statements that automatically - // generated subsources, which is no longer necessary. - if options - .iter() - .any(|o| matches!(o.name, PgConfigOptionName::TextColumns)) - { - options.retain(|o| !matches!(o.name, PgConfigOptionName::TextColumns)); - info!("migrate: converted postgres source {stmt} to remove subsource options"); + key_constraint, + external_references, + progress_subsource, + }) => { + let options = match &mut conn { + CreateSourceConnection::Postgres { options, .. } => options, + CreateSourceConnection::Yugabyte { options, .. } => options, + _ => unreachable!("match determined above"), + }; + // This option storing text columns on the primary source statement is redundant + // with the option on subsource statements so can just be removed. + // This was kept for round-tripping of `CREATE SOURCE` statements that automatically + // generated subsources, which is no longer necessary. + if options + .iter() + .any(|o| matches!(o.name, PgConfigOptionName::TextColumns)) + { + options.retain(|o| !matches!(o.name, PgConfigOptionName::TextColumns)); + let stmt = Statement::CreateSource(CreateSourceStatement { + connection: conn, + name, + if_not_exists, + in_cluster, + include_metadata, + format, + envelope, + col_names, + with_options, + key_constraint, + external_references, + progress_subsource, + }); + item.create_sql = stmt.to_ast_string_stable(); + tx.update_item(item.id, item)?; + info!("migrate: converted postgres source {stmt} to remove subsource options"); + } } - } - // MySQL sources are multi-output sources whose subsources are - // migrated above. All we need to do is remove the subsource-related - // options from this statement since they are no longer relevant. - Statement::CreateSource(CreateSourceStatement { - connection: CreateSourceConnection::MySql { options, .. }, - .. - }) => { - // These options storing text and exclude columns on the primary source statement - // are redundant with the options on subsource statements so can just be removed. - // They was kept for round-tripping of `CREATE SOURCE` statements that automatically - // generated subsources, which is no longer necessary. - if options.iter().any(|o| { - matches!( - o.name, - MySqlConfigOptionName::TextColumns | MySqlConfigOptionName::ExcludeColumns - ) - }) { - options.retain(|o| { - !matches!( + // MySQL sources are multi-output sources whose subsources are + // migrated above. All we need to do is remove the subsource-related + // options from this statement since they are no longer relevant. + Statement::CreateSource(CreateSourceStatement { + connection: mut conn @ CreateSourceConnection::MySql { .. }, + name, + if_not_exists, + in_cluster, + include_metadata, + format, + envelope, + col_names, + with_options, + key_constraint, + external_references, + progress_subsource, + .. + }) => { + let options = match &mut conn { + CreateSourceConnection::MySql { options, .. } => options, + _ => unreachable!("match determined above"), + }; + // These options storing text and exclude columns on the primary source statement + // are redundant with the options on subsource statements so can just be removed. + // They was kept for round-tripping of `CREATE SOURCE` statements that automatically + // generated subsources, which is no longer necessary. + if options.iter().any(|o| { + matches!( o.name, MySqlConfigOptionName::TextColumns | MySqlConfigOptionName::ExcludeColumns ) - }); - info!("migrate: converted mysql source {stmt} to remove subsource options"); + }) { + options.retain(|o| { + !matches!( + o.name, + MySqlConfigOptionName::TextColumns + | MySqlConfigOptionName::ExcludeColumns + ) + }); + let stmt = Statement::CreateSource(CreateSourceStatement { + connection: conn, + name, + if_not_exists, + in_cluster, + include_metadata, + format, + envelope, + col_names, + with_options, + key_constraint, + external_references, + progress_subsource, + }); + item.create_sql = stmt.to_ast_string_stable(); + tx.update_item(item.id, item)?; + info!("migrate: converted mysql source {stmt} to remove subsource options"); + } } - } - // Multi-output load generator sources whose subsources are already - // migrated above. There is no need to remove any options from this - // statement since they are not export-specific. - Statement::CreateSource(CreateSourceStatement { - connection: - CreateSourceConnection::LoadGenerator { - generator: - LoadGenerator::Auction | LoadGenerator::Marketing | LoadGenerator::Tpch, - .. - }, - .. - }) => {} - // Single-output sources that need to be migrated to tables. These sources currently output - // data to the primary collection of the source statement. We will create a new table - // statement for them and move all export-specific options over from the source statement, - // while moving the `CREATE SOURCE` statement to a new name and moving its shard to the - // new table statement. - Statement::CreateSource(CreateSourceStatement { - connection: - conn @ (CreateSourceConnection::Kafka { .. } - | CreateSourceConnection::LoadGenerator { - generator: - LoadGenerator::Clock - | LoadGenerator::Datums - | LoadGenerator::Counter - | LoadGenerator::KeyValue, - .. - }), - name, - col_names, - include_metadata, - format, - envelope, - with_options, - .. - }) => { - // To check if this is a source that has already been migrated we use a basic - // heuristic: if there is at least one existing table for the source, and if - // the envelope/format/include_metadata options are empty, we assume it's - // already been migrated. - if let Some(existing_tables) = tables_by_source.get(&item.id) { - if !existing_tables.is_empty() + // Multi-output load generator sources whose subsources are already + // migrated above. There is no need to remove any options from this + // statement since they are not export-specific. + Statement::CreateSource(CreateSourceStatement { + connection: + CreateSourceConnection::LoadGenerator { + generator: + LoadGenerator::Auction | LoadGenerator::Marketing | LoadGenerator::Tpch, + .. + }, + .. + }) => {} + // Single-output sources that need to be migrated to tables. These sources currently output + // data to the primary collection of the source statement. We will create a new table + // statement for them and move all export-specific options over from the source statement, + // while moving the `CREATE SOURCE` statement to a new name and moving its shard to the + // new table statement. + Statement::CreateSource(CreateSourceStatement { + connection: + conn @ (CreateSourceConnection::Kafka { .. } + | CreateSourceConnection::LoadGenerator { + generator: + LoadGenerator::Clock + | LoadGenerator::Datums + | LoadGenerator::Counter + | LoadGenerator::KeyValue, + .. + }), + name, + col_names, + include_metadata, + format, + envelope, + mut with_options, + if_not_exists, + in_cluster, + progress_subsource, + external_references, + key_constraint, + }) => { + // To check if this is a source that has already been migrated we use a basic + // heuristic: if there is at least one existing table for the source, and if + // the envelope/format/include_metadata options are empty, we assume it's + // already been migrated. + let tables_for_source = + items_with_statements_copied + .iter() + .any(|(item, statement)| match statement { + Statement::CreateTableFromSource(stmt) => { + let source: GlobalId = match &stmt.source { + RawItemName::Name(_) => { + unreachable!("tables store source as ID") + } + RawItemName::Id(source_id, _, _) => { + source_id.parse().expect("valid id") + } + }; + source == item.id + } + _ => false, + }); + if tables_for_source && envelope.is_none() && format.is_none() && include_metadata.is_empty() { + info!("migrate: skipping already migrated source: {}", name); return Ok(()); } - } - // Use the current source name as the new table name, and rename the source to - // `_source`. This is intended to allow users to continue using - // queries that reference the source name, since they will now need to query the - // table instead. - let new_source_name_ident = Ident::new_unchecked( - name.0.last().expect("at least one ident").to_string() + "_source", - ); - let mut new_source_name = name.clone(); - *new_source_name.0.last_mut().expect("at least one ident") = new_source_name_ident; - let table_name = std::mem::replace(name, new_source_name.clone()); - - // Also update the name of the source 'item' - let mut table_item_name = item.name.clone() + "_source"; - std::mem::swap(&mut item.name, &mut table_item_name); - - // A reference to the source that will be included in the table statement - let source_ref = RawItemName::Id(item.id.to_string(), new_source_name, None); - - let columns = if col_names.is_empty() { - TableFromSourceColumns::NotSpecified - } else { - TableFromSourceColumns::Named(col_names.drain(..).collect()) - }; - - // All source tables must have a `details` option, which is a serialized proto - // describing any source-specific details for this table statement. - let details = match conn { - // For kafka sources this proto is currently empty. - CreateSourceConnection::Kafka { .. } => SourceExportStatementDetails::Kafka {}, - CreateSourceConnection::LoadGenerator { .. } => { - // Since these load generators are single-output we use the default output. - SourceExportStatementDetails::LoadGenerator { - output: LoadGeneratorOutput::Default, + // Use the current source name as the new table name, and rename the source to + // `_source`. This is intended to allow users to continue using + // queries that reference the source name, since they will now need to query the + // table instead. + let new_source_name_ident = Ident::new_unchecked( + name.0.last().expect("at least one ident").to_string() + "_source", + ); + let mut new_source_name = name.clone(); + *new_source_name.0.last_mut().expect("at least one ident") = new_source_name_ident; + + // Also update the name of the source 'item' + let mut table_item_name = item.name.clone() + "_source"; + std::mem::swap(&mut item.name, &mut table_item_name); + + // A reference to the source that will be included in the table statement + let source_ref = + RawItemName::Id(item.id.to_string(), new_source_name.clone(), None); + + let columns = if col_names.is_empty() { + TableFromSourceColumns::NotSpecified + } else { + TableFromSourceColumns::Named(col_names) + }; + + // All source tables must have a `details` option, which is a serialized proto + // describing any source-specific details for this table statement. + let details = match &conn { + // For kafka sources this proto is currently empty. + CreateSourceConnection::Kafka { .. } => SourceExportStatementDetails::Kafka {}, + CreateSourceConnection::LoadGenerator { .. } => { + // Since these load generators are single-output we use the default output. + SourceExportStatementDetails::LoadGenerator { + output: LoadGeneratorOutput::Default, + } + } + _ => unreachable!("match determined above"), + }; + let mut table_with_options = vec![TableFromSourceOption { + name: TableFromSourceOptionName::Details, + value: Some(WithOptionValue::Value(Value::String(hex::encode( + details.into_proto().encode_to_vec(), + )))), + }]; + + // Move over the IgnoreKeys option if it exists. + let mut i = 0; + while i < with_options.len() { + if with_options[i].name == CreateSourceOptionName::IgnoreKeys { + let option = with_options.remove(i); + table_with_options.push(TableFromSourceOption { + name: TableFromSourceOptionName::IgnoreKeys, + value: option.value, + }); + } else { + i += 1; } } - _ => unreachable!("match determined above"), - }; - let mut table_with_options = vec![TableFromSourceOption { - name: TableFromSourceOptionName::Details, - value: Some(WithOptionValue::Value(Value::String(hex::encode( - details.into_proto().encode_to_vec(), - )))), - }]; - - // Move over the IgnoreKeys option if it exists. - let mut i = 0; - while i < with_options.len() { - if with_options[i].name == CreateSourceOptionName::IgnoreKeys { - let option = with_options.remove(i); - table_with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::IgnoreKeys, - value: option.value, - }); - } else { - i += 1; + // Move over the Timeline option if it exists. + i = 0; + while i < with_options.len() { + if with_options[i].name == CreateSourceOptionName::Timeline { + let option = with_options.remove(i); + table_with_options.push(TableFromSourceOption { + name: TableFromSourceOptionName::Timeline, + value: option.value, + }); + } else { + i += 1; + } } - } - // Move over the Timeline option if it exists. - i = 0; - while i < with_options.len() { - if with_options[i].name == CreateSourceOptionName::Timeline { - let option = with_options.remove(i); - table_with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::Timeline, - value: option.value, - }); - } else { - i += 1; - } - } + // Generate the same external-reference that would have been generated + // during purification for single-output sources. + let external_reference = match &conn { + // For kafka sources this proto is currently empty. + CreateSourceConnection::Kafka { options, .. } => { + let topic_option = options + .iter() + .find(|o| matches!(o.name, KafkaSourceConfigOptionName::Topic)) + .expect("kafka sources must have a topic"); + let topic = match &topic_option.value { + Some(WithOptionValue::Value(Value::String(topic))) => topic, + _ => unreachable!("topic must be a string"), + }; + + Some(UnresolvedItemName::qualified(&[Ident::new(topic)?])) + } + CreateSourceConnection::LoadGenerator { generator, .. } => { + // Since these load generators are single-output the external reference + // uses the schema-name for both namespace and name. + let name = FullItemName { + database: mz_sql::names::RawDatabaseSpecifier::Name( + mz_storage_types::sources::load_generator::LOAD_GENERATOR_DATABASE_NAME + .to_owned(), + ), + schema: generator.schema_name().to_string(), + item: generator.schema_name().to_string(), + }; + Some(UnresolvedItemName::from(name)) + } + _ => unreachable!("match determined above"), + }; - // The new table statement, stealing the export-specific options from the - // create source statement. - let table = CreateTableFromSourceStatement { - name: table_name, - constraints: vec![], - columns, - if_not_exists: false, - source: source_ref, - // external_reference is not required for single-output sources - external_reference: None, - with_options: table_with_options, - envelope: envelope.take(), - include_metadata: include_metadata.drain(..).collect(), - format: format.take(), - }; - - // Insert the new table statement into the catalog with a new id. - let ids = tx.allocate_user_item_ids(1)?; - let new_table_id = ids[0]; - tx.insert_user_item( - new_table_id, - item.schema_id.clone(), - &table_item_name, - table.to_ast_string_stable(), - item.owner_id.clone(), - item.privileges.clone(), - &Default::default(), - )?; - // We need to move the shard currently attached to the source statement to the - // table statement such that the existing data in the shard is preserved and can - // be queried on the new table statement. However, we need to keep the GlobalId of - // the source the same, to preserve existing references to that statement in - // external tools such as DBT and Terraform. We will insert a new shard for the source - // statement which will be automatically created after the migration is complete. - let new_source_shard = ShardId::new(); - let (source_id, existing_source_shard) = tx - .delete_collection_metadata(btreeset! {item.id}) - .pop() - .expect("shard should exist"); - tx.insert_collection_metadata(btreemap! { - new_table_id => existing_source_shard, - source_id => new_source_shard - })?; - - let schema = tx.get_schema(&item.schema_id).expect("schema must exist"); - - add_to_audit_log( - tx, - mz_audit_log::EventType::Create, - mz_audit_log::ObjectType::Table, - mz_audit_log::EventDetails::IdFullNameV1(mz_audit_log::IdFullNameV1 { - id: new_table_id.to_string(), - name: mz_audit_log::FullNameV1 { - database: schema - .database_id - .map(|d| d.to_string()) - .unwrap_or_default(), - schema: schema.name, - item: table_item_name, - }, - }), - now(), - )?; + // The new table statement, stealing the name and the export-specific fields from + // the create source statement. + let table = CreateTableFromSourceStatement { + name, + constraints: vec![], + columns, + if_not_exists: false, + source: source_ref, + external_reference, + with_options: table_with_options, + envelope, + include_metadata, + format, + }; - info!("migrate: updated source {stmt} and added table {table}"); + // The source statement with a new name and many of its fields emptied + let source = CreateSourceStatement { + connection: conn, + name: new_source_name, + if_not_exists, + in_cluster, + include_metadata: vec![], + format: None, + envelope: None, + col_names: vec![], + with_options, + key_constraint, + external_references, + progress_subsource, + }; + + let source_id = item.id; + let schema_id = item.schema_id.clone(); + let schema = tx.get_schema(&item.schema_id).expect("schema must exist"); + + let owner_id = item.owner_id.clone(); + let privileges = item.privileges.clone(); + + // Update the source statement in the catalog first, since the name will + // otherwise conflict with the new table statement. + info!("migrate: updated source {} to {source}", item.create_sql); + item.create_sql = Statement::CreateSource(source).to_ast_string_stable(); + tx.update_item(item.id, item)?; + + // Insert the new table statement into the catalog with a new id. + let ids = tx.allocate_user_item_ids(1)?; + let new_table_id = ids[0]; + info!("migrate: added table {new_table_id}: {table}"); + tx.insert_user_item( + new_table_id, + schema_id, + &table_item_name, + table.to_ast_string_stable(), + owner_id, + privileges, + &Default::default(), + )?; + // We need to move the shard currently attached to the source statement to the + // table statement such that the existing data in the shard is preserved and can + // be queried on the new table statement. However, we need to keep the GlobalId of + // the source the same, to preserve existing references to that statement in + // external tools such as DBT and Terraform. We will insert a new shard for the source + // statement which will be automatically created after the migration is complete. + let new_source_shard = ShardId::new(); + let (source_id, existing_source_shard) = tx + .delete_collection_metadata(btreeset! {source_id}) + .pop() + .expect("shard should exist"); + tx.insert_collection_metadata(btreemap! { + new_table_id => existing_source_shard, + source_id => new_source_shard + })?; + + add_to_audit_log( + tx, + mz_audit_log::EventType::Create, + mz_audit_log::ObjectType::Table, + mz_audit_log::EventDetails::IdFullNameV1(mz_audit_log::IdFullNameV1 { + id: new_table_id.to_string(), + name: mz_audit_log::FullNameV1 { + database: schema + .database_id + .map(|d| d.to_string()) + .unwrap_or_default(), + schema: schema.name, + item: table_item_name, + }, + }), + now(), + )?; + + // We also need to update any other statements that reference the source to use the new + // table id/name instead. + changed_ids.insert(source_id, new_table_id); + } + + // When we upgrade to > rust 1.81 we should use #[expect(unreachable_patterns)] + // to enforce that we have covered all CreateSourceStatement variants. + #[allow(unreachable_patterns)] + Statement::CreateSource(_) => {} + _ => (), } + } - // When we upgrade to > rust 1.81 we should use #[expect(unreachable_patterns)] - // to enforce that we have covered all CreateSourceStatement variants. - #[allow(unreachable_patterns)] - Statement::CreateSource(_) => {} - _ => (), + let mut updated_items = BTreeMap::new(); + for (mut item, mut statement) in items_with_statements_copied { + match &statement { + // Don’t rewrite any of the statements we just migrated. + Statement::CreateSource(_) => {} + Statement::CreateSubsource(_) => {} + Statement::CreateTableFromSource(_) => {} + // We need to rewrite any statements that reference a source id to use the new + // table id instead, since any contained data in the source will now be in the table. + // This assumes the table has stolen the source's name, which is the case + // for all sources that were migrated. + _ => { + if mz_sql::names::modify_dependency_item_ids(&mut statement, &changed_ids) { + item.create_sql = statement.to_ast_string_stable(); + updated_items.insert(item.id, item); + } + } + } + } + if !updated_items.is_empty() { + tx.update_items(updated_items)?; } Ok(()) diff --git a/src/adapter/src/catalog/open.rs b/src/adapter/src/catalog/open.rs index eab051fd2057b..9f563aa3189ae 100644 --- a/src/adapter/src/catalog/open.rs +++ b/src/adapter/src/catalog/open.rs @@ -413,11 +413,7 @@ impl Catalog { | BootstrapStateUpdateKind::StorageCollectionMetadata(_) | BootstrapStateUpdateKind::SourceReferences(_) | BootstrapStateUpdateKind::UnfinalizedShard(_) => { - post_item_updates.push(StateUpdate { - kind: kind.into(), - ts, - diff, - }) + post_item_updates.push((kind, ts, diff)); } BootstrapStateUpdateKind::AuditLog(_) => { audit_log_updates.push(StateUpdate { @@ -501,7 +497,7 @@ impl Catalog { // Migrate item ASTs. let builtin_table_update = if !config.skip_migrations { - migrate::migrate( + let migrate_result = migrate::migrate( &mut state, &mut txn, &mut local_expr_cache, @@ -516,7 +512,15 @@ impl Catalog { this_version: config.build_info.version, cause: e.to_string(), }) - })? + })?; + if !migrate_result.post_item_updates.is_empty() { + // Include any post-item-updates generated by migrations, and then consolidate + // them to ensure diffs are all positive. + post_item_updates.extend(migrate_result.post_item_updates); + differential_dataflow::consolidation::consolidate_updates(&mut post_item_updates); + } + + migrate_result.builtin_table_updates } else { state .apply_updates_for_bootstrap(item_updates, &mut local_expr_cache) @@ -524,9 +528,15 @@ impl Catalog { }; builtin_table_updates.extend(builtin_table_update); - let builtin_table_update = state - .apply_updates_for_bootstrap(post_item_updates, &mut local_expr_cache) - .await; + let post_item_updates = post_item_updates + .into_iter() + .map(|(kind, ts, diff)| StateUpdate { + kind: kind.into(), + ts, + diff: diff.try_into().expect("valid diff"), + }) + .collect(); + let builtin_table_update = state.apply_updates_for_bootstrap(post_item_updates, &mut local_expr_cache).await; builtin_table_updates.extend(builtin_table_update); // We don't need to apply the audit logs in memory, yet apply can be expensive when the diff --git a/src/sql-parser/src/ast/defs/ddl.rs b/src/sql-parser/src/ast/defs/ddl.rs index 7bffdcc990681..573aa56f4dfce 100644 --- a/src/sql-parser/src/ast/defs/ddl.rs +++ b/src/sql-parser/src/ast/defs/ddl.rs @@ -1294,6 +1294,24 @@ impl AstDisplay for LoadGenerator { } impl_display!(LoadGenerator); +impl LoadGenerator { + /// Corresponds with the same mapping on the `LoadGenerator` enum defined in + /// src/storage-types/src/sources/load_generator.rs, but re-defined here for + /// cases where we only have the AST representation. This can be removed once + /// the `ast_rewrite_sources_to_tables` migration is removed. + pub fn schema_name(&self) -> &'static str { + match self { + LoadGenerator::Counter => "counter", + LoadGenerator::Clock => "clock", + LoadGenerator::Marketing => "marketing", + LoadGenerator::Auction => "auction", + LoadGenerator::Datums => "datums", + LoadGenerator::Tpch => "tpch", + LoadGenerator::KeyValue => "key_value", + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum LoadGeneratorOptionName { ScaleFactor, diff --git a/src/sql/src/names.rs b/src/sql/src/names.rs index 7af22528fce85..2eb24322e83b1 100644 --- a/src/sql/src/names.rs +++ b/src/sql/src/names.rs @@ -24,6 +24,7 @@ use mz_repr::network_policy_id::NetworkPolicyId; use mz_repr::role_id::RoleId; use mz_repr::{CatalogItemId, GlobalId, RelationVersion}; use mz_repr::{ColumnName, RelationVersionSelector}; +use mz_sql_parser::ast::visit_mut::VisitMutNode; use mz_sql_parser::ast::{CreateContinualTaskStatement, Expr, RawNetworkPolicyName, Version}; use mz_sql_parser::ident; use proptest_derive::Arbitrary; @@ -2386,6 +2387,44 @@ where ResolvedIds::new(visitor.ids) } +#[derive(Debug)] +pub struct ItemDependencyModifier<'a> { + pub modified: bool, + pub id_map: &'a BTreeMap, +} + +impl<'ast, 'a> VisitMut<'ast, Raw> for ItemDependencyModifier<'a> { + fn visit_item_name_mut(&mut self, item_name: &mut RawItemName) { + if let RawItemName::Id(id, _, _) = item_name { + let parsed_id = id.parse::().unwrap(); + if let Some(new_id) = self.id_map.get(&parsed_id) { + *id = new_id.to_string(); + self.modified = true; + } + } + } +} + +/// Updates any references in the provided AST node that are keys in `id_map`. +/// If an id is found it will be updated to the value of the key in `id_map`. +/// This assumes the names of the reference(s) are unmodified (e.g. each pair of +/// ids refer to an item of the same name, whose id has changed). +pub fn modify_dependency_item_ids<'ast, N>( + node: &'ast mut N, + id_map: &BTreeMap, +) -> bool +where + N: VisitMutNode<'ast, Raw>, +{ + let mut modifier = ItemDependencyModifier { + id_map, + modified: false, + }; + node.visit_mut(&mut modifier); + + modifier.modified +} + // Used when displaying a view's source for human creation. If the name // specified is the same as the name in the catalog, we don't use the ID format. #[derive(Debug)] diff --git a/src/storage-types/src/sources/load_generator.rs b/src/storage-types/src/sources/load_generator.rs index 3d9dd471cd71a..0d6ea646992f1 100644 --- a/src/storage-types/src/sources/load_generator.rs +++ b/src/storage-types/src/sources/load_generator.rs @@ -196,6 +196,8 @@ pub enum LoadGenerator { pub const LOAD_GENERATOR_DATABASE_NAME: &str = "mz_load_generators"; impl LoadGenerator { + /// Must be kept in-sync with the same mapping on the `LoadGenerator` enum defined in + /// src/sql-parser/src/ast/defs/ddl.rs. pub fn schema_name(&self) -> &'static str { match self { LoadGenerator::Counter { .. } => "counter", From 0d0bf92d73d5a25233c7bc0304357a837d36d141 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Thu, 24 Oct 2024 14:37:35 -0400 Subject: [PATCH 08/52] Also test the migration in the legacy upgrade tests --- misc/python/materialize/mzcompose/__init__.py | 5 +- test/legacy-upgrade/mzcompose.py | 54 +++++++++++++++++-- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/misc/python/materialize/mzcompose/__init__.py b/misc/python/materialize/mzcompose/__init__.py index 561e68719a92e..c54d577571f08 100644 --- a/misc/python/materialize/mzcompose/__init__.py +++ b/misc/python/materialize/mzcompose/__init__.py @@ -53,7 +53,9 @@ def get_default_system_parameters( - version: MzVersion | None = None, zero_downtime: bool = False + version: MzVersion | None = None, + zero_downtime: bool = False, + force_source_table_syntax: bool = False, ) -> dict[str, str]: """For upgrade tests we only want parameters set when all environmentd / clusterd processes have reached a specific version (or higher) @@ -125,6 +127,7 @@ def get_default_system_parameters( "persist_record_schema_id": ( "true" if version > MzVersion.parse_mz("v0.127.0-dev") else "false" ), + "force_source_table_syntax": "true" if force_source_table_syntax else "false", "persist_batch_columnar_format": "both_v2", "persist_batch_delete_enabled": "true", "persist_batch_structured_order": "true", diff --git a/test/legacy-upgrade/mzcompose.py b/test/legacy-upgrade/mzcompose.py index a839aabe7e23b..6f88e272d3011 100644 --- a/test/legacy-upgrade/mzcompose.py +++ b/test/legacy-upgrade/mzcompose.py @@ -116,20 +116,56 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: else: if parallelism_count == 1 or parallelism_index == 0: test_upgrade_from_version( - c, f"{version}", priors, filter=args.filter, zero_downtime=True + c, + f"{version}", + priors, + filter=args.filter, + zero_downtime=True, + force_source_table_syntax=False, ) if parallelism_count == 1 or parallelism_index == 1: test_upgrade_from_version( - c, f"{version}", priors, filter=args.filter, zero_downtime=False + c, + f"{version}", + priors, + filter=args.filter, + zero_downtime=False, + force_source_table_syntax=False, + ) + test_upgrade_from_version( + c, + f"{version}", + priors, + filter=args.filter, + zero_downtime=False, + force_source_table_syntax=True, ) if parallelism_count == 1 or parallelism_index == 0: test_upgrade_from_version( - c, "current_source", priors=[], filter=args.filter, zero_downtime=True + c, + "current_source", + priors=[], + filter=args.filter, + zero_downtime=True, + force_source_table_syntax=False, ) if parallelism_count == 1 or parallelism_index == 1: test_upgrade_from_version( - c, "current_source", priors=[], filter=args.filter, zero_downtime=False + c, + "current_source", + priors=[], + filter=args.filter, + zero_downtime=False, + force_source_table_syntax=False, + ) + test_upgrade_from_version( + c, + "current_source", + priors=[], + filter=args.filter, + zero_downtime=False, + force_source_table_syntax=True, ) @@ -152,13 +188,14 @@ def test_upgrade_from_version( priors: list[MzVersion], filter: str, zero_downtime: bool, + force_source_table_syntax: bool, ) -> None: print( f"+++ Testing {'0dt upgrade' if zero_downtime else 'regular upgrade'} from Materialize {from_version} to current_source." ) system_parameter_defaults = get_default_system_parameters( - zero_downtime=zero_downtime + zero_downtime=zero_downtime, ) deploy_generation = 0 @@ -288,6 +325,13 @@ def test_upgrade_from_version( c.rm(mz_service) print(f"{'0dt-' if zero_downtime else ''}Upgrading to final version") + system_parameter_defaults = get_default_system_parameters( + zero_downtime=zero_downtime, + # We can only force the syntax on the final version so that the migration to convert + # sources to the new model can be applied without preventing sources from being + # created in the old syntax on the older version. + force_source_table_syntax=force_source_table_syntax, + ) mz_to = Materialized( name=mz_service, options=list(mz_options.values()), From 947d5281cf8c749dc54e2ff5f83781f25a0c3c61 Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Fri, 25 Oct 2024 11:28:38 +0200 Subject: [PATCH 09/52] platform checks: unique source name --- misc/python/materialize/checks/all_checks/upsert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/misc/python/materialize/checks/all_checks/upsert.py b/misc/python/materialize/checks/all_checks/upsert.py index 739bc9d5effa9..543478f0a0fe8 100644 --- a/misc/python/materialize/checks/all_checks/upsert.py +++ b/misc/python/materialize/checks/all_checks/upsert.py @@ -183,12 +183,12 @@ def initialize(self) -> Testdrive: $ kafka-ingest format=avro key-format=avro topic=upsert-legacy-syntax key-schema=${keyschema} schema=${schema} repeat=10000 {"key1": "A${kafka-ingest.iteration}"} {"f1": "A${kafka-ingest.iteration}"} - > CREATE SOURCE upsert_insert + > CREATE SOURCE upsert_insert_legacy FROM KAFKA CONNECTION kafka_conn (TOPIC 'testdrive-upsert-legacy-syntax-${testdrive.seed}') FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn ENVELOPE UPSERT - > CREATE MATERIALIZED VIEW upsert_insert_view AS SELECT COUNT(DISTINCT key1 || ' ' || f1) FROM upsert_insert; + > CREATE MATERIALIZED VIEW upsert_insert_legacy_view AS SELECT COUNT(DISTINCT key1 || ' ' || f1) FROM upsert_insert_legacy; """ ) ) @@ -212,10 +212,10 @@ def validate(self) -> Testdrive: return Testdrive( dedent( """ - > SELECT COUNT(*), COUNT(DISTINCT key1), COUNT(DISTINCT f1) FROM upsert_insert + > SELECT COUNT(*), COUNT(DISTINCT key1), COUNT(DISTINCT f1) FROM upsert_insert_legacy 10000 10000 10000 - > SELECT * FROM upsert_insert_view; + > SELECT * FROM upsert_insert_legacy_view; 10000 """ ) From 06765d2f6f1086ad5cb2ed723bdc128938b44836 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Fri, 25 Oct 2024 10:23:56 -0400 Subject: [PATCH 10/52] Migration structure cleanup from feedback --- src/adapter/src/catalog/migrate.rs | 71 +++++++++++++----------------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index ff21c13ce89fb..da264a06fa1c6 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -277,7 +277,7 @@ fn ast_rewrite_sources_to_tables( let raw_source_name = match of_source { // If `of_source` is None then this is a `progress` subsource which we // are not migrating as they are not currently relevant to the new table model. - None => return Ok(()), + None => continue, Some(name) => name, }; let source = match raw_source_name { @@ -300,21 +300,17 @@ fn ast_rewrite_sources_to_tables( // The external reference is a `with_option` on subsource statements but is a // separate field on table statements. - let mut i = 0; - let external_reference = loop { - if i >= with_options.len() { - panic!("subsource must have an external reference"); - } - if with_options[i].name == CreateSubsourceOptionName::ExternalReference { - let option = with_options.remove(i); - match option.value { - Some(WithOptionValue::UnresolvedItemName(name)) => break name, - _ => unreachable!("external reference must be an unresolved item name"), - }; - } else { - i += 1; - } + let external_reference = match with_options + .iter() + .position(|opt| opt.name == CreateSubsourceOptionName::ExternalReference) + { + Some(i) => match with_options.remove(i).value { + Some(WithOptionValue::UnresolvedItemName(name)) => name, + _ => unreachable!("external reference must be an unresolved item name"), + }, + None => panic!("subsource must have an external reference"), }; + let with_options = with_options .into_iter() .map(|option| { @@ -337,8 +333,7 @@ fn ast_rewrite_sources_to_tables( panic!("progress option should not exist on this subsource") } CreateSubsourceOptionName::ExternalReference => { - // This option is handled separately above. - unreachable!() + unreachable!("This option is handled separately above.") } } }) @@ -591,30 +586,26 @@ fn ast_rewrite_sources_to_tables( }]; // Move over the IgnoreKeys option if it exists. - let mut i = 0; - while i < with_options.len() { - if with_options[i].name == CreateSourceOptionName::IgnoreKeys { - let option = with_options.remove(i); - table_with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::IgnoreKeys, - value: option.value, - }); - } else { - i += 1; - } - } + if let Some(i) = with_options + .iter() + .position(|opt| opt.name == CreateSourceOptionName::IgnoreKeys) + { + let option = with_options.remove(i); + table_with_options.push(TableFromSourceOption { + name: TableFromSourceOptionName::IgnoreKeys, + value: option.value, + }); + }; // Move over the Timeline option if it exists. - i = 0; - while i < with_options.len() { - if with_options[i].name == CreateSourceOptionName::Timeline { - let option = with_options.remove(i); - table_with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::Timeline, - value: option.value, - }); - } else { - i += 1; - } + if let Some(i) = with_options + .iter() + .position(|opt| opt.name == CreateSourceOptionName::Timeline) + { + let option = with_options.remove(i); + table_with_options.push(TableFromSourceOption { + name: TableFromSourceOptionName::Timeline, + value: option.value, + }); } // Generate the same external-reference that would have been generated From b652d13e11b5247adf41e33137fdf2419e830ce5 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Fri, 25 Oct 2024 12:13:42 -0400 Subject: [PATCH 11/52] Address more feedback; ensure new source name is unique --- src/adapter/src/catalog/migrate.rs | 47 +++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index da264a06fa1c6..b23eb29500d76 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -256,6 +256,16 @@ fn ast_rewrite_sources_to_tables( .collect::, anyhow::Error>>()?; let items_with_statements_copied = items_with_statements.clone(); + let item_names_per_schema = items_with_statements_copied + .iter() + .map(|(item, _)| (item.schema_id.clone(), &item.name)) + .fold(BTreeMap::new(), |mut acc, (schema_id, name)| { + acc.entry(schema_id) + .or_insert_with(|| btreeset! {}) + .insert(name); + acc + }); + // Any GlobalId that should be changed to a new GlobalId in any statements that // reference it. This is necessary for ensuring downstream statements (e.g. // mat views, indexes) that reference a single-output source (e.g. kafka) @@ -538,22 +548,40 @@ fn ast_rewrite_sources_to_tables( && include_metadata.is_empty() { info!("migrate: skipping already migrated source: {}", name); - return Ok(()); + continue; } // Use the current source name as the new table name, and rename the source to // `_source`. This is intended to allow users to continue using // queries that reference the source name, since they will now need to query the // table instead. - let new_source_name_ident = Ident::new_unchecked( - name.0.last().expect("at least one ident").to_string() + "_source", - ); - let mut new_source_name = name.clone(); - *new_source_name.0.last_mut().expect("at least one ident") = new_source_name_ident; - // Also update the name of the source 'item' - let mut table_item_name = item.name.clone() + "_source"; - std::mem::swap(&mut item.name, &mut table_item_name); + // First find an unused name within the same schema to avoid conflicts. + let mut new_source_item_name = format!("{}_source", item.name); + let mut new_source_name_inner = + format!("{}_source", name.0.last().expect("at least one ident")); + let mut i = 0; + while item_names_per_schema + .get(&item.schema_id) + .expect("schema must exist") + .contains(&new_source_item_name) + { + new_source_item_name = format!("{}_source_{}", item.name, i); + new_source_name_inner = format!( + "{}_source_{}", + name.0.last().expect("at least one ident"), + i + ); + i += 1; + } + // We will use the original item name for the new table item. + let table_item_name = item.name.clone(); + + // Update the source item/statement to use the new name. + let mut new_source_name = name.clone(); + *new_source_name.0.last_mut().expect("at least one ident") = + Ident::new_unchecked(new_source_name_inner); + item.name = new_source_item_name; // A reference to the source that will be included in the table statement let source_ref = @@ -611,7 +639,6 @@ fn ast_rewrite_sources_to_tables( // Generate the same external-reference that would have been generated // during purification for single-output sources. let external_reference = match &conn { - // For kafka sources this proto is currently empty. CreateSourceConnection::Kafka { options, .. } => { let topic_option = options .iter() From c0898497ffef5c7ccf9bde0888140606c2dfe6fd Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Fri, 25 Oct 2024 14:32:08 -0400 Subject: [PATCH 12/52] Fix legacy-upgrade checks --- test/legacy-upgrade/check-from-v0.27.0-compile-proto-sources.td | 2 -- test/legacy-upgrade/check-from-v0.45.0-kafka-progress.td | 2 -- 2 files changed, 4 deletions(-) diff --git a/test/legacy-upgrade/check-from-v0.27.0-compile-proto-sources.td b/test/legacy-upgrade/check-from-v0.27.0-compile-proto-sources.td index 4240d3a2229de..fcc08c2529d2d 100644 --- a/test/legacy-upgrade/check-from-v0.27.0-compile-proto-sources.td +++ b/test/legacy-upgrade/check-from-v0.27.0-compile-proto-sources.td @@ -24,5 +24,3 @@ c h a e - -> DROP SOURCE kafka_proto_source CASCADE; diff --git a/test/legacy-upgrade/check-from-v0.45.0-kafka-progress.td b/test/legacy-upgrade/check-from-v0.45.0-kafka-progress.td index 69af351dbbe33..9592f9055b7d5 100644 --- a/test/legacy-upgrade/check-from-v0.45.0-kafka-progress.td +++ b/test/legacy-upgrade/check-from-v0.45.0-kafka-progress.td @@ -18,5 +18,3 @@ $ set-regex match=\d+ replacement= $ set-regex match=testdrive-upgrade-kafka-source-.*?' replacement=' >[version<8100] SHOW CREATE SOURCE kafka_source materialize.public.kafka_source "CREATE SOURCE \"materialize\".\"public\".\"kafka_source\" FROM KAFKA CONNECTION \"materialize\".\"public\".\"kafka_conn\" (TOPIC = '') FORMAT AVRO USING SCHEMA '{ \"type\": \"record\", \"name\": \"cpx\", \"fields\": [ {\"name\": \"a\", \"type\": \"long\"}, {\"name\": \"b\", \"type\": \"long\"} ] }' ENVELOPE NONE EXPOSE PROGRESS AS \"materialize\".\"public\".\"kafka_source_progress\"" - -> DROP SOURCE kafka_source; From 491ac189d44ce67d35de2f3c8d3ac12f015cd19f Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Fri, 25 Oct 2024 14:49:52 -0400 Subject: [PATCH 13/52] Fixes caused by rebase on main --- src/adapter/src/catalog/migrate.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index b23eb29500d76..2c645ec9e56d3 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -38,7 +38,8 @@ where for mut item in tx.get_items() { let mut stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; - f(tx, item.id, &mut stmt).await?; + // TODO(alter_table): Switch this to CatalogItemId. + f(tx, item.global_id, &mut stmt).await?; item.create_sql = stmt.to_ast_string_stable(); updated_items.insert(item.id, item); @@ -303,7 +304,7 @@ fn ast_rewrite_sources_to_tables( _ => false, }) .expect("source must exist"); - RawItemName::Id(source_item.id.to_string(), name, None) + RawItemName::Id(source_item.global_id.to_string(), name, None) } RawItemName::Id(..) => raw_source_name, }; @@ -368,7 +369,7 @@ fn ast_rewrite_sources_to_tables( item.create_sql, table ); item.create_sql = Statement::CreateTableFromSource(table).to_ast_string_stable(); - tx.update_item(item.id, item)?; + tx.update_item(item.global_id, item)?; } // Postgres sources are multi-output sources whose subsources are @@ -419,7 +420,7 @@ fn ast_rewrite_sources_to_tables( progress_subsource, }); item.create_sql = stmt.to_ast_string_stable(); - tx.update_item(item.id, item)?; + tx.update_item(item.global_id, item)?; info!("migrate: converted postgres source {stmt} to remove subsource options"); } } @@ -477,7 +478,7 @@ fn ast_rewrite_sources_to_tables( progress_subsource, }); item.create_sql = stmt.to_ast_string_stable(); - tx.update_item(item.id, item)?; + tx.update_item(item.global_id, item)?; info!("migrate: converted mysql source {stmt} to remove subsource options"); } } @@ -538,7 +539,7 @@ fn ast_rewrite_sources_to_tables( source_id.parse().expect("valid id") } }; - source == item.id + source == item.global_id } _ => false, }); @@ -585,7 +586,7 @@ fn ast_rewrite_sources_to_tables( // A reference to the source that will be included in the table statement let source_ref = - RawItemName::Id(item.id.to_string(), new_source_name.clone(), None); + RawItemName::Id(item.global_id.to_string(), new_source_name.clone(), None); let columns = if col_names.is_empty() { TableFromSourceColumns::NotSpecified @@ -698,7 +699,7 @@ fn ast_rewrite_sources_to_tables( progress_subsource, }; - let source_id = item.id; + let source_id = item.global_id; let schema_id = item.schema_id.clone(); let schema = tx.get_schema(&item.schema_id).expect("schema must exist"); @@ -709,7 +710,7 @@ fn ast_rewrite_sources_to_tables( // otherwise conflict with the new table statement. info!("migrate: updated source {} to {source}", item.create_sql); item.create_sql = Statement::CreateSource(source).to_ast_string_stable(); - tx.update_item(item.id, item)?; + tx.update_item(item.global_id, item)?; // Insert the new table statement into the catalog with a new id. let ids = tx.allocate_user_item_ids(1)?; @@ -785,7 +786,7 @@ fn ast_rewrite_sources_to_tables( _ => { if mz_sql::names::modify_dependency_item_ids(&mut statement, &changed_ids) { item.create_sql = statement.to_ast_string_stable(); - updated_items.insert(item.id, item); + updated_items.insert(item.global_id, item); } } } From 8c983504cf381d7d54b806c67695c3aec147741e Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Mon, 28 Oct 2024 11:24:18 +0100 Subject: [PATCH 14/52] ci: print source table migration issues --- misc/python/materialize/cli/ci_annotate_errors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/misc/python/materialize/cli/ci_annotate_errors.py b/misc/python/materialize/cli/ci_annotate_errors.py index 53b7fc82d272b..84865cbbcaff4 100644 --- a/misc/python/materialize/cli/ci_annotate_errors.py +++ b/misc/python/materialize/cli/ci_annotate_errors.py @@ -101,6 +101,8 @@ | (FAIL|TIMEOUT)\s+\[\s*\d+\.\d+s\] # parallel-workload | worker_.*\ still\ running: [\s\S]* Threads\ have\ not\ stopped\ within\ 5\ minutes,\ exiting\ hard + # source-table migration + | source-table-migration\ issue ) .* $ """, From da5d73332286b45680eb23efd0c245fb4ad702d4 Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Fri, 25 Oct 2024 14:10:37 +0200 Subject: [PATCH 15/52] migration tests: pg-cdc-old-syntax --- ci/nightly/pipeline.template.yml | 11 ++ test/pg-cdc-old-syntax/mzcompose.py | 132 +++++++++++++++++- .../subsource-resolution-duplicates.td | 3 - 3 files changed, 141 insertions(+), 5 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 01d43fc235446..0c71a2de11d54 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -646,6 +646,17 @@ steps: queue: hetzner-aarch64-4cpu-8gb # the mzbuild postgres version will be used, which depends on the Dockerfile specification + - id: pg-cdc-migration + label: Postgres CDC source-versioning migration tests + depends_on: build-aarch64 + timeout_in_minutes: 360 + plugins: + - ./ci/plugins/mzcompose: + composition: pg-cdc-old-syntax + run: migration + agents: + queue: hetzner-aarch64-4cpu-8gb + - id: pg-cdc-resumption-old-syntax label: Postgres CDC resumption tests (before source versioning) depends_on: build-aarch64 diff --git a/test/pg-cdc-old-syntax/mzcompose.py b/test/pg-cdc-old-syntax/mzcompose.py index f44e64c31615b..5560e259197ad 100644 --- a/test/pg-cdc-old-syntax/mzcompose.py +++ b/test/pg-cdc-old-syntax/mzcompose.py @@ -18,10 +18,16 @@ from pg8000 import Connection from materialize import buildkite +from materialize.mz_version import MzVersion from materialize.mzcompose.composition import Composition, WorkflowArgumentParser from materialize.mzcompose.service import Service, ServiceConfig from materialize.mzcompose.services.materialized import Materialized -from materialize.mzcompose.services.postgres import Postgres +from materialize.mzcompose.services.minio import Minio +from materialize.mzcompose.services.postgres import ( + METADATA_STORE, + CockroachOrPostgresMetadata, + Postgres, +) from materialize.mzcompose.services.test_certs import TestCerts from materialize.mzcompose.services.testdrive import Testdrive from materialize.mzcompose.services.toxiproxy import Toxiproxy @@ -88,8 +94,11 @@ def create_postgres( additional_system_parameter_defaults={ "log_filter": "mz_storage::source::postgres=trace,debug,info,warn,error" }, + external_minio=True, ), Testdrive(), + CockroachOrPostgresMetadata(), + Minio(setup_materialize=True), TestCerts(), Toxiproxy(), create_postgres(pg_version=None), @@ -323,7 +332,11 @@ def workflow_cdc(c: Composition, parser: WorkflowArgumentParser) -> None: def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: workflows_with_internal_sharding = ["cdc"] sharded_workflows = workflows_with_internal_sharding + buildkite.shard_list( - [w for w in c.workflows if w not in workflows_with_internal_sharding], + [ + w + for w in c.workflows + if w not in workflows_with_internal_sharding and w != "migration" + ], lambda w: w, ) print( @@ -348,3 +361,118 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: with c.test_case(name): c.workflow(name, *parser.args) + + +def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: + parser.add_argument( + "filter", + nargs="*", + default=["*.td"], + help="limit to only the files matching filter", + ) + args = parser.parse_args() + + matching_files = [] + for filter in args.filter: + matching_files.extend(glob.glob(filter, root_dir="test/pg-cdc-old-syntax")) + sharded_files: list[str] = sorted( + buildkite.shard_list(matching_files, lambda file: file) + ) + print(f"Files: {sharded_files}") + + ssl_ca = c.run("test-certs", "cat", "/secrets/ca.crt", capture=True).stdout + ssl_cert = c.run("test-certs", "cat", "/secrets/certuser.crt", capture=True).stdout + ssl_key = c.run("test-certs", "cat", "/secrets/certuser.key", capture=True).stdout + ssl_wrong_cert = c.run( + "test-certs", "cat", "/secrets/postgres.crt", capture=True + ).stdout + ssl_wrong_key = c.run( + "test-certs", "cat", "/secrets/postgres.key", capture=True + ).stdout + + pg_version = get_targeted_pg_version(parser) + + mz_old_image = "materialize/materialized:v0.122.0" + mz_new_image = None + + assert MzVersion.parse_cargo() < MzVersion.parse_mz( + "v0.130.0" + ), "migration test probably no longer needed" + + for file in sharded_files: + mz_old = Materialized( + name="materialized", + image=mz_old_image, + volumes_extra=["secrets:/share/secrets"], + external_metadata_store=True, + external_minio=True, + additional_system_parameter_defaults={ + "log_filter": "mz_storage::source::postgres=trace,debug,info,warn,error" + }, + ) + + mz_new = Materialized( + name="materialized", + image=mz_new_image, + volumes_extra=["secrets:/share/secrets"], + external_metadata_store=True, + external_minio=True, + additional_system_parameter_defaults={ + "log_filter": "mz_storage::source::postgres=trace,debug,info,warn,error", + "force_source_table_syntax": "true", + }, + ) + with c.override(mz_old, create_postgres(pg_version=pg_version)): + c.up("materialized", "test-certs", "postgres") + + print(f"Running {file} with mz_old") + + c.run_testdrive_files( + f"--var=ssl-ca={ssl_ca}", + f"--var=ssl-cert={ssl_cert}", + f"--var=ssl-key={ssl_key}", + f"--var=ssl-wrong-cert={ssl_wrong_cert}", + f"--var=ssl-wrong-key={ssl_wrong_key}", + f"--var=default-replica-size={Materialized.Size.DEFAULT_SIZE}-{Materialized.Size.DEFAULT_SIZE}", + f"--var=default-storage-size={Materialized.Size.DEFAULT_SIZE}-1", + "--no-reset", + file, + ) + c.kill("materialized", wait=True) + + with c.override(mz_new): + c.up("materialized") + + print("Running mz_new") + verify_sources(c, file) + + c.kill("materialized", wait=True) + c.kill("postgres", wait=True) + c.kill(METADATA_STORE, wait=True) + c.rm("materialized") + c.rm(METADATA_STORE) + c.rm("postgres") + c.rm_volumes("mzdata") + + +def verify_sources(c: Composition, file: str) -> None: + source_names = c.sql_query("SELECT name FROM mz_sources WHERE id LIKE 'u%';") + + print(f"Sources created in {file} are: {source_names}") + + for row in source_names: + verify_source(c, file, row[0]) + + +def verify_source(c: Composition, file: str, source_name: str) -> None: + try: + print(f"Checking source: {source_name}") + # must not crash + c.sql_query(f"SELECT count(*) FROM {source_name};") + + result = c.sql_query(f"SHOW CREATE SOURCE {source_name};") + sql = result[0][1] + assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" + assert "FOR ALL TABLES" not in sql, f"FOR ALL TABLES found in: {sql}" + except Exception as e: + print(f"source-table-migration issue in {file}: {str(e)}") diff --git a/test/pg-cdc-old-syntax/subsource-resolution-duplicates.td b/test/pg-cdc-old-syntax/subsource-resolution-duplicates.td index 3f510be4088b8..8f053fd552679 100644 --- a/test/pg-cdc-old-syntax/subsource-resolution-duplicates.td +++ b/test/pg-cdc-old-syntax/subsource-resolution-duplicates.td @@ -57,6 +57,3 @@ detail: subsources referencing table: x, y mz_source postgres quickstart "" mz_source_progress progress "" t subsource quickstart "" - -$ postgres-execute connection=postgres://postgres:postgres@postgres -DROP SCHEMA other CASCADE; From 9451a6b161c044b0f9be36ca0a807f94bf553c4b Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Mon, 28 Oct 2024 11:35:03 +0100 Subject: [PATCH 16/52] migration tests: extract logic --- .../materialize/source_table_migration.py | 50 +++++++++++++++++++ test/pg-cdc-old-syntax/mzcompose.py | 42 +++------------- 2 files changed, 58 insertions(+), 34 deletions(-) create mode 100644 misc/python/materialize/source_table_migration.py diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py new file mode 100644 index 0000000000000..ed420cbf172f8 --- /dev/null +++ b/misc/python/materialize/source_table_migration.py @@ -0,0 +1,50 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Utilities for testing the source table migration""" +from materialize.mz_version import MzVersion +from materialize.mzcompose.composition import Composition + + +def verify_sources_after_source_table_migration(c: Composition, file: str) -> None: + source_names = c.sql_query("SELECT name FROM mz_sources WHERE id LIKE 'u%';") + + print(f"Sources created in {file} are: {source_names}") + + for row in source_names: + _verify_source(c, file, row[0]) + + +def _verify_source(c: Composition, file: str, source_name: str) -> None: + try: + print(f"Checking source: {source_name}") + # must not crash + c.sql("SET statement_timeout = '20s'") + c.sql_query(f"SELECT count(*) FROM {source_name};") + + result = c.sql_query(f"SHOW CREATE SOURCE {source_name};") + sql = result[0][1] + assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" + assert "FOR ALL TABLES" not in sql, f"FOR ALL TABLES found in: {sql}" + except Exception as e: + print(f"source-table-migration issue in {file}: {str(e)}") + + +def check_source_table_migration_test_sensible() -> None: + assert MzVersion.parse_cargo() < MzVersion.parse_mz( + "v0.130.0" + ), "migration test probably no longer needed" + + +def get_old_image_for_source_table_migration_test() -> str: + return "materialize/materialized:v0.122.0" + + +def get_new_image_for_source_table_migration_test() -> str | None: + return None diff --git a/test/pg-cdc-old-syntax/mzcompose.py b/test/pg-cdc-old-syntax/mzcompose.py index 5560e259197ad..51ab5ed98d0e9 100644 --- a/test/pg-cdc-old-syntax/mzcompose.py +++ b/test/pg-cdc-old-syntax/mzcompose.py @@ -18,7 +18,6 @@ from pg8000 import Connection from materialize import buildkite -from materialize.mz_version import MzVersion from materialize.mzcompose.composition import Composition, WorkflowArgumentParser from materialize.mzcompose.service import Service, ServiceConfig from materialize.mzcompose.services.materialized import Materialized @@ -31,6 +30,11 @@ from materialize.mzcompose.services.test_certs import TestCerts from materialize.mzcompose.services.testdrive import Testdrive from materialize.mzcompose.services.toxiproxy import Toxiproxy +from materialize.source_table_migration import ( + get_new_image_for_source_table_migration_test, + get_old_image_for_source_table_migration_test, + verify_sources_after_source_table_migration, +) # Set the max slot WAL keep size to 10MB DEFAULT_PG_EXTRA_COMMAND = ["-c", "max_slot_wal_keep_size=10"] @@ -392,17 +396,10 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: pg_version = get_targeted_pg_version(parser) - mz_old_image = "materialize/materialized:v0.122.0" - mz_new_image = None - - assert MzVersion.parse_cargo() < MzVersion.parse_mz( - "v0.130.0" - ), "migration test probably no longer needed" - for file in sharded_files: mz_old = Materialized( name="materialized", - image=mz_old_image, + image=get_old_image_for_source_table_migration_test(), volumes_extra=["secrets:/share/secrets"], external_metadata_store=True, external_minio=True, @@ -413,7 +410,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: mz_new = Materialized( name="materialized", - image=mz_new_image, + image=get_new_image_for_source_table_migration_test(), volumes_extra=["secrets:/share/secrets"], external_metadata_store=True, external_minio=True, @@ -444,7 +441,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: c.up("materialized") print("Running mz_new") - verify_sources(c, file) + verify_sources_after_source_table_migration(c, file) c.kill("materialized", wait=True) c.kill("postgres", wait=True) @@ -453,26 +450,3 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: c.rm(METADATA_STORE) c.rm("postgres") c.rm_volumes("mzdata") - - -def verify_sources(c: Composition, file: str) -> None: - source_names = c.sql_query("SELECT name FROM mz_sources WHERE id LIKE 'u%';") - - print(f"Sources created in {file} are: {source_names}") - - for row in source_names: - verify_source(c, file, row[0]) - - -def verify_source(c: Composition, file: str, source_name: str) -> None: - try: - print(f"Checking source: {source_name}") - # must not crash - c.sql_query(f"SELECT count(*) FROM {source_name};") - - result = c.sql_query(f"SHOW CREATE SOURCE {source_name};") - sql = result[0][1] - assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" - assert "FOR ALL TABLES" not in sql, f"FOR ALL TABLES found in: {sql}" - except Exception as e: - print(f"source-table-migration issue in {file}: {str(e)}") From cfd9da3404058cce4293469c06da7abab02ba007 Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Mon, 28 Oct 2024 16:29:49 +0100 Subject: [PATCH 17/52] migration tests: improve verification --- .../materialize/source_table_migration.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index ed420cbf172f8..21ecc1ee4d7c1 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -12,16 +12,23 @@ from materialize.mzcompose.composition import Composition -def verify_sources_after_source_table_migration(c: Composition, file: str) -> None: - source_names = c.sql_query("SELECT name FROM mz_sources WHERE id LIKE 'u%';") +def verify_sources_after_source_table_migration( + c: Composition, file: str, fail: bool = False +) -> None: + source_names_rows = c.sql_query( + "SELECT sm.name || '.' || src.name FROM mz_sources src INNER JOIN mz_schemas sm ON src.schema_id = sm.id WHERE src.id LIKE 'u%';" + ) + source_names = [row[0] for row in source_names_rows] print(f"Sources created in {file} are: {source_names}") - for row in source_names: - _verify_source(c, file, row[0]) + for source_name in source_names: + _verify_source(c, file, source_name, fail=fail) -def _verify_source(c: Composition, file: str, source_name: str) -> None: +def _verify_source( + c: Composition, file: str, source_name: str, fail: bool = False +) -> None: try: print(f"Checking source: {source_name}") # must not crash @@ -32,9 +39,14 @@ def _verify_source(c: Composition, file: str, source_name: str) -> None: sql = result[0][1] assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" assert "FOR ALL TABLES" not in sql, f"FOR ALL TABLES found in: {sql}" + assert "CREATE SUBSOURCE" not in sql, f"FOR ALL TABLES found in: {sql}" + print("OK.") except Exception as e: print(f"source-table-migration issue in {file}: {str(e)}") + if fail: + raise e + def check_source_table_migration_test_sensible() -> None: assert MzVersion.parse_cargo() < MzVersion.parse_mz( From b737c30f26e188ec55ae70add164b42fa4982174 Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Mon, 28 Oct 2024 13:47:06 +0100 Subject: [PATCH 18/52] migration tests: mysql-cdc-old-syntax --- ci/nightly/pipeline.template.yml | 11 ++ test/mysql-cdc-old-syntax/mzcompose.py | 106 +++++++++++++++++- .../subsource-resolution-duplicates.td | 3 - 3 files changed, 116 insertions(+), 4 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 0c71a2de11d54..d55b219a764f8 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -615,6 +615,17 @@ steps: agents: queue: hetzner-aarch64-4cpu-8gb + - id: mysql-cdc-migration + label: MySQL CDC source-versioning migration tests + depends_on: build-aarch64 + timeout_in_minutes: 360 + plugins: + - ./ci/plugins/mzcompose: + composition: mysql-cdc-old-syntax + run: migration + agents: + queue: hetzner-aarch64-4cpu-8gb + - id: mysql-cdc-resumption-old-syntax label: MySQL CDC resumption tests (before source versioning) depends_on: build-aarch64 diff --git a/test/mysql-cdc-old-syntax/mzcompose.py b/test/mysql-cdc-old-syntax/mzcompose.py index 32d78456c45c2..46986c98c3190 100644 --- a/test/mysql-cdc-old-syntax/mzcompose.py +++ b/test/mysql-cdc-old-syntax/mzcompose.py @@ -22,9 +22,19 @@ ) from materialize.mzcompose.composition import Composition, WorkflowArgumentParser from materialize.mzcompose.services.materialized import Materialized +from materialize.mzcompose.services.minio import Minio from materialize.mzcompose.services.mysql import MySql +from materialize.mzcompose.services.postgres import ( + METADATA_STORE, + CockroachOrPostgresMetadata, +) from materialize.mzcompose.services.test_certs import TestCerts from materialize.mzcompose.services.testdrive import Testdrive +from materialize.source_table_migration import ( + get_new_image_for_source_table_migration_test, + get_old_image_for_source_table_migration_test, + verify_sources_after_source_table_migration, +) def create_mysql(mysql_version: str) -> MySql: @@ -46,6 +56,7 @@ def create_mysql_replica(mysql_version: str) -> MySql: SERVICES = [ Materialized( + external_minio=True, additional_system_parameter_defaults={ "log_filter": "mz_storage::source::mysql=trace,info" }, @@ -53,6 +64,8 @@ def create_mysql_replica(mysql_version: str) -> MySql: create_mysql(MySql.DEFAULT_VERSION), create_mysql_replica(MySql.DEFAULT_VERSION), TestCerts(), + CockroachOrPostgresMetadata(), + Minio(setup_materialize=True), Testdrive(default_timeout="60s"), ] @@ -72,7 +85,11 @@ def get_targeted_mysql_version(parser: WorkflowArgumentParser) -> str: def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: workflows_with_internal_sharding = ["cdc"] sharded_workflows = workflows_with_internal_sharding + buildkite.shard_list( - [w for w in c.workflows if w not in workflows_with_internal_sharding], + [ + w + for w in c.workflows + if w not in workflows_with_internal_sharding and w != "migration" + ], lambda w: w, ) print( @@ -273,3 +290,90 @@ def do_inserts(c: Composition): """ ), ) + + +def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: + parser.add_argument( + "filter", + nargs="*", + default=["*.td"], + help="limit to only the files matching filter", + ) + args = parser.parse_args() + + matching_files = [] + for filter in args.filter: + matching_files.extend(glob.glob(filter, root_dir="test/mysql-cdc-old-syntax")) + + # TODO: database-issues#8706 + matching_files = [file for file in matching_files if file != "alter-source.td"] + + sharded_files: list[str] = sorted( + buildkite.shard_list(matching_files, lambda file: file) + ) + print(f"Files: {sharded_files}") + + mysql_version = get_targeted_mysql_version(parser) + + for file in sharded_files: + + mz_old = Materialized( + name="materialized", + image=get_old_image_for_source_table_migration_test(), + external_metadata_store=True, + external_minio=True, + additional_system_parameter_defaults={ + "log_filter": "mz_storage::source::mysql=trace,info" + }, + ) + + mz_new = Materialized( + name="materialized", + image=get_new_image_for_source_table_migration_test(), + external_metadata_store=True, + external_minio=True, + additional_system_parameter_defaults={ + "log_filter": "mz_storage::source::mysql=trace,info" + }, + ) + + with c.override(mz_old, create_mysql(mysql_version)): + c.up("materialized", "mysql") + + print(f"Running {file} with mz_old") + + valid_ssl_context = retrieve_ssl_context_for_mysql(c) + wrong_ssl_context = retrieve_invalid_ssl_context_for_mysql(c) + + c.sources_and_sinks_ignored_from_validation.add("drop_table") + + c.run_testdrive_files( + f"--var=ssl-ca={valid_ssl_context.ca}", + f"--var=ssl-client-cert={valid_ssl_context.client_cert}", + f"--var=ssl-client-key={valid_ssl_context.client_key}", + f"--var=ssl-wrong-ca={wrong_ssl_context.ca}", + f"--var=ssl-wrong-client-cert={wrong_ssl_context.client_cert}", + f"--var=ssl-wrong-client-key={wrong_ssl_context.client_key}", + f"--var=mysql-root-password={MySql.DEFAULT_ROOT_PASSWORD}", + "--var=mysql-user-password=us3rp4ssw0rd", + f"--var=default-replica-size={Materialized.Size.DEFAULT_SIZE}-{Materialized.Size.DEFAULT_SIZE}", + f"--var=default-storage-size={Materialized.Size.DEFAULT_SIZE}-1", + "--no-reset", + file, + ) + + c.kill("materialized", wait=True) + + with c.override(mz_new): + c.up("materialized") + + print("Running mz_new") + verify_sources_after_source_table_migration(c, file) + + c.kill("materialized", wait=True) + c.kill("mysql", wait=True) + c.kill(METADATA_STORE, wait=True) + c.rm("materialized") + c.rm(METADATA_STORE) + c.rm("mysql") + c.rm_volumes("mzdata") diff --git a/test/mysql-cdc-old-syntax/subsource-resolution-duplicates.td b/test/mysql-cdc-old-syntax/subsource-resolution-duplicates.td index 1b5662cc78956..c762d583849e5 100644 --- a/test/mysql-cdc-old-syntax/subsource-resolution-duplicates.td +++ b/test/mysql-cdc-old-syntax/subsource-resolution-duplicates.td @@ -53,6 +53,3 @@ detail: subsources referencing table: x, y mz_source mysql quickstart "" mz_source_progress progress "" t subsource quickstart "" - -$ mysql-execute name=mysql -DROP DATABASE other; From e8fd48be0bc56a8501904d57f445c6716f4dd7ff Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Mon, 28 Oct 2024 14:10:55 +0100 Subject: [PATCH 19/52] migration tests: testdrive-old-kafka-syntax --- ci/nightly/pipeline.template.yml | 11 + .../mzcompose.py | 227 +++++++++++++++++- 2 files changed, 235 insertions(+), 3 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index d55b219a764f8..d8fff84b85c28 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -699,6 +699,17 @@ steps: agents: queue: hetzner-aarch64-8cpu-16gb + - id: testdrive-kafka-migration + label: "Testdrive %N migration tests" + depends_on: build-aarch64 + timeout_in_minutes: 180 + parallelism: 8 + plugins: + - ./ci/plugins/mzcompose: + composition: testdrive-old-kafka-src-syntax + run: migration + agents: + queue: hetzner-aarch64-8cpu-16gb - group: AWS key: aws diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 77fd29709bf9d..517e71d961e69 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -12,10 +12,10 @@ the expected-result/actual-result (aka golden testing) paradigm. A query is retried until it produces the desired result. """ - +import glob from pathlib import Path -from materialize import ci_util +from materialize import ci_util, spawn from materialize.mzcompose import get_default_system_parameters from materialize.mzcompose.composition import Composition, WorkflowArgumentParser from materialize.mzcompose.services.azure import Azurite @@ -24,11 +24,20 @@ from materialize.mzcompose.services.materialized import Materialized from materialize.mzcompose.services.minio import Minio from materialize.mzcompose.services.mysql import MySql -from materialize.mzcompose.services.postgres import Postgres +from materialize.mzcompose.services.postgres import ( + METADATA_STORE, + CockroachOrPostgresMetadata, + Postgres, +) from materialize.mzcompose.services.redpanda import Redpanda from materialize.mzcompose.services.schema_registry import SchemaRegistry from materialize.mzcompose.services.testdrive import Testdrive from materialize.mzcompose.services.zookeeper import Zookeeper +from materialize.source_table_migration import ( + get_new_image_for_source_table_migration_test, + get_old_image_for_source_table_migration_test, + verify_sources_after_source_table_migration, +) SERVICES = [ Zookeeper(), @@ -40,12 +49,25 @@ Minio(setup_materialize=True, additional_directories=["copytos3"]), Azurite(), Materialized(external_blob_store=True), + CockroachOrPostgresMetadata(), FivetranDestination(volumes_extra=["tmp:/share/tmp"]), Testdrive(external_blob_store=True), ] def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: + for name in c.workflows: + if name == "default": + continue + + if name == "migration": + continue + + with c.test_case(name): + c.workflow(name) + + +def workflow_kafka(c: Composition, parser: WorkflowArgumentParser) -> None: """Run testdrive.""" parser.add_argument( "--redpanda", @@ -237,3 +259,202 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: ci_util.upload_junit_report( "testdrive", Path(__file__).parent / junit_report ) + + +def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: + """Run testdrive.""" + parser.add_argument( + "--redpanda", + action="store_true", + help="run against Redpanda instead of the Confluent Platform", + ) + parser.add_argument( + "--aws-region", + help="run against the specified AWS region instead of localstack", + ) + parser.add_argument( + "--kafka-default-partitions", + type=int, + metavar="N", + help="set the default number of kafka partitions per topic", + ) + parser.add_argument( + "--system-param", + type=str, + action="append", + nargs="*", + help="System parameters to set in Materialize, i.e. what you would set with `ALTER SYSTEM SET`", + ) + + parser.add_argument("--replicas", type=int, default=1, help="use multiple replicas") + + parser.add_argument( + "--default-timeout", + type=str, + help="set the default timeout for Testdrive", + ) + parser.add_argument( + "files", + nargs="*", + default=["*.td"], + help="run against the specified files", + ) + + (args, _) = parser.parse_known_args() + + matching_files = [] + for filter in args.files: + matching_files.extend( + glob.glob(filter, root_dir="test/testdrive-old-kafka-src-syntax") + ) + matching_files = [file for file in matching_files if file != "session.td"] + + dependencies = [ + "fivetran-destination", + "minio", + "materialized", + "postgres", + "mysql", + ] + + if args.redpanda: + kafka_deps = ["redpanda"] + else: + kafka_deps = ["zookeeper", "kafka", "schema-registry"] + + dependencies += kafka_deps + + testdrive = Testdrive( + forward_buildkite_shard=True, + kafka_default_partitions=args.kafka_default_partitions, + aws_region=args.aws_region, + # validate_catalog_store=True, + default_timeout=args.default_timeout, + volumes_extra=["mzdata:/mzdata"], + external_minio=True, + fivetran_destination=True, + fivetran_destination_files_path="/share/tmp", + entrypoint_extra=[f"--var=uses-redpanda={args.redpanda}"], + ) + + sysparams = args.system_param + if not args.system_param: + sysparams = [] + + additional_system_parameter_defaults = {} + for val in sysparams: + x = val[0].split("=", maxsplit=1) + assert len(x) == 2, f"--system-param '{val}' should be the format =" + key = x[0] + val = x[1] + + additional_system_parameter_defaults[key] = val + + mz_old = Materialized( + default_size=Materialized.Size.DEFAULT_SIZE, + image=get_old_image_for_source_table_migration_test(), + external_metadata_store=True, + external_minio=True, + additional_system_parameter_defaults=additional_system_parameter_defaults, + ) + mz_new = Materialized( + default_size=Materialized.Size.DEFAULT_SIZE, + image=get_new_image_for_source_table_migration_test(), + external_metadata_store=True, + external_minio=True, + additional_system_parameter_defaults=additional_system_parameter_defaults, + ) + + for file in matching_files: + with c.override(testdrive, mz_old): + c.up(*dependencies) + + c.sql( + "ALTER SYSTEM SET max_clusters = 50;", + port=6877, + user="mz_system", + ) + + non_default_testdrive_vars = [] + + if args.replicas > 1: + c.sql("DROP CLUSTER quickstart CASCADE", user="mz_system", port=6877) + # Make sure a replica named 'r1' always exists + replica_names = [ + "r1" if replica_id == 0 else f"replica{replica_id}" + for replica_id in range(0, args.replicas) + ] + replica_string = ",".join( + f"{replica_name} (SIZE '{mz_old.default_replica_size}')" + for replica_name in replica_names + ) + c.sql( + f"CREATE CLUSTER quickstart REPLICAS ({replica_string})", + user="mz_system", + port=6877, + ) + + # Note that any command that outputs SHOW CLUSTERS will have output + # that depends on the number of replicas testdrive has. This means + # it might be easier to skip certain tests if the number of replicas + # is > 1. + c.sql( + f""" + CREATE CLUSTER testdrive_single_replica_cluster SIZE = '{mz_old.default_replica_size}'; + GRANT ALL PRIVILEGES ON CLUSTER testdrive_single_replica_cluster TO materialize; + """, + user="mz_system", + port=6877, + ) + + non_default_testdrive_vars.append(f"--var=replicas={args.replicas}") + non_default_testdrive_vars.append( + "--var=single-replica-cluster=testdrive_single_replica_cluster" + ) + + non_default_testdrive_vars.append( + f"--var=default-replica-size={mz_old.default_replica_size}" + ) + non_default_testdrive_vars.append( + f"--var=default-storage-size={mz_old.default_storage_size}" + ) + + print(f"Running {file} with mz_old") + + c.run_testdrive_files( + *non_default_testdrive_vars, + "--no-reset", + file, + ) + + c.kill("materialized", wait=True) + + with c.override(mz_new): + c.up("materialized") + + print("Running mz_new") + verify_sources_after_source_table_migration(c, file) + + c.kill("materialized", wait=True) + c.kill("postgres", wait=True) + c.kill("mysql", wait=True) + c.kill(METADATA_STORE, wait=True) + + for dep in kafka_deps: + c.kill(dep, wait=True) + + for dep in kafka_deps: + c.rm(dep) + + c.rm("materialized") + c.rm(METADATA_STORE) + c.rm("postgres") + c.rm("mysql") + + # remove the testdrive container which uses the mzdata volume + testdrive_container_id = spawn.capture( + ["docker", "ps", "-a", "--filter", f"volume={c.name}_mzdata", "-q"] + ).strip() + spawn.runv(["docker", "rm", testdrive_container_id]) + + c.rm_volumes("mzdata", force=True) From 2bba8bda74f2d9d3e20e75b2a2b5ca0e6c70fe2b Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Tue, 29 Oct 2024 09:32:42 +0100 Subject: [PATCH 20/52] migration tests: improve output --- misc/python/materialize/source_table_migration.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index 21ecc1ee4d7c1..ffafc1d7c3dc4 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -22,6 +22,8 @@ def verify_sources_after_source_table_migration( print(f"Sources created in {file} are: {source_names}") + c.sql("SET statement_timeout = '20s'") + for source_name in source_names: _verify_source(c, file, source_name, fail=fail) @@ -31,11 +33,15 @@ def _verify_source( ) -> None: try: print(f"Checking source: {source_name}") + # must not crash - c.sql("SET statement_timeout = '20s'") - c.sql_query(f"SELECT count(*) FROM {source_name};") + statement = f"SELECT count(*) FROM {source_name};" + print(statement) + c.sql_query(statement) - result = c.sql_query(f"SHOW CREATE SOURCE {source_name};") + statement = f"SHOW CREATE SOURCE {source_name};" + print(statement) + result = c.sql_query(statement) sql = result[0][1] assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" assert "FOR ALL TABLES" not in sql, f"FOR ALL TABLES found in: {sql}" From 27652cd59f01e7b0a1c82671fcad43d5a11d20f1 Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Tue, 29 Oct 2024 14:15:53 +0100 Subject: [PATCH 21/52] migration tests: fixes --- test/mysql-cdc-old-syntax/mzcompose.py | 6 ++---- test/testdrive-old-kafka-src-syntax/mzcompose.py | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/mysql-cdc-old-syntax/mzcompose.py b/test/mysql-cdc-old-syntax/mzcompose.py index 46986c98c3190..308cd4a9c2856 100644 --- a/test/mysql-cdc-old-syntax/mzcompose.py +++ b/test/mysql-cdc-old-syntax/mzcompose.py @@ -305,9 +305,6 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: for filter in args.filter: matching_files.extend(glob.glob(filter, root_dir="test/mysql-cdc-old-syntax")) - # TODO: database-issues#8706 - matching_files = [file for file in matching_files if file != "alter-source.td"] - sharded_files: list[str] = sorted( buildkite.shard_list(matching_files, lambda file: file) ) @@ -333,7 +330,8 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: external_metadata_store=True, external_minio=True, additional_system_parameter_defaults={ - "log_filter": "mz_storage::source::mysql=trace,info" + "log_filter": "mz_storage::source::mysql=trace,info", + "force_source_table_syntax": "true", }, ) diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 517e71d961e69..075ddd07ea6e5 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -350,6 +350,8 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: additional_system_parameter_defaults[key] = val + additional_system_parameter_defaults["force_source_table_syntax"] = "true" + mz_old = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, image=get_old_image_for_source_table_migration_test(), From bb49cf5e11caed3e7f1c5715d0a5ec8b685f7850 Mon Sep 17 00:00:00 2001 From: Rainer Niedermayr Date: Tue, 29 Oct 2024 16:12:33 +0100 Subject: [PATCH 22/52] migration tests: fixes --- misc/python/materialize/source_table_migration.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index ffafc1d7c3dc4..423443da73fb2 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -45,7 +45,10 @@ def _verify_source( sql = result[0][1] assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" assert "FOR ALL TABLES" not in sql, f"FOR ALL TABLES found in: {sql}" - assert "CREATE SUBSOURCE" not in sql, f"FOR ALL TABLES found in: {sql}" + + if not source_name.endswith("_progress"): + assert "CREATE SUBSOURCE" not in sql, f"CREATE SUBSOURCE found in: {sql}" + print("OK.") except Exception as e: print(f"source-table-migration issue in {file}: {str(e)}") From acdb421ddcba760dfc23023b3e414bbecaeaa049 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Tue, 29 Oct 2024 15:17:24 -0400 Subject: [PATCH 23/52] Fix for mysql source being restarted after new table added --- src/storage/src/source/mysql/replication/events.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/storage/src/source/mysql/replication/events.rs b/src/storage/src/source/mysql/replication/events.rs index 14a628e8db25a..56f0016a3faa4 100644 --- a/src/storage/src/source/mysql/replication/events.rs +++ b/src/storage/src/source/mysql/replication/events.rs @@ -197,6 +197,11 @@ pub(super) async fn handle_query_event( (Some("commit"), None) => { is_complete_event = true; } + // Detect `CREATE TABLE ` statements which don't affect existing tables but do + // signify a complete event (e.g. for the purposes of advancing the GTID) + (Some("create"), Some("table")) => { + is_complete_event = true; + } _ => {} } From 62d7ef70470cbdc09af1a83b169aeb882f29ac02 Mon Sep 17 00:00:00 2001 From: Roshan Jobanputra Date: Tue, 29 Oct 2024 17:54:37 -0400 Subject: [PATCH 24/52] Avoid needing to rewrite ids of dependent statements by changing the sorting of item updates --- src/adapter/src/catalog/apply.rs | 79 ++++++++++++++++++++---------- src/adapter/src/catalog/migrate.rs | 1 + src/adapter/src/coord.rs | 23 --------- src/sql/src/names.rs | 26 ++++++++++ 4 files changed, 81 insertions(+), 48 deletions(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index 9db55bc11d605..8157d7098c16a 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -25,7 +25,7 @@ use mz_catalog::builtin::{ use mz_catalog::durable::objects::{ ClusterKey, DatabaseKey, DurableType, ItemKey, NetworkPolicyKey, RoleKey, SchemaKey, }; -use mz_catalog::durable::{CatalogError, DurableCatalogError, SystemObjectMapping}; +use mz_catalog::durable::{CatalogError, SystemObjectMapping}; use mz_catalog::memory::error::{Error, ErrorKind}; use mz_catalog::memory::objects::{ CatalogEntry, CatalogItem, Cluster, ClusterReplica, DataSourceDesc, Database, Func, Index, Log, @@ -37,6 +37,7 @@ use mz_compute_types::config::ComputeReplicaConfig; use mz_controller::clusters::{ReplicaConfig, ReplicaLogging}; use mz_controller_types::ClusterId; use mz_expr::MirScalarExpr; +use mz_ore::collections::CollectionExt; use mz_ore::tracing::OpenTelemetryContext; use mz_ore::{instrument, soft_assert_no_log}; use mz_pgrepr::oid::INVALID_OID; @@ -54,7 +55,7 @@ use mz_sql::session::vars::{VarError, VarInput}; use mz_sql::{plan, rbac}; use mz_sql_parser::ast::Expr; use mz_storage_types::sources::Timeline; -use tracing::{error, info_span, warn, Instrument}; +use tracing::{info_span, warn, Instrument}; use crate::catalog::state::LocalExpressionCache; use crate::catalog::{BuiltinTableUpdate, CatalogState}; @@ -1029,17 +1030,7 @@ impl CatalogState { } } }; - // We allow sinks to break this invariant due to a know issue with `ALTER SINK`. - // https://github.com/MaterializeInc/materialize/pull/28708. - if !entry.is_sink() && entry.uses().iter().any(|id| *id > entry.id) { - let msg = format!( - "item cannot depend on items with larger GlobalIds, item: {:?}, dependencies: {:?}", - entry, - entry.uses() - ); - error!("internal catalog errr: {msg}"); - return Err(CatalogError::Durable(DurableCatalogError::Internal(msg))); - } + self.insert_entry(entry); } StateDiff::Retraction => { @@ -1894,25 +1885,63 @@ fn sort_updates_inner(updates: Vec) -> Vec { } } - /// Sort item updates by [`CatalogItemId`]. + /// Sort item updates by parsing statements to identify any id-based dependencies within + /// this set of updates and then performing a topological sort. fn sort_item_updates( item_updates: Vec<(mz_catalog::durable::Item, Timestamp, StateDiff)>, ) -> VecDeque<(mz_catalog::durable::Item, Timestamp, StateDiff)> { - item_updates + let mut items_with_dependencies = item_updates .into_iter() - // HACK: due to `ALTER SINK`, sinks can appear before the objects they - // depend upon. Fortunately, because sinks can never have dependencies - // and can never depend upon one another, to fix the topological sort, - // we can just always move sinks to the end. - .sorted_by_key(|(item, _ts, _diff)| { - if item.create_sql.starts_with("CREATE SINK") { - CatalogItemId::User(u64::MAX) - } else { - item.id - } + .map(|(item, ts, diff)| { + let parsed = mz_sql::parse::parse(&item.create_sql) + .expect("failed to parse persisted SQL") + .into_element() + .ast; + let deps = mz_sql::names::raw_item_dependency_ids(&parsed); + + (item.global_id, (deps, (item, ts, diff))) }) + .collect::>(); + let mut visited = BTreeSet::new(); + let mut sorted = Vec::new(); + fn dfs( + id: GlobalId, + visited: &mut BTreeSet, + sorted: &mut Vec, + items_with_dependencies: &BTreeMap< + GlobalId, + ( + BTreeSet, + (mz_catalog::durable::Item, Timestamp, StateDiff), + ), + >, + ) { + visited.insert(id); + let deps = items_with_dependencies + .get(&id) + .map(|(deps, _)| deps) + .expect("item should be in the map"); + for dep in deps { + // We only want to visit dependencies that are in the current set of updates. + if !visited.contains(dep) && items_with_dependencies.contains_key(dep) { + dfs(*dep, visited, sorted, items_with_dependencies); + } + } + sorted.push(id); + } + for id in items_with_dependencies.keys() { + if !visited.contains(id) { + dfs(*id, &mut visited, &mut sorted, &items_with_dependencies); + } + } + // return the values from items_with_dependencies in the order of sorted + sorted + .into_iter() + .filter_map(|id| items_with_dependencies.remove(&id)) + .map(|item| item.1) .collect() } + let item_retractions = sort_item_updates(item_retractions); let item_additions = sort_item_updates(item_additions); diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 2c645ec9e56d3..196e18250a0f5 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -785,6 +785,7 @@ fn ast_rewrite_sources_to_tables( // for all sources that were migrated. _ => { if mz_sql::names::modify_dependency_item_ids(&mut statement, &changed_ids) { + info!("migrate: updated dependency reference in statement {statement}"); item.create_sql = statement.to_ast_string_stable(); updated_items.insert(item.global_id, item); } diff --git a/src/adapter/src/coord.rs b/src/adapter/src/coord.rs index 2f26cff656e02..039365243c075 100644 --- a/src/adapter/src/coord.rs +++ b/src/adapter/src/coord.rs @@ -1943,29 +1943,6 @@ impl Coordinator { let mut privatelink_connections = BTreeMap::new(); for entry in &entries { - // TODO(database-issues#7922): we should move this invariant into `CatalogEntry`. - mz_ore::soft_assert_or_log!( - // We only expect user objects to objects obey this invariant. - // System objects, for instance, can depend on other system - // objects that belong to a schema that is simply loaded first. - // To meaningfully resolve this, we could need more careful - // loading order or more complex IDs, neither of which seem very - // beneficial. - // - // HACK: sinks are permitted to depend on items with larger IDs, - // due to `ALTER SINK`. - !entry.id().is_user() - || entry.is_sink() - || entry - .uses() - .iter() - .all(|dependency_id| *dependency_id <= entry.id), - "entries should only use to items with lesser `GlobalId`s, but \ - {:?} uses {:?}", - entry.id, - entry.uses() - ); - debug!( "coordinator init: installing {} {}", entry.item().typ(), diff --git a/src/sql/src/names.rs b/src/sql/src/names.rs index 2eb24322e83b1..658cc93bba9e4 100644 --- a/src/sql/src/names.rs +++ b/src/sql/src/names.rs @@ -2387,6 +2387,32 @@ where ResolvedIds::new(visitor.ids) } +#[derive(Debug)] +pub struct RawItemDependencyIds { + pub ids: BTreeSet, +} + +impl<'ast> Visit<'ast, Raw> for RawItemDependencyIds { + fn visit_item_name(&mut self, item_name: &RawItemName) { + if let RawItemName::Id(id, _, _) = item_name { + let parsed_id = id.parse::().unwrap(); + self.ids.insert(parsed_id); + } + } +} + +/// Collect any ID-based dependencies of the provided raw AST node. +pub fn raw_item_dependency_ids<'ast, N>(node: &'ast N) -> BTreeSet +where + N: VisitNode<'ast, Raw>, +{ + let mut deps = RawItemDependencyIds { + ids: BTreeSet::new(), + }; + node.visit(&mut deps); + deps.ids +} + #[derive(Debug)] pub struct ItemDependencyModifier<'a> { pub modified: bool, From 1bcc902daaa891fac4e7c114f6f3ffe3b24e186c Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 14 Nov 2024 11:35:44 -0500 Subject: [PATCH 25/52] Fix merge skew --- src/adapter/src/catalog/migrate.rs | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 196e18250a0f5..85b25f474f01c 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -14,13 +14,14 @@ use mz_catalog::durable::{Transaction, Item}; use mz_catalog::memory::objects::{StateUpdate,BootstrapStateUpdateKind}; use mz_ore::collections::CollectionExt; use mz_ore::now::NowFn; -use mz_repr::{CatalogItemId, Timestamp}; +use mz_repr::{CatalogItemId, GlobalId, Timestamp}; use mz_sql::ast::display::AstDisplay; use mz_sql::names::FullItemName; use mz_sql_parser::ast::{Raw, Statement}; use semver::Version; use tracing::info; use uuid::Uuid; + // DO NOT add any more imports from `crate` outside of `crate::catalog`. use crate::catalog::open::into_consolidatable_updates_startup; use crate::catalog::state::LocalExpressionCache; @@ -369,7 +370,7 @@ fn ast_rewrite_sources_to_tables( item.create_sql, table ); item.create_sql = Statement::CreateTableFromSource(table).to_ast_string_stable(); - tx.update_item(item.global_id, item)?; + tx.update_item(item.id, item)?; } // Postgres sources are multi-output sources whose subsources are @@ -420,7 +421,7 @@ fn ast_rewrite_sources_to_tables( progress_subsource, }); item.create_sql = stmt.to_ast_string_stable(); - tx.update_item(item.global_id, item)?; + tx.update_item(item.id, item)?; info!("migrate: converted postgres source {stmt} to remove subsource options"); } } @@ -478,7 +479,7 @@ fn ast_rewrite_sources_to_tables( progress_subsource, }); item.create_sql = stmt.to_ast_string_stable(); - tx.update_item(item.global_id, item)?; + tx.update_item(item.id, item)?; info!("migrate: converted mysql source {stmt} to remove subsource options"); } } @@ -705,25 +706,28 @@ fn ast_rewrite_sources_to_tables( let owner_id = item.owner_id.clone(); let privileges = item.privileges.clone(); + let extra_versions = item.extra_versions.clone(); // Update the source statement in the catalog first, since the name will // otherwise conflict with the new table statement. info!("migrate: updated source {} to {source}", item.create_sql); item.create_sql = Statement::CreateSource(source).to_ast_string_stable(); - tx.update_item(item.global_id, item)?; + tx.update_item(item.id, item)?; // Insert the new table statement into the catalog with a new id. let ids = tx.allocate_user_item_ids(1)?; - let new_table_id = ids[0]; + let (new_table_id, new_table_global_id) = ids[0]; info!("migrate: added table {new_table_id}: {table}"); tx.insert_user_item( new_table_id, + new_table_global_id, schema_id, &table_item_name, table.to_ast_string_stable(), owner_id, privileges, &Default::default(), + extra_versions, )?; // We need to move the shard currently attached to the source statement to the // table statement such that the existing data in the shard is preserved and can @@ -737,7 +741,7 @@ fn ast_rewrite_sources_to_tables( .pop() .expect("shard should exist"); tx.insert_collection_metadata(btreemap! { - new_table_id => existing_source_shard, + new_table_global_id => existing_source_shard, source_id => new_source_shard })?; @@ -761,7 +765,7 @@ fn ast_rewrite_sources_to_tables( // We also need to update any other statements that reference the source to use the new // table id/name instead. - changed_ids.insert(source_id, new_table_id); + changed_ids.insert(source_id, new_table_global_id); } // When we upgrade to > rust 1.81 we should use #[expect(unreachable_patterns)] @@ -787,7 +791,7 @@ fn ast_rewrite_sources_to_tables( if mz_sql::names::modify_dependency_item_ids(&mut statement, &changed_ids) { info!("migrate: updated dependency reference in statement {statement}"); item.create_sql = statement.to_ast_string_stable(); - updated_items.insert(item.global_id, item); + updated_items.insert(item.id, item); } } } From 30aa14b5f698db0257046b9bb3fe5d852858b911 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 14 Nov 2024 14:17:41 -0500 Subject: [PATCH 26/52] Fix dependency tracking --- src/adapter/src/catalog/apply.rs | 60 ++++++++++++++++++++++++++------ src/sql/src/names.rs | 29 +++++++++------ 2 files changed, 69 insertions(+), 20 deletions(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index 8157d7098c16a..07e69c12c226b 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -37,7 +37,7 @@ use mz_compute_types::config::ComputeReplicaConfig; use mz_controller::clusters::{ReplicaConfig, ReplicaLogging}; use mz_controller_types::ClusterId; use mz_expr::MirScalarExpr; -use mz_ore::collections::CollectionExt; +use mz_ore::collections::{CollectionExt, HashMap}; use mz_ore::tracing::OpenTelemetryContext; use mz_ore::{instrument, soft_assert_no_log}; use mz_pgrepr::oid::INVALID_OID; @@ -101,7 +101,7 @@ impl CatalogState { local_expression_cache: &mut LocalExpressionCache, ) -> Vec> { let mut builtin_table_updates = Vec::with_capacity(updates.len()); - let updates = sort_updates(updates); + let updates = sort_updates(updates, &self); let mut groups: Vec> = Vec::new(); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { @@ -143,7 +143,7 @@ impl CatalogState { updates: Vec, ) -> Result>, CatalogError> { let mut builtin_table_updates = Vec::with_capacity(updates.len()); - let updates = sort_updates(updates); + let updates = sort_updates(updates, self); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { let mut retractions = InProgressRetractions::default(); @@ -1740,12 +1740,12 @@ impl CatalogState { } /// Sort [`StateUpdate`]s in timestamp then dependency order -fn sort_updates(mut updates: Vec) -> Vec { +fn sort_updates(mut updates: Vec, state: &CatalogState) -> Vec { let mut sorted_updates = Vec::with_capacity(updates.len()); updates.sort_by_key(|update| update.ts); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { - let sorted_ts_updates = sort_updates_inner(updates.collect()); + let sorted_ts_updates = sort_updates_inner(updates.collect(), state); sorted_updates.extend(sorted_ts_updates); } @@ -1753,7 +1753,7 @@ fn sort_updates(mut updates: Vec) -> Vec { } /// Sort [`StateUpdate`]s in dependency order for a single timestamp. -fn sort_updates_inner(updates: Vec) -> Vec { +fn sort_updates_inner(updates: Vec, state: &CatalogState) -> Vec { fn push_update( update: T, diff: StateDiff, @@ -1889,7 +1889,16 @@ fn sort_updates_inner(updates: Vec) -> Vec { /// this set of updates and then performing a topological sort. fn sort_item_updates( item_updates: Vec<(mz_catalog::durable::Item, Timestamp, StateDiff)>, + state: &CatalogState, ) -> VecDeque<(mz_catalog::durable::Item, Timestamp, StateDiff)> { + let mut item_id_lookup: HashMap<_, HashMap<_, _>> = HashMap::new(); + for (item, _, _) in &item_updates { + item_id_lookup + .entry(item.schema_id) + .or_insert_with(|| HashMap::new()) + .insert(item.name.clone(), item.global_id); + } + let mut items_with_dependencies = item_updates .into_iter() .map(|(item, ts, diff)| { @@ -1897,9 +1906,40 @@ fn sort_updates_inner(updates: Vec) -> Vec { .expect("failed to parse persisted SQL") .into_element() .ast; - let deps = mz_sql::names::raw_item_dependency_ids(&parsed); + let (mut id_deps, name_deps) = mz_sql::names::raw_item_dependency_ids(&parsed); + + // Convert named deps into ID deps. Ideally this is empty and all dependencies are + // specified by ID. However, this is not the case and require some changes and a + // migration to fix. + for name in name_deps { + let (db, schema, item) = match name.0.len() { + 3 => ( + Some(name.0[0].as_str()), + name.0[1].as_str(), + name.0[2].as_str(), + ), + 2 => (None, name.0[1].as_str(), name.0[2].as_str()), + _ => panic!("Invalid item name: {name:?}"), + }; + let schema = state + .resolve_schema(None, db, schema, &SYSTEM_CONN_ID) + .expect("schema must be loaded before an item"); + // If `name` is not also being applied in this batch then the relative order of + // `item` and `name` doesn't matter, so we can ignore it. + let schema_id = match schema.id { + SchemaSpecifier::Id(id) => id, + SchemaSpecifier::Temporary => { + panic!("temporary item {name:?} persisted as dependency of {item:?}") + } + }; + if let Some(ids) = item_id_lookup.get(&schema_id) { + if let Some(id) = ids.get(item) { + id_deps.insert(*id); + } + } + } - (item.global_id, (deps, (item, ts, diff))) + (item.global_id, (id_deps, (item, ts, diff))) }) .collect::>(); let mut visited = BTreeSet::new(); @@ -1942,8 +1982,8 @@ fn sort_updates_inner(updates: Vec) -> Vec { .collect() } - let item_retractions = sort_item_updates(item_retractions); - let item_additions = sort_item_updates(item_additions); + let item_retractions = sort_item_updates(item_retractions, state); + let item_additions = sort_item_updates(item_additions, state); /// Sort temporary item updates by GlobalId. fn sort_temp_item_updates( diff --git a/src/sql/src/names.rs b/src/sql/src/names.rs index 658cc93bba9e4..2221d9257dd05 100644 --- a/src/sql/src/names.rs +++ b/src/sql/src/names.rs @@ -2388,29 +2388,38 @@ where } #[derive(Debug)] -pub struct RawItemDependencyIds { - pub ids: BTreeSet, +struct RawItemDependencyIds<'a> { + ids: BTreeSet, + names: BTreeSet<&'a UnresolvedItemName>, } -impl<'ast> Visit<'ast, Raw> for RawItemDependencyIds { - fn visit_item_name(&mut self, item_name: &RawItemName) { - if let RawItemName::Id(id, _, _) = item_name { - let parsed_id = id.parse::().unwrap(); - self.ids.insert(parsed_id); +impl<'ast> Visit<'ast, Raw> for RawItemDependencyIds<'ast> { + fn visit_item_name(&mut self, item_name: &'ast RawItemName) { + match item_name { + RawItemName::Name(name) => { + self.names.insert(name); + } + RawItemName::Id(id, _, _) => { + let parsed_id = id.parse::().unwrap(); + self.ids.insert(parsed_id); + } } } } -/// Collect any ID-based dependencies of the provided raw AST node. -pub fn raw_item_dependency_ids<'ast, N>(node: &'ast N) -> BTreeSet +/// Collect any dependencies of the provided raw AST node. +pub fn raw_item_dependency_ids<'ast, N>( + node: &'ast N, +) -> (BTreeSet, BTreeSet<&'ast UnresolvedItemName>) where N: VisitNode<'ast, Raw>, { let mut deps = RawItemDependencyIds { ids: BTreeSet::new(), + names: BTreeSet::new(), }; node.visit(&mut deps); - deps.ids + (deps.ids, deps.names) } #[derive(Debug)] From cc0a868f6658827f15330e7e0005efc865e597d5 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 14 Nov 2024 14:18:24 -0500 Subject: [PATCH 27/52] Fix lint --- src/adapter/src/catalog/open.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/adapter/src/catalog/open.rs b/src/adapter/src/catalog/open.rs index 9f563aa3189ae..61131977252af 100644 --- a/src/adapter/src/catalog/open.rs +++ b/src/adapter/src/catalog/open.rs @@ -536,7 +536,9 @@ impl Catalog { diff: diff.try_into().expect("valid diff"), }) .collect(); - let builtin_table_update = state.apply_updates_for_bootstrap(post_item_updates, &mut local_expr_cache).await; + let builtin_table_update = state + .apply_updates_for_bootstrap(post_item_updates, &mut local_expr_cache) + .await; builtin_table_updates.extend(builtin_table_update); // We don't need to apply the audit logs in memory, yet apply can be expensive when the From 635e9883b08ebdecf5070101238fd98af2604bdb Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 14 Nov 2024 14:54:23 -0500 Subject: [PATCH 28/52] Fix some issues --- src/adapter/src/catalog/apply.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index 07e69c12c226b..a8ba5800d7b19 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -101,7 +101,7 @@ impl CatalogState { local_expression_cache: &mut LocalExpressionCache, ) -> Vec> { let mut builtin_table_updates = Vec::with_capacity(updates.len()); - let updates = sort_updates(updates, &self); + let updates = sort_updates(updates, self); let mut groups: Vec> = Vec::new(); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { @@ -1895,7 +1895,7 @@ fn sort_updates_inner(updates: Vec, state: &CatalogState) -> Vec, state: &CatalogState) -> Vec (None, name.0[1].as_str(), name.0[2].as_str()), + 2 => (None, name.0[0].as_str(), name.0[1].as_str()), _ => panic!("Invalid item name: {name:?}"), }; let schema = state From 7e0428659266daafb9b45c5107043c4f3cee8d68 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 14 Nov 2024 16:50:25 -0500 Subject: [PATCH 29/52] Fix dependency tracking --- src/adapter/src/catalog/apply.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index a8ba5800d7b19..e33c734eecfb5 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -1919,7 +1919,8 @@ fn sort_updates_inner(updates: Vec, state: &CatalogState) -> Vec (None, name.0[0].as_str(), name.0[1].as_str()), - _ => panic!("Invalid item name: {name:?}"), + // This must be a CTE. + _ => continue, }; let schema = state .resolve_schema(None, db, schema, &SYSTEM_CONN_ID) From 21b4d51234079f419af663a04f229e189f8da7b3 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Fri, 15 Nov 2024 10:01:41 -0500 Subject: [PATCH 30/52] Fix merge skew --- src/adapter/src/catalog/open.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/adapter/src/catalog/open.rs b/src/adapter/src/catalog/open.rs index 61131977252af..926cebb6bd709 100644 --- a/src/adapter/src/catalog/open.rs +++ b/src/adapter/src/catalog/open.rs @@ -379,7 +379,6 @@ impl Catalog { let mut post_item_updates = Vec::new(); let mut audit_log_updates = Vec::new(); for (kind, ts, diff) in updates { - let diff = diff.try_into().expect("valid diff"); match kind { BootstrapStateUpdateKind::Role(_) | BootstrapStateUpdateKind::Database(_) @@ -393,7 +392,7 @@ impl Catalog { pre_item_updates.push(StateUpdate { kind: kind.into(), ts, - diff, + diff: diff.try_into().expect("valid diff"), }) } BootstrapStateUpdateKind::IntrospectionSourceIndex(_) @@ -401,13 +400,13 @@ impl Catalog { system_item_updates.push(StateUpdate { kind: kind.into(), ts, - diff, + diff: diff.try_into().expect("valid diff"), }) } BootstrapStateUpdateKind::Item(_) => item_updates.push(StateUpdate { kind: kind.into(), ts, - diff, + diff: diff.try_into().expect("valid diff"), }), BootstrapStateUpdateKind::Comment(_) | BootstrapStateUpdateKind::StorageCollectionMetadata(_) @@ -419,7 +418,7 @@ impl Catalog { audit_log_updates.push(StateUpdate { kind: kind.into(), ts, - diff, + diff: diff.try_into().expect("valid diff"), }); } } From a6c67e83d6371032970e60f33b403ac6cd41f7d7 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Fri, 15 Nov 2024 12:11:40 -0500 Subject: [PATCH 31/52] Update test versions --- misc/python/materialize/source_table_migration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index 423443da73fb2..0f56806d2d9ba 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -59,12 +59,12 @@ def _verify_source( def check_source_table_migration_test_sensible() -> None: assert MzVersion.parse_cargo() < MzVersion.parse_mz( - "v0.130.0" + "v0.133.0" ), "migration test probably no longer needed" def get_old_image_for_source_table_migration_test() -> str: - return "materialize/materialized:v0.122.0" + return "materialize/materialized:v0.125.0" def get_new_image_for_source_table_migration_test() -> str | None: From 3144e921bb978ac5ad18dc7c9b8401ae6cbec3db Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 12 Dec 2024 16:05:38 -0500 Subject: [PATCH 32/52] More merge skew fixes --- .../materialize/checks/all_checks/source_tables.py | 1 + src/adapter-types/src/dyncfgs.rs | 1 - src/adapter/src/catalog/apply.rs | 2 +- src/adapter/src/catalog/migrate.rs | 11 +++++------ 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/misc/python/materialize/checks/all_checks/source_tables.py b/misc/python/materialize/checks/all_checks/source_tables.py index c36bb1eb8af12..279ede6a4022e 100644 --- a/misc/python/materialize/checks/all_checks/source_tables.py +++ b/misc/python/materialize/checks/all_checks/source_tables.py @@ -28,6 +28,7 @@ class TableFromPgSource(TableFromSourceBase): suffix = "tbl_from_pg_source" def initialize(self) -> Testdrive: + return Testdrive( self.generic_setup() + dedent( diff --git a/src/adapter-types/src/dyncfgs.rs b/src/adapter-types/src/dyncfgs.rs index 0ab56c9b41144..795a342f284a4 100644 --- a/src/adapter-types/src/dyncfgs.rs +++ b/src/adapter-types/src/dyncfgs.rs @@ -128,5 +128,4 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { .add(&DEFAULT_SINK_PARTITION_STRATEGY) .add(&ENABLE_CONTINUAL_TASK_BUILTINS) .add(&ENABLE_EXPRESSION_CACHE) - .add(&ENABLE_SOURCE_TABLE_MIGRATION) } diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index e33c734eecfb5..f8b2e39b9c487 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -25,7 +25,7 @@ use mz_catalog::builtin::{ use mz_catalog::durable::objects::{ ClusterKey, DatabaseKey, DurableType, ItemKey, NetworkPolicyKey, RoleKey, SchemaKey, }; -use mz_catalog::durable::{CatalogError, SystemObjectMapping}; +use mz_catalog::durable::{CatalogError, SystemObjectMapping}; use mz_catalog::memory::error::{Error, ErrorKind}; use mz_catalog::memory::objects::{ CatalogEntry, CatalogItem, Cluster, ClusterReplica, DataSourceDesc, Database, Func, Index, Log, diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 85b25f474f01c..bfadeaa3cd2bb 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -31,7 +31,7 @@ fn rewrite_ast_items(tx: &mut Transaction<'_>, mut f: F) -> Result<(), anyhow where F: for<'a> FnMut( &'a mut Transaction<'_>, - GlobalId, + CatalogItemId, &'a mut Statement, ) -> Result<(), anyhow::Error>, { @@ -39,8 +39,8 @@ where for mut item in tx.get_items() { let mut stmt = mz_sql::parse::parse(&item.create_sql)?.into_element().ast; - // TODO(alter_table): Switch this to CatalogItemId. - f(tx, item.global_id, &mut stmt).await?; + f(tx, item.id, &mut stmt)?; + item.create_sql = stmt.to_ast_string_stable(); updated_items.insert(item.id, item); @@ -110,8 +110,7 @@ pub(crate) async fn migrate( ast_rewrite_sources_to_tables(tx, now)?; } - rewrite_ast_items(tx, |tx, item, stmt, all_items_and_statements| { - let now = now.clone(); + rewrite_ast_items(tx, |_tx, _id, _stmt| { // Add per-item AST migrations below. // // Each migration should be a function that takes `stmt` (the AST @@ -121,7 +120,7 @@ pub(crate) async fn migrate( // Migration functions may also take `tx` as input to stage // arbitrary changes to the catalog. - Ok(()) + Ok(()) })?; // Load items into catalog. We make sure to consolidate the old updates with the new updates to From 01619dc65d5324dbe1ae6acfde9fe2b6fd85b2a7 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Mon, 16 Dec 2024 14:06:13 -0500 Subject: [PATCH 33/52] Update item sorting to only sort within item groups --- src/adapter/src/catalog/apply.rs | 209 ++++++++++++++--------------- src/catalog/src/durable/objects.rs | 60 +++++---- src/sql/src/names.rs | 35 ----- 3 files changed, 139 insertions(+), 165 deletions(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index f8b2e39b9c487..7be37cb11650a 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -37,7 +37,6 @@ use mz_compute_types::config::ComputeReplicaConfig; use mz_controller::clusters::{ReplicaConfig, ReplicaLogging}; use mz_controller_types::ClusterId; use mz_expr::MirScalarExpr; -use mz_ore::collections::{CollectionExt, HashMap}; use mz_ore::tracing::OpenTelemetryContext; use mz_ore::{instrument, soft_assert_no_log}; use mz_pgrepr::oid::INVALID_OID; @@ -101,7 +100,7 @@ impl CatalogState { local_expression_cache: &mut LocalExpressionCache, ) -> Vec> { let mut builtin_table_updates = Vec::with_capacity(updates.len()); - let updates = sort_updates(updates, self); + let updates = sort_updates(updates); let mut groups: Vec> = Vec::new(); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { @@ -143,7 +142,7 @@ impl CatalogState { updates: Vec, ) -> Result>, CatalogError> { let mut builtin_table_updates = Vec::with_capacity(updates.len()); - let updates = sort_updates(updates, self); + let updates = sort_updates(updates); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { let mut retractions = InProgressRetractions::default(); @@ -1740,12 +1739,12 @@ impl CatalogState { } /// Sort [`StateUpdate`]s in timestamp then dependency order -fn sort_updates(mut updates: Vec, state: &CatalogState) -> Vec { +fn sort_updates(mut updates: Vec) -> Vec { let mut sorted_updates = Vec::with_capacity(updates.len()); updates.sort_by_key(|update| update.ts); for (_, updates) in &updates.into_iter().group_by(|update| update.ts) { - let sorted_ts_updates = sort_updates_inner(updates.collect(), state); + let sorted_ts_updates = sort_updates_inner(updates.collect()); sorted_updates.extend(sorted_ts_updates); } @@ -1753,7 +1752,7 @@ fn sort_updates(mut updates: Vec, state: &CatalogState) -> Vec, state: &CatalogState) -> Vec { +fn sort_updates_inner(updates: Vec) -> Vec { fn push_update( update: T, diff: StateDiff, @@ -1889,117 +1888,117 @@ fn sort_updates_inner(updates: Vec, state: &CatalogState) -> Vec, - state: &CatalogState, ) -> VecDeque<(mz_catalog::durable::Item, Timestamp, StateDiff)> { - let mut item_id_lookup: HashMap<_, HashMap<_, _>> = HashMap::new(); - for (item, _, _) in &item_updates { - item_id_lookup - .entry(item.schema_id) - .or_insert_with(HashMap::new) - .insert(item.name.clone(), item.global_id); - } - - let mut items_with_dependencies = item_updates - .into_iter() - .map(|(item, ts, diff)| { - let parsed = mz_sql::parse::parse(&item.create_sql) - .expect("failed to parse persisted SQL") - .into_element() - .ast; - let (mut id_deps, name_deps) = mz_sql::names::raw_item_dependency_ids(&parsed); - - // Convert named deps into ID deps. Ideally this is empty and all dependencies are - // specified by ID. However, this is not the case and require some changes and a - // migration to fix. - for name in name_deps { - let (db, schema, item) = match name.0.len() { - 3 => ( - Some(name.0[0].as_str()), - name.0[1].as_str(), - name.0[2].as_str(), - ), - 2 => (None, name.0[0].as_str(), name.0[1].as_str()), - // This must be a CTE. - _ => continue, - }; - let schema = state - .resolve_schema(None, db, schema, &SYSTEM_CONN_ID) - .expect("schema must be loaded before an item"); - // If `name` is not also being applied in this batch then the relative order of - // `item` and `name` doesn't matter, so we can ignore it. - let schema_id = match schema.id { - SchemaSpecifier::Id(id) => id, - SchemaSpecifier::Temporary => { - panic!("temporary item {name:?} persisted as dependency of {item:?}") - } - }; - if let Some(ids) = item_id_lookup.get(&schema_id) { - if let Some(id) = ids.get(item) { - id_deps.insert(*id); - } - } - } - - (item.global_id, (id_deps, (item, ts, diff))) - }) - .collect::>(); - let mut visited = BTreeSet::new(); - let mut sorted = Vec::new(); - fn dfs( - id: GlobalId, - visited: &mut BTreeSet, - sorted: &mut Vec, - items_with_dependencies: &BTreeMap< - GlobalId, - ( - BTreeSet, - (mz_catalog::durable::Item, Timestamp, StateDiff), - ), - >, - ) { - visited.insert(id); - let deps = items_with_dependencies - .get(&id) - .map(|(deps, _)| deps) - .expect("item should be in the map"); - for dep in deps { - // We only want to visit dependencies that are in the current set of updates. - if !visited.contains(dep) && items_with_dependencies.contains_key(dep) { - dfs(*dep, visited, sorted, items_with_dependencies); - } + // Partition items into groups s.t. each item in one group has a predefined order with all + // items in other groups. For example, all sinks are ordered greater than all tables. + let mut types = Vec::new(); + let mut secrets = Vec::new(); + let mut connections = Vec::new(); + let mut sources = Vec::new(); + let mut tables = Vec::new(); + let mut derived_items = Vec::new(); + let mut sinks = Vec::new(); + let mut continual_tasks = Vec::new(); + + for update in item_updates { + match update.0.item_type() { + CatalogItemType::Type => types.push(update), + CatalogItemType::Secret => secrets.push(update), + CatalogItemType::Connection => connections.push(update), + CatalogItemType::Source => sources.push(update), + CatalogItemType::Table => tables.push(update), + CatalogItemType::View + | CatalogItemType::MaterializedView + | CatalogItemType::Index + | CatalogItemType::Func => derived_items.push(update), + CatalogItemType::Sink => sinks.push(update), + CatalogItemType::ContinualTask => continual_tasks.push(update), } - sorted.push(id); } - for id in items_with_dependencies.keys() { - if !visited.contains(id) { - dfs(*id, &mut visited, &mut sorted, &items_with_dependencies); - } + + // Within each group, sort by ID. + for group in [ + &mut types, + &mut secrets, + &mut connections, + &mut sources, + &mut tables, + &mut derived_items, + &mut sinks, + &mut continual_tasks, + ] { + group.sort_by_key(|(item, _, _)| item.id); } - // return the values from items_with_dependencies in the order of sorted - sorted - .into_iter() - .filter_map(|id| items_with_dependencies.remove(&id)) - .map(|item| item.1) + + iter::empty() + .chain(types) + .chain(secrets) + .chain(connections) + .chain(sources) + .chain(tables) + .chain(derived_items) + .chain(sinks) + .chain(continual_tasks) .collect() } - let item_retractions = sort_item_updates(item_retractions, state); - let item_additions = sort_item_updates(item_additions, state); + let item_retractions = sort_item_updates(item_retractions); + let item_additions = sort_item_updates(item_additions); /// Sort temporary item updates by GlobalId. fn sort_temp_item_updates( temp_item_updates: Vec<(TemporaryItem, Timestamp, StateDiff)>, ) -> VecDeque<(TemporaryItem, Timestamp, StateDiff)> { - temp_item_updates - .into_iter() - // HACK: due to `ALTER SINK`, sinks can appear before the objects they - // depend upon. Fortunately, because sinks can never have dependencies - // and can never depend upon one another, to fix the topological sort, - // we can just always move sinks to the end. - .sorted_by_key(|(item, _ts, _diff)| match item.item.typ() { - CatalogItemType::Sink => CatalogItemId::User(u64::MAX), - _ => item.id, - }) + // Partition items into groups s.t. each item in one group has a predefined order with all + // items in other groups. For example, all sinks are ordered greater than all tables. + let mut types = Vec::new(); + let mut secrets = Vec::new(); + let mut connections = Vec::new(); + let mut sources = Vec::new(); + let mut tables = Vec::new(); + let mut derived_items = Vec::new(); + let mut sinks = Vec::new(); + let mut continual_tasks = Vec::new(); + + for update in temp_item_updates { + match update.0.item.typ() { + CatalogItemType::Type => types.push(update), + CatalogItemType::Secret => secrets.push(update), + CatalogItemType::Connection => connections.push(update), + CatalogItemType::Source => sources.push(update), + CatalogItemType::Table => tables.push(update), + CatalogItemType::View + | CatalogItemType::MaterializedView + | CatalogItemType::Index + | CatalogItemType::Func => derived_items.push(update), + CatalogItemType::Sink => sinks.push(update), + CatalogItemType::ContinualTask => continual_tasks.push(update), + } + } + + // Within each group, sort by ID. + for group in [ + &mut types, + &mut secrets, + &mut connections, + &mut sources, + &mut tables, + &mut derived_items, + &mut sinks, + &mut continual_tasks, + ] { + group.sort_by_key(|(item, _, _)| item.id); + } + + iter::empty() + .chain(types) + .chain(secrets) + .chain(connections) + .chain(sources) + .chain(tables) + .chain(derived_items) + .chain(sinks) + .chain(continual_tasks) .collect() } let temp_item_retractions = sort_temp_item_updates(temp_item_retractions); diff --git a/src/catalog/src/durable/objects.rs b/src/catalog/src/durable/objects.rs index e6bb869ce84fb..1d0002461d61f 100644 --- a/src/catalog/src/durable/objects.rs +++ b/src/catalog/src/durable/objects.rs @@ -509,6 +509,12 @@ pub struct Item { pub extra_versions: BTreeMap, } +impl Item { + pub fn item_type(&self) -> CatalogItemType { + item_type(&self.create_sql) + } +} + impl DurableType for Item { type Key = ItemKey; type Value = ItemValue; @@ -1289,32 +1295,36 @@ pub struct ItemValue { } impl ItemValue { - pub(crate) fn item_type(&self) -> CatalogItemType { - // NOTE(benesch): the implementation of this method is hideous, but is - // there a better alternative? Storing the object type alongside the - // `create_sql` would introduce the possibility of skew. - let mut tokens = self.create_sql.split_whitespace(); - assert_eq!(tokens.next(), Some("CREATE")); - match tokens.next() { - Some("TABLE") => CatalogItemType::Table, - Some("SOURCE") | Some("SUBSOURCE") => CatalogItemType::Source, - Some("SINK") => CatalogItemType::Sink, - Some("VIEW") => CatalogItemType::View, - Some("MATERIALIZED") => { - assert_eq!(tokens.next(), Some("VIEW")); - CatalogItemType::MaterializedView - } - Some("CONTINUAL") => { - assert_eq!(tokens.next(), Some("TASK")); - CatalogItemType::ContinualTask - } - Some("INDEX") => CatalogItemType::Index, - Some("TYPE") => CatalogItemType::Type, - Some("FUNCTION") => CatalogItemType::Func, - Some("SECRET") => CatalogItemType::Secret, - Some("CONNECTION") => CatalogItemType::Connection, - _ => panic!("unexpected create sql: {}", self.create_sql), + pub fn item_type(&self) -> CatalogItemType { + item_type(&self.create_sql) + } +} + +fn item_type(create_sql: &str) -> CatalogItemType { + // NOTE(benesch): the implementation of this method is hideous, but is + // there a better alternative? Storing the object type alongside the + // `create_sql` would introduce the possibility of skew. + let mut tokens = create_sql.split_whitespace(); + assert_eq!(tokens.next(), Some("CREATE")); + match tokens.next() { + Some("TABLE") => CatalogItemType::Table, + Some("SOURCE") | Some("SUBSOURCE") => CatalogItemType::Source, + Some("SINK") => CatalogItemType::Sink, + Some("VIEW") => CatalogItemType::View, + Some("MATERIALIZED") => { + assert_eq!(tokens.next(), Some("VIEW")); + CatalogItemType::MaterializedView + } + Some("CONTINUAL") => { + assert_eq!(tokens.next(), Some("TASK")); + CatalogItemType::ContinualTask } + Some("INDEX") => CatalogItemType::Index, + Some("TYPE") => CatalogItemType::Type, + Some("FUNCTION") => CatalogItemType::Func, + Some("SECRET") => CatalogItemType::Secret, + Some("CONNECTION") => CatalogItemType::Connection, + _ => panic!("unexpected create sql: {}", create_sql), } } diff --git a/src/sql/src/names.rs b/src/sql/src/names.rs index 2221d9257dd05..2eb24322e83b1 100644 --- a/src/sql/src/names.rs +++ b/src/sql/src/names.rs @@ -2387,41 +2387,6 @@ where ResolvedIds::new(visitor.ids) } -#[derive(Debug)] -struct RawItemDependencyIds<'a> { - ids: BTreeSet, - names: BTreeSet<&'a UnresolvedItemName>, -} - -impl<'ast> Visit<'ast, Raw> for RawItemDependencyIds<'ast> { - fn visit_item_name(&mut self, item_name: &'ast RawItemName) { - match item_name { - RawItemName::Name(name) => { - self.names.insert(name); - } - RawItemName::Id(id, _, _) => { - let parsed_id = id.parse::().unwrap(); - self.ids.insert(parsed_id); - } - } - } -} - -/// Collect any dependencies of the provided raw AST node. -pub fn raw_item_dependency_ids<'ast, N>( - node: &'ast N, -) -> (BTreeSet, BTreeSet<&'ast UnresolvedItemName>) -where - N: VisitNode<'ast, Raw>, -{ - let mut deps = RawItemDependencyIds { - ids: BTreeSet::new(), - names: BTreeSet::new(), - }; - node.visit(&mut deps); - (deps.ids, deps.names) -} - #[derive(Debug)] pub struct ItemDependencyModifier<'a> { pub modified: bool, From ec3842d02fd07a5d21c74eb02740dc301211a1c9 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 17 Dec 2024 12:12:01 -0500 Subject: [PATCH 34/52] Experiment for migrate --- src/adapter/src/catalog/open.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/adapter/src/catalog/open.rs b/src/adapter/src/catalog/open.rs index 926cebb6bd709..a8493292c1528 100644 --- a/src/adapter/src/catalog/open.rs +++ b/src/adapter/src/catalog/open.rs @@ -516,6 +516,11 @@ impl Catalog { // Include any post-item-updates generated by migrations, and then consolidate // them to ensure diffs are all positive. post_item_updates.extend(migrate_result.post_item_updates); + if let Some(max_ts) = post_item_updates.iter().map(|(_, ts, _)| ts).max().cloned() { + for (_, ts, _) in &mut post_item_updates { + *ts = max_ts; + } + } differential_dataflow::consolidation::consolidate_updates(&mut post_item_updates); } From 622e63ece42251e3212fc42250a59cee01232411 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Wed, 18 Dec 2024 11:14:41 -0500 Subject: [PATCH 35/52] Fix migration idempotency --- src/adapter/src/catalog/migrate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index bfadeaa3cd2bb..f03611ddc90ef 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -529,7 +529,7 @@ fn ast_rewrite_sources_to_tables( let tables_for_source = items_with_statements_copied .iter() - .any(|(item, statement)| match statement { + .any(|(_, statement)| match statement { Statement::CreateTableFromSource(stmt) => { let source: GlobalId = match &stmt.source { RawItemName::Name(_) => { From 5a1135a5685f7d33dcc38629add495fa7c33a6dd Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Wed, 18 Dec 2024 11:24:38 -0500 Subject: [PATCH 36/52] Fixup --- src/adapter/src/catalog/apply.rs | 16 ++++++++++++---- src/adapter/src/catalog/open.rs | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index 7be37cb11650a..cd6f3fea66a74 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -1892,6 +1892,8 @@ fn sort_updates_inner(updates: Vec) -> Vec { // Partition items into groups s.t. each item in one group has a predefined order with all // items in other groups. For example, all sinks are ordered greater than all tables. let mut types = Vec::new(); + // N.B. Functions can depend on system tables, but not user tables. + let mut funcs = Vec::new(); let mut secrets = Vec::new(); let mut connections = Vec::new(); let mut sources = Vec::new(); @@ -1903,14 +1905,14 @@ fn sort_updates_inner(updates: Vec) -> Vec { for update in item_updates { match update.0.item_type() { CatalogItemType::Type => types.push(update), + CatalogItemType::Func => funcs.push(update), CatalogItemType::Secret => secrets.push(update), CatalogItemType::Connection => connections.push(update), CatalogItemType::Source => sources.push(update), CatalogItemType::Table => tables.push(update), CatalogItemType::View | CatalogItemType::MaterializedView - | CatalogItemType::Index - | CatalogItemType::Func => derived_items.push(update), + | CatalogItemType::Index => derived_items.push(update), CatalogItemType::Sink => sinks.push(update), CatalogItemType::ContinualTask => continual_tasks.push(update), } @@ -1919,6 +1921,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { // Within each group, sort by ID. for group in [ &mut types, + &mut funcs, &mut secrets, &mut connections, &mut sources, @@ -1932,6 +1935,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { iter::empty() .chain(types) + .chain(funcs) .chain(secrets) .chain(connections) .chain(sources) @@ -1952,6 +1956,8 @@ fn sort_updates_inner(updates: Vec) -> Vec { // Partition items into groups s.t. each item in one group has a predefined order with all // items in other groups. For example, all sinks are ordered greater than all tables. let mut types = Vec::new(); + // N.B. Functions can depend on system tables, but not user tables. + let mut funcs = Vec::new(); let mut secrets = Vec::new(); let mut connections = Vec::new(); let mut sources = Vec::new(); @@ -1963,14 +1969,14 @@ fn sort_updates_inner(updates: Vec) -> Vec { for update in temp_item_updates { match update.0.item.typ() { CatalogItemType::Type => types.push(update), + CatalogItemType::Func => funcs.push(update), CatalogItemType::Secret => secrets.push(update), CatalogItemType::Connection => connections.push(update), CatalogItemType::Source => sources.push(update), CatalogItemType::Table => tables.push(update), CatalogItemType::View | CatalogItemType::MaterializedView - | CatalogItemType::Index - | CatalogItemType::Func => derived_items.push(update), + | CatalogItemType::Index => derived_items.push(update), CatalogItemType::Sink => sinks.push(update), CatalogItemType::ContinualTask => continual_tasks.push(update), } @@ -1979,6 +1985,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { // Within each group, sort by ID. for group in [ &mut types, + &mut funcs, &mut secrets, &mut connections, &mut sources, @@ -1992,6 +1999,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { iter::empty() .chain(types) + .chain(funcs) .chain(secrets) .chain(connections) .chain(sources) diff --git a/src/adapter/src/catalog/open.rs b/src/adapter/src/catalog/open.rs index a8493292c1528..7a2e0c08c3a1e 100644 --- a/src/adapter/src/catalog/open.rs +++ b/src/adapter/src/catalog/open.rs @@ -516,6 +516,7 @@ impl Catalog { // Include any post-item-updates generated by migrations, and then consolidate // them to ensure diffs are all positive. post_item_updates.extend(migrate_result.post_item_updates); + // Push everything to the same timestamp so it consolidates cleanly. if let Some(max_ts) = post_item_updates.iter().map(|(_, ts, _)| ts).max().cloned() { for (_, ts, _) in &mut post_item_updates { *ts = max_ts; From a79183a2f535eaaecadf01873e6df2d879a8cb7b Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 2 Jan 2025 13:09:23 -0500 Subject: [PATCH 37/52] resolve merge conflicts --- src/adapter/src/catalog/migrate.rs | 35 +++++------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index f03611ddc90ef..4c3e2668b9b4f 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -10,8 +10,8 @@ use std::collections::BTreeMap; use mz_catalog::builtin::BuiltinTable; -use mz_catalog::durable::{Transaction, Item}; -use mz_catalog::memory::objects::{StateUpdate,BootstrapStateUpdateKind}; +use mz_catalog::durable::Transaction; +use mz_catalog::memory::objects::{BootstrapStateUpdateKind, StateUpdate}; use mz_ore::collections::CollectionExt; use mz_ore::now::NowFn; use mz_repr::{CatalogItemId, GlobalId, Timestamp}; @@ -237,8 +237,8 @@ fn ast_rewrite_sources_to_tables( use mz_persist_types::ShardId; use mz_proto::RustType; use mz_sql::ast::{ - CreateSourceConnection, CreateSourceOptionName, CreateSourceStatement, - CreateSubsourceOptionName, CreateSubsourceStatement, CreateTableFromSourceStatement, Ident, + CreateSourceConnection, CreateSourceStatement, CreateSubsourceOptionName, + CreateSubsourceStatement, CreateTableFromSourceStatement, Ident, KafkaSourceConfigOptionName, LoadGenerator, MySqlConfigOptionName, PgConfigOptionName, RawItemName, TableFromSourceColumns, TableFromSourceOption, TableFromSourceOptionName, UnresolvedItemName, Value, WithOptionValue, @@ -515,7 +515,7 @@ fn ast_rewrite_sources_to_tables( include_metadata, format, envelope, - mut with_options, + with_options, if_not_exists, in_cluster, progress_subsource, @@ -607,36 +607,13 @@ fn ast_rewrite_sources_to_tables( } _ => unreachable!("match determined above"), }; - let mut table_with_options = vec![TableFromSourceOption { + let table_with_options = vec![TableFromSourceOption { name: TableFromSourceOptionName::Details, value: Some(WithOptionValue::Value(Value::String(hex::encode( details.into_proto().encode_to_vec(), )))), }]; - // Move over the IgnoreKeys option if it exists. - if let Some(i) = with_options - .iter() - .position(|opt| opt.name == CreateSourceOptionName::IgnoreKeys) - { - let option = with_options.remove(i); - table_with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::IgnoreKeys, - value: option.value, - }); - }; - // Move over the Timeline option if it exists. - if let Some(i) = with_options - .iter() - .position(|opt| opt.name == CreateSourceOptionName::Timeline) - { - let option = with_options.remove(i); - table_with_options.push(TableFromSourceOption { - name: TableFromSourceOptionName::Timeline, - value: option.value, - }); - } - // Generate the same external-reference that would have been generated // during purification for single-output sources. let external_reference = match &conn { From a81d461abb9e7adea6aaa6fcc2e84bde4bb03528 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 14 Jan 2025 09:53:59 -0500 Subject: [PATCH 38/52] Update versions --- misc/python/materialize/mzcompose/__init__.py | 2 +- misc/python/materialize/source_table_migration.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/misc/python/materialize/mzcompose/__init__.py b/misc/python/materialize/mzcompose/__init__.py index c54d577571f08..bb4fe0440da38 100644 --- a/misc/python/materialize/mzcompose/__init__.py +++ b/misc/python/materialize/mzcompose/__init__.py @@ -91,7 +91,7 @@ def get_default_system_parameters( "enable_0dt_deployment": "true" if zero_downtime else "false", "enable_0dt_deployment_panic_after_timeout": "true", "enable_0dt_deployment_sources": ( - "true" if version >= MzVersion.parse_mz("v0.125.0-dev") else "false" + "true" if version >= MzVersion.parse_mz("v0.130.0-dev") else "false" ), "enable_alter_swap": "true", "enable_columnation_lgalloc": "true", diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index 0f56806d2d9ba..45add7e76fd73 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -59,12 +59,12 @@ def _verify_source( def check_source_table_migration_test_sensible() -> None: assert MzVersion.parse_cargo() < MzVersion.parse_mz( - "v0.133.0" + "v0.137.0" ), "migration test probably no longer needed" def get_old_image_for_source_table_migration_test() -> str: - return "materialize/materialized:v0.125.0" + return "materialize/materialized:v0.129.0" def get_new_image_for_source_table_migration_test() -> str | None: From 67aec327d44d8d4a93d31daee1216ed7db4dadab Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 16 Jan 2025 13:35:57 -0500 Subject: [PATCH 39/52] Fix more merge skew --- Cargo.lock | 3 +++ misc/python/materialize/mzcompose/__init__.py | 2 +- .../materialize/source_table_migration.py | 4 +-- src/adapter/BUILD.bazel | 6 +++++ src/adapter/Cargo.toml | 3 +++ src/adapter/src/catalog/apply.rs | 5 ++-- src/adapter/src/catalog/migrate.rs | 27 ++++++++++--------- src/sql/src/names.rs | 6 ++--- test/mysql-cdc-old-syntax/mzcompose.py | 6 ++--- test/pg-cdc-old-syntax/mzcompose.py | 6 ++--- .../mzcompose.py | 6 ++--- 11 files changed, 43 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1e3c27b91b8f0..cb41f3cca09d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4583,6 +4583,7 @@ dependencies = [ "fail", "futures", "governor", + "hex", "http 1.1.0", "ipnet", "itertools 0.12.1", @@ -4611,6 +4612,7 @@ dependencies = [ "mz-pgrepr", "mz-pgwire-common", "mz-postgres-util", + "mz-proto", "mz-repr", "mz-rocksdb-types", "mz-secrets", @@ -4626,6 +4628,7 @@ dependencies = [ "mz-transform", "opentelemetry", "prometheus", + "prost", "qcell", "rand 0.8.5", "rand_chacha 0.3.0", diff --git a/misc/python/materialize/mzcompose/__init__.py b/misc/python/materialize/mzcompose/__init__.py index bb4fe0440da38..bc66525fcdd7c 100644 --- a/misc/python/materialize/mzcompose/__init__.py +++ b/misc/python/materialize/mzcompose/__init__.py @@ -91,7 +91,7 @@ def get_default_system_parameters( "enable_0dt_deployment": "true" if zero_downtime else "false", "enable_0dt_deployment_panic_after_timeout": "true", "enable_0dt_deployment_sources": ( - "true" if version >= MzVersion.parse_mz("v0.130.0-dev") else "false" + "true" if version >= MzVersion.parse_mz("v0.132.0-dev") else "false" ), "enable_alter_swap": "true", "enable_columnation_lgalloc": "true", diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index 45add7e76fd73..e717dc0aedb2b 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -59,12 +59,12 @@ def _verify_source( def check_source_table_migration_test_sensible() -> None: assert MzVersion.parse_cargo() < MzVersion.parse_mz( - "v0.137.0" + "v0.139.0" ), "migration test probably no longer needed" def get_old_image_for_source_table_migration_test() -> str: - return "materialize/materialized:v0.129.0" + return "materialize/materialized:v0.131.0" def get_new_image_for_source_table_migration_test() -> str | None: diff --git a/src/adapter/BUILD.bazel b/src/adapter/BUILD.bazel index 65920e5ef9811..f833c66a2fd83 100644 --- a/src/adapter/BUILD.bazel +++ b/src/adapter/BUILD.bazel @@ -53,6 +53,7 @@ rust_library( "//src/pgrepr:mz_pgrepr", "//src/pgwire-common:mz_pgwire_common", "//src/postgres-util:mz_postgres_util", + "//src/proto:mz_proto", "//src/repr:mz_repr", "//src/rocksdb-types:mz_rocksdb_types", "//src/secrets:mz_secrets", @@ -119,6 +120,7 @@ rust_test( "//src/pgrepr:mz_pgrepr", "//src/pgwire-common:mz_pgwire_common", "//src/postgres-util:mz_postgres_util", + "//src/proto:mz_proto", "//src/repr:mz_repr", "//src/rocksdb-types:mz_rocksdb_types", "//src/secrets:mz_secrets", @@ -165,6 +167,7 @@ rust_doc_test( "//src/pgrepr:mz_pgrepr", "//src/pgwire-common:mz_pgwire_common", "//src/postgres-util:mz_postgres_util", + "//src/proto:mz_proto", "//src/repr:mz_repr", "//src/rocksdb-types:mz_rocksdb_types", "//src/secrets:mz_secrets", @@ -231,6 +234,7 @@ rust_test( "//src/pgrepr:mz_pgrepr", "//src/pgwire-common:mz_pgwire_common", "//src/postgres-util:mz_postgres_util", + "//src/proto:mz_proto", "//src/repr:mz_repr", "//src/rocksdb-types:mz_rocksdb_types", "//src/secrets:mz_secrets", @@ -297,6 +301,7 @@ rust_test( "//src/pgrepr:mz_pgrepr", "//src/pgwire-common:mz_pgwire_common", "//src/postgres-util:mz_postgres_util", + "//src/proto:mz_proto", "//src/repr:mz_repr", "//src/rocksdb-types:mz_rocksdb_types", "//src/secrets:mz_secrets", @@ -363,6 +368,7 @@ rust_test( "//src/pgrepr:mz_pgrepr", "//src/pgwire-common:mz_pgwire_common", "//src/postgres-util:mz_postgres_util", + "//src/proto:mz_proto", "//src/repr:mz_repr", "//src/rocksdb-types:mz_rocksdb_types", "//src/secrets:mz_secrets", diff --git a/src/adapter/Cargo.toml b/src/adapter/Cargo.toml index a4298b2fe1703..d84638fa3f525 100644 --- a/src/adapter/Cargo.toml +++ b/src/adapter/Cargo.toml @@ -23,6 +23,7 @@ enum-kinds = "0.5.1" fail = { version = "0.5.1", features = ["failpoints"] } futures = "0.3.25" governor = "0.6.0" +hex = "0.4.3" http = "1.1.0" ipnet = "2.5.0" itertools = "0.12.1" @@ -53,6 +54,7 @@ mz-pgcopy = { path = "../pgcopy" } mz-pgrepr = { path = "../pgrepr" } mz-pgwire-common = { path = "../pgwire-common" } mz-postgres-util = { path = "../postgres-util" } +mz-proto = { path = "../proto" } mz-repr = { path = "../repr", features = ["tracing_"] } mz-rocksdb-types = { path = "../rocksdb-types" } mz-secrets = { path = "../secrets" } @@ -68,6 +70,7 @@ mz-transform = { path = "../transform" } mz-timestamp-oracle = { path = "../timestamp-oracle" } opentelemetry = { version = "0.24.0", features = ["trace"] } prometheus = { version = "0.13.3", default-features = false } +prost = { version = "0.13.2", features = ["no-recursion-limit"] } qcell = "0.5" rand = "0.8.5" rand_chacha = "0.3" diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index cd6f3fea66a74..5add00b0caf40 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -1884,8 +1884,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { } } - /// Sort item updates by parsing statements to identify any id-based dependencies within - /// this set of updates and then performing a topological sort. + /// Sort item updates by dependency. fn sort_item_updates( item_updates: Vec<(mz_catalog::durable::Item, Timestamp, StateDiff)>, ) -> VecDeque<(mz_catalog::durable::Item, Timestamp, StateDiff)> { @@ -1949,7 +1948,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { let item_retractions = sort_item_updates(item_retractions); let item_additions = sort_item_updates(item_additions); - /// Sort temporary item updates by GlobalId. + /// Sort temporary item updates by dependency. fn sort_temp_item_updates( temp_item_updates: Vec<(TemporaryItem, Timestamp, StateDiff)>, ) -> VecDeque<(TemporaryItem, Timestamp, StateDiff)> { diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index 4c3e2668b9b4f..a4da5729d29f8 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -14,7 +14,7 @@ use mz_catalog::durable::Transaction; use mz_catalog::memory::objects::{BootstrapStateUpdateKind, StateUpdate}; use mz_ore::collections::CollectionExt; use mz_ore::now::NowFn; -use mz_repr::{CatalogItemId, GlobalId, Timestamp}; +use mz_repr::{CatalogItemId, Timestamp}; use mz_sql::ast::display::AstDisplay; use mz_sql::names::FullItemName; use mz_sql_parser::ast::{Raw, Statement}; @@ -267,7 +267,7 @@ fn ast_rewrite_sources_to_tables( acc }); - // Any GlobalId that should be changed to a new GlobalId in any statements that + // Any CatalogItemId that should be changed to a new CatalogItemId in any statements that // reference it. This is necessary for ensuring downstream statements (e.g. // mat views, indexes) that reference a single-output source (e.g. kafka) // will now reference the corresponding new table, with the same data, instead. @@ -276,7 +276,7 @@ fn ast_rewrite_sources_to_tables( for (mut item, stmt) in items_with_statements { match stmt { // Migrate each `CREATE SUBSOURCE` statement to an equivalent - // `CREATE TABLE .. FROM SOURCE` statement. + // `CREATE TABLE ... FROM SOURCE` statement. Statement::CreateSubsource(CreateSubsourceStatement { name, columns, @@ -294,7 +294,7 @@ fn ast_rewrite_sources_to_tables( let source = match raw_source_name { // Some legacy subsources have named-only references to their `of_source` // so we ensure we always use an ID-based reference in the stored - // `CREATE TABLE .. FROM SOURCE` statements. + // `CREATE TABLE ... FROM SOURCE` statements. RawItemName::Name(name) => { // Convert the name reference to an ID reference. let (source_item, _) = items_with_statements_copied @@ -304,7 +304,7 @@ fn ast_rewrite_sources_to_tables( _ => false, }) .expect("source must exist"); - RawItemName::Id(source_item.global_id.to_string(), name, None) + RawItemName::Id(source_item.id.to_string(), name, None) } RawItemName::Id(..) => raw_source_name, }; @@ -531,7 +531,7 @@ fn ast_rewrite_sources_to_tables( .iter() .any(|(_, statement)| match statement { Statement::CreateTableFromSource(stmt) => { - let source: GlobalId = match &stmt.source { + let source: CatalogItemId = match &stmt.source { RawItemName::Name(_) => { unreachable!("tables store source as ID") } @@ -539,7 +539,7 @@ fn ast_rewrite_sources_to_tables( source_id.parse().expect("valid id") } }; - source == item.global_id + source == item.id } _ => false, }); @@ -586,7 +586,7 @@ fn ast_rewrite_sources_to_tables( // A reference to the source that will be included in the table statement let source_ref = - RawItemName::Id(item.global_id.to_string(), new_source_name.clone(), None); + RawItemName::Id(item.id.to_string(), new_source_name.clone(), None); let columns = if col_names.is_empty() { TableFromSourceColumns::NotSpecified @@ -676,7 +676,8 @@ fn ast_rewrite_sources_to_tables( progress_subsource, }; - let source_id = item.global_id; + let source_id = item.id; + let source_global_id = item.global_id; let schema_id = item.schema_id.clone(); let schema = tx.get_schema(&item.schema_id).expect("schema must exist"); @@ -712,13 +713,13 @@ fn ast_rewrite_sources_to_tables( // external tools such as DBT and Terraform. We will insert a new shard for the source // statement which will be automatically created after the migration is complete. let new_source_shard = ShardId::new(); - let (source_id, existing_source_shard) = tx - .delete_collection_metadata(btreeset! {source_id}) + let (source_global_id, existing_source_shard) = tx + .delete_collection_metadata(btreeset! {source_global_id}) .pop() .expect("shard should exist"); tx.insert_collection_metadata(btreemap! { new_table_global_id => existing_source_shard, - source_id => new_source_shard + source_global_id => new_source_shard })?; add_to_audit_log( @@ -741,7 +742,7 @@ fn ast_rewrite_sources_to_tables( // We also need to update any other statements that reference the source to use the new // table id/name instead. - changed_ids.insert(source_id, new_table_global_id); + changed_ids.insert(source_id, new_table_id); } // When we upgrade to > rust 1.81 we should use #[expect(unreachable_patterns)] diff --git a/src/sql/src/names.rs b/src/sql/src/names.rs index 2eb24322e83b1..1401f83872040 100644 --- a/src/sql/src/names.rs +++ b/src/sql/src/names.rs @@ -2390,13 +2390,13 @@ where #[derive(Debug)] pub struct ItemDependencyModifier<'a> { pub modified: bool, - pub id_map: &'a BTreeMap, + pub id_map: &'a BTreeMap, } impl<'ast, 'a> VisitMut<'ast, Raw> for ItemDependencyModifier<'a> { fn visit_item_name_mut(&mut self, item_name: &mut RawItemName) { if let RawItemName::Id(id, _, _) = item_name { - let parsed_id = id.parse::().unwrap(); + let parsed_id = id.parse::().unwrap(); if let Some(new_id) = self.id_map.get(&parsed_id) { *id = new_id.to_string(); self.modified = true; @@ -2411,7 +2411,7 @@ impl<'ast, 'a> VisitMut<'ast, Raw> for ItemDependencyModifier<'a> { /// ids refer to an item of the same name, whose id has changed). pub fn modify_dependency_item_ids<'ast, N>( node: &'ast mut N, - id_map: &BTreeMap, + id_map: &BTreeMap, ) -> bool where N: VisitMutNode<'ast, Raw>, diff --git a/test/mysql-cdc-old-syntax/mzcompose.py b/test/mysql-cdc-old-syntax/mzcompose.py index 308cd4a9c2856..e472f1162fb22 100644 --- a/test/mysql-cdc-old-syntax/mzcompose.py +++ b/test/mysql-cdc-old-syntax/mzcompose.py @@ -56,7 +56,7 @@ def create_mysql_replica(mysql_version: str) -> MySql: SERVICES = [ Materialized( - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults={ "log_filter": "mz_storage::source::mysql=trace,info" }, @@ -318,7 +318,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: name="materialized", image=get_old_image_for_source_table_migration_test(), external_metadata_store=True, - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults={ "log_filter": "mz_storage::source::mysql=trace,info" }, @@ -328,7 +328,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: name="materialized", image=get_new_image_for_source_table_migration_test(), external_metadata_store=True, - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults={ "log_filter": "mz_storage::source::mysql=trace,info", "force_source_table_syntax": "true", diff --git a/test/pg-cdc-old-syntax/mzcompose.py b/test/pg-cdc-old-syntax/mzcompose.py index 51ab5ed98d0e9..9f5ad1f7f7e9b 100644 --- a/test/pg-cdc-old-syntax/mzcompose.py +++ b/test/pg-cdc-old-syntax/mzcompose.py @@ -98,7 +98,7 @@ def create_postgres( additional_system_parameter_defaults={ "log_filter": "mz_storage::source::postgres=trace,debug,info,warn,error" }, - external_minio=True, + external_blob_store=True, ), Testdrive(), CockroachOrPostgresMetadata(), @@ -402,7 +402,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: image=get_old_image_for_source_table_migration_test(), volumes_extra=["secrets:/share/secrets"], external_metadata_store=True, - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults={ "log_filter": "mz_storage::source::postgres=trace,debug,info,warn,error" }, @@ -413,7 +413,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: image=get_new_image_for_source_table_migration_test(), volumes_extra=["secrets:/share/secrets"], external_metadata_store=True, - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults={ "log_filter": "mz_storage::source::postgres=trace,debug,info,warn,error", "force_source_table_syntax": "true", diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 075ddd07ea6e5..c9a11b1462a51 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -331,7 +331,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: # validate_catalog_store=True, default_timeout=args.default_timeout, volumes_extra=["mzdata:/mzdata"], - external_minio=True, + external_blob_store=True, fivetran_destination=True, fivetran_destination_files_path="/share/tmp", entrypoint_extra=[f"--var=uses-redpanda={args.redpanda}"], @@ -356,14 +356,14 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: default_size=Materialized.Size.DEFAULT_SIZE, image=get_old_image_for_source_table_migration_test(), external_metadata_store=True, - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults=additional_system_parameter_defaults, ) mz_new = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, image=get_new_image_for_source_table_migration_test(), external_metadata_store=True, - external_minio=True, + external_blob_store=True, additional_system_parameter_defaults=additional_system_parameter_defaults, ) From a3f06d29deddcaa4f39affeadb2353fcd3849741 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 28 Jan 2025 11:17:39 -0500 Subject: [PATCH 40/52] Remove prints --- misc/python/materialize/source_table_migration.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/misc/python/materialize/source_table_migration.py b/misc/python/materialize/source_table_migration.py index e717dc0aedb2b..474665977698f 100644 --- a/misc/python/materialize/source_table_migration.py +++ b/misc/python/materialize/source_table_migration.py @@ -36,11 +36,9 @@ def _verify_source( # must not crash statement = f"SELECT count(*) FROM {source_name};" - print(statement) c.sql_query(statement) statement = f"SHOW CREATE SOURCE {source_name};" - print(statement) result = c.sql_query(statement) sql = result[0][1] assert "FOR TABLE" not in sql, f"FOR TABLE found in: {sql}" From 7650c3997231a82e6b0bac2a473e8a9c9c2f3415 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 28 Jan 2025 11:53:26 -0500 Subject: [PATCH 41/52] fixup --- src/adapter/src/catalog/apply.rs | 17 +++++++++++++++++ src/adapter/src/catalog/migrate.rs | 4 +--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/adapter/src/catalog/apply.rs b/src/adapter/src/catalog/apply.rs index 5add00b0caf40..521afcedb6525 100644 --- a/src/adapter/src/catalog/apply.rs +++ b/src/adapter/src/catalog/apply.rs @@ -1885,6 +1885,20 @@ fn sort_updates_inner(updates: Vec) -> Vec { } /// Sort item updates by dependency. + /// + /// First we group items into groups that are totally ordered by dependency. For example, when + /// sorting all items by dependency we know that all tables can come after all sources, because + /// a source can never depend on a table. Within these groups, the ID order matches the + /// dependency order. + /// + /// It used to be the case that the ID order of ALL items matched the dependency order. However, + /// certain migrations shuffled item IDs around s.t. this was no longer true. A much better + /// approach would be to investigate each item, discover their exact dependencies, and then + /// perform a topological sort. This is non-trivial because we only have the CREATE SQL of each + /// item here. Within the SQL the dependent items are sometimes referred to by ID and sometimes + /// referred to by name. + /// + /// The logic of this function should match [`sort_temp_item_updates`]. fn sort_item_updates( item_updates: Vec<(mz_catalog::durable::Item, Timestamp, StateDiff)>, ) -> VecDeque<(mz_catalog::durable::Item, Timestamp, StateDiff)> { @@ -1892,6 +1906,7 @@ fn sort_updates_inner(updates: Vec) -> Vec { // items in other groups. For example, all sinks are ordered greater than all tables. let mut types = Vec::new(); // N.B. Functions can depend on system tables, but not user tables. + // TODO(udf): This will change when UDFs are supported. let mut funcs = Vec::new(); let mut secrets = Vec::new(); let mut connections = Vec::new(); @@ -1949,6 +1964,8 @@ fn sort_updates_inner(updates: Vec) -> Vec { let item_additions = sort_item_updates(item_additions); /// Sort temporary item updates by dependency. + /// + /// The logic of this function should match [`sort_item_updates`]. fn sort_temp_item_updates( temp_item_updates: Vec<(TemporaryItem, Timestamp, StateDiff)>, ) -> VecDeque<(TemporaryItem, Timestamp, StateDiff)> { diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index a4da5729d29f8..e40cdc370531c 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -745,9 +745,7 @@ fn ast_rewrite_sources_to_tables( changed_ids.insert(source_id, new_table_id); } - // When we upgrade to > rust 1.81 we should use #[expect(unreachable_patterns)] - // to enforce that we have covered all CreateSourceStatement variants. - #[allow(unreachable_patterns)] + #[expect(unreachable_patterns)] Statement::CreateSource(_) => {} _ => (), } From e6c9316e67894c4e71d3efe5d75b397e85f83a9a Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 28 Jan 2025 13:43:32 -0500 Subject: [PATCH 42/52] Fixup --- src/adapter/src/catalog/migrate.rs | 44 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index e94951f3d8edf..c98bfa4d12efe 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -19,7 +19,7 @@ use mz_persist_types::ShardId; use mz_repr::{CatalogItemId, Timestamp}; use mz_sql::ast::display::AstDisplay; use mz_sql::names::FullItemName; -use mz_sql_parser::ast::{Raw, Statement}; +use mz_sql_parser::ast::{IdentError, Raw, Statement}; use mz_storage_client::controller::StorageTxn; use semver::Version; use tracing::info; @@ -560,32 +560,36 @@ fn ast_rewrite_sources_to_tables( // queries that reference the source name, since they will now need to query the // table instead. + assert_eq!( + item.name, + name.0 + .last() + .expect("at least one ident") + .to_ast_string_stable() + ); // First find an unused name within the same schema to avoid conflicts. - let mut new_source_item_name = format!("{}_source", item.name); - let mut new_source_name_inner = - format!("{}_source", name.0.last().expect("at least one ident")); - let mut i = 0; - while item_names_per_schema - .get(&item.schema_id) - .expect("schema must exist") - .contains(&new_source_item_name) - { - new_source_item_name = format!("{}_source_{}", item.name, i); - new_source_name_inner = format!( - "{}_source_{}", - name.0.last().expect("at least one ident"), - i - ); - i += 1; - } + let is_valid = |new_source_ident: &Ident| { + if item_names_per_schema + .get(&item.schema_id) + .expect("schema must exist") + .contains(&new_source_ident.to_ast_string_stable()) + { + Ok::<_, IdentError>(false) + } else { + Ok(true) + } + }; + let new_source_ident = + Ident::try_generate_name(item.name.clone(), "_source", is_valid)?; + // We will use the original item name for the new table item. let table_item_name = item.name.clone(); // Update the source item/statement to use the new name. let mut new_source_name = name.clone(); *new_source_name.0.last_mut().expect("at least one ident") = - Ident::new_unchecked(new_source_name_inner); - item.name = new_source_item_name; + new_source_ident.clone(); + item.name = new_source_ident.to_ast_string_stable(); // A reference to the source that will be included in the table statement let source_ref = From 7acae30b14933c80d8033e9cc7c29acae4bbe5be Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 28 Jan 2025 14:01:58 -0500 Subject: [PATCH 43/52] Fix migration test --- test/testdrive-old-kafka-src-syntax/mzcompose.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index c9a11b1462a51..38169c32a89df 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -328,7 +328,6 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: forward_buildkite_shard=True, kafka_default_partitions=args.kafka_default_partitions, aws_region=args.aws_region, - # validate_catalog_store=True, default_timeout=args.default_timeout, volumes_extra=["mzdata:/mzdata"], external_blob_store=True, @@ -350,8 +349,6 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: additional_system_parameter_defaults[key] = val - additional_system_parameter_defaults["force_source_table_syntax"] = "true" - mz_old = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, image=get_old_image_for_source_table_migration_test(), @@ -359,6 +356,9 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: external_blob_store=True, additional_system_parameter_defaults=additional_system_parameter_defaults, ) + + additional_system_parameter_defaults["force_source_table_syntax"] = "true" + mz_new = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, image=get_new_image_for_source_table_migration_test(), From 84360a91b14a025f75a0fda6b6abf63682021d46 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 28 Jan 2025 14:12:12 -0500 Subject: [PATCH 44/52] Update test/testdrive-old-kafka-src-syntax/mzcompose.py Co-authored-by: Dennis Felsing --- test/testdrive-old-kafka-src-syntax/mzcompose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 38169c32a89df..899a355644a99 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -354,7 +354,7 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: image=get_old_image_for_source_table_migration_test(), external_metadata_store=True, external_blob_store=True, - additional_system_parameter_defaults=additional_system_parameter_defaults, + additional_system_parameter_defaults=dict(additional_system_parameter_defaults), ) additional_system_parameter_defaults["force_source_table_syntax"] = "true" From 583408ad2ff85ee91a83187d1361fcae7b11e667 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Tue, 28 Jan 2025 14:34:05 -0500 Subject: [PATCH 45/52] Fixup --- src/adapter/src/catalog/migrate.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/adapter/src/catalog/migrate.rs b/src/adapter/src/catalog/migrate.rs index c98bfa4d12efe..33823c4e7f816 100644 --- a/src/adapter/src/catalog/migrate.rs +++ b/src/adapter/src/catalog/migrate.rs @@ -562,17 +562,14 @@ fn ast_rewrite_sources_to_tables( assert_eq!( item.name, - name.0 - .last() - .expect("at least one ident") - .to_ast_string_stable() + name.0.last().expect("at least one ident").to_string() ); // First find an unused name within the same schema to avoid conflicts. let is_valid = |new_source_ident: &Ident| { if item_names_per_schema .get(&item.schema_id) .expect("schema must exist") - .contains(&new_source_ident.to_ast_string_stable()) + .contains(&new_source_ident.to_string()) { Ok::<_, IdentError>(false) } else { @@ -589,7 +586,7 @@ fn ast_rewrite_sources_to_tables( let mut new_source_name = name.clone(); *new_source_name.0.last_mut().expect("at least one ident") = new_source_ident.clone(); - item.name = new_source_ident.to_ast_string_stable(); + item.name = new_source_ident.to_string(); // A reference to the source that will be included in the table statement let source_ref = From 6cdddc995ce4fe29cd64348ddd6de1b1c6da88ab Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 29 Jan 2025 16:57:20 +0000 Subject: [PATCH 46/52] Fix and work around test failures Based on https://buildkite.com/materialize/nightly/builds/10996 --- ci/nightly/pipeline.template.yml | 3 +-- .../lint-main/checks/check-mzcompose-files.sh | 1 + test/legacy-upgrade/mzcompose.py | 14 ++++++++++++++ test/mysql-cdc-old-syntax/mzcompose.py | 2 +- test/pg-cdc-old-syntax/mzcompose.py | 2 +- .../avro-resolution-no-publish-writer.td | 5 +++-- .../mzcompose.py | 19 +++++-------------- 7 files changed, 26 insertions(+), 20 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 7498277861dc4..9b8bd6bbc3041 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -700,10 +700,9 @@ steps: queue: hetzner-aarch64-8cpu-16gb - id: testdrive-kafka-migration - label: "Testdrive %N migration tests" + label: "Testdrive (before Kafka source versioning) migration tests" depends_on: build-aarch64 timeout_in_minutes: 180 - parallelism: 8 plugins: - ./ci/plugins/mzcompose: composition: testdrive-old-kafka-src-syntax diff --git a/ci/test/lint-main/checks/check-mzcompose-files.sh b/ci/test/lint-main/checks/check-mzcompose-files.sh index 0031f92c3dc3d..0ee2e916c074f 100755 --- a/ci/test/lint-main/checks/check-mzcompose-files.sh +++ b/ci/test/lint-main/checks/check-mzcompose-files.sh @@ -45,6 +45,7 @@ check_default_workflow_references_others() { -not -wholename "./test/canary-environment/mzcompose.py" `# Only run manually` \ -not -wholename "./test/ssh-connection/mzcompose.py" `# Handled differently` \ -not -wholename "./test/scalability/mzcompose.py" `# Other workflows are for manual usage` \ + -not -wholename "./test/testdrive-old-kafka-src-syntax/mzcompose.py" `# Other workflow is run separately` \ ) for file in "${MZCOMPOSE_TEST_FILES[@]}"; do diff --git a/test/legacy-upgrade/mzcompose.py b/test/legacy-upgrade/mzcompose.py index 1da734a1fc9e9..9b4eee9404ca4 100644 --- a/test/legacy-upgrade/mzcompose.py +++ b/test/legacy-upgrade/mzcompose.py @@ -152,6 +152,19 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: zero_downtime=True, force_source_table_syntax=False, ) + if args.lts_upgrade: + # Direct upgrade from latest LTS version without any inbetween versions + version = LTS_VERSIONS[-1] + priors = [v for v in all_versions if v <= version] + test_upgrade_from_version( + c, + f"{version}", + priors, + filter=args.filter, + zero_downtime=False, + force_source_table_syntax=True, + lts_upgrade=True, + ) if parallelism_count == 1 or parallelism_index == 1: test_upgrade_from_version( c, @@ -179,6 +192,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: priors, filter=args.filter, zero_downtime=False, + force_source_table_syntax=False, lts_upgrade=True, ) diff --git a/test/mysql-cdc-old-syntax/mzcompose.py b/test/mysql-cdc-old-syntax/mzcompose.py index e472f1162fb22..c92ec8e0a1ebc 100644 --- a/test/mysql-cdc-old-syntax/mzcompose.py +++ b/test/mysql-cdc-old-syntax/mzcompose.py @@ -96,7 +96,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: f"Workflows in shard with index {buildkite.get_parallelism_index()}: {sharded_workflows}" ) for name in sharded_workflows: - if name == "default": + if name in ("default", "migration"): continue with c.test_case(name): diff --git a/test/pg-cdc-old-syntax/mzcompose.py b/test/pg-cdc-old-syntax/mzcompose.py index 9f5ad1f7f7e9b..43a8ea2933ccc 100644 --- a/test/pg-cdc-old-syntax/mzcompose.py +++ b/test/pg-cdc-old-syntax/mzcompose.py @@ -347,7 +347,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: f"Workflows in shard with index {buildkite.get_parallelism_index()}: {sharded_workflows}" ) for name in sharded_workflows: - if name == "default": + if name in ("default", "migration"): continue # TODO: Flaky, reenable when database-issues#7611 is fixed diff --git a/test/testdrive-old-kafka-src-syntax/avro-resolution-no-publish-writer.td b/test/testdrive-old-kafka-src-syntax/avro-resolution-no-publish-writer.td index e0bb6d626c03c..19ce5236fcf66 100644 --- a/test/testdrive-old-kafka-src-syntax/avro-resolution-no-publish-writer.td +++ b/test/testdrive-old-kafka-src-syntax/avro-resolution-no-publish-writer.td @@ -43,5 +43,6 @@ GRANT CREATE, USAGE ON SCHEMA public TO materialize $ kafka-ingest format=bytes topic=resolution-no-publish-writer timestamp=1 \\x00\x00\x00\x00\x01\xf6\x01 -! SELECT * FROM resolution_no_publish_writer; -contains:to resolve +# TODO: Reenable when https://github.com/MaterializeInc/database-issues/issues/8933 is fixed +# ! SELECT * FROM resolution_no_publish_writer; +# contains:to resolve diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 899a355644a99..a39c3bb16e35e 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -35,7 +35,6 @@ from materialize.mzcompose.services.zookeeper import Zookeeper from materialize.source_table_migration import ( get_new_image_for_source_table_migration_test, - get_old_image_for_source_table_migration_test, verify_sources_after_source_table_migration, ) @@ -56,18 +55,6 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: - for name in c.workflows: - if name == "default": - continue - - if name == "migration": - continue - - with c.test_case(name): - c.workflow(name) - - -def workflow_kafka(c: Composition, parser: WorkflowArgumentParser) -> None: """Run testdrive.""" parser.add_argument( "--redpanda", @@ -351,13 +338,17 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: mz_old = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, - image=get_old_image_for_source_table_migration_test(), + image=get_new_image_for_source_table_migration_test(), external_metadata_store=True, external_blob_store=True, additional_system_parameter_defaults=dict(additional_system_parameter_defaults), ) + print(additional_system_parameter_defaults) + x = dict(additional_system_parameter_defaults) additional_system_parameter_defaults["force_source_table_syntax"] = "true" + print(additional_system_parameter_defaults) + print(x) mz_new = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, From 853f226993bb36fb0d01302b8f8e5caf67a15ea3 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Wed, 29 Jan 2025 14:21:22 -0500 Subject: [PATCH 47/52] Try and fix test --- test/mysql-cdc-old-syntax/table-in-mysql-schema.td | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/mysql-cdc-old-syntax/table-in-mysql-schema.td b/test/mysql-cdc-old-syntax/table-in-mysql-schema.td index 87490c15e78c6..24f1ec8836941 100644 --- a/test/mysql-cdc-old-syntax/table-in-mysql-schema.td +++ b/test/mysql-cdc-old-syntax/table-in-mysql-schema.td @@ -29,11 +29,13 @@ $ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-pass $ mysql-execute name=mysql DROP DATABASE IF EXISTS public; DROP DATABASE IF EXISTS another_schema; +DROP DATABASE IF EXISTS other; +DROP DATABASE IF EXISTS mysql; +CREATE DATABASE mysql; USE mysql; - # Insert data pre-snapshot -CREATE TABLE t_in_mysql (f1 INT); -INSERT INTO t_in_mysql VALUES (1), (2); +CREATE TABLE mysql.t_in_mysql (f1 INT); +INSERT INTO mysql.t_in_mysql VALUES (1), (2); ! CREATE SOURCE mz_source FROM MYSQL CONNECTION mysql_conn From 7b93cb3909cc435ad0365193afcca480304cbaa9 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Wed, 29 Jan 2025 14:35:50 -0500 Subject: [PATCH 48/52] Try and fix test --- test/mysql-cdc-old-syntax/table-in-mysql-schema.td | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/mysql-cdc-old-syntax/table-in-mysql-schema.td b/test/mysql-cdc-old-syntax/table-in-mysql-schema.td index 24f1ec8836941..e7e992a9d4824 100644 --- a/test/mysql-cdc-old-syntax/table-in-mysql-schema.td +++ b/test/mysql-cdc-old-syntax/table-in-mysql-schema.td @@ -30,8 +30,7 @@ $ mysql-execute name=mysql DROP DATABASE IF EXISTS public; DROP DATABASE IF EXISTS another_schema; DROP DATABASE IF EXISTS other; -DROP DATABASE IF EXISTS mysql; -CREATE DATABASE mysql; +CREATE DATABASE IF NOT EXISTS mysql; USE mysql; # Insert data pre-snapshot CREATE TABLE mysql.t_in_mysql (f1 INT); From c033fe10cf2ef168cb90315bc0e78eccdf815ca0 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Wed, 29 Jan 2025 15:25:49 -0500 Subject: [PATCH 49/52] Fix kafka tests --- .../testdrive-old-kafka-src-syntax/indexes.td | 107 +++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/test/testdrive-old-kafka-src-syntax/indexes.td b/test/testdrive-old-kafka-src-syntax/indexes.td index afe535df1b2a9..5d2c2aa87ec64 100644 --- a/test/testdrive-old-kafka-src-syntax/indexes.td +++ b/test/testdrive-old-kafka-src-syntax/indexes.td @@ -7,6 +7,9 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. +$ set-sql-timeout duration=1s +$ set-max-tries max-tries=5 + $ set-arg-default single-replica-cluster=quickstart $ set-regex match=cluster1|quickstart replacement= @@ -279,7 +282,109 @@ bar_ind bar {a} "" $ postgres-execute connection=postgres://mz_system@${testdrive.materialize-internal-sql-addr}/materialize ALTER SYSTEM SET enable_rbac_checks TO true -> SHOW INDEXES IN CLUSTER mz_catalog_server +>[version<=13100] SHOW INDEXES IN CLUSTER mz_catalog_server +mz_active_peeks_per_worker_s2_primary_idx mz_active_peeks_per_worker mz_catalog_server {id,worker_id} "" +mz_arrangement_batches_raw_s2_primary_idx mz_arrangement_batches_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_records_raw_s2_primary_idx mz_arrangement_records_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_sharing_raw_s2_primary_idx mz_arrangement_sharing_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_heap_capacity_raw_s2_primary_idx mz_arrangement_heap_capacity_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_heap_allocations_raw_s2_primary_idx mz_arrangement_heap_allocations_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_heap_size_raw_s2_primary_idx mz_arrangement_heap_size_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_batcher_allocations_raw_s2_primary_idx mz_arrangement_batcher_allocations_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_batcher_capacity_raw_s2_primary_idx mz_arrangement_batcher_capacity_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_batcher_records_raw_s2_primary_idx mz_arrangement_batcher_records_raw mz_catalog_server {operator_id,worker_id} "" +mz_arrangement_batcher_size_raw_s2_primary_idx mz_arrangement_batcher_size_raw mz_catalog_server {operator_id,worker_id} "" +mz_cluster_deployment_lineage_ind mz_cluster_deployment_lineage mz_catalog_server {cluster_id} "" +mz_cluster_replica_history_ind mz_cluster_replica_history mz_catalog_server {dropped_at} "" +mz_cluster_replica_name_history_ind mz_cluster_replica_name_history mz_catalog_server {id} "" +mz_cluster_replica_metrics_ind mz_cluster_replica_metrics mz_catalog_server {replica_id} "" +mz_cluster_replica_metrics_history_ind mz_cluster_replica_metrics_history mz_catalog_server {replica_id} "" +mz_cluster_replica_sizes_ind mz_cluster_replica_sizes mz_catalog_server {size} "" +mz_cluster_replica_statuses_ind mz_cluster_replica_statuses mz_catalog_server {replica_id} "" +mz_cluster_replica_status_history_ind mz_cluster_replica_status_history mz_catalog_server {replica_id} "" +mz_cluster_replicas_ind mz_cluster_replicas mz_catalog_server {id} "" +mz_clusters_ind mz_clusters mz_catalog_server {id} "" +mz_columns_ind mz_columns mz_catalog_server {name} "" +mz_comments_ind mz_comments mz_catalog_server {id} "" +mz_compute_dependencies_ind mz_compute_dependencies mz_catalog_server {dependency_id} "" +mz_compute_dataflow_global_ids_per_worker_s2_primary_idx mz_compute_dataflow_global_ids_per_worker mz_catalog_server {id,worker_id} "" +mz_compute_error_counts_raw_s2_primary_idx mz_compute_error_counts_raw mz_catalog_server {export_id,worker_id} "" +mz_compute_exports_per_worker_s2_primary_idx mz_compute_exports_per_worker mz_catalog_server {export_id,worker_id} "" +mz_compute_frontiers_per_worker_s2_primary_idx mz_compute_frontiers_per_worker mz_catalog_server {export_id,worker_id} "" +mz_compute_hydration_times_per_worker_s2_primary_idx mz_compute_hydration_times_per_worker mz_catalog_server {export_id,worker_id} "" +mz_compute_import_frontiers_per_worker_s2_primary_idx mz_compute_import_frontiers_per_worker mz_catalog_server {export_id,import_id,worker_id} "" +mz_compute_lir_mapping_per_worker_s2_primary_idx mz_compute_lir_mapping_per_worker mz_catalog_server {global_id,lir_id,worker_id} "" +mz_compute_operator_durations_histogram_raw_s2_primary_idx mz_compute_operator_durations_histogram_raw mz_catalog_server {id,worker_id,duration_ns} "" +mz_connections_ind mz_connections mz_catalog_server {schema_id} "" +mz_console_cluster_utilization_overview_ind mz_console_cluster_utilization_overview mz_catalog_server {cluster_id} "" +mz_continual_tasks_ind mz_continual_tasks mz_catalog_server {id} "" +mz_databases_ind mz_databases mz_catalog_server {name} "" +mz_dataflow_addresses_per_worker_s2_primary_idx mz_dataflow_addresses_per_worker mz_catalog_server {id,worker_id} "" +mz_dataflow_channels_per_worker_s2_primary_idx mz_dataflow_channels_per_worker mz_catalog_server {id,worker_id} "" +mz_dataflow_operator_reachability_raw_s2_primary_idx mz_dataflow_operator_reachability_raw mz_catalog_server {address,port,worker_id,update_type,time} "" +mz_dataflow_operators_per_worker_s2_primary_idx mz_dataflow_operators_per_worker mz_catalog_server {id,worker_id} "" +mz_dataflow_shutdown_durations_histogram_raw_s2_primary_idx mz_dataflow_shutdown_durations_histogram_raw mz_catalog_server {worker_id,duration_ns} "" +mz_frontiers_ind mz_frontiers mz_catalog_server {object_id} "" +mz_indexes_ind mz_indexes mz_catalog_server {id} "" +mz_kafka_sources_ind mz_kafka_sources mz_catalog_server {id} "" +mz_materialized_views_ind mz_materialized_views mz_catalog_server {id} "" +mz_message_batch_counts_received_raw_s2_primary_idx mz_message_batch_counts_received_raw mz_catalog_server {channel_id,from_worker_id,to_worker_id} "" +mz_message_batch_counts_sent_raw_s2_primary_idx mz_message_batch_counts_sent_raw mz_catalog_server {channel_id,from_worker_id,to_worker_id} "" +mz_message_counts_received_raw_s2_primary_idx mz_message_counts_received_raw mz_catalog_server {channel_id,from_worker_id,to_worker_id} "" +mz_message_counts_sent_raw_s2_primary_idx mz_message_counts_sent_raw mz_catalog_server {channel_id,from_worker_id,to_worker_id} "" +mz_object_dependencies_ind mz_object_dependencies mz_catalog_server {object_id} "" +mz_object_history_ind mz_object_history mz_catalog_server {id} "" +mz_object_lifetimes_ind mz_object_lifetimes mz_catalog_server {id} "" +mz_object_transitive_dependencies_ind mz_object_transitive_dependencies mz_catalog_server {object_id} "" +mz_objects_ind mz_objects mz_catalog_server {schema_id} "" +mz_notices_ind mz_notices mz_catalog_server {id} "" +mz_peek_durations_histogram_raw_s2_primary_idx mz_peek_durations_histogram_raw mz_catalog_server {worker_id,type,duration_ns} "" +mz_recent_activity_log_thinned_ind mz_recent_activity_log_thinned mz_catalog_server {sql_hash} "" +mz_recent_storage_usage_ind mz_recent_storage_usage mz_catalog_server {object_id} "" +mz_roles_ind mz_roles mz_catalog_server {id} "" +mz_scheduling_elapsed_raw_s2_primary_idx mz_scheduling_elapsed_raw mz_catalog_server {id,worker_id} "" +mz_scheduling_parks_histogram_raw_s2_primary_idx mz_scheduling_parks_histogram_raw mz_catalog_server {worker_id,slept_for_ns,requested_ns} "" +mz_schemas_ind mz_schemas mz_catalog_server {database_id} "" +mz_secrets_ind mz_secrets mz_catalog_server {name} "" +mz_show_all_objects_ind mz_show_all_objects mz_catalog_server {schema_id} "" +mz_show_cluster_replicas_ind mz_show_cluster_replicas mz_catalog_server {cluster} "" +mz_show_clusters_ind mz_show_clusters mz_catalog_server {name} "" +mz_show_columns_ind mz_show_columns mz_catalog_server {id} "" +mz_show_connections_ind mz_show_connections mz_catalog_server {schema_id} "" +mz_show_databases_ind mz_show_databases mz_catalog_server {name} "" +mz_show_indexes_ind mz_show_indexes mz_catalog_server {schema_id} "" +mz_show_materialized_views_ind mz_show_materialized_views mz_catalog_server {schema_id} "" +mz_show_roles_ind mz_show_roles mz_catalog_server {name} "" +mz_show_schemas_ind mz_show_schemas mz_catalog_server {database_id} "" +mz_show_secrets_ind mz_show_secrets mz_catalog_server {schema_id} "" +mz_show_sinks_ind mz_show_sinks mz_catalog_server {schema_id} "" +mz_show_sources_ind mz_show_sources mz_catalog_server {schema_id} "" +mz_show_tables_ind mz_show_tables mz_catalog_server {schema_id} "" +mz_show_types_ind mz_show_types mz_catalog_server {schema_id} "" +mz_show_views_ind mz_show_views mz_catalog_server {schema_id} "" +mz_sink_statistics_ind mz_sink_statistics mz_catalog_server {id} "" +mz_sink_status_history_ind mz_sink_status_history mz_catalog_server {sink_id} "" +mz_source_statistics_with_history_ind mz_source_statistics_with_history mz_catalog_server {id} "" +mz_sink_statuses_ind mz_sink_statuses mz_catalog_server {id} "" +mz_sinks_ind mz_sinks mz_catalog_server {id} "" +mz_source_statistics_ind mz_source_statistics mz_catalog_server {id} "" +mz_source_status_history_ind mz_source_status_history mz_catalog_server {source_id} "" +mz_source_statuses_ind mz_source_statuses mz_catalog_server {id} "" +mz_sources_ind mz_sources mz_catalog_server {id} "" +mz_tables_ind mz_tables mz_catalog_server {schema_id} "" +mz_types_ind mz_types mz_catalog_server {schema_id} "" +mz_recent_sql_text_ind mz_recent_sql_text mz_catalog_server {sql_hash} "" +mz_views_ind mz_views mz_catalog_server {schema_id} "" +mz_wallclock_global_lag_recent_history_ind mz_wallclock_global_lag_recent_history mz_catalog_server {object_id} "" +mz_webhook_sources_ind mz_webhook_sources mz_catalog_server {id} "" +pg_attrdef_all_databases_ind pg_attrdef_all_databases mz_catalog_server {oid,adrelid,adnum,adbin,adsrc} "" +pg_attribute_all_databases_ind pg_attribute_all_databases mz_catalog_server {attrelid,attname,atttypid,attlen,attnum,atttypmod,attnotnull,atthasdef,attidentity,attgenerated,attisdropped,attcollation,database_name,pg_type_database_name} "" +pg_class_all_databases_ind pg_class_all_databases mz_catalog_server {relname} "" +pg_description_all_databases_ind pg_description_all_databases mz_catalog_server {objoid,classoid,objsubid,description,oid_database_name,class_database_name} "" +pg_namespace_all_databases_ind pg_namespace_all_databases mz_catalog_server {nspname} "" +pg_type_all_databases_ind pg_type_all_databases mz_catalog_server {oid} "" + +>[version>13100] SHOW INDEXES IN CLUSTER mz_catalog_server mz_active_peeks_per_worker_s2_primary_idx mz_active_peeks_per_worker mz_catalog_server {id,worker_id} "" mz_arrangement_batches_raw_s2_primary_idx mz_arrangement_batches_raw mz_catalog_server {operator_id,worker_id} "" mz_arrangement_records_raw_s2_primary_idx mz_arrangement_records_raw mz_catalog_server {operator_id,worker_id} "" From 3420277d0ad14489503d1e3ff5d40b6bcd2c1e8a Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Thu, 30 Jan 2025 08:12:51 +0000 Subject: [PATCH 50/52] ci: Larger agent for td migration test --- ci/nightly/pipeline.template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index b8aeb8c1d6bc5..2d418175e193f 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -708,7 +708,7 @@ steps: composition: testdrive-old-kafka-src-syntax run: migration agents: - queue: hetzner-aarch64-8cpu-16gb + queue: hetzner-aarch64-16cpu-32gb - group: AWS key: aws From e2b56b47c7d56e4fd309dd4dc397931c2f2e1be9 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 30 Jan 2025 10:06:09 -0500 Subject: [PATCH 51/52] Fix mz config --- .../mzcompose.py | 56 ++++++++++++++----- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 31ee0363afc7c..0f8565324e8c8 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -281,6 +281,17 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: type=str, help="set the default timeout for Testdrive", ) + + parser.add_argument( + "--rewrite-results", + action="store_true", + help="Rewrite results, disables junit reports", + ) + + parser.add_argument( + "--azurite", action="store_true", help="Use Azurite as blob store instead of S3" + ) + parser.add_argument( "files", nargs="*", @@ -312,18 +323,6 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: dependencies += kafka_deps - testdrive = Testdrive( - forward_buildkite_shard=True, - kafka_default_partitions=args.kafka_default_partitions, - aws_region=args.aws_region, - default_timeout=args.default_timeout, - volumes_extra=["mzdata:/mzdata"], - external_blob_store=True, - fivetran_destination=True, - fivetran_destination_files_path="/share/tmp", - entrypoint_extra=[f"--var=uses-redpanda={args.redpanda}"], - ) - sysparams = args.system_param if not args.system_param: sysparams = [] @@ -337,14 +336,41 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: additional_system_parameter_defaults[key] = val + leaves_tombstones = ( + "true" + if additional_system_parameter_defaults.get( + "storage_use_continual_feedback_upsert", + get_default_system_parameters()["storage_use_continual_feedback_upsert"], + ) + == "false" + else "false" + ) + mz_old = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, - image=get_old_image_for_source_table_migration_test(), - external_metadata_store=True, + image=get_new_image_for_source_table_migration_test(), external_blob_store=True, + blob_store_is_azure=args.azurite, additional_system_parameter_defaults=dict(additional_system_parameter_defaults), ) + testdrive = Testdrive( + forward_buildkite_shard=True, + kafka_default_partitions=args.kafka_default_partitions, + aws_region=args.aws_region, + validate_catalog_store=True, + default_timeout=args.default_timeout, + volumes_extra=["mzdata:/mzdata"], + external_blob_store=True, + blob_store_is_azure=args.azurite, + fivetran_destination=True, + fivetran_destination_files_path="/share/tmp", + entrypoint_extra=[ + f"--var=uses-redpanda={args.redpanda}", + f"--var=leaves-tombstones={leaves_tombstones}", + ], + ) + print(additional_system_parameter_defaults) x = dict(additional_system_parameter_defaults) additional_system_parameter_defaults["force_source_table_syntax"] = "true" @@ -354,8 +380,8 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: mz_new = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, image=get_new_image_for_source_table_migration_test(), - external_metadata_store=True, external_blob_store=True, + blob_store_is_azure=args.azurite, additional_system_parameter_defaults=additional_system_parameter_defaults, ) From 1286010677cfcbfe27cf361b41719827aa65f1e8 Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Thu, 30 Jan 2025 10:27:04 -0500 Subject: [PATCH 52/52] Fix kafka tests --- test/testdrive-old-kafka-src-syntax/mzcompose.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/testdrive-old-kafka-src-syntax/mzcompose.py b/test/testdrive-old-kafka-src-syntax/mzcompose.py index 0f8565324e8c8..3491e6b711218 100644 --- a/test/testdrive-old-kafka-src-syntax/mzcompose.py +++ b/test/testdrive-old-kafka-src-syntax/mzcompose.py @@ -342,13 +342,13 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: "storage_use_continual_feedback_upsert", get_default_system_parameters()["storage_use_continual_feedback_upsert"], ) - == "false" + == "false" else "false" ) mz_old = Materialized( default_size=Materialized.Size.DEFAULT_SIZE, - image=get_new_image_for_source_table_migration_test(), + image=get_old_image_for_source_table_migration_test(), external_blob_store=True, blob_store_is_azure=args.azurite, additional_system_parameter_defaults=dict(additional_system_parameter_defaults), @@ -358,7 +358,6 @@ def workflow_migration(c: Composition, parser: WorkflowArgumentParser) -> None: forward_buildkite_shard=True, kafka_default_partitions=args.kafka_default_partitions, aws_region=args.aws_region, - validate_catalog_store=True, default_timeout=args.default_timeout, volumes_extra=["mzdata:/mzdata"], external_blob_store=True,