diff --git a/docs/changelog/125764.yaml b/docs/changelog/125764.yaml new file mode 100644 index 0000000000000..8f85645c1425c --- /dev/null +++ b/docs/changelog/125764.yaml @@ -0,0 +1,8 @@ +pr: 125764 +summary: Fix `ReplaceMissingFieldsWithNull` +area: ES|QL +type: bug +issues: + - 126036 + - 121754 + - 126030 diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/enrich.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/enrich.csv-spec index 25b114b5d1daf..9bfb08eb82b45 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/enrich.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/enrich.csv-spec @@ -643,3 +643,21 @@ FROM airports abbrev:k | city_name:k | city_location:geo_point | country:k | location:geo_point | name:text | region:text | boundary_wkt_length:i IDR | Indore | POINT(75.8472 22.7167) | India | POINT(75.8092915005895 22.727749187571) | Devi Ahilyabai Holkar Int'l | Indore City | 231 ; + +// Regression test for https://github.com/elastic/elasticsearch/issues/126030 +// We had wrong layouts from ReplaceMissingFieldsWithNull in case of indices that had relevant fields for the query, +// but were **missing the field we enrich on**. +fieldsInOtherIndicesBug +required_capability: enrich_load +required_capability: fix_replace_missing_field_with_null_duplicate_name_id_in_layout + +from * +| keep author.keyword, book_no, scalerank, street, bytes_in, @timestamp, abbrev, city_location, distance, description, birth_date, language_code, intersects, client_ip, event_duration, version +| enrich languages_policy on author.keyword +| sort book_no +| limit 1 +; + +author.keyword:keyword|book_no:keyword|scalerank:integer|street:keyword|bytes_in:ul|@timestamp:unsupported|abbrev:keyword|city_location:geo_point|distance:double|description:unsupported|birth_date:date|language_code:integer|intersects:boolean|client_ip:unsupported|event_duration:long|version:version|language_name:keyword +Fyodor Dostoevsky |1211 |null |null |null |null |null |null |null |null |null |null |null |null |null |null |null +; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec index 8f025278cab1d..6f5d70cccddeb 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec @@ -1453,6 +1453,10 @@ emp_no:integer | language_code:integer | language_name:keyword 10093 | 3 | Spanish ; +############################################### +# Bugfixes +############################################### + multipleBatchesWithSort required_capability: join_lookup_v12 required_capability: remove_redundant_sort @@ -1539,3 +1543,21 @@ from * m:integer |birth_date:datetime null |1952-02-27T00:00:00.000Z ; + +// Regression test for https://github.com/elastic/elasticsearch/issues/126030 +// We had wrong layouts from ReplaceMissingFieldsWithNull + +enrichLookupStatsBug +required_capability: join_lookup_v12 +required_capability: fix_replace_missing_field_with_null_duplicate_name_id_in_layout + +from * +| enrich languages_policy on cluster +| rename languages.byte as language_code +| lookup join languages_lookup on language_code +| stats salary_change.long = max(ratings), foo = max(num) +; + +salary_change.long:double|foo:long +5.0 |1698069301543123456 +; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 25b028dcf1a78..d7e0709226ea6 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -932,7 +932,12 @@ public enum Cap { /** * Support for sorting when aggregate_metric_doubles are present */ - AGGREGATE_METRIC_DOUBLE_SORTING(AGGREGATE_METRIC_DOUBLE_FEATURE_FLAG); + AGGREGATE_METRIC_DOUBLE_SORTING(AGGREGATE_METRIC_DOUBLE_FEATURE_FLAG), + + /** + * Supercedes {@link Cap#MAKE_NUMBER_OF_CHANNELS_CONSISTENT_WITH_LAYOUT}. + */ + FIX_REPLACE_MISSING_FIELD_WITH_NULL_DUPLICATE_NAME_ID_IN_LAYOUT; private final boolean enabled; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java index 5aa8cab025419..236d0863f7ff9 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java @@ -10,6 +10,7 @@ import org.elasticsearch.common.util.Maps; import org.elasticsearch.index.IndexMode; import org.elasticsearch.xpack.esql.core.expression.Alias; +import org.elasticsearch.xpack.esql.core.expression.Attribute; import org.elasticsearch.xpack.esql.core.expression.AttributeSet; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.expression.Literal; @@ -17,7 +18,6 @@ import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.core.type.PotentiallyUnmappedKeywordEsField; import org.elasticsearch.xpack.esql.optimizer.LocalLogicalOptimizerContext; -import org.elasticsearch.xpack.esql.plan.logical.Aggregate; import org.elasticsearch.xpack.esql.plan.logical.EsRelation; import org.elasticsearch.xpack.esql.plan.logical.Eval; import org.elasticsearch.xpack.esql.plan.logical.Filter; @@ -26,14 +26,12 @@ import org.elasticsearch.xpack.esql.plan.logical.Project; import org.elasticsearch.xpack.esql.plan.logical.RegexExtract; import org.elasticsearch.xpack.esql.plan.logical.TopN; -import org.elasticsearch.xpack.esql.plan.logical.join.Join; -import org.elasticsearch.xpack.esql.plan.logical.local.LocalRelation; import org.elasticsearch.xpack.esql.rule.ParameterizedRule; -import org.elasticsearch.xpack.esql.stats.SearchStats; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.function.Predicate; /** * Look for any fields used in the plan that are missing locally and replace them with null. @@ -45,82 +43,83 @@ public class ReplaceMissingFieldWithNull extends ParameterizedRule<LogicalPlan, public LogicalPlan apply(LogicalPlan plan, LocalLogicalOptimizerContext localLogicalOptimizerContext) { var lookupFieldsBuilder = AttributeSet.builder(); plan.forEachUp(EsRelation.class, esRelation -> { + // Looking only for indices in LOOKUP mode is correct: during parsing, we assign the expected mode and even if a lookup index + // is used in the FROM command, it will not be marked with LOOKUP mode there - but STANDARD. + // It seems like we could instead just look for JOINs and walk down their right hand side to find lookup fields - but this does + // not work as this rule also gets called just on the right hand side of a JOIN, which means that we don't always know that + // we're inside the right (or left) branch of a JOIN node. (See PlannerUtils.localPlan - this looks for FragmentExecs and + // performs local logical optimization of the fragments; the right hand side of a LookupJoinExec can be a FragmentExec.) if (esRelation.indexMode() == IndexMode.LOOKUP) { lookupFieldsBuilder.addAll(esRelation.output()); } }); + AttributeSet lookupFields = lookupFieldsBuilder.build(); - return plan.transformUp(p -> missingToNull(p, localLogicalOptimizerContext.searchStats(), lookupFieldsBuilder.build())); - } - - private LogicalPlan missingToNull(LogicalPlan plan, SearchStats stats, AttributeSet lookupFields) { - if (plan instanceof EsRelation || plan instanceof LocalRelation) { - return plan; - } + // Do not use the attribute name, this can deviate from the field name for union types; use fieldName() instead. + // Also retain fields from lookup indices because we do not have stats for these. + Predicate<FieldAttribute> shouldBeRetained = f -> f.field() instanceof PotentiallyUnmappedKeywordEsField + || (localLogicalOptimizerContext.searchStats().exists(f.fieldName()) || lookupFields.contains(f)); - if (plan instanceof Aggregate a) { - // don't do anything (for now) - return a; - } - // keep the aliased name - else if (plan instanceof Project project) { - var projections = project.projections(); - List<NamedExpression> newProjections = new ArrayList<>(projections.size()); - Map<DataType, Alias> nullLiteral = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size()); - AttributeSet joinAttributes = joinAttributes(project); + return plan.transformUp(p -> missingToNull(p, shouldBeRetained)); + } - for (NamedExpression projection : projections) { - // Do not use the attribute name, this can deviate from the field name for union types. - if (projection instanceof FieldAttribute f - && stats.exists(f.fieldName()) == false - && joinAttributes.contains(f) == false - && f.field() instanceof PotentiallyUnmappedKeywordEsField == false) { - // TODO: Should do a searchStats lookup for join attributes instead of just ignoring them here - // See TransportSearchShardsAction + private LogicalPlan missingToNull(LogicalPlan plan, Predicate<FieldAttribute> shouldBeRetained) { + if (plan instanceof EsRelation relation) { + // Remove missing fields from the EsRelation because this is not where we will obtain them from; replace them by an Eval right + // after, instead. This allows us to safely re-use the attribute ids of the corresponding FieldAttributes. + // This means that an EsRelation[field1, field2, field3] where field1 and field 3 are missing will be replaced by + // Project[field1, field2, field3] <- keeps the ordering intact + // \_Eval[field1 = null, field3 = null] + // \_EsRelation[field2] + List<Attribute> relationOutput = relation.output(); + Map<DataType, Alias> nullLiterals = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size()); + List<NamedExpression> newProjections = new ArrayList<>(relationOutput.size()); + for (int i = 0, size = relationOutput.size(); i < size; i++) { + Attribute attr = relationOutput.get(i); + NamedExpression projection; + if (attr instanceof FieldAttribute f && (shouldBeRetained.test(f) == false)) { DataType dt = f.dataType(); - Alias nullAlias = nullLiteral.get(f.dataType()); + Alias nullAlias = nullLiterals.get(dt); // save the first field as null (per datatype) if (nullAlias == null) { + // Keep the same id so downstream query plans don't need updating + // NOTE: THIS IS BRITTLE AND CAN LEAD TO BUGS. + // In case some optimizer rule or so inserts a plan node that requires the field BEFORE the Eval that we're adding + // on top of the EsRelation, this can trigger a field extraction in the physical optimizer phase, causing wrong + // layouts due to a duplicate name id. + // If someone reaches here AGAIN when debugging e.g. ClassCastExceptions NPEs from wrong layouts, we should probably + // give up on this approach and instead insert EvalExecs in InsertFieldExtraction. Alias alias = new Alias(f.source(), f.name(), Literal.of(f, null), f.id()); - nullLiteral.put(dt, alias); + nullLiterals.put(dt, alias); projection = alias.toAttribute(); } - // otherwise point to it + // otherwise point to it since this avoids creating field copies else { - // since avoids creating field copies projection = new Alias(f.source(), f.name(), nullAlias.toAttribute(), f.id()); } + } else { + projection = attr; } - newProjections.add(projection); } - // add the first found field as null - if (nullLiteral.size() > 0) { - plan = new Eval(project.source(), project.child(), new ArrayList<>(nullLiteral.values())); - plan = new Project(project.source(), plan, newProjections); + + if (nullLiterals.size() == 0) { + return plan; } - } else if (plan instanceof Eval + + Eval eval = new Eval(plan.source(), relation, new ArrayList<>(nullLiterals.values())); + // This projection is redundant if there's another projection downstream (and no commands depend on the order until we hit it). + return new Project(plan.source(), eval, newProjections); + } + + if (plan instanceof Eval || plan instanceof Filter || plan instanceof OrderBy || plan instanceof RegexExtract || plan instanceof TopN) { - plan = plan.transformExpressionsOnlyUp( - FieldAttribute.class, - // Do not use the attribute name, this can deviate from the field name for union types. - // Also skip fields from lookup indices because we do not have stats for these. - // TODO: We do have stats for lookup indices in case they are being used in the FROM clause; this can be refined. - f -> f.field() instanceof PotentiallyUnmappedKeywordEsField || (stats.exists(f.fieldName()) || lookupFields.contains(f)) - ? f - : Literal.of(f, null) - ); - } + return plan.transformExpressionsOnlyUp(FieldAttribute.class, f -> shouldBeRetained.test(f) ? f : Literal.of(f, null)); + } return plan; } - - private static AttributeSet joinAttributes(Project project) { - var attributesBuilder = AttributeSet.builder(); - project.forEachDown(Join.class, j -> j.right().forEachDown(EsRelation.class, p -> attributesBuilder.addAll(p.output()))); - return attributesBuilder.build(); - } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/Layout.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/Layout.java index a707bb1c6c000..b0e10ca7975ca 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/Layout.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/Layout.java @@ -107,26 +107,13 @@ public Layout build() { Map<NameId, ChannelAndType> layout = new HashMap<>(); int numberOfChannels = 0; for (ChannelSet set : channels) { - boolean createNewChannel = true; - int channel = 0; + int channel = numberOfChannels++; for (NameId id : set.nameIds) { - if (layout.containsKey(id)) { - // If a NameId already exists in the map, do not increase the numberOfChannels, it can cause inverse() to create - // a null in the list of channels, and NullPointerException when build() is called. - // TODO avoid adding duplicated attributes with the same id in the plan, ReplaceMissingFieldWithNull may add nulls - // with the same ids as the missing field ids. - continue; - } - if (createNewChannel) { - channel = numberOfChannels++; - createNewChannel = false; - } + // Duplicate name ids would mean that have 2 channels that are declared under the same id. That makes no sense - which + // channel should subsequent operators use, then, when they want to refer to this id? + assert (layout.containsKey(id) == false) : "Duplicate name ids are not allowed in layouts"; ChannelAndType next = new ChannelAndType(channel, set.type); - ChannelAndType prev = layout.put(id, next); - // Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100238 - // if (prev != null) { - // throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]"); - // } + layout.put(id, next); } } return new DefaultLayout(Collections.unmodifiableMap(layout), numberOfChannels); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java index 98f3d1d2d8d8e..6903e5dfce35d 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java @@ -22,7 +22,6 @@ import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.expression.FoldContext; import org.elasticsearch.xpack.esql.core.expression.Literal; -import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; @@ -78,6 +77,7 @@ import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.nullValue; //@TestLogging(value = "org.elasticsearch.xpack.esql:TRACE", reason = "debug") @@ -141,7 +141,7 @@ public void testMissingFieldInFilterString() { /** * Expects - * Project[[last_name{r}#6]] + * Project[[last_name{f}#6]] * \_Eval[[null[KEYWORD] AS last_name]] * \_Limit[10000[INTEGER]] * \_EsRelation[test][_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gen..] @@ -158,7 +158,7 @@ public void testMissingFieldInProject() { var project = as(localPlan, Project.class); var projections = project.projections(); assertThat(Expressions.names(projections), contains("last_name")); - as(projections.get(0), ReferenceAttribute.class); + as(projections.get(0), FieldAttribute.class); var eval = as(project.child(), Eval.class); assertThat(Expressions.names(eval.fields()), contains("last_name")); var alias = as(eval.fields().get(0), Alias.class); @@ -168,6 +168,7 @@ public void testMissingFieldInProject() { var limit = as(eval.child(), Limit.class); var source = as(limit.child(), EsRelation.class); + assertThat(Expressions.names(source.output()), not(contains("last_name"))); } /** @@ -192,6 +193,7 @@ public void testMissingFieldInSort() { var limit = as(project.child(), Limit.class); var source = as(limit.child(), EsRelation.class); + assertThat(Expressions.names(source.output()), not(contains("last_name"))); } /** @@ -199,8 +201,11 @@ public void testMissingFieldInSort() { * EsqlProject[[first_name{f}#7, last_name{r}#17]] * \_Limit[1000[INTEGER],true] * \_MvExpand[last_name{f}#10,last_name{r}#17] - * \_Limit[1000[INTEGER],false] - * \_EsRelation[test][_meta_field{f}#12, emp_no{f}#6, first_name{f}#7, ge..] + * \_Project[[_meta_field{f}#12, emp_no{f}#6, first_name{f}#7, gender{f}#8, hire_date{f}#13, job{f}#14, job.raw{f}#15, lang + * uages{f}#9, last_name{r}#10, long_noidx{f}#16, salary{f}#11]] + * \_Eval[[null[KEYWORD] AS last_name]] + * \_Limit[1000[INTEGER],false] + * \_EsRelation[test][_meta_field{f}#12, emp_no{f}#6, first_name{f}#7, ge..] */ public void testMissingFieldInMvExpand() { var plan = plan(""" @@ -212,14 +217,23 @@ public void testMissingFieldInMvExpand() { var testStats = statsForMissingField("last_name"); var localPlan = localPlan(plan, testStats); + // It'd be much better if this project was pushed down past the MvExpand, because MvExpand's cost scales with the number of + // involved attributes/columns. var project = as(localPlan, EsqlProject.class); var projections = project.projections(); assertThat(Expressions.names(projections), contains("first_name", "last_name")); var limit1 = asLimit(project.child(), 1000, true); var mvExpand = as(limit1.child(), MvExpand.class); - var limit2 = asLimit(mvExpand.child(), 1000, false); - as(limit2.child(), EsRelation.class); + var project2 = as(mvExpand.child(), Project.class); + var eval = as(project2.child(), Eval.class); + assertEquals(eval.fields().size(), 1); + var lastName = eval.fields().get(0); + assertEquals(lastName.name(), "last_name"); + assertEquals(lastName.child(), new Literal(EMPTY, null, DataType.KEYWORD)); + var limit2 = asLimit(eval.child(), 1000, false); + var relation = as(limit2.child(), EsRelation.class); + assertThat(Expressions.names(relation.output()), not(contains("last_name"))); } public static class MockFieldAttributeCommand extends UnaryPlan { @@ -275,6 +289,39 @@ public void testMissingFieldInNewCommand() { ), testStats ); + + var plan = plan(""" + from test + """); + var initialRelation = plan.collectLeaves().get(0); + FieldAttribute lastName = null; + for (Attribute attr : initialRelation.output()) { + if (attr.name().equals("last_name")) { + lastName = (FieldAttribute) attr; + } + } + + // Expects + // MockFieldAttributeCommand[last_name{f}#7] + // \_Project[[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gender{f}#5, hire_date{f}#10, job{f}#11, job.raw{f}#12, langu + // ages{f}#6, last_name{r}#7, long_noidx{f}#13, salary{f}#8]] + // \_Eval[[null[KEYWORD] AS last_name]] + // \_Limit[1000[INTEGER],false] + // \_EsRelation[test][_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gen..] + LogicalPlan localPlan = localPlan(new MockFieldAttributeCommand(EMPTY, plan, lastName), testStats); + + var mockCommand = as(localPlan, MockFieldAttributeCommand.class); + var project = as(mockCommand.child(), Project.class); + var eval = as(project.child(), Eval.class); + var limit = asLimit(eval.child(), 1000); + var relation = as(limit.child(), EsRelation.class); + + assertThat(Expressions.names(eval.fields()), contains("last_name")); + var literal = as(eval.fields().get(0), Alias.class); + assertEquals(literal.child(), new Literal(EMPTY, null, DataType.KEYWORD)); + assertThat(Expressions.names(relation.output()), not(contains("last_name"))); + + assertEquals(Expressions.names(initialRelation.output()), Expressions.names(project.output())); } /** diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalPhysicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalPhysicalPlanOptimizerTests.java index e443fd7a4e9a4..ba996519b307f 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalPhysicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalPhysicalPlanOptimizerTests.java @@ -38,8 +38,8 @@ import org.elasticsearch.xpack.esql.analysis.Verifier; import org.elasticsearch.xpack.esql.core.expression.Alias; import org.elasticsearch.xpack.esql.core.expression.Expressions; +import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.expression.Literal; -import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.core.type.EsField; @@ -1340,9 +1340,9 @@ public void testMissingFieldsDoNotGetExtracted() { ) ); // emp_no - assertThat(projections.get(1), instanceOf(ReferenceAttribute.class)); + assertThat(projections.get(1), instanceOf(FieldAttribute.class)); // first_name - assertThat(projections.get(2), instanceOf(ReferenceAttribute.class)); + assertThat(projections.get(2), instanceOf(FieldAttribute.class)); // last_name --> first_name var nullAlias = Alias.unwrap(projections.get(8));