Skip to content

[8.17] ESQL: ESQL: Fix ReplaceMissingFieldsWithNull (#125764) #126187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/changelog/125764.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pr: 125764
summary: Fix `ReplaceMissingFieldsWithNull`
area: ES|QL
type: bug
issues:
- 126036
- 121754
- 126030
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import org.elasticsearch.xpack.esql.parser.QueryParam;
import org.elasticsearch.xpack.esql.plan.logical.Enrich;
import org.elasticsearch.xpack.esql.plan.logical.EsRelation;
import org.elasticsearch.xpack.esql.plan.logical.Limit;
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
import org.elasticsearch.xpack.esql.plan.logical.local.LocalRelation;
import org.elasticsearch.xpack.esql.plan.logical.local.LocalSupplier;
Expand Down Expand Up @@ -132,6 +133,7 @@
import static org.elasticsearch.xpack.esql.parser.ParserUtils.ParamClassification.PATTERN;
import static org.elasticsearch.xpack.esql.parser.ParserUtils.ParamClassification.VALUE;
import static org.hamcrest.Matchers.instanceOf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;

Expand Down Expand Up @@ -393,6 +395,14 @@ public static <T> T as(Object node, Class<T> type) {
return type.cast(node);
}

public static Limit asLimit(Object node, Integer limitLiteral) {
Limit limit = as(node, Limit.class);
if (limitLiteral != null) {
assertEquals(as(limit.limit(), Literal.class).value(), limitLiteral);
}
return limit;
}

public static Map<String, EsField> loadMapping(String name) {
return LoadMapping.loadMapping(name);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -644,3 +644,21 @@ FROM airports
abbrev:k | city_name:k | city_location:geo_point | country:k | location:geo_point | name:text | region:text | boundary_wkt_length:i
IDR | Indore | POINT(75.8472 22.7167) | India | POINT(75.8092915005895 22.727749187571) | Devi Ahilyabai Holkar Int'l | Indore City | 231
;

// Regression test for https://github.com/elastic/elasticsearch/issues/126030
// We had wrong layouts from ReplaceMissingFieldsWithNull in case of indices that had relevant fields for the query,
// but were **missing the field we enrich on**.
fieldsInOtherIndicesBug
required_capability: enrich_load
required_capability: fix_replace_missing_field_with_null_duplicate_name_id_in_layout

from *
| keep author.keyword, book_no, scalerank, street, bytes_in, @timestamp, abbrev, city_location, distance, description, birth_date, language_code, intersects, client_ip, event_duration, version
| enrich languages_policy on author.keyword
| sort book_no
| limit 1
;

author.keyword:keyword|book_no:keyword|scalerank:integer|street:keyword|bytes_in:ul|@timestamp:null|abbrev:keyword|city_location:geo_point|distance:double|description:null|birth_date:date|language_code:keyword|intersects:boolean|client_ip:null|event_duration:long|version:version|language_name:keyword
Fyodor Dostoevsky |1211 |null |null |null |null |null |null |null |null |null |null |null |null |null |null |null
;
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,12 @@ public enum Cap {
/**
* Fix for regex folding with case-insensitive pattern https://github.com/elastic/elasticsearch/issues/118371
*/
FIXED_REGEX_FOLD;
FIXED_REGEX_FOLD,

/**
* Avoid duplicated channels with the same name id when executing ESQL queries.
*/
FIX_REPLACE_MISSING_FIELD_WITH_NULL_DUPLICATE_NAME_ID_IN_LAYOUT;

private final boolean enabled;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
package org.elasticsearch.xpack.esql.optimizer.rules.logical.local;

import org.elasticsearch.common.util.Maps;
import org.elasticsearch.index.IndexMode;
import org.elasticsearch.xpack.esql.core.expression.Alias;
import org.elasticsearch.xpack.esql.core.expression.Attribute;
import org.elasticsearch.xpack.esql.core.expression.AttributeSet;
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
import org.elasticsearch.xpack.esql.core.expression.Literal;
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.optimizer.LocalLogicalOptimizerContext;
import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
import org.elasticsearch.xpack.esql.plan.logical.EsRelation;
import org.elasticsearch.xpack.esql.plan.logical.Eval;
import org.elasticsearch.xpack.esql.plan.logical.Filter;
Expand All @@ -23,13 +25,12 @@
import org.elasticsearch.xpack.esql.plan.logical.Project;
import org.elasticsearch.xpack.esql.plan.logical.RegexExtract;
import org.elasticsearch.xpack.esql.plan.logical.TopN;
import org.elasticsearch.xpack.esql.plan.logical.local.LocalRelation;
import org.elasticsearch.xpack.esql.rule.ParameterizedRule;
import org.elasticsearch.xpack.esql.stats.SearchStats;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;

/**
* Look for any fields used in the plan that are missing locally and replace them with null.
Expand All @@ -39,60 +40,84 @@ public class ReplaceMissingFieldWithNull extends ParameterizedRule<LogicalPlan,

@Override
public LogicalPlan apply(LogicalPlan plan, LocalLogicalOptimizerContext localLogicalOptimizerContext) {
return plan.transformUp(p -> missingToNull(p, localLogicalOptimizerContext.searchStats()));
}
// Fields from lookup indices don't need to be present on the node, and our search stats don't include them, anyway. Ignore them.
AttributeSet lookupFields = new AttributeSet();
plan.forEachUp(EsRelation.class, esRelation -> {
// Looking only for indices in LOOKUP mode is correct: during parsing, we assign the expected mode and even if a lookup index
// is used in the FROM command, it will not be marked with LOOKUP mode there - but STANDARD.
// It seems like we could instead just look for JOINs and walk down their right hand side to find lookup fields - but this does
// not work as this rule also gets called just on the right hand side of a JOIN, which means that we don't always know that
// we're inside the right (or left) branch of a JOIN node. (See PlannerUtils.localPlan - this looks for FragmentExecs and
// performs local logical optimization of the fragments; the right hand side of a LookupJoinExec can be a FragmentExec.)
if (esRelation.indexMode() == IndexMode.LOOKUP) {
lookupFields.addAll(esRelation.output());
}
});

private LogicalPlan missingToNull(LogicalPlan plan, SearchStats stats) {
if (plan instanceof EsRelation || plan instanceof LocalRelation) {
return plan;
}
// Do not use the attribute name, this can deviate from the field name for union types; use fieldName() instead.
// Also retain fields from lookup indices because we do not have stats for these.
Predicate<FieldAttribute> shouldBeRetained = f -> (localLogicalOptimizerContext.searchStats().exists(f.fieldName())
|| lookupFields.contains(f));

if (plan instanceof Aggregate a) {
// don't do anything (for now)
return a;
}
// keep the aliased name
else if (plan instanceof Project project) {
var projections = project.projections();
List<NamedExpression> newProjections = new ArrayList<>(projections.size());
Map<DataType, Alias> nullLiteral = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size());
return plan.transformUp(p -> missingToNull(p, shouldBeRetained));
}

for (NamedExpression projection : projections) {
// Do not use the attribute name, this can deviate from the field name for union types.
if (projection instanceof FieldAttribute f && stats.exists(f.fieldName()) == false) {
private LogicalPlan missingToNull(LogicalPlan plan, Predicate<FieldAttribute> shouldBeRetained) {
if (plan instanceof EsRelation relation) {
// Remove missing fields from the EsRelation because this is not where we will obtain them from; replace them by an Eval right
// after, instead. This allows us to safely re-use the attribute ids of the corresponding FieldAttributes.
// This means that an EsRelation[field1, field2, field3] where field1 and field 3 are missing will be replaced by
// Project[field1, field2, field3] <- keeps the ordering intact
// \_Eval[field1 = null, field3 = null]
// \_EsRelation[field2]
List<Attribute> relationOutput = relation.output();
Map<DataType, Alias> nullLiterals = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size());
List<NamedExpression> newProjections = new ArrayList<>(relationOutput.size());
for (int i = 0, size = relationOutput.size(); i < size; i++) {
Attribute attr = relationOutput.get(i);
NamedExpression projection;
if (attr instanceof FieldAttribute f && (shouldBeRetained.test(f) == false)) {
DataType dt = f.dataType();
Alias nullAlias = nullLiteral.get(f.dataType());
Alias nullAlias = nullLiterals.get(dt);
// save the first field as null (per datatype)
if (nullAlias == null) {
// Keep the same id so downstream query plans don't need updating
// NOTE: THIS IS BRITTLE AND CAN LEAD TO BUGS.
// In case some optimizer rule or so inserts a plan node that requires the field BEFORE the Eval that we're adding
// on top of the EsRelation, this can trigger a field extraction in the physical optimizer phase, causing wrong
// layouts due to a duplicate name id.
// If someone reaches here AGAIN when debugging e.g. ClassCastExceptions NPEs from wrong layouts, we should probably
// give up on this approach and instead insert EvalExecs in InsertFieldExtraction.
Alias alias = new Alias(f.source(), f.name(), Literal.of(f, null), f.id());
nullLiteral.put(dt, alias);
nullLiterals.put(dt, alias);
projection = alias.toAttribute();
}
// otherwise point to it
// otherwise point to it since this avoids creating field copies
else {
// since avoids creating field copies
projection = new Alias(f.source(), f.name(), nullAlias.toAttribute(), f.id());
}
} else {
projection = attr;
}

newProjections.add(projection);
}
// add the first found field as null
if (nullLiteral.size() > 0) {
plan = new Eval(project.source(), project.child(), new ArrayList<>(nullLiteral.values()));
plan = new Project(project.source(), plan, newProjections);

if (nullLiterals.size() == 0) {
return plan;
}
} else if (plan instanceof Eval

Eval eval = new Eval(plan.source(), relation, new ArrayList<>(nullLiterals.values()));
// This projection is redundant if there's another projection downstream (and no commands depend on the order until we hit it).
return new Project(plan.source(), eval, newProjections);
}

if (plan instanceof Eval
|| plan instanceof Filter
|| plan instanceof OrderBy
|| plan instanceof RegexExtract
|| plan instanceof TopN) {
plan = plan.transformExpressionsOnlyUp(
FieldAttribute.class,
// Do not use the attribute name, this can deviate from the field name for union types.
f -> stats.exists(f.fieldName()) ? f : Literal.of(f, null)
);
}
return plan.transformExpressionsOnlyUp(FieldAttribute.class, f -> shouldBeRetained.test(f) ? f : Literal.of(f, null));
}

return plan;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,11 @@ public Layout build() {
for (ChannelSet set : channels) {
int channel = numberOfChannels++;
for (NameId id : set.nameIds) {
// Duplicate name ids would mean that have 2 channels that are declared under the same id. That makes no sense - which
// channel should subsequent operators use, then, when they want to refer to this id?
assert (layout.containsKey(id) == false) : "Duplicate name ids are not allowed in layouts";
ChannelAndType next = new ChannelAndType(channel, set.type);
ChannelAndType prev = layout.put(id, next);
// Do allow multiple name to point to the same channel - see https://github.com/elastic/elasticsearch/pull/100238
// if (prev != null) {
// throw new IllegalArgumentException("Name [" + id + "] is on two channels [" + prev + "] and [" + next + "]");
// }
layout.put(id, next);
}
}
return new DefaultLayout(Collections.unmodifiableMap(layout), numberOfChannels);
Expand Down
Loading