From 4096eb907d75707a45e7e8843fdda4fa9366472f Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Fri, 5 Sep 2025 17:02:06 +0800 Subject: [PATCH 1/8] [Feature](func) Support function soundex --- be/src/vec/functions/function_soundex.cpp | 117 +++++++++++++++++ .../vec/functions/simple_function_factory.h | 2 + .../vec/function/function_soundex_test.cpp | 84 ++++++++++++ .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../expressions/functions/scalar/Soundex.java | 69 ++++++++++ .../visitor/ScalarFunctionVisitor.java | 5 + .../string_functions/test_string_function.out | 120 ++++++++++++++++++ .../test_string_function.groovy | 41 ++++++ 8 files changed, 440 insertions(+) create mode 100644 be/src/vec/functions/function_soundex.cpp create mode 100644 be/test/vec/function/function_soundex_test.cpp create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Soundex.java diff --git a/be/src/vec/functions/function_soundex.cpp b/be/src/vec/functions/function_soundex.cpp new file mode 100644 index 00000000000000..e280a6938d3804 --- /dev/null +++ b/be/src/vec/functions/function_soundex.cpp @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "vec/columns/column_string.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +class FunctionSoundex : public IFunction { +public: + static constexpr auto name = "soundex"; + + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 1; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const override { + const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column; + + auto res_column = ColumnString::create(); + for (size_t i = 0; i < input_rows_count; ++i) { + StringRef ref = col_ptr->get_data_at(i); + std::string word = std::string(ref.data, ref.size); + std::string soundex_code = calculate_soundex(word); + res_column->insert_data(soundex_code.c_str(), soundex_code.length()); + } + + block.replace_by_position(result, std::move(res_column)); + return Status::OK(); + } + +private: + std::string calculate_soundex(const std::string& word) const { + if (word.empty()) { + return ""; + } + + std::string result(4, '\0'); + char pre_code = '\0'; + for (char c : word) { + if (!std::isalpha(c)) { + continue; + } + + c = static_cast(std::toupper(c)); + char code = get_soundex_code(c); + if (result.empty()) { + result += c; + } else if (std::isdigit(code) && code != pre_code) { + result += code; + if (result.size() == 4) { + return result; + } + } + + if (code != 'N') { + pre_code = code; + } + } + + while (!result.empty() && result.size() < 4) { + result += '0'; + } + + return result; + } + + char get_soundex_code(char c) const { + /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code + * the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels. + * eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K). + * Since the vowel "A" separates the Z and K, the K is coded. + * + * 2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded. + * Here we use 'N' to represent these two characters. + * eg : Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226. + */ + static constexpr char soundex_table[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V', + '2', '2', '4', '5', '5', 'V', '1', '2', '6', + '2', '3', 'V', '1', 'N', '2', 'V', '2'}; + + return soundex_table[c - 'A']; + } +}; + +void register_function_soundex(SimpleFunctionFactory& factory) { + factory.register_function(); +} + +#include "common/compile_check_end.h" +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h index bd08c7af81f61a..5339e90b828e8d 100644 --- a/be/src/vec/functions/simple_function_factory.h +++ b/be/src/vec/functions/simple_function_factory.h @@ -119,6 +119,7 @@ void register_function_dict_get_many(SimpleFunctionFactory& factory); void register_function_ai(SimpleFunctionFactory& factory); void register_function_score(SimpleFunctionFactory& factory); void register_function_variant_type(SimpleFunctionFactory& factory); +void register_function_soundex(SimpleFunctionFactory& factory); #if defined(BE_TEST) && !defined(BE_BENCHMARK) void register_function_throw_exception(SimpleFunctionFactory& factory); @@ -336,6 +337,7 @@ class SimpleFunctionFactory { register_function_dict_get_many(instance); register_function_ai(instance); register_function_score(instance); + register_function_soundex(instance); #if defined(BE_TEST) && !defined(BE_BENCHMARK) register_function_throw_exception(instance); #endif diff --git a/be/test/vec/function/function_soundex_test.cpp b/be/test/vec/function/function_soundex_test.cpp new file mode 100644 index 00000000000000..e6405e2b6f2a5f --- /dev/null +++ b/be/test/vec/function/function_soundex_test.cpp @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "function_test_util.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +TEST(SoundexFunctionTest, soundex_basic_test) { + std::string func_name = "soundex"; + + { + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; + + DataSet data_set = { + {{std::string("Doris")}, std::string("D620")}, + {{std::string("Robert")}, std::string("R163")}, + {{std::string("Rupert")}, std::string("R163")}, + {{std::string("Smith")}, std::string("S530")}, + {{std::string("Smyth")}, std::string("S530")}, + {{std::string("Johnson")}, std::string("J525")}, + {{std::string("Jackson")}, std::string("J250")}, + {{std::string("Ashcraft")}, std::string("A261")}, + {{std::string("Ashcroft")}, std::string("A261")}, + {{std::string("Washington")}, std::string("W252")}, + {{std::string("Lee")}, std::string("L000")}, + {{std::string("Gutierrez")}, std::string("G362")}, + {{std::string("Pfister")}, std::string("P236")}, + {{std::string("Honeyman")}, std::string("H555")}, + {{std::string("Lloyd")}, std::string("L300")}, + {{std::string("Tymczak")}, std::string("T522")}, + + {{std::string("A")}, std::string("A000")}, + {{std::string("B")}, std::string("B000")}, + {{std::string("Z")}, std::string("Z000")}, + + {{std::string("robert")}, std::string("R163")}, + {{std::string("ROBERT")}, std::string("R163")}, + {{std::string("RoBerT")}, std::string("R163")}, + + {{std::string("R@bert")}, std::string("R163")}, + {{std::string("Rob3rt")}, std::string("R163")}, + {{std::string("Rob-ert")}, std::string("R163")}, + {{std::string("123Robert")}, std::string("R163")}, + {{std::string("123")}, std::string("")}, + {{std::string("@#$")}, std::string("")}, + {{std::string(" ")}, std::string("")}, + {{std::string("")}, std::string("")}, + {{std::string("Ab_+ %*^cdefghijklmnopqrstuvwxyz")}, std::string("A123")}, + + {{std::string("Euler")}, std::string("E460")}, + {{std::string("Gauss")}, std::string("G200")}, + {{std::string("Hilbert")}, std::string("H416")}, + {{std::string("Knuth")}, std::string("K530")}, + {{std::string("Lloyd")}, std::string("L300")}, + {{std::string("Lukasiewicz")}, std::string("L222")}, + + {{std::string("Huang")}, std::string("H520")}, + {{std::string("Zhang")}, std::string("Z520")}, + {{std::string("Wang")}, std::string("W520")}}; + + static_cast(check_function(func_name, input_types, data_set)); + } +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 2116a6d3821752..f2df69bb48b1cb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -415,6 +415,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm3sum; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Decrypt; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Encrypt; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Soundex; import org.apache.doris.nereids.trees.expressions.functions.scalar.Space; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByChar; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByRegexp; @@ -918,6 +919,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Sm3sum.class, "sm3sum"), scalar(Sm4Decrypt.class, "sm4_decrypt"), scalar(Sm4Encrypt.class, "sm4_encrypt"), + scalar(Soundex.class, "soundex"), scalar(Space.class, "space"), scalar(SplitByChar.class, "split_by_char"), scalar(SplitByRegexp.class, "split_by_regexp"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Soundex.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Soundex.java new file mode 100644 index 00000000000000..2e7b3bfa6780dd --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Soundex.java @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * Scalar function 'Soundex' + */ +public class Soundex extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT).args(VarcharType.SYSTEM_DEFAULT) + ); + + /** + * constructor with 1 argument. + */ + public Soundex(Expression arg) { + super("soundex", arg); + } + + /** constructor for withChildren and reuse signature */ + private Soundex(ScalarFunctionParams functionParams) { + super(functionParams); + } + + @Override + public Soundex withChildren(List children) { + Preconditions.checkArgument(children.size() == 1); + return new Soundex(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitSoundex(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 7ed4adb87ea94b..ab2a2d054f92a0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -417,6 +417,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm3sum; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Decrypt; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Encrypt; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Soundex; import org.apache.doris.nereids.trees.expressions.functions.scalar.Space; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByChar; import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByRegexp; @@ -2030,6 +2031,10 @@ default R visitSm4Encrypt(Sm4Encrypt sm4Encrypt, C context) { return visitScalarFunction(sm4Encrypt, context); } + default R visitSoundex(Soundex soundex, C context) { + return visitScalarFunction(soundex, context); + } + default R visitSpace(Space space, C context) { return visitScalarFunction(space, context); } diff --git a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out index e3d169bd04f425..8a904ecb19662b 100644 --- a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out @@ -446,6 +446,126 @@ d***is -- !sub_replace_utf8_sql10 -- \N +-- !soundex -- +D620 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- +S530 + +-- !soundex -- +S530 + +-- !soundex -- +J525 + +-- !soundex -- +J250 + +-- !soundex -- +A261 + +-- !soundex -- +A261 + +-- !soundex -- +W252 + +-- !soundex -- +L000 + +-- !soundex -- +G362 + +-- !soundex -- +P236 + +-- !soundex -- +H555 + +-- !soundex -- +L300 + +-- !soundex -- +T522 + +-- !soundex -- +A000 + +-- !soundex -- +B000 + +-- !soundex -- +Z000 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- +R163 + +-- !soundex -- + + +-- !soundex -- + + +-- !soundex -- + + +-- !soundex -- + + +-- !soundex -- +A123 + +-- !soundex -- +E460 + +-- !soundex -- +G200 + +-- !soundex -- +H416 + +-- !soundex -- +K530 + +-- !soundex -- +L300 + +-- !soundex -- +L222 + +-- !soundex -- +H520 + +-- !soundex -- +Z520 + +-- !soundex -- +W520 + -- !sql -- 1 7 2 7 diff --git a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy index 8b4fc9d23388c9..92e9329336fef4 100644 --- a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy @@ -214,6 +214,47 @@ suite("test_string_function") { qt_sub_replace_utf8_sql9 " select sub_replace('你好世界','大家',4);" qt_sub_replace_utf8_sql10 " select sub_replace('你好世界','大家',-1);" + qt_soundex """SELECT SOUNDEX('Doris');""" + qt_soundex """SELECT SOUNDEX('Robert');""" + qt_soundex """SELECT SOUNDEX('Rupert');""" + qt_soundex """SELECT SOUNDEX('Smith');""" + qt_soundex """SELECT SOUNDEX('Smyth');""" + qt_soundex """SELECT SOUNDEX('Johnson');""" + qt_soundex """SELECT SOUNDEX('Jackson');""" + qt_soundex """SELECT SOUNDEX('Ashcraft');""" + qt_soundex """SELECT SOUNDEX('Ashcroft');""" + qt_soundex """SELECT SOUNDEX('Washington');""" + qt_soundex """SELECT SOUNDEX('Lee');""" + qt_soundex """SELECT SOUNDEX('Gutierrez');""" + qt_soundex """SELECT SOUNDEX('Pfister');""" + qt_soundex """SELECT SOUNDEX('Honeyman');""" + qt_soundex """SELECT SOUNDEX('Lloyd');""" + qt_soundex """SELECT SOUNDEX('Tymczak');""" + qt_soundex """SELECT SOUNDEX('A');""" + qt_soundex """SELECT SOUNDEX('B');""" + qt_soundex """SELECT SOUNDEX('Z');""" + qt_soundex """SELECT SOUNDEX('robert');""" + qt_soundex """SELECT SOUNDEX('ROBERT');""" + qt_soundex """SELECT SOUNDEX('RoBerT');""" + qt_soundex """SELECT SOUNDEX('R@bert');""" + qt_soundex """SELECT SOUNDEX('Rob3rt');""" + qt_soundex """SELECT SOUNDEX('Rob-ert');""" + qt_soundex """SELECT SOUNDEX('123Robert');""" + qt_soundex """SELECT SOUNDEX('123');""" + qt_soundex """SELECT SOUNDEX('~!@#%^&*-+');""" + qt_soundex """SELECT SOUNDEX(' ');""" + qt_soundex """SELECT SOUNDEX('');""" + qt_soundex """SELECT SOUNDEX('Ab_+ %*^cdefghijklmnopqrstuvwxyz');""" + qt_soundex """SELECT SOUNDEX('Euler');""" + qt_soundex """SELECT SOUNDEX('Gauss');""" + qt_soundex """SELECT SOUNDEX('Hilbert');""" + qt_soundex """SELECT SOUNDEX('Knuth');""" + qt_soundex """SELECT SOUNDEX('Lloyd');""" + qt_soundex """SELECT SOUNDEX('Lukasiewicz');""" + qt_soundex """SELECT SOUNDEX('Huang');""" + qt_soundex """SELECT SOUNDEX('Zhang');""" + qt_soundex """SELECT SOUNDEX('Wang');""" + sql """ From 8b4e83561a3c9c804e4cc80123605313644acc42 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Sat, 6 Sep 2025 00:02:48 +0800 Subject: [PATCH 2/8] fix --- be/src/vec/functions/function_soundex.cpp | 59 ++++++++++--------- .../string_functions/test_string_function.out | 3 + .../test_string_function.groovy | 1 + 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/be/src/vec/functions/function_soundex.cpp b/be/src/vec/functions/function_soundex.cpp index e280a6938d3804..eec1bffb92cf25 100644 --- a/be/src/vec/functions/function_soundex.cpp +++ b/be/src/vec/functions/function_soundex.cpp @@ -17,6 +17,7 @@ #include +#include "common/status.h" #include "vec/columns/column_string.h" #include "vec/data_types/data_type_string.h" #include "vec/functions/function.h" @@ -46,8 +47,7 @@ class FunctionSoundex : public IFunction { auto res_column = ColumnString::create(); for (size_t i = 0; i < input_rows_count; ++i) { StringRef ref = col_ptr->get_data_at(i); - std::string word = std::string(ref.data, ref.size); - std::string soundex_code = calculate_soundex(word); + std::string soundex_code = calculate_soundex(ref); res_column->insert_data(soundex_code.c_str(), soundex_code.length()); } @@ -56,30 +56,35 @@ class FunctionSoundex : public IFunction { } private: - std::string calculate_soundex(const std::string& word) const { - if (word.empty()) { + std::string calculate_soundex(const StringRef& ref) const { + if (ref.empty()) { return ""; } - std::string result(4, '\0'); + std::string result; + result.reserve(4); char pre_code = '\0'; - for (char c : word) { + for (char c : ref.to_string()) { + if (c > 0x7f) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "soundex only supports ASCII"); + } + if (!std::isalpha(c)) { continue; } c = static_cast(std::toupper(c)); - char code = get_soundex_code(c); if (result.empty()) { result += c; - } else if (std::isdigit(code) && code != pre_code) { - result += code; - if (result.size() == 4) { - return result; + pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A']; + } else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') { + if (code != 'V' && code != pre_code) { + result += code; + if (result.size() == 4) { + return result; + } } - } - if (code != 'N') { pre_code = code; } } @@ -91,22 +96,18 @@ class FunctionSoundex : public IFunction { return result; } - char get_soundex_code(char c) const { - /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code - * the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels. - * eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K). - * Since the vowel "A" separates the Z and K, the K is coded. - * - * 2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded. - * Here we use 'N' to represent these two characters. - * eg : Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226. - */ - static constexpr char soundex_table[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V', - '2', '2', '4', '5', '5', 'V', '1', '2', '6', - '2', '3', 'V', '1', 'N', '2', 'V', '2'}; - - return soundex_table[c - 'A']; - } + /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code + * the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels. + * eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K). + * Since the vowel "A" separates the Z and K, the K is coded. + * + * 2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded. + * Here we use 'N' to represent these two characters. + * eg : **Ashcraft** is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226. + */ + static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V', + '2', '2', '4', '5', '5', 'V', '1', '2', '6', + '2', '3', 'V', '1', 'N', '2', 'V', '2'}; }; void register_function_soundex(SimpleFunctionFactory& factory) { diff --git a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out index 8a904ecb19662b..28734490064b53 100644 --- a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out @@ -566,6 +566,9 @@ Z520 -- !soundex -- W520 +-- !soundex -- +\N + -- !sql -- 1 7 2 7 diff --git a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy index 92e9329336fef4..cf599b5a36597e 100644 --- a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy @@ -254,6 +254,7 @@ suite("test_string_function") { qt_soundex """SELECT SOUNDEX('Huang');""" qt_soundex """SELECT SOUNDEX('Zhang');""" qt_soundex """SELECT SOUNDEX('Wang');""" + qt_soundex """SELECT SOUNDEX(NULL);""" From e2cc5ed5a4f518081f0d09b048714c8357982eaf Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Sat, 6 Sep 2025 11:55:50 +0800 Subject: [PATCH 3/8] mv ut to string_test && add chinese test --- be/src/vec/functions/function_soundex.cpp | 5 +- .../vec/function/function_soundex_test.cpp | 84 ------------------- be/test/vec/function/function_string_test.cpp | 68 +++++++++++++++ 3 files changed, 71 insertions(+), 86 deletions(-) delete mode 100644 be/test/vec/function/function_soundex_test.cpp diff --git a/be/src/vec/functions/function_soundex.cpp b/be/src/vec/functions/function_soundex.cpp index eec1bffb92cf25..9cf31c087d3b2e 100644 --- a/be/src/vec/functions/function_soundex.cpp +++ b/be/src/vec/functions/function_soundex.cpp @@ -64,11 +64,12 @@ class FunctionSoundex : public IFunction { std::string result; result.reserve(4); char pre_code = '\0'; - for (char c : ref.to_string()) { + for (size_t i = 0; i < ref.size; ++i) { + auto c = static_cast(ref.data[i]); + if (c > 0x7f) { throw Exception(ErrorCode::INVALID_ARGUMENT, "soundex only supports ASCII"); } - if (!std::isalpha(c)) { continue; } diff --git a/be/test/vec/function/function_soundex_test.cpp b/be/test/vec/function/function_soundex_test.cpp deleted file mode 100644 index e6405e2b6f2a5f..00000000000000 --- a/be/test/vec/function/function_soundex_test.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "function_test_util.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_string.h" - -namespace doris::vectorized { - -TEST(SoundexFunctionTest, soundex_basic_test) { - std::string func_name = "soundex"; - - { - InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; - - DataSet data_set = { - {{std::string("Doris")}, std::string("D620")}, - {{std::string("Robert")}, std::string("R163")}, - {{std::string("Rupert")}, std::string("R163")}, - {{std::string("Smith")}, std::string("S530")}, - {{std::string("Smyth")}, std::string("S530")}, - {{std::string("Johnson")}, std::string("J525")}, - {{std::string("Jackson")}, std::string("J250")}, - {{std::string("Ashcraft")}, std::string("A261")}, - {{std::string("Ashcroft")}, std::string("A261")}, - {{std::string("Washington")}, std::string("W252")}, - {{std::string("Lee")}, std::string("L000")}, - {{std::string("Gutierrez")}, std::string("G362")}, - {{std::string("Pfister")}, std::string("P236")}, - {{std::string("Honeyman")}, std::string("H555")}, - {{std::string("Lloyd")}, std::string("L300")}, - {{std::string("Tymczak")}, std::string("T522")}, - - {{std::string("A")}, std::string("A000")}, - {{std::string("B")}, std::string("B000")}, - {{std::string("Z")}, std::string("Z000")}, - - {{std::string("robert")}, std::string("R163")}, - {{std::string("ROBERT")}, std::string("R163")}, - {{std::string("RoBerT")}, std::string("R163")}, - - {{std::string("R@bert")}, std::string("R163")}, - {{std::string("Rob3rt")}, std::string("R163")}, - {{std::string("Rob-ert")}, std::string("R163")}, - {{std::string("123Robert")}, std::string("R163")}, - {{std::string("123")}, std::string("")}, - {{std::string("@#$")}, std::string("")}, - {{std::string(" ")}, std::string("")}, - {{std::string("")}, std::string("")}, - {{std::string("Ab_+ %*^cdefghijklmnopqrstuvwxyz")}, std::string("A123")}, - - {{std::string("Euler")}, std::string("E460")}, - {{std::string("Gauss")}, std::string("G200")}, - {{std::string("Hilbert")}, std::string("H416")}, - {{std::string("Knuth")}, std::string("K530")}, - {{std::string("Lloyd")}, std::string("L300")}, - {{std::string("Lukasiewicz")}, std::string("L222")}, - - {{std::string("Huang")}, std::string("H520")}, - {{std::string("Zhang")}, std::string("Z520")}, - {{std::string("Wang")}, std::string("W520")}}; - - static_cast(check_function(func_name, input_types, data_set)); - } -} - -} // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 4ee28fc8de0d76..27f59fdef702c6 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -3607,4 +3607,72 @@ TEST(function_string_test, function_count_substring_test) { check_function_all_arg_comb(func_name, input_types, data_set); } } + +TEST(function_string_test, soundex_test) { + std::string func_name = "soundex"; + + { + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; + + DataSet data_set = { + {{std::string("Doris")}, std::string("D620")}, + {{std::string("Robert")}, std::string("R163")}, + {{std::string("Rupert")}, std::string("R163")}, + {{std::string("Smith")}, std::string("S530")}, + {{std::string("Smyth")}, std::string("S530")}, + {{std::string("Johnson")}, std::string("J525")}, + {{std::string("Jackson")}, std::string("J250")}, + {{std::string("Ashcraft")}, std::string("A261")}, + {{std::string("Ashcroft")}, std::string("A261")}, + {{std::string("Washington")}, std::string("W252")}, + {{std::string("Lee")}, std::string("L000")}, + {{std::string("Gutierrez")}, std::string("G362")}, + {{std::string("Pfister")}, std::string("P236")}, + {{std::string("Honeyman")}, std::string("H555")}, + {{std::string("Lloyd")}, std::string("L300")}, + {{std::string("Tymczak")}, std::string("T522")}, + + {{std::string("A")}, std::string("A000")}, + {{std::string("B")}, std::string("B000")}, + {{std::string("Z")}, std::string("Z000")}, + + {{std::string("robert")}, std::string("R163")}, + {{std::string("ROBERT")}, std::string("R163")}, + {{std::string("RoBerT")}, std::string("R163")}, + + {{std::string("R@bert")}, std::string("R163")}, + {{std::string("Rob3rt")}, std::string("R163")}, + {{std::string("Rob-ert")}, std::string("R163")}, + {{std::string("123Robert")}, std::string("R163")}, + {{std::string("123")}, std::string("")}, + {{std::string("@#$")}, std::string("")}, + {{std::string(" ")}, std::string("")}, + {{std::string("")}, std::string("")}, + {{std::string("Ab_+ %*^cdefghijklmnopqrstuvwxyz")}, std::string("A123")}, + + {{std::string("Euler")}, std::string("E460")}, + {{std::string("Gauss")}, std::string("G200")}, + {{std::string("Hilbert")}, std::string("H416")}, + {{std::string("Knuth")}, std::string("K530")}, + {{std::string("Lloyd")}, std::string("L300")}, + {{std::string("Lukasiewicz")}, std::string("L222")}, + + {{std::string("Huang")}, std::string("H520")}, + {{std::string("Zhang")}, std::string("Z520")}, + {{std::string("Wang")}, std::string("W520")}}; + + static_cast(check_function(func_name, input_types, data_set)); + } + + { + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; + + DataSet data_set = {{{std::string("中文测试")}, std::string("")}, + {{std::string("abc 你好")}, std::string("")}}; + + static_cast(check_function(func_name, input_types, data_set, -1, + -1, true)); + } +} + } // namespace doris::vectorized From 21e73c97f74553b51ad891ec1d427f1133f528ea Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Sat, 6 Sep 2025 22:01:19 +0800 Subject: [PATCH 4/8] add nonASCII test --- be/test/vec/function/function_string_test.cpp | 1 + .../string_functions/test_string_function.out | 3 +++ .../string_functions/test_string_function.groovy | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 27f59fdef702c6..d4c06c5b86096f 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -3616,6 +3616,7 @@ TEST(function_string_test, soundex_test) { DataSet data_set = { {{std::string("Doris")}, std::string("D620")}, + {{std::string("ApacheDoris中文测试")}, std::string("A123")}, {{std::string("Robert")}, std::string("R163")}, {{std::string("Rupert")}, std::string("R163")}, {{std::string("Smith")}, std::string("S530")}, diff --git a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out index 28734490064b53..51798c696294d0 100644 --- a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out @@ -569,6 +569,9 @@ W520 -- !soundex -- \N +-- !soundex -- +A123 + -- !sql -- 1 7 2 7 diff --git a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy index cf599b5a36597e..ee0210ce022695 100644 --- a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy @@ -256,6 +256,17 @@ suite("test_string_function") { qt_soundex """SELECT SOUNDEX('Wang');""" qt_soundex """SELECT SOUNDEX(NULL);""" + // non-ASCII test for soundex + qt_soundex """SELECT SOUNDEX('ApacheDoris非 ASCII 测试');""" + test{ + sql """SELECT SOUNDEX('非 ASCII 测试');""" + exception "soundex only supports ASCII" + } + test{ + sql """SELECT SOUNDEX('Doris中文测试');""" + exception "soundex only supports ASCII" + } + sql """ From e5937a3d42cef70fcfaed9848c476ab33e8e7360 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Sun, 7 Sep 2025 14:27:22 +0800 Subject: [PATCH 5/8] fe fold --- be/src/vec/functions/function_soundex.cpp | 21 +++++--- .../executable/StringArithmetic.java | 53 +++++++++++++++++++ 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/be/src/vec/functions/function_soundex.cpp b/be/src/vec/functions/function_soundex.cpp index 9cf31c087d3b2e..c22af95838d414 100644 --- a/be/src/vec/functions/function_soundex.cpp +++ b/be/src/vec/functions/function_soundex.cpp @@ -45,10 +45,10 @@ class FunctionSoundex : public IFunction { const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column; auto res_column = ColumnString::create(); + res_column->reserve(input_rows_count); for (size_t i = 0; i < input_rows_count; ++i) { StringRef ref = col_ptr->get_data_at(i); - std::string soundex_code = calculate_soundex(ref); - res_column->insert_data(soundex_code.c_str(), soundex_code.length()); + RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_column.get(), i)); } block.replace_by_position(result, std::move(res_column)); @@ -56,9 +56,11 @@ class FunctionSoundex : public IFunction { } private: - std::string calculate_soundex(const StringRef& ref) const { - if (ref.empty()) { - return ""; + Status calculate_soundex_and_insert(const StringRef& ref, ColumnString* res_column, + const size_t row) const { + if (ref.size == 0) { + res_column->insert_data("", 0); + return Status::OK(); } std::string result; @@ -68,7 +70,8 @@ class FunctionSoundex : public IFunction { auto c = static_cast(ref.data[i]); if (c > 0x7f) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "soundex only supports ASCII"); + return Status::InvalidArgument("soundex only supports ASCII, but got: {}", + ref.data[i]); } if (!std::isalpha(c)) { continue; @@ -82,7 +85,8 @@ class FunctionSoundex : public IFunction { if (code != 'V' && code != pre_code) { result += code; if (result.size() == 4) { - return result; + res_column->insert_data(result.c_str(), result.length()); + return Status::OK(); } } @@ -94,7 +98,8 @@ class FunctionSoundex : public IFunction { result += '0'; } - return result; + res_column->insert_data(result.c_str(), result.length()); + return Status::OK(); } /** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java index a592e71a9a1e36..9955b510a7aafd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java @@ -1059,4 +1059,57 @@ public static Expression replaceEmpty(StringLikeLiteral first, StringLikeLiteral } return castStringLikeLiteral(first, first.getValue().replace(second.getValue(), third.getValue())); } + + /** + * Executable arithmetic functions soundex + */ + @ExecFunction(name = "soundex") + public static Expression soundex(StringLikeLiteral first) { + char[] soundexTable = { + 'V', '1', '2', '3', 'V', '1', '2', 'N', 'V', + '2', '2', '4', '5', '5', 'V', '1', '2', '6', + '2', '3', 'V', '1', 'N', '2', 'V', '2' + }; + + String result = ""; + if (!first.getValue().isEmpty()) { + char preCode = '\0'; + + for (int i = 0; i < first.getValue().length(); i++) { + char c = first.getValue().charAt(i); + + if (c > 0x7f) { + throw new RuntimeException("soundex only supports ASCII, but got: " + c); + } + if (!Character.isLetter(c)) { + continue; + } + + c = Character.toUpperCase(c); + if (result.isEmpty()) { + result += c; + preCode = (soundexTable[c - 'A'] == 'N') ? '\0' : soundexTable[c - 'A']; + } else { + char code = soundexTable[c - 'A']; + if (code != 'N') { + if (code != 'V' && code != preCode) { + result += code; + if (result.length() == 4) { + break; + } + } + preCode = code; + } + } + } + + if (result.length() > 0) { + while (result.length() < 4) { + result += '0'; + } + } + } + + return castStringLikeLiteral(first, result); + } } From 42de590bbf3c67c5b61d75614afff5cf7004b19e Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Mon, 8 Sep 2025 13:14:01 +0800 Subject: [PATCH 6/8] 1 --- be/src/vec/functions/function_soundex.cpp | 35 ++++++++++++++--------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/be/src/vec/functions/function_soundex.cpp b/be/src/vec/functions/function_soundex.cpp index c22af95838d414..80e775dec36c8c 100644 --- a/be/src/vec/functions/function_soundex.cpp +++ b/be/src/vec/functions/function_soundex.cpp @@ -46,9 +46,13 @@ class FunctionSoundex : public IFunction { auto res_column = ColumnString::create(); res_column->reserve(input_rows_count); + auto& res_data = res_column->get_chars(); + auto& res_offsets = res_column->get_offsets(); + res_data.reserve(input_rows_count * CODE_SIZE); + res_offsets.resize(input_rows_count); for (size_t i = 0; i < input_rows_count; ++i) { StringRef ref = col_ptr->get_data_at(i); - RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_column.get(), i)); + RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_data, res_offsets, i)); } block.replace_by_position(result, std::move(res_column)); @@ -56,15 +60,16 @@ class FunctionSoundex : public IFunction { } private: - Status calculate_soundex_and_insert(const StringRef& ref, ColumnString* res_column, - const size_t row) const { + Status calculate_soundex_and_insert(const StringRef& ref, ColumnString::Chars& chars, + ColumnString::Offsets& offsets, const size_t row) const { + uint32_t row_start = (row == 0) ? 0 : offsets[row - 1]; + uint32_t expect_end = row_start + CODE_SIZE; + if (ref.size == 0) { - res_column->insert_data("", 0); + offsets[row] = row_start; return Status::OK(); } - std::string result; - result.reserve(4); char pre_code = '\0'; for (size_t i = 0; i < ref.size; ++i) { auto c = static_cast(ref.data[i]); @@ -78,14 +83,14 @@ class FunctionSoundex : public IFunction { } c = static_cast(std::toupper(c)); - if (result.empty()) { - result += c; + if (chars.size() == row_start) { + chars.push_back(c); pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A']; } else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') { if (code != 'V' && code != pre_code) { - result += code; - if (result.size() == 4) { - res_column->insert_data(result.c_str(), result.length()); + chars.push_back(code); + if (chars.size() == expect_end) { + offsets[row] = static_cast(chars.size()); return Status::OK(); } } @@ -94,11 +99,11 @@ class FunctionSoundex : public IFunction { } } - while (!result.empty() && result.size() < 4) { - result += '0'; + while (chars.size() != row_start && chars.size() < expect_end) { + chars.push_back('0'); } + offsets[row] = static_cast(chars.size()); - res_column->insert_data(result.c_str(), result.length()); return Status::OK(); } @@ -114,6 +119,8 @@ class FunctionSoundex : public IFunction { static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V', '2', '2', '4', '5', '5', 'V', '1', '2', '6', '2', '3', 'V', '1', 'N', '2', 'V', '2'}; + + static constexpr uint8_t CODE_SIZE = 4; }; void register_function_soundex(SimpleFunctionFactory& factory) { From 006038ec23502017008434e9b99c1f57174c187c Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Mon, 8 Sep 2025 15:49:40 +0800 Subject: [PATCH 7/8] add fold constant test --- .../executable/StringArithmetic.java | 4 ++- .../rules/expression/FoldConstantTest.java | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java index 9955b510a7aafd..afedc7a3fd18dd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java @@ -18,6 +18,7 @@ package org.apache.doris.nereids.trees.expressions.functions.executable; import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.exceptions.NotSupportedException; import org.apache.doris.nereids.trees.expressions.ExecFunction; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.literal.ArrayLiteral; @@ -1079,8 +1080,9 @@ public static Expression soundex(StringLikeLiteral first) { char c = first.getValue().charAt(i); if (c > 0x7f) { - throw new RuntimeException("soundex only supports ASCII, but got: " + c); + throw new NotSupportedException("soundex only supports ASCII, but got: " + c); } + if (!Character.isLetter(c)) { continue; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/expression/FoldConstantTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/expression/FoldConstantTest.java index bb12026c20be46..81f0c31b92f89d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/expression/FoldConstantTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/expression/FoldConstantTest.java @@ -21,6 +21,7 @@ import org.apache.doris.common.Config; import org.apache.doris.nereids.analyzer.UnboundRelation; import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.exceptions.NotSupportedException; import org.apache.doris.nereids.parser.NereidsParser; import org.apache.doris.nereids.rules.analysis.ExpressionAnalyzer; import org.apache.doris.nereids.rules.expression.rules.FoldConstantRule; @@ -90,6 +91,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Sign; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sin; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sinh; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Soundex; import org.apache.doris.nereids.trees.expressions.functions.scalar.Sqrt; import org.apache.doris.nereids.trees.expressions.functions.scalar.StrToDate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Substring; @@ -555,6 +557,30 @@ void testFoldString() { ); rewritten = executor.rewrite(replace, context); Assertions.assertEquals(new StringLiteral("default"), rewritten); + + Soundex soundex = new Soundex(StringLiteral.of("Ashcraft")); + rewritten = executor.rewrite(soundex, context); + Assertions.assertEquals(new StringLiteral("A261"), rewritten); + soundex = new Soundex(StringLiteral.of("Robert")); + rewritten = executor.rewrite(soundex, context); + Assertions.assertEquals(new StringLiteral("R163"), rewritten); + soundex = new Soundex(StringLiteral.of("R@bert")); + rewritten = executor.rewrite(soundex, context); + Assertions.assertEquals(new StringLiteral("R163"), rewritten); + soundex = new Soundex(StringLiteral.of("Honeyman")); + rewritten = executor.rewrite(soundex, context); + Assertions.assertEquals(new StringLiteral("H555"), rewritten); + soundex = new Soundex(StringLiteral.of("Apache Doris你好")); + rewritten = executor.rewrite(soundex, context); + Assertions.assertEquals(new StringLiteral("A123"), rewritten); + soundex = new Soundex(StringLiteral.of("")); + rewritten = executor.rewrite(soundex, context); + Assertions.assertEquals(new StringLiteral(""), rewritten); + + Assertions.assertThrows(NotSupportedException.class, () -> { + Soundex soundexThrow = new Soundex(new StringLiteral("Doris你好")); + executor.rewrite(soundexThrow, context); + }, "soundex only supports ASCII"); } @Test From c62f342b083aa5400953e6dddcc218861c249e55 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Mon, 8 Sep 2025 21:11:58 +0800 Subject: [PATCH 8/8] add fold constant regression test --- .../fold_constant_string_arithmatic.groovy | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy index 374f69bb58b502..68db295f66cefe 100644 --- a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy +++ b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy @@ -1851,5 +1851,48 @@ suite("fold_constant_string_arithmatic") { testFoldConst("select cast(cast('inf' as double) as string)") testFoldConst("select cast(cast('-inf' as double) as string)") */ + + // SOUNDEX + testFoldConst("SELECT SOUNDEX('Doris')") + testFoldConst("SELECT SOUNDEX('Robert')") + testFoldConst("SELECT SOUNDEX('Rupert')") + testFoldConst("SELECT SOUNDEX('Smith')") + testFoldConst("SELECT SOUNDEX('Smyth')") + testFoldConst("SELECT SOUNDEX('Johnson')") + testFoldConst("SELECT SOUNDEX('Jackson')") + testFoldConst("SELECT SOUNDEX('Ashcraft')") + testFoldConst("SELECT SOUNDEX('Ashcroft')") + testFoldConst("SELECT SOUNDEX('Washington')") + testFoldConst("SELECT SOUNDEX('Lee')") + testFoldConst("SELECT SOUNDEX('Gutierrez')") + testFoldConst("SELECT SOUNDEX('Pfister')") + testFoldConst("SELECT SOUNDEX('Honeyman')") + testFoldConst("SELECT SOUNDEX('Lloyd')") + testFoldConst("SELECT SOUNDEX('Tymczak')") + testFoldConst("SELECT SOUNDEX('A')") + testFoldConst("SELECT SOUNDEX('B')") + testFoldConst("SELECT SOUNDEX('Z')") + testFoldConst("SELECT SOUNDEX('robert')") + testFoldConst("SELECT SOUNDEX('ROBERT')") + testFoldConst("SELECT SOUNDEX('RoBerT')") + testFoldConst("SELECT SOUNDEX('R@bert')") + testFoldConst("SELECT SOUNDEX('Rob3rt')") + testFoldConst("SELECT SOUNDEX('Rob-ert')") + testFoldConst("SELECT SOUNDEX('123Robert')") + testFoldConst("SELECT SOUNDEX('123')") + testFoldConst("SELECT SOUNDEX('~!@#%^&*-+')") + testFoldConst("SELECT SOUNDEX(' ')") + testFoldConst("SELECT SOUNDEX('')") + testFoldConst("SELECT SOUNDEX('Ab_+ %*^cdefghijklmnopqrstuvwxyz')") + testFoldConst("SELECT SOUNDEX('Euler')") + testFoldConst("SELECT SOUNDEX('Gauss')") + testFoldConst("SELECT SOUNDEX('Hilbert')") + testFoldConst("SELECT SOUNDEX('Knuth')") + testFoldConst("SELECT SOUNDEX('Lloyd')") + testFoldConst("SELECT SOUNDEX('Lukasiewicz')") + testFoldConst("SELECT SOUNDEX('Huang')") + testFoldConst("SELECT SOUNDEX('Zhang')") + testFoldConst("SELECT SOUNDEX('Wang')") + testFoldConst("SELECT SOUNDEX(NULL)") }