Skip to content

Commit 97eded5

Browse files
authored
[Feature](func) Support function soundex (#55731)
The `SOUNDEX` function computes the [American Soundex](https://en.wikipedia.org/wiki/Soundex) value, which consists of the first letter followed by a three-digit sound code that represents the English pronunciation of the input string. - Example: ```sql SELECT SOUNDEX('Doris'); ``` ```text +------------------+ | SOUNDEX('Doris') | +------------------+ | D620 | +------------------+ ```
1 parent a7e4570 commit 97eded5

File tree

11 files changed

+581
-0
lines changed

11 files changed

+581
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <cctype>
19+
20+
#include "common/status.h"
21+
#include "vec/columns/column_string.h"
22+
#include "vec/data_types/data_type_string.h"
23+
#include "vec/functions/function.h"
24+
#include "vec/functions/simple_function_factory.h"
25+
26+
namespace doris::vectorized {
27+
#include "common/compile_check_begin.h"
28+
29+
class FunctionSoundex : public IFunction {
30+
public:
31+
static constexpr auto name = "soundex";
32+
33+
static FunctionPtr create() { return std::make_shared<FunctionSoundex>(); }
34+
35+
String get_name() const override { return name; }
36+
37+
size_t get_number_of_arguments() const override { return 1; }
38+
39+
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
40+
return std::make_shared<DataTypeString>();
41+
}
42+
43+
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
44+
uint32_t result, size_t input_rows_count) const override {
45+
const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column;
46+
47+
auto res_column = ColumnString::create();
48+
res_column->reserve(input_rows_count);
49+
auto& res_data = res_column->get_chars();
50+
auto& res_offsets = res_column->get_offsets();
51+
res_data.reserve(input_rows_count * CODE_SIZE);
52+
res_offsets.resize(input_rows_count);
53+
for (size_t i = 0; i < input_rows_count; ++i) {
54+
StringRef ref = col_ptr->get_data_at(i);
55+
RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_data, res_offsets, i));
56+
}
57+
58+
block.replace_by_position(result, std::move(res_column));
59+
return Status::OK();
60+
}
61+
62+
private:
63+
Status calculate_soundex_and_insert(const StringRef& ref, ColumnString::Chars& chars,
64+
ColumnString::Offsets& offsets, const size_t row) const {
65+
uint32_t row_start = (row == 0) ? 0 : offsets[row - 1];
66+
uint32_t expect_end = row_start + CODE_SIZE;
67+
68+
if (ref.size == 0) {
69+
offsets[row] = row_start;
70+
return Status::OK();
71+
}
72+
73+
char pre_code = '\0';
74+
for (size_t i = 0; i < ref.size; ++i) {
75+
auto c = static_cast<unsigned char>(ref.data[i]);
76+
77+
if (c > 0x7f) {
78+
return Status::InvalidArgument("soundex only supports ASCII, but got: {}",
79+
ref.data[i]);
80+
}
81+
if (!std::isalpha(c)) {
82+
continue;
83+
}
84+
85+
c = static_cast<char>(std::toupper(c));
86+
if (chars.size() == row_start) {
87+
chars.push_back(c);
88+
pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A'];
89+
} else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') {
90+
if (code != 'V' && code != pre_code) {
91+
chars.push_back(code);
92+
if (chars.size() == expect_end) {
93+
offsets[row] = static_cast<ColumnString::Offset>(chars.size());
94+
return Status::OK();
95+
}
96+
}
97+
98+
pre_code = code;
99+
}
100+
}
101+
102+
while (chars.size() != row_start && chars.size() < expect_end) {
103+
chars.push_back('0');
104+
}
105+
offsets[row] = static_cast<ColumnString::Offset>(chars.size());
106+
107+
return Status::OK();
108+
}
109+
110+
/** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code
111+
* the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels.
112+
* eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K).
113+
* Since the vowel "A" separates the Z and K, the K is coded.
114+
*
115+
* 2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded.
116+
* Here we use 'N' to represent these two characters.
117+
* eg : **Ashcraft** is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226.
118+
*/
119+
static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V',
120+
'2', '2', '4', '5', '5', 'V', '1', '2', '6',
121+
'2', '3', 'V', '1', 'N', '2', 'V', '2'};
122+
123+
static constexpr uint8_t CODE_SIZE = 4;
124+
};
125+
126+
void register_function_soundex(SimpleFunctionFactory& factory) {
127+
factory.register_function<FunctionSoundex>();
128+
}
129+
130+
#include "common/compile_check_end.h"
131+
} // namespace doris::vectorized

be/src/vec/functions/simple_function_factory.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ void register_function_dict_get_many(SimpleFunctionFactory& factory);
120120
void register_function_ai(SimpleFunctionFactory& factory);
121121
void register_function_score(SimpleFunctionFactory& factory);
122122
void register_function_variant_type(SimpleFunctionFactory& factory);
123+
void register_function_soundex(SimpleFunctionFactory& factory);
123124

124125
#if defined(BE_TEST) && !defined(BE_BENCHMARK)
125126
void register_function_throw_exception(SimpleFunctionFactory& factory);
@@ -338,6 +339,7 @@ class SimpleFunctionFactory {
338339
register_function_dict_get_many(instance);
339340
register_function_ai(instance);
340341
register_function_score(instance);
342+
register_function_soundex(instance);
341343
#if defined(BE_TEST) && !defined(BE_BENCHMARK)
342344
register_function_throw_exception(instance);
343345
#endif

be/test/vec/function/function_string_test.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3607,4 +3607,73 @@ TEST(function_string_test, function_count_substring_test) {
36073607
check_function_all_arg_comb<DataTypeInt32, true>(func_name, input_types, data_set);
36083608
}
36093609
}
3610+
3611+
TEST(function_string_test, soundex_test) {
3612+
std::string func_name = "soundex";
3613+
3614+
{
3615+
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
3616+
3617+
DataSet data_set = {
3618+
{{std::string("Doris")}, std::string("D620")},
3619+
{{std::string("ApacheDoris中文测试")}, std::string("A123")},
3620+
{{std::string("Robert")}, std::string("R163")},
3621+
{{std::string("Rupert")}, std::string("R163")},
3622+
{{std::string("Smith")}, std::string("S530")},
3623+
{{std::string("Smyth")}, std::string("S530")},
3624+
{{std::string("Johnson")}, std::string("J525")},
3625+
{{std::string("Jackson")}, std::string("J250")},
3626+
{{std::string("Ashcraft")}, std::string("A261")},
3627+
{{std::string("Ashcroft")}, std::string("A261")},
3628+
{{std::string("Washington")}, std::string("W252")},
3629+
{{std::string("Lee")}, std::string("L000")},
3630+
{{std::string("Gutierrez")}, std::string("G362")},
3631+
{{std::string("Pfister")}, std::string("P236")},
3632+
{{std::string("Honeyman")}, std::string("H555")},
3633+
{{std::string("Lloyd")}, std::string("L300")},
3634+
{{std::string("Tymczak")}, std::string("T522")},
3635+
3636+
{{std::string("A")}, std::string("A000")},
3637+
{{std::string("B")}, std::string("B000")},
3638+
{{std::string("Z")}, std::string("Z000")},
3639+
3640+
{{std::string("robert")}, std::string("R163")},
3641+
{{std::string("ROBERT")}, std::string("R163")},
3642+
{{std::string("RoBerT")}, std::string("R163")},
3643+
3644+
{{std::string("R@bert")}, std::string("R163")},
3645+
{{std::string("Rob3rt")}, std::string("R163")},
3646+
{{std::string("Rob-ert")}, std::string("R163")},
3647+
{{std::string("123Robert")}, std::string("R163")},
3648+
{{std::string("123")}, std::string("")},
3649+
{{std::string("@#$")}, std::string("")},
3650+
{{std::string(" ")}, std::string("")},
3651+
{{std::string("")}, std::string("")},
3652+
{{std::string("Ab_+ %*^cdefghijklmnopqrstuvwxyz")}, std::string("A123")},
3653+
3654+
{{std::string("Euler")}, std::string("E460")},
3655+
{{std::string("Gauss")}, std::string("G200")},
3656+
{{std::string("Hilbert")}, std::string("H416")},
3657+
{{std::string("Knuth")}, std::string("K530")},
3658+
{{std::string("Lloyd")}, std::string("L300")},
3659+
{{std::string("Lukasiewicz")}, std::string("L222")},
3660+
3661+
{{std::string("Huang")}, std::string("H520")},
3662+
{{std::string("Zhang")}, std::string("Z520")},
3663+
{{std::string("Wang")}, std::string("W520")}};
3664+
3665+
static_cast<void>(check_function<DataTypeString, true>(func_name, input_types, data_set));
3666+
}
3667+
3668+
{
3669+
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
3670+
3671+
DataSet data_set = {{{std::string("中文测试")}, std::string("")},
3672+
{{std::string("abc 你好")}, std::string("")}};
3673+
3674+
static_cast<void>(check_function<DataTypeString, true>(func_name, input_types, data_set, -1,
3675+
-1, true));
3676+
}
3677+
}
3678+
36103679
} // namespace doris::vectorized

fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@
415415
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm3sum;
416416
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Decrypt;
417417
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Encrypt;
418+
import org.apache.doris.nereids.trees.expressions.functions.scalar.Soundex;
418419
import org.apache.doris.nereids.trees.expressions.functions.scalar.Space;
419420
import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByChar;
420421
import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByRegexp;
@@ -919,6 +920,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
919920
scalar(Sm3sum.class, "sm3sum"),
920921
scalar(Sm4Decrypt.class, "sm4_decrypt"),
921922
scalar(Sm4Encrypt.class, "sm4_encrypt"),
923+
scalar(Soundex.class, "soundex"),
922924
scalar(Space.class, "space"),
923925
scalar(SplitByChar.class, "split_by_char"),
924926
scalar(SplitByRegexp.class, "split_by_regexp"),

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.doris.nereids.trees.expressions.functions.executable;
1919

2020
import org.apache.doris.nereids.exceptions.AnalysisException;
21+
import org.apache.doris.nereids.exceptions.NotSupportedException;
2122
import org.apache.doris.nereids.trees.expressions.ExecFunction;
2223
import org.apache.doris.nereids.trees.expressions.Expression;
2324
import org.apache.doris.nereids.trees.expressions.literal.ArrayLiteral;
@@ -1059,4 +1060,58 @@ public static Expression replaceEmpty(StringLikeLiteral first, StringLikeLiteral
10591060
}
10601061
return castStringLikeLiteral(first, first.getValue().replace(second.getValue(), third.getValue()));
10611062
}
1063+
1064+
/**
1065+
* Executable arithmetic functions soundex
1066+
*/
1067+
@ExecFunction(name = "soundex")
1068+
public static Expression soundex(StringLikeLiteral first) {
1069+
char[] soundexTable = {
1070+
'V', '1', '2', '3', 'V', '1', '2', 'N', 'V',
1071+
'2', '2', '4', '5', '5', 'V', '1', '2', '6',
1072+
'2', '3', 'V', '1', 'N', '2', 'V', '2'
1073+
};
1074+
1075+
String result = "";
1076+
if (!first.getValue().isEmpty()) {
1077+
char preCode = '\0';
1078+
1079+
for (int i = 0; i < first.getValue().length(); i++) {
1080+
char c = first.getValue().charAt(i);
1081+
1082+
if (c > 0x7f) {
1083+
throw new NotSupportedException("soundex only supports ASCII, but got: " + c);
1084+
}
1085+
1086+
if (!Character.isLetter(c)) {
1087+
continue;
1088+
}
1089+
1090+
c = Character.toUpperCase(c);
1091+
if (result.isEmpty()) {
1092+
result += c;
1093+
preCode = (soundexTable[c - 'A'] == 'N') ? '\0' : soundexTable[c - 'A'];
1094+
} else {
1095+
char code = soundexTable[c - 'A'];
1096+
if (code != 'N') {
1097+
if (code != 'V' && code != preCode) {
1098+
result += code;
1099+
if (result.length() == 4) {
1100+
break;
1101+
}
1102+
}
1103+
preCode = code;
1104+
}
1105+
}
1106+
}
1107+
1108+
if (result.length() > 0) {
1109+
while (result.length() < 4) {
1110+
result += '0';
1111+
}
1112+
}
1113+
}
1114+
1115+
return castStringLikeLiteral(first, result);
1116+
}
10621117
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.scalar;
19+
20+
import org.apache.doris.catalog.FunctionSignature;
21+
import org.apache.doris.nereids.trees.expressions.Expression;
22+
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
23+
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
24+
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
25+
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
26+
import org.apache.doris.nereids.types.VarcharType;
27+
28+
import com.google.common.base.Preconditions;
29+
import com.google.common.collect.ImmutableList;
30+
31+
import java.util.List;
32+
33+
/**
34+
* Scalar function 'Soundex'
35+
*/
36+
public class Soundex extends ScalarFunction
37+
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {
38+
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
39+
FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT).args(VarcharType.SYSTEM_DEFAULT)
40+
);
41+
42+
/**
43+
* constructor with 1 argument.
44+
*/
45+
public Soundex(Expression arg) {
46+
super("soundex", arg);
47+
}
48+
49+
/** constructor for withChildren and reuse signature */
50+
private Soundex(ScalarFunctionParams functionParams) {
51+
super(functionParams);
52+
}
53+
54+
@Override
55+
public Soundex withChildren(List<Expression> children) {
56+
Preconditions.checkArgument(children.size() == 1);
57+
return new Soundex(getFunctionParams(children));
58+
}
59+
60+
@Override
61+
public List<FunctionSignature> getSignatures() {
62+
return SIGNATURES;
63+
}
64+
65+
@Override
66+
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
67+
return visitor.visitSoundex(this, context);
68+
}
69+
}

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@
417417
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm3sum;
418418
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Decrypt;
419419
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sm4Encrypt;
420+
import org.apache.doris.nereids.trees.expressions.functions.scalar.Soundex;
420421
import org.apache.doris.nereids.trees.expressions.functions.scalar.Space;
421422
import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByChar;
422423
import org.apache.doris.nereids.trees.expressions.functions.scalar.SplitByRegexp;
@@ -2031,6 +2032,10 @@ default R visitSm4Encrypt(Sm4Encrypt sm4Encrypt, C context) {
20312032
return visitScalarFunction(sm4Encrypt, context);
20322033
}
20332034

2035+
default R visitSoundex(Soundex soundex, C context) {
2036+
return visitScalarFunction(soundex, context);
2037+
}
2038+
20342039
default R visitSpace(Space space, C context) {
20352040
return visitScalarFunction(space, context);
20362041
}

0 commit comments

Comments
 (0)