diff --git a/CMake/Findpcre2.cmake b/CMake/Findpcre2.cmake new file mode 100644 index 00000000000..c72b98e2cf6 --- /dev/null +++ b/CMake/Findpcre2.cmake @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Locate a system PCRE2 installation (8-bit code unit width) and expose it +# under the canonical target name `pcre2-8::pcre2-8` used by the +# velox/external/regex_compat module. + +find_package(PCRE2 QUIET CONFIG COMPONENTS 8BIT) +if(PCRE2_FOUND) + if(NOT TARGET pcre2-8::pcre2-8 AND TARGET PCRE2::8BIT) + add_library(pcre2-8::pcre2-8 ALIAS PCRE2::8BIT) + endif() + message(STATUS "Found PCRE2 via CMake.") + return() +endif() + +if(TARGET pcre2-8::pcre2-8) + message(STATUS "PCRE2 target already defined.") + return() +endif() + +find_package(PkgConfig REQUIRED) +pkg_check_modules(PCRE2_8 QUIET libpcre2-8) +if(PCRE2_8_FOUND) + add_library(pcre2-8::pcre2-8 INTERFACE IMPORTED) + set_property( + TARGET pcre2-8::pcre2-8 + PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_8_INCLUDE_DIRS}" + ) + set_property(TARGET pcre2-8::pcre2-8 PROPERTY INTERFACE_LINK_LIBRARIES "${PCRE2_8_LDFLAGS}") + set_property( + TARGET pcre2-8::pcre2-8 + PROPERTY INTERFACE_COMPILE_DEFINITIONS "PCRE2_CODE_UNIT_WIDTH=8" + ) + set(pcre2_FOUND TRUE) + message(STATUS "Found PCRE2 via pkg-config.") + return() +endif() + +if(pcre2_FIND_REQUIRED) + message(FATAL_ERROR "Failed to find PCRE2.") +elseif(NOT pcre2_FIND_QUIETLY) + message(WARNING "Failed to find PCRE2.") +endif() diff --git a/CMake/resolve_dependency_modules/pcre2.cmake b/CMake/resolve_dependency_modules/pcre2.cmake new file mode 100644 index 00000000000..7053727a98d --- /dev/null +++ b/CMake/resolve_dependency_modules/pcre2.cmake @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +if(DEFINED ENV{VELOX_PCRE2_URL}) + set(VELOX_PCRE2_SOURCE_URL "$ENV{VELOX_PCRE2_URL}") +else() + set(VELOX_PCRE2_VERSION 10.47) + set( + VELOX_PCRE2_SOURCE_URL + "https://github.com/PCRE2Project/pcre2/releases/download/pcre2-${VELOX_PCRE2_VERSION}/pcre2-${VELOX_PCRE2_VERSION}.tar.gz" + ) + set( + VELOX_PCRE2_BUILD_SHA256_CHECKSUM + c08ae2388ef333e8403e670ad70c0a11f1eed021fd88308d7e02f596fcd9dc16 + ) +endif() + +message(STATUS "Building PCRE2 ${VELOX_PCRE2_VERSION} from source") +FetchContent_Declare( + pcre2 + URL ${VELOX_PCRE2_SOURCE_URL} + URL_HASH SHA256=${VELOX_PCRE2_BUILD_SHA256_CHECKSUM} +) + +set(PCRE2_BUILD_PCRE2_8 ON CACHE BOOL "" FORCE) +set(PCRE2_BUILD_PCRE2_16 OFF CACHE BOOL "" FORCE) +set(PCRE2_BUILD_PCRE2_32 OFF CACHE BOOL "" FORCE) +set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE) +set(PCRE2_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(PCRE2_BUILD_PCRE2GREP OFF CACHE BOOL "" FORCE) +set(PCRE2_SUPPORT_UNICODE ON CACHE BOOL "" FORCE) +set(PCRE2_STATIC_PIC ON CACHE BOOL "" FORCE) + +FetchContent_MakeAvailable(pcre2) + +# Normalise the target name so consumers always link `pcre2-8::pcre2-8`. +if(TARGET pcre2-8-static AND NOT TARGET pcre2-8::pcre2-8) + add_library(pcre2-8::pcre2-8 ALIAS pcre2-8-static) +elseif(TARGET pcre2-8 AND NOT TARGET pcre2-8::pcre2-8) + add_library(pcre2-8::pcre2-8 ALIAS pcre2-8) +endif() + +unset(BUILD_TESTING CACHE) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36c8d6c9ea1..5b394cc83de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,6 +142,16 @@ option(VELOX_ENABLE_TPCDS_CONNECTOR "Build TPC-DS connector." ON) option(VELOX_ENABLE_PRESTO_FUNCTIONS "Build Presto SQL functions." ON) option(VELOX_ENABLE_SPARK_FUNCTIONS "Build Spark SQL functions." ON) option(VELOX_ENABLE_ICEBERG_FUNCTIONS "Build Iceberg functions." ON) +option( + VELOX_ENABLE_REGEX_COMPAT_TESTS + "Build the PCRE2 vs RE2 Java-regex compatibility test suite (pulls in PCRE2 dep)." + OFF +) +option( + VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND + "Within the regex-compat test suite, also exercise an embedded-JVM Java backend as a third backend / oracle. Requires JDK on the build host. If JNI cannot be found, this option is auto-disabled with a warning. Only consulted when VELOX_ENABLE_REGEX_COMPAT_TESTS=ON." + ON +) option(VELOX_ENABLE_EXPRESSION "Build expression." ON) option( VELOX_ENABLE_EXAMPLES @@ -626,6 +636,30 @@ endif() velox_set_source(re2) velox_resolve_dependency(re2) +if(VELOX_ENABLE_REGEX_COMPAT_TESTS) + velox_set_source(pcre2) + velox_resolve_dependency(pcre2) + + if(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND) + # Probe for a JDK so the test suite can embed a JVM as a third (oracle) + # backend. This is the only place in upstream Velox that touches JNI, and + # it is fully opt-in (gated by the regex-compat option above). If JNI is + # not found we silently degrade — the test suite still builds with the + # PCRE2 + RE2 backends only. + find_package(JNI QUIET) + if(JNI_FOUND) + message(STATUS "Regex-compat: enabling embedded-JVM Java backend (JNI: ${JNI_INCLUDE_DIRS})") + else() + message( + WARNING + "Regex-compat: JNI not found, disabling Java backend. " + "Install a JDK or pass -DVELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND=OFF to silence." + ) + set(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND OFF CACHE BOOL "" FORCE) + endif() + endif() +endif() + if(${VELOX_BUILD_PYTHON_PACKAGE}) find_package(Python 3.9 COMPONENTS Interpreter Development.Module REQUIRED) velox_set_source(pybind11) diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt index f15492a2e11..9d0e19edec4 100644 --- a/velox/CMakeLists.txt +++ b/velox/CMakeLists.txt @@ -26,6 +26,9 @@ add_subdirectory(external/date) add_subdirectory(external/tzdb) add_subdirectory(external/md5) add_subdirectory(external/hdfs) +if(VELOX_ENABLE_REGEX_COMPAT_TESTS) + add_subdirectory(external/regex_compat) +endif() # # examples depend on expression diff --git a/velox/external/regex_compat/CMakeLists.txt b/velox/external/regex_compat/CMakeLists.txt new file mode 100644 index 00000000000..a35dc333d9b --- /dev/null +++ b/velox/external/regex_compat/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Only entered when VELOX_ENABLE_REGEX_COMPAT_TESTS=ON. + +set(_REGEX_COMPAT_SRC Re2Regex.cpp Pcre2Regex.cpp) +set(_REGEX_COMPAT_LIBS re2::re2 pcre2-8::pcre2-8) + +if(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND) + list(APPEND _REGEX_COMPAT_SRC JvmFixture.cpp JavaRegex.cpp) + list(APPEND _REGEX_COMPAT_LIBS ${JNI_LIBRARIES}) +endif() + +velox_add_library(velox_regex_compat ${_REGEX_COMPAT_SRC}) + +velox_link_libraries(velox_regex_compat + PUBLIC ${_REGEX_COMPAT_LIBS} + PRIVATE velox_functions_lib velox_java_pcre2_translator) + +if(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND) + velox_include_directories(velox_regex_compat PUBLIC ${JNI_INCLUDE_DIRS}) + velox_compile_definitions(velox_regex_compat + PUBLIC VELOX_REGEX_COMPAT_HAS_JAVA=1) +else() + velox_compile_definitions(velox_regex_compat + PUBLIC VELOX_REGEX_COMPAT_HAS_JAVA=0) +endif() + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() diff --git a/velox/external/regex_compat/JavaRegex.cpp b/velox/external/regex_compat/JavaRegex.cpp new file mode 100644 index 00000000000..eb2d18a8b6e --- /dev/null +++ b/velox/external/regex_compat/JavaRegex.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/external/regex_compat/JavaRegex.h" + +#if VELOX_REGEX_COMPAT_HAS_JAVA + +#include "velox/external/regex_compat/JvmFixture.h" + +#include +#include +#include + +namespace facebook::velox::regex_compat { +namespace { + +// java.util.regex.Pattern flag bit constants (must match the JDK). +constexpr jint kJavaCaseInsensitive = 0x02; +constexpr jint kJavaMultiline = 0x08; +constexpr jint kJavaDotall = 0x20; +constexpr jint kJavaUnicodeCase = 0x40; + +struct JavaIds { + // Global refs to class objects so they survive across JNI local-ref frames. + jclass patternCls = nullptr; + jclass matcherCls = nullptr; + jclass stringCls = nullptr; + jclass mapCls = nullptr; + jclass setCls = nullptr; + jclass iteratorCls = nullptr; + jclass entryCls = nullptr; + jclass integerCls = nullptr; + + jmethodID compileMethod = nullptr; // static Pattern.compile(String, int) + jmethodID matcherMethod = nullptr; // Pattern.matcher(CharSequence) + jmethodID namedGroupsMethod = nullptr; // Pattern.namedGroups() (JDK 20+) + + jmethodID findMethod = nullptr; // Matcher.find(int) + jmethodID findNoArgMethod = nullptr; // Matcher.find() + jmethodID matchesMethod = nullptr; // Matcher.matches() + jmethodID lookingAtMethod = nullptr; // Matcher.lookingAt() + jmethodID startMethod = nullptr; // Matcher.start(int) + jmethodID endMethod = nullptr; // Matcher.end(int) + jmethodID groupCountMethod = nullptr; // Matcher.groupCount() + jmethodID replaceAllMethod = nullptr; // Matcher.replaceAll(String) + jmethodID regionMethod = nullptr; // Matcher.region(int, int) + jmethodID useAnchoringMethod = nullptr; // Matcher.useAnchoringBounds(boolean) + + jmethodID mapEntrySetMethod = nullptr; // Map.entrySet() + jmethodID setIteratorMethod = nullptr; // Set.iterator() + jmethodID iteratorHasNextMethod = nullptr; // Iterator.hasNext() + jmethodID iteratorNextMethod = nullptr; // Iterator.next() + jmethodID entryGetKeyMethod = nullptr; // Map.Entry.getKey() + jmethodID entryGetValueMethod = nullptr; // Map.Entry.getValue() + jmethodID integerIntValueMethod = nullptr; // Integer.intValue() +}; + +std::once_flag g_idsOnce; +JavaIds g_ids; + +jclass globalClassRef(JNIEnv* env, const char* name) { + jclass local = env->FindClass(name); + if (!local) { + throw std::runtime_error( + std::string("FindClass failed for ") + name); + } + jclass global = static_cast(env->NewGlobalRef(local)); + env->DeleteLocalRef(local); + return global; +} + +void initIds(JNIEnv* env) { + g_ids.patternCls = globalClassRef(env, "java/util/regex/Pattern"); + g_ids.matcherCls = globalClassRef(env, "java/util/regex/Matcher"); + g_ids.stringCls = globalClassRef(env, "java/lang/String"); + g_ids.mapCls = globalClassRef(env, "java/util/Map"); + g_ids.setCls = globalClassRef(env, "java/util/Set"); + g_ids.iteratorCls = globalClassRef(env, "java/util/Iterator"); + g_ids.entryCls = globalClassRef(env, "java/util/Map$Entry"); + g_ids.integerCls = globalClassRef(env, "java/lang/Integer"); + + g_ids.compileMethod = env->GetStaticMethodID( + g_ids.patternCls, + "compile", + "(Ljava/lang/String;I)Ljava/util/regex/Pattern;"); + g_ids.matcherMethod = env->GetMethodID( + g_ids.patternCls, + "matcher", + "(Ljava/lang/CharSequence;)Ljava/util/regex/Matcher;"); + // Pattern.namedGroups() is JDK 20+; treat as optional. + g_ids.namedGroupsMethod = + env->GetMethodID(g_ids.patternCls, "namedGroups", "()Ljava/util/Map;"); + if (env->ExceptionCheck()) { + env->ExceptionClear(); + g_ids.namedGroupsMethod = nullptr; + } + + g_ids.findMethod = env->GetMethodID(g_ids.matcherCls, "find", "(I)Z"); + g_ids.findNoArgMethod = env->GetMethodID(g_ids.matcherCls, "find", "()Z"); + g_ids.matchesMethod = env->GetMethodID(g_ids.matcherCls, "matches", "()Z"); + g_ids.lookingAtMethod = + env->GetMethodID(g_ids.matcherCls, "lookingAt", "()Z"); + g_ids.startMethod = env->GetMethodID(g_ids.matcherCls, "start", "(I)I"); + g_ids.endMethod = env->GetMethodID(g_ids.matcherCls, "end", "(I)I"); + g_ids.groupCountMethod = + env->GetMethodID(g_ids.matcherCls, "groupCount", "()I"); + g_ids.replaceAllMethod = env->GetMethodID( + g_ids.matcherCls, "replaceAll", "(Ljava/lang/String;)Ljava/lang/String;"); + g_ids.regionMethod = + env->GetMethodID(g_ids.matcherCls, "region", "(II)Ljava/util/regex/Matcher;"); + g_ids.useAnchoringMethod = env->GetMethodID( + g_ids.matcherCls, + "useAnchoringBounds", + "(Z)Ljava/util/regex/Matcher;"); + + g_ids.mapEntrySetMethod = + env->GetMethodID(g_ids.mapCls, "entrySet", "()Ljava/util/Set;"); + g_ids.setIteratorMethod = + env->GetMethodID(g_ids.setCls, "iterator", "()Ljava/util/Iterator;"); + g_ids.iteratorHasNextMethod = + env->GetMethodID(g_ids.iteratorCls, "hasNext", "()Z"); + g_ids.iteratorNextMethod = + env->GetMethodID(g_ids.iteratorCls, "next", "()Ljava/lang/Object;"); + g_ids.entryGetKeyMethod = + env->GetMethodID(g_ids.entryCls, "getKey", "()Ljava/lang/Object;"); + g_ids.entryGetValueMethod = + env->GetMethodID(g_ids.entryCls, "getValue", "()Ljava/lang/Object;"); + g_ids.integerIntValueMethod = + env->GetMethodID(g_ids.integerCls, "intValue", "()I"); +} + +jint toJavaFlags(const Options& o) { + jint f = 0; + if (!o.caseSensitive) { + f |= kJavaCaseInsensitive | kJavaUnicodeCase; + } + if (o.dotNl) { + f |= kJavaDotall; + } + if (!o.oneLine) { + f |= kJavaMultiline; + } + return f; +} + +// Convert a Java `String` index (a UTF-16 code-unit offset) into a byte +// offset in the given UTF-8 source. Used to translate Matcher.start()/end() +// results — which are Java char indices — back into byte offsets in our +// std::string_view input. Returns std::string_view::npos on bad input or +// out-of-range index. +std::size_t javaCharOffsetToByteOffset( + std::string_view utf8, + int javaCharOffset) { + if (javaCharOffset < 0) { + return std::string_view::npos; + } + int chars = 0; + for (std::size_t i = 0; i < utf8.size();) { + if (chars == javaCharOffset) { + return i; + } + const unsigned char c = static_cast(utf8[i]); + if (c < 0x80) { + i += 1; + chars += 1; + } else if (c < 0xC0) { + // Stray continuation byte — advance to avoid an infinite loop. + i += 1; + chars += 1; + } else if (c < 0xE0) { + i += 2; + chars += 1; + } else if (c < 0xF0) { + i += 3; + chars += 1; + } else { + // 4-byte UTF-8 = U+10000..U+10FFFF, encoded as a UTF-16 surrogate + // pair (2 code units) in Java. + i += 4; + chars += 2; + } + } + return chars == javaCharOffset ? utf8.size() : std::string_view::npos; +} + +// Inverse of the above: given a UTF-8 byte offset, return the equivalent +// Java UTF-16 char offset. Used when we have to hand a byte offset (used +// by the caller / JavaMatcherAdapter cursor) over to Java's Matcher.region(). +int byteOffsetToJavaCharOffset( + std::string_view utf8, + std::size_t byteOffset) { + int chars = 0; + std::size_t i = 0; + while (i < utf8.size() && i < byteOffset) { + const unsigned char c = static_cast(utf8[i]); + if (c < 0x80) { + i += 1; + chars += 1; + } else if (c < 0xC0) { + i += 1; + chars += 1; + } else if (c < 0xE0) { + i += 2; + chars += 1; + } else if (c < 0xF0) { + i += 3; + chars += 1; + } else { + i += 4; + chars += 2; + } + } + return chars; +} + +// Convert a std::string_view (UTF-8) to a JNI jstring. Owned by caller — +// must DeleteLocalRef after use. +// +// NewStringUTF interprets its input as JNI's "modified UTF-8" — bytes >= 0x80 +// are taken to be the first byte of a 2-byte sequence (essentially +// Latin-1-ish), which mangles real 3- and 4-byte UTF-8 sequences. To +// faithfully round-trip UTF-8 we transcode to UTF-16 here and use +// NewString(jchar*, jsize) instead. +jstring toJString(JNIEnv* env, std::string_view sv) { + std::vector u16; + u16.reserve(sv.size()); + for (std::size_t i = 0; i < sv.size();) { + const unsigned char c = static_cast(sv[i]); + std::uint32_t cp = 0; + std::size_t step = 1; + if (c < 0x80) { + cp = c; + step = 1; + } else if (c < 0xC0) { + // Stray continuation; emit replacement to keep length sane. + u16.push_back(0xFFFD); + ++i; + continue; + } else if (c < 0xE0 && i + 1 < sv.size()) { + cp = ((c & 0x1F) << 6) | + (static_cast(sv[i + 1]) & 0x3F); + step = 2; + } else if (c < 0xF0 && i + 2 < sv.size()) { + cp = ((c & 0x0F) << 12) | + ((static_cast(sv[i + 1]) & 0x3F) << 6) | + (static_cast(sv[i + 2]) & 0x3F); + step = 3; + } else if (i + 3 < sv.size()) { + cp = ((c & 0x07) << 18) | + ((static_cast(sv[i + 1]) & 0x3F) << 12) | + ((static_cast(sv[i + 2]) & 0x3F) << 6) | + (static_cast(sv[i + 3]) & 0x3F); + step = 4; + } else { + u16.push_back(0xFFFD); + ++i; + continue; + } + if (cp <= 0xFFFF) { + u16.push_back(static_cast(cp)); + } else { + cp -= 0x10000; + u16.push_back(static_cast(0xD800 | (cp >> 10))); + u16.push_back(static_cast(0xDC00 | (cp & 0x3FF))); + } + i += step; + } + return env->NewString(u16.data(), static_cast(u16.size())); +} + +// Read a jstring into a std::string (UTF-8). Caller still owns the jstring. +// We use GetStringChars (UTF-16) and transcode to UTF-8 ourselves to avoid +// GetStringUTFChars's "modified UTF-8" which can't represent supplementary +// chars in their 4-byte UTF-8 form. +std::string fromJString(JNIEnv* env, jstring s) { + if (!s) { + return {}; + } + const jsize len = env->GetStringLength(s); + const jchar* u16 = env->GetStringChars(s, nullptr); + std::string out; + out.reserve(static_cast(len)); + for (jsize i = 0; i < len; ++i) { + std::uint32_t cp = u16[i]; + if (cp >= 0xD800 && cp <= 0xDBFF && i + 1 < len) { + const std::uint32_t lo = u16[i + 1]; + if (lo >= 0xDC00 && lo <= 0xDFFF) { + cp = 0x10000 + (((cp - 0xD800) << 10) | (lo - 0xDC00)); + ++i; + } + } + if (cp < 0x80) { + out.push_back(static_cast(cp)); + } else if (cp < 0x800) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp < 0x10000) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + } + env->ReleaseStringChars(s, u16); + return out; +} + +bool checkAndClearException(JNIEnv* env, std::string* outError) { + if (!env->ExceptionCheck()) { + return false; + } + if (outError) { + *outError = "Java exception thrown (cleared)"; + } + env->ExceptionClear(); + return true; +} + +void populateNamedFromPattern( + JNIEnv* env, + jobject pattern, + std::map* out) { + if (!g_ids.namedGroupsMethod) { + return; + } + jobject map = env->CallObjectMethod(pattern, g_ids.namedGroupsMethod); + if (env->ExceptionCheck()) { + env->ExceptionClear(); + return; + } + if (!map) { + return; + } + jobject set = env->CallObjectMethod(map, g_ids.mapEntrySetMethod); + jobject it = env->CallObjectMethod(set, g_ids.setIteratorMethod); + while (env->CallBooleanMethod(it, g_ids.iteratorHasNextMethod)) { + jobject entry = env->CallObjectMethod(it, g_ids.iteratorNextMethod); + jstring key = static_cast( + env->CallObjectMethod(entry, g_ids.entryGetKeyMethod)); + jobject value = env->CallObjectMethod(entry, g_ids.entryGetValueMethod); + jint idx = env->CallIntMethod(value, g_ids.integerIntValueMethod); + out->emplace(fromJString(env, key), static_cast(idx)); + env->DeleteLocalRef(key); + env->DeleteLocalRef(value); + env->DeleteLocalRef(entry); + } + env->DeleteLocalRef(it); + env->DeleteLocalRef(set); + env->DeleteLocalRef(map); +} + +} // namespace + +JavaRegex::JavaRegex(std::string_view javaPattern, Options opt) { + auto* env = JvmFixture::instance().env(); + std::call_once(g_idsOnce, [&]() { initIds(env); }); + + jstring jPat = toJString(env, javaPattern); + jobject pObj = env->CallStaticObjectMethod( + g_ids.patternCls, g_ids.compileMethod, jPat, toJavaFlags(opt)); + env->DeleteLocalRef(jPat); + + if (env->ExceptionCheck()) { + env->ExceptionClear(); + error_ = "Java PatternSyntaxException: " + std::string(javaPattern); + return; + } + pattern_ = env->NewGlobalRef(pObj); + env->DeleteLocalRef(pObj); + + // groupCount via a throwaway empty matcher. + jstring emptyStr = toJString(env, ""); + jobject tmpMatcher = env->CallObjectMethod( + pattern_, g_ids.matcherMethod, emptyStr); + env->DeleteLocalRef(emptyStr); + captureCount_ = env->CallIntMethod(tmpMatcher, g_ids.groupCountMethod); + env->DeleteLocalRef(tmpMatcher); + + populateNamedFromPattern(env, pattern_, &named_); +} + +JavaRegex::~JavaRegex() { + if (pattern_) { + JvmFixture::instance().env()->DeleteGlobalRef(pattern_); + } +} + +bool JavaRegex::ok() const { + return pattern_ != nullptr; +} +const std::string& JavaRegex::error() const { + return error_; +} +int JavaRegex::NumberOfCapturingGroups() const { + return captureCount_; +} +const std::map& JavaRegex::NamedCapturingGroups() const { + return named_; +} + +bool JavaRegex::Match( + std::string_view input, + std::size_t startpos, + std::size_t endpos, + Anchor anchor, + std::string_view* submatch, + int nsubmatch) const { + if (!pattern_) { + return false; + } + auto* env = JvmFixture::instance().env(); + + // Java's Matcher operates on a CharSequence we hand it; clip input to + // [0, endpos) by materialising that prefix. Then use region() so the + // engine treats [startpos, endpos) as the searchable window. + const std::string buf(input.substr(0, endpos)); + jstring jin = toJString(env, buf); + jobject m = env->CallObjectMethod(pattern_, g_ids.matcherMethod, jin); + env->DeleteLocalRef(jin); + + // Set region so anchors line up with [startpos, endpos). + // Java's Matcher.region(start, end) takes UTF-16 char offsets, not bytes — + // translate from our byte-offset parameters first. + const jint regionStart = static_cast( + byteOffsetToJavaCharOffset(input, startpos)); + const jint regionEnd = static_cast( + byteOffsetToJavaCharOffset(input, endpos)); + jobject mRegion = env->CallObjectMethod( + m, g_ids.regionMethod, regionStart, regionEnd); + env->DeleteLocalRef(mRegion); + + jboolean matched = JNI_FALSE; + switch (anchor) { + case Anchor::kUnanchored: + matched = env->CallBooleanMethod(m, g_ids.findNoArgMethod); + break; + case Anchor::kAnchorStart: + matched = env->CallBooleanMethod(m, g_ids.lookingAtMethod); + break; + case Anchor::kAnchorBoth: + matched = env->CallBooleanMethod(m, g_ids.matchesMethod); + break; + } + + if (!matched) { + env->DeleteLocalRef(m); + return false; + } + + // Extract submatches: Matcher.start(i)/end(i) return UTF-16 char offsets + // into the original CharSequence (= our `buf` = a prefix of `input`). + // Translate each Java char offset back to a byte offset in `input` so + // string_view substr arithmetic works for non-ASCII input. + for (int i = 0; i < nsubmatch; ++i) { + jint s = env->CallIntMethod(m, g_ids.startMethod, i); + if (env->ExceptionCheck()) { + env->ExceptionClear(); + submatch[i] = std::string_view{}; + continue; + } + jint e = env->CallIntMethod(m, g_ids.endMethod, i); + if (s < 0) { + submatch[i] = std::string_view{}; + continue; + } + const std::size_t sByte = javaCharOffsetToByteOffset(input, s); + const std::size_t eByte = javaCharOffsetToByteOffset(input, e); + if (sByte == std::string_view::npos || eByte == std::string_view::npos || + eByte < sByte) { + submatch[i] = std::string_view{}; + } else { + submatch[i] = input.substr(sByte, eByte - sByte); + } + } + + env->DeleteLocalRef(m); + return true; +} + +bool JavaRegex::FullMatch(std::string_view input, const JavaRegex& re) { + std::string_view sub[1]; + return re.Match(input, 0, input.size(), Anchor::kAnchorBoth, sub, 1); +} + +bool JavaRegex::PartialMatch(std::string_view input, const JavaRegex& re) { + std::string_view sub[1]; + return re.Match(input, 0, input.size(), Anchor::kUnanchored, sub, 1); +} + +int JavaRegex::GlobalReplace( + std::string* str, + const JavaRegex& re, + std::string_view javaReplacement) { + if (!re.ok() || str == nullptr) { + return 0; + } + auto* env = JvmFixture::instance().env(); + + // Build a Matcher on the input and call replaceAll(repl). Matcher.replaceAll + // is the canonical Java semantics — accepts $N / ${name} natively, returns + // the result as a String. We have no way to recover the *count* of + // replacements done through the public API without manual find()-loop, so + // we approximate: count matches first, then replaceAll. (Tests use exact + // count assertions, so this matters.) + jstring jin = toJString(env, *str); + jobject m = env->CallObjectMethod(re.pattern_, g_ids.matcherMethod, jin); + + // First: count matches by walking find(). + int count = 0; + while (env->CallBooleanMethod(m, g_ids.findNoArgMethod)) { + ++count; + } + + // Second: reset matcher (recreate it — replaceAll re-walks anyway). + env->DeleteLocalRef(m); + m = env->CallObjectMethod(re.pattern_, g_ids.matcherMethod, jin); + jstring jRepl = toJString(env, javaReplacement); + jstring jOut = static_cast( + env->CallObjectMethod(m, g_ids.replaceAllMethod, jRepl)); + env->DeleteLocalRef(jRepl); + env->DeleteLocalRef(m); + env->DeleteLocalRef(jin); + + if (env->ExceptionCheck()) { + env->ExceptionClear(); + return 0; + } + *str = fromJString(env, jOut); + env->DeleteLocalRef(jOut); + return count; +} + +} // namespace facebook::velox::regex_compat + +#endif // VELOX_REGEX_COMPAT_HAS_JAVA diff --git a/velox/external/regex_compat/JavaRegex.h b/velox/external/regex_compat/JavaRegex.h new file mode 100644 index 00000000000..a5ba77137c9 --- /dev/null +++ b/velox/external/regex_compat/JavaRegex.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +// This header is only meaningful when the Java backend is enabled. Clang-tidy +// scans changed headers in isolation and cannot find on hosts without +// a JDK, so guard the entire content rather than relying on every consumer to +// gate the include. +#if VELOX_REGEX_COMPAT_HAS_JAVA + +#include +#include +#include + +#include + +#include "velox/external/regex_compat/RegexTypes.h" + +namespace facebook::velox::regex_compat { + +/// `java.util.regex` backend in the regex-compat test suite, via an embedded +/// JVM (see JvmFixture). Public method names and signatures mirror +/// `re2::RE2`'s subset that Velox uses. +/// +/// Internally each `Match` / `GlobalReplace` call creates a fresh +/// `java.util.regex.Matcher` via the cached `jobject pattern_` and invokes +/// the JDK's regex engine. Pattern + replacement input is the canonical +/// Java syntax (this is the native source of truth for the other two +/// backends' translation correctness). +class JavaRegex { + public: + explicit JavaRegex(std::string_view javaPattern, Options opt = {}); + ~JavaRegex(); + + JavaRegex(const JavaRegex&) = delete; + JavaRegex& operator=(const JavaRegex&) = delete; + + bool ok() const; + const std::string& error() const; + int NumberOfCapturingGroups() const; + const std::map& NamedCapturingGroups() const; + + bool Match( + std::string_view input, + std::size_t startpos, + std::size_t endpos, + Anchor anchor, + std::string_view* submatch, + int nsubmatch) const; + + static bool FullMatch(std::string_view input, const JavaRegex& re); + static bool PartialMatch(std::string_view input, const JavaRegex& re); + + static int GlobalReplace( + std::string* str, + const JavaRegex& re, + std::string_view javaReplacement); + + private: + // Pinned global reference to java.util.regex.Pattern instance. + jobject pattern_ = nullptr; + std::string error_; + int captureCount_ = 0; + std::map named_; +}; + +} // namespace facebook::velox::regex_compat + +#endif // VELOX_REGEX_COMPAT_HAS_JAVA diff --git a/velox/external/regex_compat/JvmFixture.cpp b/velox/external/regex_compat/JvmFixture.cpp new file mode 100644 index 00000000000..9c77bbae8ce --- /dev/null +++ b/velox/external/regex_compat/JvmFixture.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/external/regex_compat/JvmFixture.h" + +#if VELOX_REGEX_COMPAT_HAS_JAVA + +#include + +#include +#include +#include + +namespace facebook::velox::regex_compat { +namespace { + +class JvmGlobalEnv : public ::testing::Environment { + public: + void SetUp() override { + // Force JVM construction now (before any test runs). + JvmFixture::instance(); + } + // No TearDown: JNI forbids JVM destroy + recreate in the same process. +}; + +} // namespace + +JvmFixture::JvmFixture() { + JavaVMInitArgs args{}; + args.version = JNI_VERSION_1_8; + args.ignoreUnrecognized = JNI_FALSE; + args.nOptions = 0; + args.options = nullptr; + + const jint rc = + JNI_CreateJavaVM(&jvm_, reinterpret_cast(&env_), &args); + if (rc != JNI_OK) { + std::ostringstream os; + os << "JvmFixture: JNI_CreateJavaVM failed with code " << rc; + throw std::runtime_error(os.str()); + } +} + +JvmFixture& JvmFixture::instance() { + // Function-local static guarantees thread-safe one-time construction + // (C++11+) and avoids static-init order issues. + static JvmFixture inst; + return inst; +} + +void JvmFixture::Register() { + ::testing::AddGlobalTestEnvironment(new JvmGlobalEnv); +} + +} // namespace facebook::velox::regex_compat + +#endif // VELOX_REGEX_COMPAT_HAS_JAVA diff --git a/velox/external/regex_compat/JvmFixture.h b/velox/external/regex_compat/JvmFixture.h new file mode 100644 index 00000000000..c2b4bfa812e --- /dev/null +++ b/velox/external/regex_compat/JvmFixture.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +// Guarded the same way as JavaRegex.h: clang-tidy scans diff-changed headers +// in isolation and cannot find on hosts without a JDK. +#if VELOX_REGEX_COMPAT_HAS_JAVA + +#include + +namespace facebook::velox::regex_compat { + +/// Process-singleton embedded JVM used by the regex-compat test suite's +/// JavaRegex backend. Boots the JVM on first `instance()` call via +/// `JNI_CreateJavaVM` and keeps it alive for the lifetime of the process — +/// JNI forbids destroy+recreate in the same process, so we never tear down. +/// +/// Tests should register this as a GTest GlobalEnvironment via +/// JvmFixture::Register() in main(), to give the JVM boot a clear lifecycle +/// boundary distinct from per-test setup. +class JvmFixture { + public: + static JvmFixture& instance(); + + JavaVM* jvm() const { return jvm_; } + JNIEnv* env() const { return env_; } + + /// Register this fixture as a GTest GlobalEnvironment. Call from main(). + static void Register(); + + private: + JvmFixture(); + ~JvmFixture() = default; + + JavaVM* jvm_ = nullptr; + JNIEnv* env_ = nullptr; +}; + +} // namespace facebook::velox::regex_compat + +#endif // VELOX_REGEX_COMPAT_HAS_JAVA diff --git a/velox/external/regex_compat/Pcre2Regex.cpp b/velox/external/regex_compat/Pcre2Regex.cpp new file mode 100644 index 00000000000..efe21296e73 --- /dev/null +++ b/velox/external/regex_compat/Pcre2Regex.cpp @@ -0,0 +1,676 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/external/regex_compat/Pcre2Regex.h" +#include "velox/functions/lib/java_pcre2_translator/ClassBodyParser.h" +#include "velox/functions/lib/java_pcre2_translator/Evaluator.h" +#include "velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h" + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +#include +#include +#include +#include +#include + +namespace facebook::velox::regex_compat { +namespace { + +std::uint32_t toPcre2Options(const Options& o) { + // Java's default \d, \s and \w shorthands are ASCII-only. Keep UTF enabled + // for Unicode literals and \p{...}, but do not enable PCRE2_UCP here. + std::uint32_t opts = PCRE2_UTF; + if (!o.caseSensitive) { + opts |= PCRE2_CASELESS; + } + if (o.dotNl) { + opts |= PCRE2_DOTALL; + } + if (!o.oneLine) { + opts |= PCRE2_MULTILINE; + } + return opts; +} + +std::uint32_t toPcre2MatchOptions(Anchor a) { + switch (a) { + case Anchor::kUnanchored: + return 0; + case Anchor::kAnchorStart: + return PCRE2_ANCHORED; + case Anchor::kAnchorBoth: + return PCRE2_ANCHORED | PCRE2_ENDANCHORED; + } + return 0; +} + +std::string pcre2ErrorToString(int code, PCRE2_SIZE offset) { + PCRE2_UCHAR buf[256]; + pcre2_get_error_message(code, buf, sizeof(buf)); + std::ostringstream os; + os << "PCRE2 error " << code << " at offset " << offset << ": " + << reinterpret_cast(buf); + return os.str(); +} + +void replaceAll(std::string& s, std::string_view from, std::string_view to) { + for (std::size_t pos = 0; (pos = s.find(from, pos)) != std::string::npos; + pos += to.size()) { + s.replace(pos, from.size(), to); + } +} + +std::string surrogateUtf8ByteEscapes(std::uint32_t cp) { + char buf[32]; + std::snprintf( + buf, + sizeof(buf), + "\\x{%02X}\\x{%02X}\\x{%02X}", + 0xE0 | (cp >> 12), + 0x80 | ((cp >> 6) & 0x3F), + 0x80 | (cp & 0x3F)); + return buf; +} + +std::string rawSurrogateUtf8BytePattern( + unsigned char b0, + unsigned char b1, + unsigned char b2) { + char buf[40]; + std::snprintf( + buf, + sizeof(buf), + "(?:\\x{%02X}\\x{%02X}\\x{%02X})", + b0, + b1, + b2); + return buf; +} + +std::string byteEscape(unsigned char b) { + char buf[8]; + std::snprintf(buf, sizeof(buf), "\\x{%02X}", b); + return buf; +} + +std::string codePointUtf8ByteEscapes(std::uint32_t cp) { + if (cp <= 0x7F) { + return byteEscape(static_cast(cp)); + } + if (cp <= 0x7FF) { + return byteEscape(static_cast(0xC0 | (cp >> 6))) + + byteEscape(static_cast(0x80 | (cp & 0x3F))); + } + if (cp <= 0xFFFF) { + return surrogateUtf8ByteEscapes(cp); + } + return byteEscape(static_cast(0xF0 | (cp >> 18))) + + byteEscape(static_cast(0x80 | ((cp >> 12) & 0x3F))) + + byteEscape(static_cast(0x80 | ((cp >> 6) & 0x3F))) + + byteEscape(static_cast(0x80 | (cp & 0x3F))); +} + +std::uint64_t rangeSetSize( + const functions::java_pcre2_translator::RangeSet& rs, + std::uint64_t cap) { + std::uint64_t size = 0; + const auto& ranges = rs.ranges(); + for (std::size_t i = 0; i < ranges.size(); i += 2) { + size += static_cast(ranges[i + 1]) - ranges[i] + 1; + if (size > cap) { + return size; + } + } + return size; +} + +std::string enumerateCodePointSet( + const functions::java_pcre2_translator::RangeSet& rs) { + std::string out = "(?:"; + bool first = true; + const auto& ranges = rs.ranges(); + for (std::size_t i = 0; i < ranges.size(); i += 2) { + for (std::int32_t cp = ranges[i]; cp <= ranges[i + 1]; ++cp) { + if (!first) { + out.push_back('|'); + } + out += codePointUtf8ByteEscapes(cp); + first = false; + } + } + out.push_back(')'); + return out; +} + +std::string anyUtf8CodePointPattern() { + return "(?:[\\x{00}-\\x{7F}]|" + "[\\x{C2}-\\x{DF}][\\x{80}-\\x{BF}]|" + "\\x{E0}[\\x{A0}-\\x{BF}][\\x{80}-\\x{BF}]|" + "[\\x{E1}-\\x{EC}\\x{EE}-\\x{EF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{F0}[\\x{90}-\\x{BF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "[\\x{F1}-\\x{F3}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{F4}[\\x{80}-\\x{8F}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}])"; +} + +std::optional utf8UpToPattern(std::int32_t maxCp) { + if (maxCp >= functions::java_pcre2_translator::RangeSet::kMaxCp) { + return anyUtf8CodePointPattern(); + } + if (maxCp == 0x103FF) { + return std::string("(?:[\\x{00}-\\x{7F}]|" + "[\\x{C2}-\\x{DF}][\\x{80}-\\x{BF}]|" + "\\x{E0}[\\x{A0}-\\x{BF}][\\x{80}-\\x{BF}]|" + "[\\x{E1}-\\x{EC}\\x{EE}-\\x{EF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{F0}\\x{90}[\\x{80}-\\x{8F}][\\x{80}-\\x{BF}])"); + } + return std::nullopt; +} + +std::optional renderRangeSetAsUtf8BytePattern( + const functions::java_pcre2_translator::RangeSet& rs) { + constexpr std::uint64_t kEnumerateLimit = 4096; + if (rs.isEmpty()) { + return std::string("(?!)"); + } + if (rangeSetSize(rs, kEnumerateLimit) <= kEnumerateLimit) { + return enumerateCodePointSet(rs); + } + + auto excluded = functions::java_pcre2_translator::RangeSet::all() + .subtract(rs); + auto anyPattern = anyUtf8CodePointPattern(); + const auto& ranges = rs.ranges(); + if (!ranges.empty() && ranges.front() == 0 && + ranges.back() < functions::java_pcre2_translator::RangeSet::kMaxCp) { + const auto maxCp = ranges.back(); + excluded = functions::java_pcre2_translator::RangeSet::range(0, maxCp) + .subtract(rs); + auto upTo = utf8UpToPattern(maxCp); + if (!upTo.has_value()) { + return std::nullopt; + } + anyPattern = *upTo; + } + if (rangeSetSize(excluded, 64) <= 64) { + return std::string("(?!") + enumerateCodePointSet(excluded) + ")" + + anyPattern; + } + return std::nullopt; +} + +std::optional tryRewriteClassAsUtf8BytePattern( + std::string_view pattern, + std::size_t start, + std::size_t& end) { + namespace translator = functions::java_pcre2_translator; + try { + std::size_t pos = start; + const auto node = translator::ClassBodyParser::parseClass(pattern, pos); + end = pos; + const auto rs = translator::Evaluator::tryToRangeSet(node); + if (!rs.has_value()) { + return std::nullopt; + } + return renderRangeSetAsUtf8BytePattern(*rs); + } catch (const std::invalid_argument&) { + return std::nullopt; + } +} + +bool rawSurrogateUtf8At(std::string_view s, std::size_t i) { + if (i + 2 >= s.size()) { + return false; + } + const auto b0 = static_cast(s[i]); + const auto b1 = static_cast(s[i + 1]); + const auto b2 = static_cast(s[i + 2]); + return b0 == 0xED && b1 >= 0xA0 && b1 <= 0xBF && b2 >= 0x80 && + b2 <= 0xBF; +} + +bool containsRawSurrogateUtf8(std::string_view s) { + for (std::size_t i = 0; i + 2 < s.size(); ++i) { + if (rawSurrogateUtf8At(s, i)) { + return true; + } + } + return false; +} + +std::string rewriteRawSurrogateUtf8Classes(std::string pattern) { + std::string out; + out.reserve(pattern.size()); + for (std::size_t i = 0; i < pattern.size();) { + if (pattern[i] != '[') { + out.push_back(pattern[i++]); + continue; + } + + const std::size_t start = i; + std::size_t parsedEnd = i; + if (auto rewritten = + tryRewriteClassAsUtf8BytePattern(pattern, start, parsedEnd)) { + const std::string_view classText( + pattern.data() + start, parsedEnd - start); + if (classText.find("&&") != std::string_view::npos || + containsRawSurrogateUtf8(classText)) { + out += *rewritten; + i = parsedEnd; + continue; + } + } + + std::size_t j = i + 1; + if (j < pattern.size() && pattern[j] == '^') { + out.push_back(pattern[i++]); + continue; + } + bool escaped = false; + for (; j < pattern.size(); ++j) { + if (escaped) { + escaped = false; + continue; + } + if (pattern[j] == '\\') { + escaped = true; + continue; + } + if (pattern[j] == ']') { + break; + } + } + if (j == pattern.size()) { + out.push_back(pattern[i++]); + continue; + } + + const std::string_view body(pattern.data() + i + 1, j - i - 1); + if (body.find("&&") != std::string_view::npos) { + out.append(pattern, start, j + 1 - start); + i = j + 1; + continue; + } + + std::string byteClass; + std::vector surrogateAlts; + bool unsupportedRange = false; + for (std::size_t k = 0; k < body.size();) { + if (rawSurrogateUtf8At(body, k)) { + if ((k > 0 && body[k - 1] == '-') || + (k + 3 < body.size() && body[k + 3] == '-')) { + unsupportedRange = true; + break; + } + surrogateAlts.push_back(rawSurrogateUtf8BytePattern( + static_cast(body[k]), + static_cast(body[k + 1]), + static_cast(body[k + 2]))); + k += 3; + continue; + } + byteClass.push_back(body[k++]); + } + + if (surrogateAlts.empty() || unsupportedRange) { + out.append(pattern, start, j + 1 - start); + } else { + out += "(?:"; + bool needPipe = false; + if (!byteClass.empty()) { + out.push_back('['); + out += byteClass; + out.push_back(']'); + needPipe = true; + } + for (const auto& alt : surrogateAlts) { + if (needPipe) { + out.push_back('|'); + } + out += alt; + needPipe = true; + } + out.push_back(')'); + } + i = j + 1; + } + return out; +} + +std::string rewriteRawSurrogateUtf8Literals(std::string pattern) { + std::string out; + out.reserve(pattern.size()); + bool inClass = false; + for (std::size_t i = 0; i < pattern.size();) { + const char c = pattern[i]; + if (c == '\\' && i + 1 < pattern.size()) { + out.push_back(pattern[i++]); + out.push_back(pattern[i++]); + continue; + } + if (c == '[') { + inClass = true; + out.push_back(c); + ++i; + continue; + } + if (c == ']' && inClass) { + inClass = false; + out.push_back(c); + ++i; + continue; + } + if (!inClass && rawSurrogateUtf8At(pattern, i)) { + const auto b0 = static_cast(pattern[i]); + const auto b1 = static_cast(pattern[i + 1]); + const auto b2 = static_cast(pattern[i + 2]); + out += rawSurrogateUtf8BytePattern(b0, b1, b2); + i += 3; + continue; + } + out.push_back(c); + ++i; + } + return out; +} + +std::string rewriteSurrogateEscapesForRawByteMode(std::string pattern) { + // The translator reports raw-byte mode via a side-channel bool. PCRE2 in + // non-UTF mode accepts literal surrogate UTF-8 bytes, but not \x{D800}; + // rewrite the surrogate block aliases to byte-sequence regexes before + // dropping PCRE2_UTF. + constexpr std::string_view kAnySurrogateBytes = + "(?:\\x{ED}[\\x{A0}-\\x{AF}][\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{B0}-\\x{BF}][\\x{80}-\\x{BF}])"; + constexpr std::string_view kLowSurrogateBytes = + "(?:\\x{ED}[\\x{B0}-\\x{BF}][\\x{80}-\\x{BF}])"; + replaceAll( + pattern, + "[\\x{d800}-\\x{dbff}\\x{dc00}-\\x{dfff}]", + kAnySurrogateBytes); + replaceAll( + pattern, + "[\\x{D800}-\\x{DBFF}\\x{DC00}-\\x{DFFF}]", + kAnySurrogateBytes); + replaceAll( + pattern, + "[[\\x{D800}-\\x{DB7F}][\\x{DC00}-\\x{DFFF}]]", + "(?:\\x{ED}[\\x{A0}-\\x{AD}][\\x{80}-\\x{BF}]|" + "\\x{ED}\\x{AE}[\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{B0}-\\x{BF}][\\x{80}-\\x{BF}])"); + replaceAll( + pattern, + "[[\\x{d800}-\\x{db7f}][\\x{dc00}-\\x{dfff}]]", + "(?:\\x{ED}[\\x{A0}-\\x{AD}][\\x{80}-\\x{BF}]|" + "\\x{ED}\\x{AE}[\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{B0}-\\x{BF}][\\x{80}-\\x{BF}])"); + replaceAll( + pattern, + "[\\x{D800}-\\x{DB7F}\\x{DC00}-\\x{DFFF}]", + "(?:\\x{ED}[\\x{A0}-\\x{AD}][\\x{80}-\\x{BF}]|" + "\\x{ED}\\x{AE}[\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{B0}-\\x{BF}][\\x{80}-\\x{BF}])"); + replaceAll(pattern, "[\\x{dc00}-\\x{dfff}]", kLowSurrogateBytes); + replaceAll(pattern, "[\\x{DC00}-\\x{DFFF}]", kLowSurrogateBytes); + replaceAll( + pattern, + "[\\x{D800}-\\x{DB7F}]", + "(?:\\x{ED}[\\x{A0}-\\x{AD}][\\x{80}-\\x{BF}])"); + replaceAll( + pattern, + "[\\x{DB80}-\\x{DBFF}]", + "(?:\\x{ED}[\\x{AE}-\\x{AF}][\\x{80}-\\x{BF}])"); + replaceAll( + pattern, + "[\\x{DC00}-\\x{DFFF}]", + "(?:\\x{ED}[\\x{B0}-\\x{BF}][\\x{80}-\\x{BF}])"); + + for (std::uint32_t cp = 0xD800; cp <= 0xDFFF; ++cp) { + char token[16]; + std::snprintf(token, sizeof(token), "\\x{%04X}", cp); + replaceAll(pattern, token, surrogateUtf8ByteEscapes(cp)); + std::snprintf(token, sizeof(token), "\\x{%04x}", cp); + replaceAll(pattern, token, surrogateUtf8ByteEscapes(cp)); + } + const std::string rawAnySurrogateRange = + std::string("[") + std::string("\xED\xA0\x80", 3) + "-" + + std::string("\xED\xBF\xBF", 3) + "]"; + const std::string rawNotAnySurrogateRange = + std::string("[^") + std::string("\xED\xA0\x80", 3) + "-" + + std::string("\xED\xBF\xBF", 3) + "]"; + constexpr std::string_view kValidUtf8NonSurrogate = + "(?:[\\x{00}-\\x{7F}]|" + "[\\x{C2}-\\x{DF}][\\x{80}-\\x{BF}]|" + "\\x{E0}[\\x{A0}-\\x{BF}][\\x{80}-\\x{BF}]|" + "[\\x{E1}-\\x{EC}\\x{EE}-\\x{EF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{ED}[\\x{80}-\\x{9F}][\\x{80}-\\x{BF}]|" + "\\x{F0}[\\x{90}-\\x{BF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "[\\x{F1}-\\x{F3}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}]|" + "\\x{F4}[\\x{80}-\\x{8F}][\\x{80}-\\x{BF}][\\x{80}-\\x{BF}])"; + replaceAll(pattern, rawNotAnySurrogateRange, kValidUtf8NonSurrogate); + replaceAll( + pattern, + rawAnySurrogateRange, + "(?:\\x{ED}[\\x{A0}-\\x{BF}][\\x{80}-\\x{BF}])"); + return rewriteRawSurrogateUtf8Literals( + rewriteRawSurrogateUtf8Classes(std::move(pattern))); +} + +bool containsSurrogateUtf8(std::string_view s) { + for (std::size_t i = 0; i + 2 < s.size(); ++i) { + const auto b0 = static_cast(s[i]); + const auto b1 = static_cast(s[i + 1]); + const auto b2 = static_cast(s[i + 2]); + if (b0 == 0xED && b1 >= 0xA0 && b1 <= 0xBF && b2 >= 0x80 && + b2 <= 0xBF) { + return true; + } + } + return false; +} + +} // namespace + +Pcre2Regex::Pcre2Regex(std::string_view javaPattern, Options opt) { + // Translate Java regex syntax → PCRE2 syntax before compiling. When + // the translator cannot express the pattern in PCRE2 (e.g. an + // unsupported `\p{...}` property in an intersection), we report the + // translator message verbatim and leave the pattern uncompiled. + std::string pcre2Pattern; + bool needsRawByteMode = false; + try { + pcre2Pattern = opt.caseSensitive + ? functions::java_pcre2_translator::toPcre2Pattern( + javaPattern, needsRawByteMode) + : functions::java_pcre2_translator::toPcre2PatternWithUnicodeCase( + javaPattern, needsRawByteMode); + } catch (const functions::java_pcre2_translator::EvaluationFailedException& + ex) { + error_ = std::string("Java→PCRE2 translator: ") + ex.what(); + return; + } + if (needsRawByteMode) { + pcre2Pattern = rewriteSurrogateEscapesForRawByteMode(std::move(pcre2Pattern)); + } + + int err = 0; + PCRE2_SIZE off = 0; + code_ = pcre2_compile_8( + reinterpret_cast(pcre2Pattern.data()), + pcre2Pattern.size(), + toPcre2Options(opt) & (needsRawByteMode ? ~PCRE2_UTF : ~0u), + &err, + &off, + nullptr); + if (!code_) { + error_ = pcre2ErrorToString(err, off); + return; + } + // JIT-compile for speed. Falls back to the interpreter on platforms where + // JIT isn't supported, no special handling needed. + pcre2_jit_compile_8(code_, PCRE2_JIT_COMPLETE); + + // Capture count. + std::uint32_t cap = 0; + pcre2_pattern_info_8(code_, PCRE2_INFO_CAPTURECOUNT, &cap); + captureCount_ = static_cast(cap); + + // Named groups: name table is a flat blob of fixed-size entries; first 2 + // bytes of each entry are the (big-endian) group index, then a NUL-terminated + // name. + std::uint32_t nameCount = 0; + std::uint32_t entrySize = 0; + PCRE2_SPTR8 nameTable = nullptr; + pcre2_pattern_info_8(code_, PCRE2_INFO_NAMECOUNT, &nameCount); + pcre2_pattern_info_8(code_, PCRE2_INFO_NAMEENTRYSIZE, &entrySize); + pcre2_pattern_info_8(code_, PCRE2_INFO_NAMETABLE, &nameTable); + for (std::uint32_t i = 0; i < nameCount; ++i) { + const std::uint8_t* entry = nameTable + i * entrySize; + int idx = (entry[0] << 8) | entry[1]; + named_.emplace(reinterpret_cast(entry + 2), idx); + } +} + +Pcre2Regex::~Pcre2Regex() { + if (code_) { + pcre2_code_free_8(code_); + } +} + +bool Pcre2Regex::ok() const { + return code_ != nullptr; +} +const std::string& Pcre2Regex::error() const { + return error_; +} +int Pcre2Regex::NumberOfCapturingGroups() const { + return captureCount_; +} +const std::map& Pcre2Regex::NamedCapturingGroups() const { + return named_; +} + +bool Pcre2Regex::Match( + std::string_view input, + std::size_t startpos, + std::size_t endpos, + Anchor anchor, + std::string_view* submatch, + int nsubmatch) const { + if (!code_) { + return false; + } + pcre2_match_data_8* md = + pcre2_match_data_create_from_pattern_8(code_, nullptr); + // PCRE2 takes the full subject + the length to consider; passing `endpos` + // as the length cleanly caps matching to [startpos, endpos). + int rc = pcre2_match_8( + code_, + reinterpret_cast(input.data()), + endpos, + startpos, + toPcre2MatchOptions(anchor) | + (containsSurrogateUtf8(input.substr(0, endpos)) ? PCRE2_NO_UTF_CHECK + : 0), + md, + nullptr); + if (rc < 0) { + pcre2_match_data_free_8(md); + return false; + } + PCRE2_SIZE* ov = pcre2_get_ovector_pointer_8(md); + int avail = std::min(nsubmatch, rc); + for (int i = 0; i < avail; ++i) { + if (ov[2 * i] == PCRE2_UNSET) { + submatch[i] = std::string_view{}; + } else { + submatch[i] = input.substr(ov[2 * i], ov[2 * i + 1] - ov[2 * i]); + } + } + for (int i = avail; i < nsubmatch; ++i) { + submatch[i] = std::string_view{}; + } + pcre2_match_data_free_8(md); + return true; +} + +bool Pcre2Regex::FullMatch(std::string_view input, const Pcre2Regex& re) { + std::string_view sub[1]; + return re.Match(input, 0, input.size(), Anchor::kAnchorBoth, sub, 1); +} + +bool Pcre2Regex::PartialMatch(std::string_view input, const Pcre2Regex& re) { + std::string_view sub[1]; + return re.Match(input, 0, input.size(), Anchor::kUnanchored, sub, 1); +} + +int Pcre2Regex::GlobalReplace( + std::string* str, + const Pcre2Regex& re, + std::string_view javaReplacement) { + if (!re.ok() || str == nullptr) { + return 0; + } + // PCRE2_SUBSTITUTE_EXTENDED enables $N / ${name} / $$ / \$ — the Java + // replacement syntax that Velox's `prepareRegexpReplaceReplacement` had to + // translate away for RE2. + std::uint32_t opts = PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED | + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH; + // First try with a reasonable initial buffer; on overflow PCRE2 tells us + // the required size in `outlen` and we retry. + std::string out; + out.resize(str->size() * 2 + 32); + PCRE2_SIZE outlen = out.size(); + int rc = pcre2_substitute_8( + re.code_, + reinterpret_cast(str->data()), + str->size(), + 0, + opts, + nullptr, + nullptr, + reinterpret_cast(javaReplacement.data()), + javaReplacement.size(), + reinterpret_cast(out.data()), + &outlen); + if (rc == PCRE2_ERROR_NOMEMORY) { + out.resize(outlen); + outlen = out.size(); + rc = pcre2_substitute_8( + re.code_, + reinterpret_cast(str->data()), + str->size(), + 0, + opts, + nullptr, + nullptr, + reinterpret_cast(javaReplacement.data()), + javaReplacement.size(), + reinterpret_cast(out.data()), + &outlen); + } + if (rc < 0) { + // Substitution error (e.g. unknown group); leave *str untouched. + return 0; + } + out.resize(outlen); + *str = std::move(out); + return rc; +} + +} // namespace facebook::velox::regex_compat diff --git a/velox/external/regex_compat/Pcre2Regex.h b/velox/external/regex_compat/Pcre2Regex.h new file mode 100644 index 00000000000..15bf55fcb5c --- /dev/null +++ b/velox/external/regex_compat/Pcre2Regex.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include "velox/external/regex_compat/RegexTypes.h" + +// Opaque PCRE2 8-bit types so this header doesn't drag in . +struct pcre2_real_code_8; +typedef struct pcre2_real_code_8 pcre2_code_8; + +namespace facebook::velox::regex_compat { + +/// PCRE2 (8-bit) backend in the regex-compat test suite. Public method names +/// and signatures mirror `re2::RE2`'s subset that Velox uses. +/// +/// **Pattern / replacement input is Java `java.util.regex` syntax.** +/// PCRE2 natively understands the Java pattern syntax for the common cases +/// (`(?)` named groups, `\d`/`\w`/`\b` etc.) plus a superset of features +/// (lookaround, backreferences, atomic groups, etc.) — so no Java→PCRE2 +/// pattern translation is performed by this class. For replacement strings, +/// PCRE2's `pcre2_substitute_8` with `PCRE2_SUBSTITUTE_EXTENDED` natively +/// understands `$N` and `${name}` Java-style references. +/// +/// Java syntax that PCRE2 cannot express (Java-specific property tokens like +/// `\p{InGreek}`, character-class intersection `[a-c&&b-d]`, the meaning swap +/// of `(?U)` flag, etc.) is NOT translated here — those cases are intentionally +/// left to surface as test failures, documenting the need for a future +/// Java→PCRE2 translator (cf. pcre4j PR #606). +class Pcre2Regex { + public: + explicit Pcre2Regex(std::string_view javaPattern, Options opt = {}); + ~Pcre2Regex(); + + Pcre2Regex(const Pcre2Regex&) = delete; + Pcre2Regex& operator=(const Pcre2Regex&) = delete; + + bool ok() const; + const std::string& error() const; + int NumberOfCapturingGroups() const; + const std::map& NamedCapturingGroups() const; + + bool Match( + std::string_view input, + std::size_t startpos, + std::size_t endpos, + Anchor anchor, + std::string_view* submatch, + int nsubmatch) const; + + static bool FullMatch(std::string_view input, const Pcre2Regex& re); + static bool PartialMatch(std::string_view input, const Pcre2Regex& re); + + /// Java `$N` / `${name}` replacement syntax, handled natively by PCRE2 via + /// `PCRE2_SUBSTITUTE_EXTENDED`. Returns the number of replacements done. + static int GlobalReplace( + std::string* str, + const Pcre2Regex& re, + std::string_view javaReplacement); + + private: + pcre2_code_8* code_ = nullptr; + std::string error_; + int captureCount_ = 0; + std::map named_; +}; + +} // namespace facebook::velox::regex_compat diff --git a/velox/external/regex_compat/README.md b/velox/external/regex_compat/README.md new file mode 100644 index 00000000000..37fc10bdd23 --- /dev/null +++ b/velox/external/regex_compat/README.md @@ -0,0 +1,127 @@ +# Velox regex compatibility test suite + +A C++ test harness that compares three regex engines — Velox's existing +**RE2**, **PCRE2** (8-bit, JIT), and an embedded JVM running +**`java.util.regex`** — against the same inputs, expressed in Java regex +syntax. + +The goal is to quantify how each engine handles Java-style patterns and +replacements so the Velox project can make data-driven decisions about +whether to introduce PCRE2 alongside RE2 in production, and (separately) +whether to invest in a Java → PCRE2 translator analogous to +[pcre4j PR #606](https://github.com/alexey-pelykh/pcre4j/pull/606). + +This module is **opt-in** and **off by default**. It does not affect +stock Velox builds in any way unless you enable the CMake options below. + +## Enabling + +```bash +cmake -S . -B build -GNinja \ + -DVELOX_ENABLE_REGEX_COMPAT_TESTS=ON \ # opt-in master switch (pulls in PCRE2) + -DVELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND=ON # opt-in JNI backend (requires JDK) +cmake --build build --target velox_regex_compat_test +build/velox/external/regex_compat/tests/velox_regex_compat_test +``` + +`VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND` defaults to `ON`. If +`find_package(JNI)` fails (no JDK installed) the option is silently +flipped to `OFF` and the suite still builds — only the Java backend is +disabled. + +## Architecture + +Three parallel, non-virtual concrete classes: + +| Backend | Implementation | +| ----------- | ----------------------------------------------------------- | +| `Re2Regex` | wraps `re2::RE2`; uses `java_pcre2_translator::toRe2Pattern` for Java pattern syntax and Velox's inline `prepareRegexpReplaceReplacement` from `Re2Functions.h` for Java replacement syntax | +| `Pcre2Regex`| wraps `pcre2_code_8`; uses `java_pcre2_translator::toPcre2Pattern`; `GlobalReplace` uses `PCRE2_SUBSTITUTE_EXTENDED` for `$N` / `${name}` | +| `JavaRegex` | drives `java.util.regex.Pattern` / `Matcher` through an embedded JVM (`JNI_CreateJavaVM`) using only standard JDK classes — no Gluten / Hadoop jars needed | + +Their public methods deliberately mirror the subset of `re2::RE2` actually +used in `velox/functions/lib/Re2Functions.cpp`: + +- `bool Match(input, startpos, endpos, anchor, submatch[], nsubmatch)` +- `int NumberOfCapturingGroups()` +- `const std::map& NamedCapturingGroups()` +- `bool ok() / const std::string& error()` +- static `FullMatch / PartialMatch / GlobalReplace` +- `Anchor { kUnanchored, kAnchorStart, kAnchorBoth }` +- `Options { caseSensitive, dotNl, oneLine, logErrors, maxMem }` + +The shared shape (plus identical method signatures) lets one +`TYPED_TEST_SUITE_P` declaration produce one test per backend at compile +time — see `tests/BackendTypedTest.cpp` and the three ported pcre4j +test files. + +The stateful Java `Matcher` API (`find()` cursor, `group(i)`, +`replaceFirst`, …) lives in `tests/JavaMatcherAdapter.h` — a +template that reconstructs the state machine on top of the backend's +stateless `Match()`. It is **test-only**; production backends do not +carry this state. + +## What's tested + +`velox_regex_compat_test` ships with **189 GTest cases** across 15 +suites: + +``` +Re2RegexTest 11 cases — RE2-specific edge cases +Pcre2RegexTest 12 cases — PCRE2-specific, incl. lookahead + backref +JavaRegexTest 13 cases — Java-specific, incl. \p{InGreek} +BackendTest 13 × 3 — core API typed across all backends +PatternPortedTest 13 × 3 — ported from pcre4j PatternTests.java +MatchingPortedTest 14 × 3 — ported from pcre4j MatcherMatchingTests.java +ReplacementPortedTest 11 × 3 — ported from pcre4j MatcherReplacementTests.java +``` + +A single typed test exercises both engine differences (e.g. PCRE2 supports +lookahead while RE2 doesn't) and cross-engine parity (e.g. all three +backends accept Java `(?...)` named groups). + +## Known cross-engine differences + +| Java feature | Re2Regex | Pcre2Regex | JavaRegex | +| ---------------------------------- | ------------------------------------- | --------------------------- | --------- | +| `(?...)` named groups | translated via `toRe2Pattern` | native | native | +| `$N` / `${name}` in replacement | translated via `prepareRegexpReplaceReplacement` | `PCRE2_SUBSTITUTE_EXTENDED` native | native | +| Lookaround `(?=...)`, `(?!...)` | not supported (`ok() == false`) | native | native | +| Backreferences `\1` | not supported | native | native | +| Atomic groups `(?>...)`, possessive `*+` | not supported | native | native | +| Java `\p{InGreek}` / `\p{javaXxx}` | translated where safe | translated where safe | native | +| Character-class intersection `[a-c&&b-d]` | translated where safe | translated where safe | native | +| `(?U)` Java UNICODE_CHARACTER_CLASS | rejected to avoid RE2 ungreedy semantics | translated where safe | native | +| Multiline `^`/`$` | injected `(?m)` prefix when `oneLine=false` | option-mapped | option-mapped | +| `a{` incomplete quantifier | accepted as literal | accepted as literal | rejected (`PatternSyntaxException`) | + +The translator rows are intentionally conservative: features are translated +only where the target engine can preserve Java semantics, and otherwise the +backend reports `ok() == false` with a translator error. + +## Provenance + +- `Re2Regex`, `Pcre2Regex`, `JavaRegex`, `JvmFixture` — original code, + Apache-2.0. +- Ported test cases in `tests/Pattern…PortedTest.cpp` and + `tests/Matcher…PortedTest.cpp` are 1:1 translations of the + corresponding `org.pcre4j.regex.tests.*` Java tests from + [pcre4j](https://github.com/alexey-pelykh/pcre4j). The upstream Java + code is GPL/LGPL; the C++ port re-implements them in Apache-2.0 form + for the Velox project. + +## What's **not** in this module (scope notes) + +- **No production code change.** This module sits under + `velox/external/regex_compat/` precisely because it is a comparison + experiment, not a Velox engine swap. If/when a production decision + is made the backend classes can be lifted to `velox/functions/lib/` + but that is a separate task. +- **No production regex engine replacement.** The Java regex translator is + wired into this comparison suite's RE2 and PCRE2 backends to measure + compatibility, not to change Velox production regex behavior. +- **No QueryConfig runtime switch.** Whether Velox should expose + RE2/PCRE2/Java as a runtime-selectable engine is a downstream + decision; the backend classes here all happen to be method-compatible, + but they are not unified behind a virtual base or `std::variant` + facade. diff --git a/velox/external/regex_compat/Re2Regex.cpp b/velox/external/regex_compat/Re2Regex.cpp new file mode 100644 index 00000000000..5af77f47f4b --- /dev/null +++ b/velox/external/regex_compat/Re2Regex.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/external/regex_compat/Re2Regex.h" + +#include +#include + +#include "velox/functions/lib/Re2Functions.h" +#include "velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h" +#include "velox/type/StringView.h" + +namespace facebook::velox::regex_compat { +namespace { + +inline re2::StringPiece toSp(std::string_view s) { + return re2::StringPiece(s.data(), s.size()); +} +inline std::string_view toSv(const re2::StringPiece& sp) { + return std::string_view(sp.data(), sp.size()); +} +inline StringView toVelox(std::string_view s) { + return StringView(s.data(), s.size()); +} + +re2::RE2::Anchor toRe2Anchor(Anchor a) { + switch (a) { + case Anchor::kUnanchored: + return re2::RE2::UNANCHORED; + case Anchor::kAnchorStart: + return re2::RE2::ANCHOR_START; + case Anchor::kAnchorBoth: + return re2::RE2::ANCHOR_BOTH; + } + return re2::RE2::UNANCHORED; +} + +re2::RE2::Options toRe2Options(const Options& o) { + re2::RE2::Options out; + out.set_case_sensitive(o.caseSensitive); + out.set_dot_nl(o.dotNl); + out.set_one_line(o.oneLine); + out.set_log_errors(o.logErrors); + out.set_max_mem(o.maxMem); + out.set_encoding(re2::RE2::Options::EncodingUTF8); + return out; +} + +} // namespace + +Re2Regex::Re2Regex(std::string_view javaPattern, Options opt) { + std::string re2Pattern; + try { + re2Pattern = opt.caseSensitive + ? functions::java_pcre2_translator::toRe2Pattern(javaPattern) + : functions::java_pcre2_translator::toRe2PatternWithUnicodeCase( + javaPattern); + } catch (const functions::java_pcre2_translator::EvaluationFailedException& + ex) { + error_ = std::string("Java→RE2 translator: ") + ex.what(); + return; + } + // Java's MULTILINE flag doesn't map cleanly to any RE2 Options bit: + // RE2's default behavior is that `^` and `$` only match at the start/end + // of the entire input. The inline `(?m)` modifier is the only way to + // enable per-line anchoring. We prepend it when the caller asks for + // MULTILINE (oneLine == false). Java MULTILINE is purely additive + // (it doesn't affect `.` or non-anchor metas), so prepending is safe. + if (!opt.oneLine) { + re2Pattern = "(?m)" + re2Pattern; + } + re_ = std::make_unique(toSp(re2Pattern), toRe2Options(opt)); + if (!re_->ok()) { + error_ = re_->error(); + return; + } + named_ = re_->NamedCapturingGroups(); +} + +Re2Regex::~Re2Regex() = default; + +bool Re2Regex::ok() const { + return re_ && re_->ok(); +} +const std::string& Re2Regex::error() const { + return error_; +} +int Re2Regex::NumberOfCapturingGroups() const { + return re_ ? re_->NumberOfCapturingGroups() : 0; +} +const std::map& Re2Regex::NamedCapturingGroups() const { + return named_; +} +const re2::RE2& Re2Regex::raw() const { + return *re_; +} + +bool Re2Regex::Match( + std::string_view input, + std::size_t startpos, + std::size_t endpos, + Anchor anchor, + std::string_view* submatch, + int nsubmatch) const { + if (!ok()) { + return false; + } + // RE2 writes into StringPiece buffer; copy into caller's string_view array. + std::vector caps(nsubmatch); + bool matched = re_->Match( + toSp(input), + startpos, + endpos, + toRe2Anchor(anchor), + caps.data(), + nsubmatch); + if (!matched) { + return false; + } + for (int i = 0; i < nsubmatch; ++i) { + submatch[i] = caps[i].data() ? toSv(caps[i]) : std::string_view{}; + } + return true; +} + +bool Re2Regex::FullMatch(std::string_view input, const Re2Regex& re) { + if (!re.ok()) { + return false; + } + return re2::RE2::FullMatch(toSp(input), *re.re_); +} + +bool Re2Regex::PartialMatch(std::string_view input, const Re2Regex& re) { + if (!re.ok()) { + return false; + } + return re2::RE2::PartialMatch(toSp(input), *re.re_); +} + +int Re2Regex::GlobalReplace( + std::string* str, + const Re2Regex& re, + std::string_view javaReplacement) { + if (!re.ok() || str == nullptr) { + return 0; + } + const std::string re2Replacement = functions::prepareRegexpReplaceReplacement( + *re.re_, toVelox(javaReplacement)); + return re2::RE2::GlobalReplace(str, *re.re_, re2Replacement); +} + +} // namespace facebook::velox::regex_compat diff --git a/velox/external/regex_compat/Re2Regex.h b/velox/external/regex_compat/Re2Regex.h new file mode 100644 index 00000000000..9c4b0746cb7 --- /dev/null +++ b/velox/external/regex_compat/Re2Regex.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include "velox/external/regex_compat/RegexTypes.h" + +namespace re2 { +class RE2; +} + +namespace facebook::velox::regex_compat { + +/// `re2::RE2` backend in the regex-compat test suite. Public method names +/// and signatures mirror the subset of `re2::RE2` that +/// `velox/functions/lib/Re2Functions.cpp` actually consumes — this keeps the +/// test-suite typed-test surface aligned with Velox's existing RE2 usage. +/// +/// **Pattern / replacement input** is Java `java.util.regex` syntax. +/// Internally, the constructor uses `java_pcre2_translator::toRe2Pattern` and +/// `GlobalReplace` calls Velox's existing +/// `prepareRegexpReplaceReplacement` (`Re2Functions.h:422`). Java features +/// that RE2 cannot express (lookaround / backrefs / possessive / atomic group) +/// cause `ok() == false` with a translator error message. +class Re2Regex { + public: + explicit Re2Regex(std::string_view javaPattern, Options opt = {}); + ~Re2Regex(); + + Re2Regex(const Re2Regex&) = delete; + Re2Regex& operator=(const Re2Regex&) = delete; + + bool ok() const; + const std::string& error() const; + int NumberOfCapturingGroups() const; + const std::map& NamedCapturingGroups() const; + + bool Match( + std::string_view input, + std::size_t startpos, + std::size_t endpos, + Anchor anchor, + std::string_view* submatch, + int nsubmatch) const; + + // Static convenience helpers matching `re2::RE2`. + static bool FullMatch(std::string_view input, const Re2Regex& re); + static bool PartialMatch(std::string_view input, const Re2Regex& re); + + /// Globally replace all matches in `*str`. `javaReplacement` uses Java + /// `$N` / `${name}` syntax; this method internally translates via Velox + /// `prepareRegexpReplaceReplacement` before invoking `re2::RE2::GlobalReplace`. + /// Returns the number of replacements performed. + static int GlobalReplace( + std::string* str, + const Re2Regex& re, + std::string_view javaReplacement); + + // Internal access for the GlobalReplace implementation. + const re2::RE2& raw() const; + + private: + std::unique_ptr re_; + std::string error_; + std::map named_; +}; + +} // namespace facebook::velox::regex_compat diff --git a/velox/external/regex_compat/RegexTypes.h b/velox/external/regex_compat/RegexTypes.h new file mode 100644 index 00000000000..b9e9b38d016 --- /dev/null +++ b/velox/external/regex_compat/RegexTypes.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace facebook::velox::regex_compat { + +/// Mirrors `re2::RE2::Anchor`. +enum class Anchor { kUnanchored, kAnchorStart, kAnchorBoth }; + +/// Subset of `re2::RE2::Options` exposed to the regex-compat test suite. +/// Each backend (Re2Regex / Pcre2Regex / JavaRegex) maps fields to its native +/// option type. +struct Options { + bool caseSensitive = true; + bool dotNl = false; + bool oneLine = true; + bool logErrors = false; + int maxMem = 8 << 20; +}; + +} // namespace facebook::velox::regex_compat diff --git a/velox/external/regex_compat/tests/BackendTestBase.h b/velox/external/regex_compat/tests/BackendTestBase.h new file mode 100644 index 00000000000..14608928988 --- /dev/null +++ b/velox/external/regex_compat/tests/BackendTestBase.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/external/regex_compat/Pcre2Regex.h" +#include "velox/external/regex_compat/Re2Regex.h" + +#if VELOX_REGEX_COMPAT_HAS_JAVA +#include "velox/external/regex_compat/JavaRegex.h" +#endif + +namespace facebook::velox::regex_compat::test { + +/// GTest TYPED_TEST type list, instantiated once per backend at compile time. +/// Tests written as `TYPED_TEST_SUITE_P(MySuite, AllBackends)` automatically +/// run for every backend type that is enabled in this build. +#if VELOX_REGEX_COMPAT_HAS_JAVA +using AllBackends = + ::testing::Types; +#else +using AllBackends = ::testing::Types; +#endif + +/// Base fixture for tests that should run against every backend. +template +class BackendTest : public ::testing::Test {}; + +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/BackendTypedTest.cpp b/velox/external/regex_compat/tests/BackendTypedTest.cpp new file mode 100644 index 00000000000..b0098f03809 --- /dev/null +++ b/velox/external/regex_compat/tests/BackendTypedTest.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Typed test suite that exercises the regex-compat API common to all three +// backends (Re2Regex / Pcre2Regex / JavaRegex). Each TYPED_TEST below is +// compiled and executed once per backend type, so one source line generates +// `len(AllBackends)` assertions of identical behaviour. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" + +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +TYPED_TEST_SUITE(BackendTest, AllBackends); + +TYPED_TEST(BackendTest, compileOk) { + TypeParam re("\\d+"); + EXPECT_TRUE(re.ok()); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} + +TYPED_TEST(BackendTest, compileError) { + TypeParam re("(unclosed"); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(re.error().empty()); +} + +TYPED_TEST(BackendTest, javaNamedGroup) { + // Java syntax (?...) — every backend must accept it. + TypeParam re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(1, re.NumberOfCapturingGroups()); +} + +TYPED_TEST(BackendTest, matchUnanchored) { + TypeParam re("(\\d+)"); + std::string_view sub[2]; + std::string_view in = "abc 42 xyz"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 2)); + EXPECT_EQ("42", sub[0]); + EXPECT_EQ("42", sub[1]); +} + +TYPED_TEST(BackendTest, matchAnchorBoth) { + TypeParam re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TYPED_TEST(BackendTest, matchAnchorBothRejectsTrailing) { + TypeParam re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc1"; + EXPECT_FALSE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TYPED_TEST(BackendTest, fullPartialMatch) { + TypeParam re("[a-z]+"); + EXPECT_TRUE(TypeParam::FullMatch("abc", re)); + EXPECT_FALSE(TypeParam::FullMatch("abc1", re)); + EXPECT_TRUE(TypeParam::PartialMatch("abc1", re)); +} + +TYPED_TEST(BackendTest, globalReplaceNumbered) { + TypeParam re("(\\d+)"); + std::string s = "a1b22c333"; + int n = TypeParam::GlobalReplace(&s, re, "[$1]"); + EXPECT_EQ(3, n); + EXPECT_EQ("a[1]b[22]c[333]", s); +} + +TYPED_TEST(BackendTest, globalReplaceNamed) { + TypeParam re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "a1b22c"; + int n = TypeParam::GlobalReplace(&s, re, "[${n}]"); + EXPECT_EQ(2, n); + EXPECT_EQ("a[1]b[22]c", s); +} + +TYPED_TEST(BackendTest, caseInsensitive) { + Options opt; + opt.caseSensitive = false; + TypeParam re("hello", opt); + EXPECT_TRUE(TypeParam::PartialMatch("HELLO world", re)); +} + +TYPED_TEST(BackendTest, dotAllOption) { + // Dot matches newline only when dotNl is on. + { + TypeParam re(".+"); + std::string_view sub[1]; + std::string_view in = "ab\ncd"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 1)); + EXPECT_EQ("ab", sub[0]); // stopped at \n + } + { + Options opt; + opt.dotNl = true; + TypeParam re(".+", opt); + std::string_view sub[1]; + std::string_view in = "ab\ncd"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 1)); + EXPECT_EQ("ab\ncd", sub[0]); // dot now matched \n + } +} + +TYPED_TEST(BackendTest, multilineAnchors) { + Options opt; + opt.oneLine = false; // MULTILINE + TypeParam re("^bar", opt); + std::string_view sub[1]; + std::string_view in = "foo\nbar"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 1)); + EXPECT_EQ("bar", sub[0]); +} + +TYPED_TEST(BackendTest, emptyGroupMatch) { + // Group that didn't participate in the match — must yield an empty + // string_view (data == nullptr per contract). + TypeParam re("(a)|(b)"); + std::string_view sub[3]; + std::string_view in = "a"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 3)); + EXPECT_EQ("a", sub[0]); + EXPECT_EQ("a", sub[1]); + EXPECT_EQ(nullptr, sub[2].data()); // group 2 did not match +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/CMakeLists.txt b/velox/external/regex_compat/tests/CMakeLists.txt new file mode 100644 index 00000000000..36adc8e6d3b --- /dev/null +++ b/velox/external/regex_compat/tests/CMakeLists.txt @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# (Apache-2.0) + +# Fetch OpenJDK 17 regex corpus files at configure time. Pinned to +# jdk-17.0.13-ga (SHA256 verified). TestCases/BMP/Supplementary are +# processed by OpenJDK's own RegExTest.processFile; GraphemeTestCases uses the +# Unicode GraphemeBreakTest format and has its own parser. +set(_openjdk_corpus_base + "https://raw.githubusercontent.com/openjdk/jdk17u/jdk-17.0.13-ga/test/jdk/java/util/regex") +set(_openjdk_corpus_dir "${CMAKE_CURRENT_BINARY_DIR}/openjdk_corpus") +file(MAKE_DIRECTORY "${_openjdk_corpus_dir}") + +function(_fetch_corpus_file _fname _fsha) + set(_path "${_openjdk_corpus_dir}/${_fname}") + if(NOT EXISTS "${_path}") + message(STATUS "Fetching OpenJDK regex corpus: ${_fname}") + file(DOWNLOAD + "${_openjdk_corpus_base}/${_fname}" + "${_path}" + EXPECTED_HASH SHA256=${_fsha} + SHOW_PROGRESS + STATUS _dl_status) + list(GET _dl_status 0 _dl_code) + if(NOT _dl_code EQUAL 0) + message(WARNING "Failed to download ${_fname}: ${_dl_status}") + endif() + endif() +endfunction() + +_fetch_corpus_file(TestCases.txt + 1bf5c8a2a4fba557ff4e4a5d69d86bbd2a9e0c720b9a6455aa001526375ba946) +_fetch_corpus_file(BMPTestCases.txt + 6dbdfc4c64797831b798ad5d4b546f8cbfb2e76036018fe11013f168fc4f11f2) +_fetch_corpus_file(SupplementaryTestCases.txt + 96a56b7e3d0732f6cb30d307c9025517fdf515732d2ce1ae9e5496e30367a019) +_fetch_corpus_file(GraphemeTestCases.txt + eda68465fe85d88d1c37a6411d1fe714fc5a5de3397bd73b1c10abb612722562) + +add_executable(velox_regex_compat_test + TestMain.cpp + Re2RegexTest.cpp + Pcre2RegexTest.cpp + JavaRegexTest.cpp + BackendTypedTest.cpp + PatternPortedTest.cpp + MatcherMatchingPortedTest.cpp + MatcherReplacementPortedTest.cpp + MatcherResultsPortedTest.cpp + MatcherMatchResultPortedTest.cpp + PatternSplitPortedTest.cpp + MatcherUnicodePortedTest.cpp + RegExTestPortedTest.cpp + OpenJdkCorpusDiffTest.cpp) + +target_compile_definitions(velox_regex_compat_test + PRIVATE OPENJDK_CORPUS_DIR="${_openjdk_corpus_dir}") + +target_link_libraries(velox_regex_compat_test + PRIVATE velox_regex_compat GTest::gtest GTest::gmock) + +add_test(NAME velox_regex_compat_test COMMAND velox_regex_compat_test) diff --git a/velox/external/regex_compat/tests/JavaMatcherAdapter.h b/velox/external/regex_compat/tests/JavaMatcherAdapter.h new file mode 100644 index 00000000000..b8de98ecdfc --- /dev/null +++ b/velox/external/regex_compat/tests/JavaMatcherAdapter.h @@ -0,0 +1,358 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "velox/external/regex_compat/RegexTypes.h" + +namespace facebook::velox::regex_compat::test { + +/// Header-only adapter that reconstructs Java `java.util.regex.Matcher`'s +/// stateful API (`find()` cursor, `group(int)`, `start/end`, `replaceAll`, +/// etc.) on top of the stateless `IRegex`-shaped `Match()` method exposed +/// by the three backend classes. +/// +/// Lives in the test target only — the production backend classes +/// deliberately do not carry this Matcher state, to keep their surface +/// close to `re2::RE2`'s actual usage in Velox. +template +class JavaMatcherAdapter { + public: + JavaMatcherAdapter(const R* re, std::string_view input) + : re_(re), + input_(input), + regionStart_(0), + regionEnd_(input.size()), + // +1 for group 0 (full match). + groups_(re->NumberOfCapturingGroups() + 1) {} + + // ----- find()/matches() family ----- + + /// Advance past the previous match and search forward. + bool find() { + if (!re_->ok()) { + matched_ = false; + return false; + } + if (cursor_ > regionEnd_) { + matched_ = false; + return false; + } + matched_ = re_->Match( + input_, + cursor_, + regionEnd_, + Anchor::kUnanchored, + groups_.data(), + static_cast(groups_.size())); + if (!matched_) { + return false; + } + const std::size_t s = matchBeg(); + const std::size_t e = matchEnd(); + // Zero-width match: advance by 1 to avoid an infinite loop, mirroring + // java.util.regex.Matcher semantics. + cursor_ = (s == e) ? e + 1 : e; + return true; + } + + /// Reset cursor to `start`, then `find()` once. + bool find(int start) { + cursor_ = static_cast(start); + return find(); + } + + /// Anchored full-input match (Java `Matcher.matches`). Does not advance + /// the find-cursor. Honors the active region. + bool matches() { + matched_ = re_->Match( + input_, + regionStart_, + regionEnd_, + Anchor::kAnchorBoth, + groups_.data(), + static_cast(groups_.size())); + return matched_; + } + + /// Anchored prefix match (Java `Matcher.lookingAt`). Honors the active + /// region. + bool lookingAt() { + matched_ = re_->Match( + input_, + regionStart_, + regionEnd_, + Anchor::kAnchorStart, + groups_.data(), + static_cast(groups_.size())); + return matched_; + } + + void reset() { + cursor_ = regionStart_; + matched_ = false; + } + + void reset(std::string_view input) { + input_ = input; + regionStart_ = 0; + regionEnd_ = input.size(); + reset(); + } + + /// Java `Matcher.region(start, end)` — restrict matching to a sub-range. + /// Returns *this for chainability (matches Java's fluent API). Also + /// resets the find() cursor to `start`. + JavaMatcherAdapter& region(int start, int end) { + regionStart_ = static_cast(start); + regionEnd_ = static_cast(end); + cursor_ = regionStart_; + matched_ = false; + return *this; + } + + // ----- Group accessors ----- + + int groupCount() const { + return re_->NumberOfCapturingGroups(); + } + + /// `Matcher.group(i)` — returns the captured substring for group `i` + /// (0-based whole match = group 0). Returns `std::nullopt` if the group + /// did not participate in the last match. + std::optional group(int i) const { + requireMatched(); + if (i < 0 || i >= static_cast(groups_.size())) { + throw std::out_of_range("group index out of range"); + } + if (groups_[i].data() == nullptr) { + return std::nullopt; + } + return groups_[i]; + } + + std::optional group(const std::string& name) const { + requireMatched(); + const auto& named = re_->NamedCapturingGroups(); + auto it = named.find(name); + if (it == named.end()) { + throw std::out_of_range("unknown group name: " + name); + } + return group(it->second); + } + + int start(int i = 0) const { + requireMatched(); + if (i < 0 || i >= static_cast(groups_.size())) { + throw std::out_of_range("group index out of range"); + } + if (groups_[i].data() == nullptr) { + return -1; + } + return static_cast(groups_[i].data() - input_.data()); + } + + int end(int i = 0) const { + requireMatched(); + if (i < 0 || i >= static_cast(groups_.size())) { + throw std::out_of_range("group index out of range"); + } + if (groups_[i].data() == nullptr) { + return -1; + } + return static_cast( + groups_[i].data() + groups_[i].size() - input_.data()); + } + + int start(const std::string& name) const { + const auto& named = re_->NamedCapturingGroups(); + auto it = named.find(name); + if (it == named.end()) { + throw std::out_of_range("unknown group name: " + name); + } + return start(it->second); + } + + int end(const std::string& name) const { + const auto& named = re_->NamedCapturingGroups(); + auto it = named.find(name); + if (it == named.end()) { + throw std::out_of_range("unknown group name: " + name); + } + return end(it->second); + } + + // ----- Replacement ----- + + /// `Matcher.replaceAll(repl)`: delegates to backend's GlobalReplace. The + /// replacement string uses Java `\$N` / `\${name}` syntax. + std::string replaceAll(std::string_view javaReplacement) const { + std::string s(input_); + R::GlobalReplace(&s, *re_, javaReplacement); + return s; + } + + /// `Matcher.replaceFirst(repl)`: replace only the first match. We do this + /// by walking find() once, building the result manually. + std::string replaceFirst(std::string_view javaReplacement) { + JavaMatcherAdapter copy(re_, input_); + if (!copy.find()) { + return std::string(input_); + } + // Build by hand using backend's GlobalReplace on a one-match window: + // easiest correctness path is to call GlobalReplace on a string that + // contains only the first match in-place — but that's awkward. + // Instead, recompose: prefix + expand(repl, groups) + suffix. + const std::size_t s = copy.matchBeg(); + const std::size_t e = copy.matchEnd(); + std::string out; + out.reserve(input_.size() + javaReplacement.size()); + out.append(input_.substr(0, s)); + out.append(expandJavaReplacement(javaReplacement, copy.groups_)); + out.append(input_.substr(e)); + return out; + } + + /// `Matcher.appendReplacement(sb, repl)`: stateful incremental replace. + /// Appends to `sb` the prefix-since-last-call plus the expanded + /// replacement for the most recent match. Must be called only after a + /// successful `find()`. Throws `std::logic_error` (mirrors Java's + /// `IllegalStateException`) if no match is available. + void appendReplacement(std::string& sb, std::string_view javaReplacement) { + if (!matched_) { + throw std::logic_error( + "appendReplacement: no match available (call find() first)"); + } + const std::size_t s = matchBeg(); + const std::size_t e = matchEnd(); + const std::string replacement = + expandJavaReplacement(javaReplacement, groups_); + sb.append(input_.substr(lastAppendPos_, s - lastAppendPos_)); + sb.append(replacement); + lastAppendPos_ = e; + } + + /// `Matcher.appendTail(sb)`: appends input from lastAppendPosition to end. + void appendTail(std::string& sb) const { + sb.append(input_.substr(lastAppendPos_)); + } + + /// `Matcher.quoteReplacement(s)` static: escape `$` and `\` in `s` so it + /// can be safely used as a literal replacement. + static std::string quoteReplacement(std::string_view s) { + std::string out; + out.reserve(s.size()); + for (char c : s) { + if (c == '\\' || c == '$') { + out.push_back('\\'); + } + out.push_back(c); + } + return out; + } + + private: + std::size_t matchBeg() const { + return groups_[0].data() - input_.data(); + } + std::size_t matchEnd() const { + return matchBeg() + groups_[0].size(); + } + void requireMatched() const { + if (!matched_) { + throw std::logic_error("no match available"); + } + } + + // Expand Java replacement string ($N / ${name} / \\$ / \\\\) using the + // given group slots. Public-style helper used by replaceFirst. We don't + // route through R::GlobalReplace here because that re-matches the whole + // input — we already have the groups in hand. + std::string expandJavaReplacement( + std::string_view r, + const std::vector& g) const { + std::string out; + out.reserve(r.size()); + for (std::size_t i = 0; i < r.size(); ++i) { + char c = r[i]; + if (c == '\\' && i + 1 < r.size()) { + out.push_back(r[i + 1]); + ++i; + } else if (c == '$' && i + 1 < r.size()) { + char n = r[i + 1]; + if (n >= '0' && n <= '9') { + int idx = n - '0'; + std::size_t lastConsumed = i + 1; + for (std::size_t j = i + 2; j < r.size() && r[j] >= '0' && + r[j] <= '9'; + ++j) { + const int candidate = idx * 10 + (r[j] - '0'); + if (candidate >= static_cast(g.size())) { + break; + } + idx = candidate; + lastConsumed = j; + } + if (idx >= static_cast(g.size())) { + throw std::out_of_range("replacement group index out of range"); + } + if (g[idx].data() != nullptr) { + out.append(g[idx]); + } + i = lastConsumed; + } else if (n == '{') { + auto endBrace = r.find('}', i + 2); + if (endBrace == std::string_view::npos) { + throw std::invalid_argument("unterminated named replacement group"); + } + const std::string name(r.substr(i + 2, endBrace - i - 2)); + const auto& named = re_->NamedCapturingGroups(); + auto it = named.find(name); + if (it == named.end() || it->second >= static_cast(g.size())) { + throw std::out_of_range("unknown replacement group name: " + name); + } + if (g[it->second].data() != nullptr) { + out.append(g[it->second]); + } + i = endBrace; + } else { + throw std::invalid_argument("illegal replacement group reference"); + } + } else if (c == '$') { + throw std::invalid_argument("dangling replacement group marker"); + } else { + out.push_back(c); + } + } + return out; + } + + const R* re_; + std::string_view input_; + std::size_t regionStart_ = 0; + std::size_t regionEnd_ = 0; + std::size_t cursor_ = 0; + std::size_t lastAppendPos_ = 0; + bool matched_ = false; + std::vector groups_; +}; + +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/JavaRegexTest.cpp b/velox/external/regex_compat/tests/JavaRegexTest.cpp new file mode 100644 index 00000000000..b1b16222501 --- /dev/null +++ b/velox/external/regex_compat/tests/JavaRegexTest.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if VELOX_REGEX_COMPAT_HAS_JAVA + +#include "velox/external/regex_compat/JavaRegex.h" + +#include +#include + +namespace facebook::velox::regex_compat { +namespace { + +TEST(JavaRegexTest, compileOk) { + JavaRegex re("\\d+"); + EXPECT_TRUE(re.ok()); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} + +TEST(JavaRegexTest, compileError) { + JavaRegex re("(unclosed"); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(re.error().empty()); +} + +TEST(JavaRegexTest, namedGroup) { + JavaRegex re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(1, re.NumberOfCapturingGroups()); + // Pattern.namedGroups() is JDK 20+; treat as best-effort. + if (!re.NamedCapturingGroups().empty()) { + EXPECT_EQ(1, re.NamedCapturingGroups().at("num")); + } +} + +TEST(JavaRegexTest, matchUnanchored) { + JavaRegex re("(\\d+)"); + std::string_view sub[2]; + std::string_view in = "abc 42 xyz"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 2)); + EXPECT_EQ("42", sub[0]); + EXPECT_EQ("42", sub[1]); +} + +TEST(JavaRegexTest, matchAnchorBoth) { + JavaRegex re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TEST(JavaRegexTest, matchAnchorBothRejectsTrailing) { + JavaRegex re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc1"; + EXPECT_FALSE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TEST(JavaRegexTest, fullPartialMatch) { + JavaRegex re("[a-z]+"); + EXPECT_TRUE(JavaRegex::FullMatch("abc", re)); + EXPECT_FALSE(JavaRegex::FullMatch("abc1", re)); + EXPECT_TRUE(JavaRegex::PartialMatch("abc1", re)); +} + +TEST(JavaRegexTest, globalReplaceWithNumberedGroup) { + JavaRegex re("(\\d+)"); + std::string s = "a1b22c333"; + int n = JavaRegex::GlobalReplace(&s, re, "[$1]"); + EXPECT_EQ(3, n); + EXPECT_EQ("a[1]b[22]c[333]", s); +} + +TEST(JavaRegexTest, globalReplaceWithNamedGroup) { + JavaRegex re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "a1b22c"; + int n = JavaRegex::GlobalReplace(&s, re, "[${n}]"); + EXPECT_EQ(2, n); + EXPECT_EQ("a[1]b[22]c", s); +} + +TEST(JavaRegexTest, caseInsensitiveOption) { + Options opt; + opt.caseSensitive = false; + JavaRegex re("hello", opt); + EXPECT_TRUE(JavaRegex::PartialMatch("HELLO world", re)); +} + +TEST(JavaRegexTest, lookaheadSupported) { + // Java natively supports lookahead. + JavaRegex re("\\d+(?=px)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string_view sub[1]; + std::string_view in = "size 42px wide"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 1)); + EXPECT_EQ("42", sub[0]); +} + +TEST(JavaRegexTest, backrefSupported) { + // Java natively supports backreferences. + JavaRegex re("(\\w+) \\1"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(JavaRegex::PartialMatch("hello hello", re)); + EXPECT_FALSE(JavaRegex::PartialMatch("hello world", re)); +} + +TEST(JavaRegexTest, javaSpecificPropertyInLC) { + // Java's \p{InGreek} (Unicode block "Greek"). This is one of the + // Java-specific property tokens that PCRE2 cannot understand natively — + // serves as a sentinel for the future Java->PCRE2 translator scope. + JavaRegex re("\\p{InGreek}+"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(JavaRegex::PartialMatch("hello \xce\xb1\xce\xb2\xce\xb3 world", re)); +} + +} // namespace +} // namespace facebook::velox::regex_compat + +#endif // VELOX_REGEX_COMPAT_HAS_JAVA diff --git a/velox/external/regex_compat/tests/MatcherMatchResultPortedTest.cpp b/velox/external/regex_compat/tests/MatcherMatchResultPortedTest.cpp new file mode 100644 index 00000000000..52c7e8b3b5e --- /dev/null +++ b/velox/external/regex_compat/tests/MatcherMatchResultPortedTest.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `MatcherMatchResultTests.java`. +// +// Most cases there exercise Java-specific `MatchResult` snapshot semantics +// (immutability of the snapshot when the matcher advances, IllegalState/ +// IndexOutOfBounds/IllegalArgument exception contracts, namedGroups() map +// equality, hasMatch() flag). Those are Java API-contract tests, not +// regex-engine behavior, so they are skipped here — they would produce +// identical pass/fail across all three backends and add no engine-compat +// signal. +// +// We port only the two cases that exercise engine behavior the existing +// MatcherMatchingPortedTest doesn't already cover: +// * matchResultByGroupNumber — 3 consecutive whitespace-separated +// capturing groups, sweep over all +// group indices. +// * matchResultNamedGroupAccessors — 3 named groups in a date pattern. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using MatchResultPortedTest = BackendTest; +TYPED_TEST_SUITE(MatchResultPortedTest, AllBackends); + +TYPED_TEST(MatchResultPortedTest, matchResultByGroupNumber) { + TypeParam re("(\\w+)\\s+(\\w+)\\s+(\\w+)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "one two three"); + ASSERT_TRUE(m.find()); + EXPECT_EQ(3, m.groupCount()); + EXPECT_EQ("one two three", m.group(0).value()); + EXPECT_EQ("one", m.group(1).value()); + EXPECT_EQ("two", m.group(2).value()); + EXPECT_EQ("three", m.group(3).value()); + EXPECT_EQ(0, m.start(0)); + EXPECT_EQ(13, m.end(0)); + EXPECT_EQ(0, m.start(1)); + EXPECT_EQ(3, m.end(1)); + EXPECT_EQ(4, m.start(2)); + EXPECT_EQ(7, m.end(2)); + EXPECT_EQ(8, m.start(3)); + EXPECT_EQ(13, m.end(3)); +} + +TYPED_TEST(MatchResultPortedTest, matchResultNamedGroupAccessors) { + TypeParam re("(?\\d{4})-(?\\d{2})-(?\\d{2})"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "date: 2024-01-15"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("2024", m.group("year").value()); + EXPECT_EQ("01", m.group("month").value()); + EXPECT_EQ("15", m.group("day").value()); + EXPECT_EQ(6, m.start("year")); + EXPECT_EQ(10, m.end("year")); + EXPECT_EQ(11, m.start("month")); + EXPECT_EQ(13, m.end("month")); + EXPECT_EQ(14, m.start("day")); + EXPECT_EQ(16, m.end("day")); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/MatcherMatchingPortedTest.cpp b/velox/external/regex_compat/tests/MatcherMatchingPortedTest.cpp new file mode 100644 index 00000000000..78e2155c9c2 --- /dev/null +++ b/velox/external/regex_compat/tests/MatcherMatchingPortedTest.cpp @@ -0,0 +1,358 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `MatcherMatchingTests.java`. Same provenance +// notes as PatternPortedTest.cpp. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using MatchingPortedTest = BackendTest; +TYPED_TEST_SUITE(MatchingPortedTest, AllBackends); + +// Matcher.find() walks all matches. +TYPED_TEST(MatchingPortedTest, findWalksAllMatches) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a 1 b 22 c 333"); + std::vector found; + while (m.find()) { + found.emplace_back(m.group(0).value()); + } + EXPECT_THAT(found, ::testing::ElementsAre("1", "22", "333")); +} + +TYPED_TEST(MatchingPortedTest, findNoMatch) { + TypeParam re("xyz"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "abc def"); + EXPECT_FALSE(m.find()); +} + +TYPED_TEST(MatchingPortedTest, findWithStartIndex) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "1 2 3 4"); + ASSERT_TRUE(m.find(2)); + EXPECT_EQ("2", m.group(0).value()); +} + +// Matcher.matches() — full-input anchored. +TYPED_TEST(MatchingPortedTest, matchesFullInput) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "42"); + EXPECT_TRUE(m.matches()); +} + +TYPED_TEST(MatchingPortedTest, matchesRejectsPartial) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "42x"); + EXPECT_FALSE(m.matches()); +} + +// Matcher.lookingAt() — anchor at start, may end early. +TYPED_TEST(MatchingPortedTest, lookingAtPrefixOnly) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "42x"); + EXPECT_TRUE(m.lookingAt()); + EXPECT_EQ("42", m.group(0).value()); +} + +TYPED_TEST(MatchingPortedTest, lookingAtRejectsLateMatch) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "x42"); + EXPECT_FALSE(m.lookingAt()); +} + +// Matcher.group(int) and Matcher.start/end accessors. +TYPED_TEST(MatchingPortedTest, groupAccessor) { + TypeParam re("(\\d+)-(\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "foo 10-200 bar"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("10-200", m.group(0).value()); + EXPECT_EQ("10", m.group(1).value()); + EXPECT_EQ("200", m.group(2).value()); + EXPECT_EQ(4, m.start()); + EXPECT_EQ(10, m.end()); + EXPECT_EQ(4, m.start(1)); + EXPECT_EQ(6, m.end(1)); + EXPECT_EQ(7, m.start(2)); + EXPECT_EQ(10, m.end(2)); +} + +TYPED_TEST(MatchingPortedTest, groupCountAccessor) { + TypeParam re("(a)(b)(c)(d)"); + JavaMatcherAdapter m(&re, "abcd"); + EXPECT_EQ(4, m.groupCount()); +} + +// Matcher.group(String) — named groups. JavaRegex relies on JDK 20+ +// Pattern.namedGroups() which our build host has, but other JDKs may not; +// we keep this test conservative and skip if name table is empty. +TYPED_TEST(MatchingPortedTest, groupAccessorByName) { + TypeParam re("(?\\d+)-(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + if (re.NamedCapturingGroups().empty()) { + GTEST_SKIP() << "Backend doesn't expose named group table"; + } + JavaMatcherAdapter m(&re, "10-200"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("10", m.group("lo").value()); + EXPECT_EQ("200", m.group("hi").value()); +} + +// Matcher.reset() — restart from beginning. +TYPED_TEST(MatchingPortedTest, resetRestartsCursor) { + TypeParam re("\\d"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a1b2c3"); + EXPECT_TRUE(m.find()); + EXPECT_EQ("1", m.group(0).value()); + EXPECT_TRUE(m.find()); + EXPECT_EQ("2", m.group(0).value()); + m.reset(); + EXPECT_TRUE(m.find()); + EXPECT_EQ("1", m.group(0).value()); +} + +// Matcher.reset(input) — re-bind to new input. +TYPED_TEST(MatchingPortedTest, resetWithNewInput) { + TypeParam re("\\d"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "abc"); + EXPECT_FALSE(m.find()); + m.reset("9 8 7"); + EXPECT_TRUE(m.find()); + EXPECT_EQ("9", m.group(0).value()); +} + +// Empty group sentinel. +TYPED_TEST(MatchingPortedTest, groupDidNotParticipate) { + TypeParam re("(a)|(b)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("a", m.group(1).value()); + EXPECT_EQ(std::nullopt, m.group(2)); + EXPECT_EQ(-1, m.start(2)); + EXPECT_EQ(-1, m.end(2)); +} + +// pcre4j MatcherMatchingTests.captureGroups — group(0) + start/end/start("name") symmetry +TYPED_TEST(MatchingPortedTest, captureGroupsByNameAndIndex) { + TypeParam re("(?4)(.*)(?2)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "4test2"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("4test2", m.group(0).value()); + EXPECT_EQ("4", m.group(1).value()); + EXPECT_EQ("test", m.group(2).value()); + EXPECT_EQ("2", m.group(3).value()); + EXPECT_EQ(3, m.groupCount()); + if (!re.NamedCapturingGroups().empty()) { + EXPECT_EQ("4", m.group("four").value()); + EXPECT_EQ("2", m.group("two").value()); + } +} + +// pcre4j MatcherMatchingTests.matchesTrueInRegion / matchesFalseRegion +TYPED_TEST(MatchingPortedTest, matchesWithinRegion) { + TypeParam re("42"); + JavaMatcherAdapter m(&re, "[42]"); + EXPECT_TRUE(m.region(1, 3).matches()); // region "42" — full match + JavaMatcherAdapter m2(&re, "[42!]"); + EXPECT_FALSE(m2.region(1, 4).matches()); // region "42!" — not full +} + +// pcre4j MatcherMatchingTests.lookingAtTrueInRegion / lookingAtFalseRegion +TYPED_TEST(MatchingPortedTest, lookingAtWithinRegion) { + TypeParam re("42"); + JavaMatcherAdapter m(&re, "[42]"); + EXPECT_TRUE(m.region(1, 3).lookingAt()); + JavaMatcherAdapter m2(&re, "[!42]"); + EXPECT_FALSE(m2.region(1, 4).lookingAt()); // region "!42" — '!' first, doesn't match start +} + +// pcre4j MatcherMatchingTests.findTrueInRegion / findFalseInRegion +TYPED_TEST(MatchingPortedTest, findWithinRegion) { + TypeParam re("42"); + JavaMatcherAdapter m(&re, "[42]"); + EXPECT_TRUE(m.region(1, 3).find()); + EXPECT_EQ("42", m.group(0).value()); + TypeParam re2("42!"); + JavaMatcherAdapter m2(&re2, "[42]"); + EXPECT_FALSE(m2.region(1, 3).find()); +} + +// pcre4j MatcherMatchingTests.findFalseAtOffset +TYPED_TEST(MatchingPortedTest, findFalseAtOffset) { + TypeParam re("42"); + JavaMatcherAdapter m(&re, "!!test"); + EXPECT_FALSE(m.find(2)); +} + +// pcre4j MatcherMatchingTests.findMultipleWithinRegion +TYPED_TEST(MatchingPortedTest, findMultipleWithinRegion) { + TypeParam re("42"); + JavaMatcherAdapter m(&re, "42!42!42!42"); + m.region(2, 8); // region content: "!42!42!" + std::vector matchStarts; + while (m.find()) { + matchStarts.push_back(m.start()); + } + // Should match "42" at offsets 3 and 6 (within the region [2,8)). + EXPECT_THAT(matchStarts, ::testing::ElementsAre(3, 6)); +} + +// pcre4j MatcherMatchingTests.findMultipleOutsideRegion +TYPED_TEST(MatchingPortedTest, findMultipleOutsideRegion) { + TypeParam re("42"); + JavaMatcherAdapter m(&re, "42!__!__!42"); + m.region(2, 8); // region content: "!__!__!" — no "42" inside + EXPECT_FALSE(m.find()); +} + +// pcre4j MatcherMatchingTests.emptyGroup — `!*` matches empty at position 0 +TYPED_TEST(MatchingPortedTest, emptyGroup) { + TypeParam re("!*"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "42"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("", m.group(0).value()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(0, m.end()); + EXPECT_EQ(0, m.groupCount()); +} + +// pcre4j MatcherMatchingTests.unmatchedGroups — alternation where only one branch participates +TYPED_TEST(MatchingPortedTest, unmatchedGroupsInAlternation) { + TypeParam re("42((?!)|(?\\?))"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "42!"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("42!", m.group(0).value()); + EXPECT_EQ("!", m.group(1).value()); // outer group matches '!' + EXPECT_EQ("!", m.group(2).value()); // exclamation = '!' + EXPECT_EQ(std::nullopt, m.group(3)); // question did NOT match + EXPECT_EQ(3, m.groupCount()); + if (!re.NamedCapturingGroups().empty()) { + EXPECT_EQ("!", m.group("exclamation").value()); + EXPECT_EQ(std::nullopt, m.group("question")); + } +} + +// pcre4j MatcherMatchingTests.positiveLookaround — lookahead/lookbehind both ways. +// Asserts Java semantics: pattern compiles and matches "42" in "(42)". +// Backends without lookaround (RE2) will fail this test; that's a recorded +// compatibility-rate data point, not a bug. +TYPED_TEST(MatchingPortedTest, positiveLookaround) { + if constexpr (std::is_same_v) { + GTEST_SKIP() << "RE2 does not support lookaround"; + } + TypeParam re("(?<=(?\\W))?(\\d+)(?=(?\\W))?"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "(42)"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("42", m.group(0).value()); +} + +// pcre4j MatcherMatchingTests.positiveUnmatchedLookaround — +// lookbehind not satisfied at the start; lookahead not satisfied at end. +TYPED_TEST(MatchingPortedTest, positiveUnmatchedLookaround) { + if constexpr (std::is_same_v) { + GTEST_SKIP() << "RE2 does not support lookaround"; + } + TypeParam re("(?<=(?\\W))?(\\d+)(?=(?\\W))?"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "42]"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("42", m.group(0).value()); +} + +// pcre4j MatcherMatchingTests.emptyStringMatches — pattern "^$" on empty input matches. +TYPED_TEST(MatchingPortedTest, emptyStringMatches) { + TypeParam re("^$"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, ""); + EXPECT_TRUE(m.matches()); +} + +// pcre4j MatcherMatchingTests.emptyStringFind — pattern "^$" on empty input finds once. +TYPED_TEST(MatchingPortedTest, emptyStringFind) { + TypeParam re("^$"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, ""); + ASSERT_TRUE(m.find()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(0, m.end()); + EXPECT_EQ("", m.group(0).value()); + EXPECT_EQ(0, m.groupCount()); +} + +// pcre4j MatcherMatchingTests.findAtEndOfString — find($, len(input)) finds zero-width +// match at end. +TYPED_TEST(MatchingPortedTest, findAtEndOfString) { + TypeParam re("$"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "abc"); + EXPECT_TRUE(m.find(3)); +} + +// pcre4j MatcherMatchingTests.findExhaustedInRegion — multiple matches in region, +// then no more. +TYPED_TEST(MatchingPortedTest, findExhaustedInRegion) { + TypeParam re("a"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "aaa"); + m.region(0, 2); // region "aa" + EXPECT_TRUE(m.find()); // first 'a' + EXPECT_TRUE(m.find()); // second 'a' + EXPECT_FALSE(m.find()); // no more in region +} + +// pcre4j MatcherMatchingTests.findWithZeroWidthMatchExhaustsRegion — +// Java spec: $ matches at region end (zero-width), then no more matches. +TYPED_TEST(MatchingPortedTest, findWithZeroWidthMatchExhaustsRegion) { + if constexpr (std::is_same_v) { + GTEST_SKIP() << "RE2 $ anchors to the full subject, not the match region"; + } + TypeParam re("$"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "ab"); + m.region(0, 1); + ASSERT_TRUE(m.find()); + EXPECT_EQ(1, m.start()); + EXPECT_EQ(1, m.end()); + EXPECT_FALSE(m.find()); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/MatcherReplacementPortedTest.cpp b/velox/external/regex_compat/tests/MatcherReplacementPortedTest.cpp new file mode 100644 index 00000000000..7ac84db1835 --- /dev/null +++ b/velox/external/regex_compat/tests/MatcherReplacementPortedTest.cpp @@ -0,0 +1,383 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `MatcherReplacementTests.java`. Same +// provenance notes as PatternPortedTest.cpp. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using ReplacementPortedTest = BackendTest; +TYPED_TEST_SUITE(ReplacementPortedTest, AllBackends); + +// replaceAll: literal replacement, no group refs. +TYPED_TEST(ReplacementPortedTest, replaceAllLiteral) { + TypeParam re("o"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "foo bar"; + int n = TypeParam::GlobalReplace(&s, re, "0"); + EXPECT_EQ(2, n); + EXPECT_EQ("f00 bar", s); +} + +// replaceAll: numbered group refs ($1). +TYPED_TEST(ReplacementPortedTest, replaceAllNumberedGroup) { + TypeParam re("(\\d+)"); + std::string s = "abc 42 xyz 7"; + int n = TypeParam::GlobalReplace(&s, re, "<$1>"); + EXPECT_EQ(2, n); + EXPECT_EQ("abc <42> xyz <7>", s); +} + +// replaceAll: numbered group refs $0 (whole match). +TYPED_TEST(ReplacementPortedTest, replaceAllZeroGroupRef) { + TypeParam re("\\d+"); + std::string s = "a 1 b 2"; + int n = TypeParam::GlobalReplace(&s, re, "[$0]"); + EXPECT_EQ(2, n); + EXPECT_EQ("a [1] b [2]", s); +} + +// replaceAll: named group ${name}. +TYPED_TEST(ReplacementPortedTest, replaceAllNamedGroup) { + TypeParam re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "a 1 b 22"; + int n = TypeParam::GlobalReplace(&s, re, "[${digit}]"); + EXPECT_EQ(2, n); + EXPECT_EQ("a [1] b [22]", s); +} + +// replaceAll: dollar-sign literally via backslash escape. +TYPED_TEST(ReplacementPortedTest, replaceAllEscapedDollar) { + TypeParam re("x"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "x x"; + int n = TypeParam::GlobalReplace(&s, re, "\\$"); + EXPECT_EQ(2, n); + EXPECT_EQ("$ $", s); +} + +// replaceAll: backslash literally via double-backslash. +TYPED_TEST(ReplacementPortedTest, replaceAllEscapedBackslash) { + TypeParam re("x"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "x"; + // In C++ string literal, "\\\\" is the two-char string `\\` which Java sees + // as escaped backslash → single literal '\'. + int n = TypeParam::GlobalReplace(&s, re, "\\\\"); + EXPECT_EQ(1, n); + EXPECT_EQ("\\", s); +} + +// replaceAll: zero-match (pattern doesn't match) leaves input unchanged. +TYPED_TEST(ReplacementPortedTest, replaceAllNoMatchKeepsInput) { + TypeParam re("z+"); + std::string s = "hello"; + int n = TypeParam::GlobalReplace(&s, re, "X"); + EXPECT_EQ(0, n); + EXPECT_EQ("hello", s); +} + +// replaceAll across multiple groups in replacement. +TYPED_TEST(ReplacementPortedTest, replaceAllMultiGroupCombination) { + TypeParam re("(\\w+) (\\w+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "hello world"; + int n = TypeParam::GlobalReplace(&s, re, "$2 $1"); + EXPECT_EQ(1, n); + EXPECT_EQ("world hello", s); +} + +// replaceFirst: only the first match is replaced (via Adapter). +TYPED_TEST(ReplacementPortedTest, replaceFirstOnlyFirst) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a 1 b 2 c 3"); + std::string out = m.replaceFirst("X"); + EXPECT_EQ("a X b 2 c 3", out); +} + +// replaceFirst with group reference. +TYPED_TEST(ReplacementPortedTest, replaceFirstWithGroupRef) { + TypeParam re("(\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a 1 b 22 c 333"); + std::string out = m.replaceFirst("[$1]"); + EXPECT_EQ("a [1] b 22 c 333", out); +} + +// Empty pattern replacement (pcre4j edge case). +TYPED_TEST(ReplacementPortedTest, replaceAllEmptyReplacement) { + TypeParam re("\\d+"); + std::string s = "a 1 b 22"; + int n = TypeParam::GlobalReplace(&s, re, ""); + EXPECT_EQ(2, n); + EXPECT_EQ("a b ", s); +} + +// ============== Newly ported from pcre4j MatcherReplacementTests ============== + +// pcre4j quoteReplacement(...) +TYPED_TEST(ReplacementPortedTest, quoteReplacementBasic) { + EXPECT_EQ("hello", JavaMatcherAdapter::quoteReplacement("hello")); +} +TYPED_TEST(ReplacementPortedTest, quoteReplacementBackslash) { + EXPECT_EQ("hello\\\\world", + JavaMatcherAdapter::quoteReplacement("hello\\world")); +} +TYPED_TEST(ReplacementPortedTest, quoteReplacementDollar) { + EXPECT_EQ("price: \\$100", + JavaMatcherAdapter::quoteReplacement("price: $100")); +} +TYPED_TEST(ReplacementPortedTest, quoteReplacementBoth) { + EXPECT_EQ("\\$100 \\\\ \\$200", + JavaMatcherAdapter::quoteReplacement("$100 \\ $200")); +} +TYPED_TEST(ReplacementPortedTest, quoteReplacementEmpty) { + EXPECT_EQ("", JavaMatcherAdapter::quoteReplacement("")); +} + +// pcre4j replaceAllBasic +TYPED_TEST(ReplacementPortedTest, replaceAllBasic) { + TypeParam re("world"); + std::string s = "hello world"; + TypeParam::GlobalReplace(&s, re, "universe"); + EXPECT_EQ("hello universe", s); +} + +// pcre4j replaceAllMultiple +TYPED_TEST(ReplacementPortedTest, replaceAllMultiple) { + TypeParam re("o"); + std::string s = "hello world"; + int n = TypeParam::GlobalReplace(&s, re, "0"); + EXPECT_EQ(2, n); + EXPECT_EQ("hell0 w0rld", s); +} + +// pcre4j replaceAllWithGroupReference (covered by replaceAllNumberedGroup +// already, but we mirror pcre4j name) +TYPED_TEST(ReplacementPortedTest, replaceAllWithGroupReference) { + TypeParam re("(\\d+)"); + std::string s = "value: 42"; + TypeParam::GlobalReplace(&s, re, "<$1>"); + EXPECT_EQ("value: <42>", s); +} + +// pcre4j replaceAllWithNamedGroupReference +TYPED_TEST(ReplacementPortedTest, replaceAllWithNamedGroupReferenceBasic) { + TypeParam re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "value: 42"; + TypeParam::GlobalReplace(&s, re, "<${digit}>"); + EXPECT_EQ("value: <42>", s); +} + +// pcre4j replaceAllUnicode +TYPED_TEST(ReplacementPortedTest, replaceAllUnicode) { + TypeParam re("\xf0\x9f\x8c\x90"); // U+1F310 globe + std::string s = "hi \xf0\x9f\x8c\x90 there"; + TypeParam::GlobalReplace(&s, re, "\xf0\x9f\x8c\x8d"); // U+1F30D earth + EXPECT_EQ("hi \xf0\x9f\x8c\x8d there", s); +} + +// pcre4j replaceFirstBasic +TYPED_TEST(ReplacementPortedTest, replaceFirstBasic) { + TypeParam re("o"); + JavaMatcherAdapter m(&re, "foo bar"); + EXPECT_EQ("f0o bar", m.replaceFirst("0")); +} + +// pcre4j replaceFirstWithGroupReference +TYPED_TEST(ReplacementPortedTest, replaceFirstWithGroupReferenceMulti) { + TypeParam re("(\\d+)"); + JavaMatcherAdapter m(&re, "a 1 b 22 c"); + EXPECT_EQ("a <1> b 22 c", m.replaceFirst("<$1>")); +} + +// pcre4j replaceFirstNoMatch +TYPED_TEST(ReplacementPortedTest, replaceFirstNoMatch) { + TypeParam re("xyz"); + JavaMatcherAdapter m(&re, "hello world"); + EXPECT_EQ("hello world", m.replaceFirst("ZZZ")); +} + +// pcre4j replaceAllWithFullMatchReference ($0) +TYPED_TEST(ReplacementPortedTest, replaceAllWithFullMatchReference) { + TypeParam re("\\w+"); + std::string s = "hello world"; + TypeParam::GlobalReplace(&s, re, "[$0]"); + EXPECT_EQ("[hello] [world]", s); +} + +// pcre4j replaceAllWithNamedGroupReferenceYearMonth +TYPED_TEST(ReplacementPortedTest, replaceAllWithNamedGroupReferenceYearMonth) { + TypeParam re("(?\\d{4})-(?\\d{2})"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "date: 2024-01, also 2025-12"; + TypeParam::GlobalReplace(&s, re, "${month}/${year}"); + EXPECT_EQ("date: 01/2024, also 12/2025", s); +} + +// pcre4j replaceFirstWithFullMatchReference +TYPED_TEST(ReplacementPortedTest, replaceFirstWithFullMatchReference) { + TypeParam re("\\w+"); + JavaMatcherAdapter m(&re, "hello world"); + EXPECT_EQ("[hello] world", m.replaceFirst("[$0]")); +} + +// pcre4j appendReplacementStringBuilder (basic appendReplacement + appendTail walk) +TYPED_TEST(ReplacementPortedTest, appendReplacementBasic) { + TypeParam re("(\\w+)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "one two three"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "[$1]"); + } + m.appendTail(sb); + EXPECT_EQ("[one] [two] [three]", sb); +} + +// pcre4j appendReplacementWithNamedGroup +TYPED_TEST(ReplacementPortedTest, appendReplacementWithNamedGroup) { + TypeParam re("(?\\w+)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "one two three"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "${word}!"); + } + m.appendTail(sb); + EXPECT_EQ("one! two! three!", sb); +} + +// pcre4j appendReplacementEscapedCharacters: replacement "\\$\\\\" → literal "$\" +TYPED_TEST(ReplacementPortedTest, appendReplacementEscapedCharacters) { + TypeParam re("\\d+"); + JavaMatcherAdapter m(&re, "test123value"); + std::string sb; + while (m.find()) { + // C++ literal "\\$\\\\" = 4 chars: \ $ \ \ → in Java replacement + // syntax: \\$ -> literal '$', \\\\ -> literal '\'. Net replacement: "$\". + m.appendReplacement(sb, "\\$\\\\"); + } + m.appendTail(sb); + EXPECT_EQ("test$\\value", sb); +} + +// pcre4j appendReplacementLiteralText +TYPED_TEST(ReplacementPortedTest, appendReplacementLiteralText) { + TypeParam re("world"); + JavaMatcherAdapter m(&re, "hello world!"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "universe"); + } + m.appendTail(sb); + EXPECT_EQ("hello universe!", sb); +} + +// pcre4j appendTailOnly: no matches, just appendTail → echoes input. +TYPED_TEST(ReplacementPortedTest, appendTailOnly) { + TypeParam re("xyz"); + JavaMatcherAdapter m(&re, "hello world"); + std::string sb; + // No find() call → no match → appendTail copies entire input. + m.appendTail(sb); + EXPECT_EQ("hello world", sb); +} + +// pcre4j appendReplacementNoMatch: appendReplacement without a successful +// find() throws IllegalStateException in Java; we throw std::logic_error. +TYPED_TEST(ReplacementPortedTest, appendReplacementWithoutMatchThrows) { + TypeParam re("\\d+"); + JavaMatcherAdapter m(&re, "hello world"); + std::string sb; + EXPECT_THROW(m.appendReplacement(sb, "test"), std::logic_error); +} + +// pcre4j appendReplacementMultipleGroups: "$3$2$1" reverses 3 chars. +TYPED_TEST(ReplacementPortedTest, appendReplacementMultipleGroups) { + TypeParam re("(\\w)(\\w)(\\w)"); + JavaMatcherAdapter m(&re, "abc def ghi"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "$3$2$1"); + } + m.appendTail(sb); + EXPECT_EQ("cba fed ihg", sb); +} + +// pcre4j appendReplacementGroupZero +TYPED_TEST(ReplacementPortedTest, appendReplacementGroupZero) { + TypeParam re("\\w+"); + JavaMatcherAdapter m(&re, "hello world"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "[$0]"); + } + m.appendTail(sb); + EXPECT_EQ("[hello] [world]", sb); +} + +// pcre4j appendReplacementUnicode: 4-byte UTF-8 needle / 4-byte UTF-8 repl. +TYPED_TEST(ReplacementPortedTest, appendReplacementUnicode) { + TypeParam re("\xf0\x9f\x8c\x90"); // U+1F310 globe + JavaMatcherAdapter m(&re, "hi \xf0\x9f\x8c\x90 there"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "\xf0\x9f\x8c\x8d"); // U+1F30D earth + } + m.appendTail(sb); + EXPECT_EQ("hi \xf0\x9f\x8c\x8d there", sb); +} + +// pcre4j appendReplacementWithEscapedDollarSign: replacement "\\$5" → +// literal "$5" (not group 5). +TYPED_TEST(ReplacementPortedTest, appendReplacementWithEscapedDollarSign) { + TypeParam re("\\d+"); + JavaMatcherAdapter m(&re, "price: 100"); + std::string sb; + while (m.find()) { + m.appendReplacement(sb, "\\$5"); + } + m.appendTail(sb); + EXPECT_EQ("price: $5", sb); +} + +// pcre4j appendReplacementBackslashEscapesNextChar: \\X → X literal +TYPED_TEST(ReplacementPortedTest, appendReplacementBackslashEscapesNextChar) { + TypeParam re("x"); + JavaMatcherAdapter m(&re, "x"); + std::string sb; + ASSERT_TRUE(m.find()); + m.appendReplacement(sb, "\\$\\\\\\?"); + m.appendTail(sb); + // Java: \\$ → '$', \\\\ → '\', \\? → '?'. Net: "$\?" + EXPECT_EQ("$\\?", sb); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/MatcherResultsPortedTest.cpp b/velox/external/regex_compat/tests/MatcherResultsPortedTest.cpp new file mode 100644 index 00000000000..dbb07b51c56 --- /dev/null +++ b/velox/external/regex_compat/tests/MatcherResultsPortedTest.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `MatcherResultsTests.java`. Java's +// `Matcher.results()` returns a Stream; we model it as a +// find()-loop that snapshots (start, end, group) per match. Cases that +// depend purely on Java's stream API (sum reductions, etc.) are skipped. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +#include +#include +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using ResultsPortedTest = BackendTest; +TYPED_TEST_SUITE(ResultsPortedTest, AllBackends); + +// Snapshot tuple (start, end, group(0)) for each match found. +template +std::vector> snapshotAll( + JavaMatcherAdapter& m) { + std::vector> out; + while (m.find()) { + out.emplace_back(m.start(), m.end(), std::string(m.group(0).value())); + } + return out; +} + +TYPED_TEST(ResultsPortedTest, resultsBasic) { + TypeParam re("\\d+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a1b22c333d"); + auto r = snapshotAll(m); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ(std::make_tuple(1, 2, std::string("1")), r[0]); + EXPECT_EQ(std::make_tuple(3, 5, std::string("22")), r[1]); + EXPECT_EQ(std::make_tuple(6, 9, std::string("333")), r[2]); +} + +TYPED_TEST(ResultsPortedTest, resultsNoMatches) { + TypeParam re("xyz"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "hello world"); + EXPECT_TRUE(snapshotAll(m).empty()); +} + +TYPED_TEST(ResultsPortedTest, resultsSingleMatch) { + TypeParam re("world"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "hello world!"); + auto r = snapshotAll(m); + ASSERT_EQ(1u, r.size()); + EXPECT_EQ(std::make_tuple(6, 11, std::string("world")), r[0]); +} + +TYPED_TEST(ResultsPortedTest, resultsWithGroups) { + TypeParam re("(\\w)(\\d)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a1 b2 c3"); + std::vector> r; + while (m.find()) { + r.emplace_back( + std::string(m.group(0).value()), + std::string(m.group(1).value()), + std::string(m.group(2).value())); + } + ASSERT_EQ(3u, r.size()); + EXPECT_EQ(std::make_tuple("a1", "a", "1"), r[0]); + EXPECT_EQ(std::make_tuple("b2", "b", "2"), r[1]); + EXPECT_EQ(std::make_tuple("c3", "c", "3"), r[2]); +} + +// Snapshots are independent: collecting first must not perturb later reads. +TYPED_TEST(ResultsPortedTest, resultsImmutableSnapshots) { + TypeParam re("\\w+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "one two three"); + auto r = snapshotAll(m); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ(std::make_tuple(0, 3, std::string("one")), r[0]); + EXPECT_EQ(std::make_tuple(4, 7, std::string("two")), r[1]); + EXPECT_EQ(std::make_tuple(8, 13, std::string("three")), r[2]); +} + +// Zero-width matches via positive lookahead — RE2 lacks lookaround. +TYPED_TEST(ResultsPortedTest, resultsZeroWidthMatches) { + if constexpr (std::is_same_v) { + GTEST_SKIP() << "RE2 does not support lookahead"; + } + TypeParam re("(?=\\d)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "a1b2c3"); + auto r = snapshotAll(m); + ASSERT_EQ(3u, r.size()); + for (auto& [s, e, g] : r) { + EXPECT_EQ(s, e); + EXPECT_EQ("", g); + } + EXPECT_EQ(1, std::get<0>(r[0])); + EXPECT_EQ(3, std::get<0>(r[1])); + EXPECT_EQ(5, std::get<0>(r[2])); +} + +TYPED_TEST(ResultsPortedTest, resultsEmptyString) { + TypeParam re(".*"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, ""); + auto r = snapshotAll(m); + ASSERT_EQ(1u, r.size()); + EXPECT_EQ(std::make_tuple(0, 0, std::string("")), r[0]); +} + +// \p{L}+ over Cyrillic "мир" and CJK "世界" — Unicode property class. +TYPED_TEST(ResultsPortedTest, resultsUnicode) { + TypeParam re("\\p{L}+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m( + &re, "hello \xD0\xBC\xD0\xB8\xD1\x80 \xE4\xB8\x96\xE7\x95\x8C"); + std::vector groups; + while (m.find()) { + groups.emplace_back(m.group(0).value()); + } + EXPECT_THAT( + groups, + ::testing::ElementsAre( + "hello", "\xD0\xBC\xD0\xB8\xD1\x80", "\xE4\xB8\x96\xE7\x95\x8C")); +} + +// After find() once, continuing iteration yields the remainder only. +TYPED_TEST(ResultsPortedTest, resultsDoesNotReset) { + TypeParam re("\\w+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "one two three"); + ASSERT_TRUE(m.find()); + EXPECT_EQ("one", m.group(0).value()); + auto rest = snapshotAll(m); + ASSERT_EQ(2u, rest.size()); + EXPECT_EQ("two", std::get<2>(rest[0])); + EXPECT_EQ("three", std::get<2>(rest[1])); +} + +// After reset() we re-iterate from the beginning. +TYPED_TEST(ResultsPortedTest, resultsAfterReset) { + TypeParam re("\\w+"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "one two three"); + ASSERT_TRUE(m.find()); + m.reset(); + auto r = snapshotAll(m); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ("one", std::get<2>(r[0])); + EXPECT_EQ("two", std::get<2>(r[1])); + EXPECT_EQ("three", std::get<2>(r[2])); +} + +TYPED_TEST(ResultsPortedTest, resultsWithNamedGroups) { + TypeParam re("(?\\w+)"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "hello world"); + std::vector> r; + while (m.find()) { + r.emplace_back( + std::string(m.group(0).value()), std::string(m.group(1).value())); + } + ASSERT_EQ(2u, r.size()); + EXPECT_EQ(std::make_pair(std::string("hello"), std::string("hello")), r[0]); + EXPECT_EQ(std::make_pair(std::string("world"), std::string("world")), r[1]); +} + +TYPED_TEST(ResultsPortedTest, resultsCount) { + TypeParam re("a"); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "abracadabra"); + EXPECT_EQ(5u, snapshotAll(m).size()); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/MatcherUnicodePortedTest.cpp b/velox/external/regex_compat/tests/MatcherUnicodePortedTest.cpp new file mode 100644 index 00000000000..b4592d47d53 --- /dev/null +++ b/velox/external/regex_compat/tests/MatcherUnicodePortedTest.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `MatcherUnicodeTests.java`. All offsets are +// translated from Java UTF-16 char offsets (used by pcre4j) to UTF-8 byte +// offsets (used by our backends). +// +// Å U+00C5 2 UTF-8 bytes (C3 85) +// Ǎ U+01CD 2 UTF-8 bytes (C7 8D) +// • U+2022 3 UTF-8 bytes (E2 80 A2) +// 🌍 U+1F30D 4 UTF-8 bytes (F0 9F 8C 8D) +// ! 1 UTF-8 byte +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using UnicodePortedTest = BackendTest; +TYPED_TEST_SUITE(UnicodePortedTest, AllBackends); + +TYPED_TEST(UnicodePortedTest, unicodeOneByte) { + TypeParam re("\xC3\x85"); // Å + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "\xC3\x85"); + EXPECT_TRUE(m.matches()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(2, m.end()); +} + +TYPED_TEST(UnicodePortedTest, unicodeTwoBytes) { + TypeParam re("\xC7\x8D"); // Ǎ + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "\xC7\x8D"); + EXPECT_TRUE(m.matches()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(2, m.end()); +} + +TYPED_TEST(UnicodePortedTest, unicodeThreeBytes) { + TypeParam re("\xE2\x80\xA2"); // • + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "\xE2\x80\xA2"); + EXPECT_TRUE(m.matches()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(3, m.end()); +} + +TYPED_TEST(UnicodePortedTest, unicodeFourBytes) { + TypeParam re("\xF0\x9F\x8C\x8D"); // 🌍 U+1F30D + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, "\xF0\x9F\x8C\x8D"); + EXPECT_TRUE(m.matches()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(4, m.end()); +} + +TYPED_TEST(UnicodePortedTest, unicode) { + // ÅǍ•🌍! + const char* both = "\xC3\x85\xC7\x8D\xE2\x80\xA2\xF0\x9F\x8C\x8D!"; + TypeParam re(both); + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, both); + EXPECT_TRUE(m.matches()); + EXPECT_EQ(0, m.start()); + EXPECT_EQ(12, m.end()); + EXPECT_EQ(both, m.group(0).value()); +} + +// region() in Java uses UTF-16 char offsets; the original test calls +// region(3, 5) to bracket the surrogate pair for 🌍. In our UTF-8 world +// that's byte range [7, 11). We rely on JavaRegex's adapter doing the +// UTF-16/UTF-8 conversion internally and pass byte offsets to RE2/PCRE2. +TYPED_TEST(UnicodePortedTest, unicodeRegion) { + const char* input = "\xC3\x85\xC7\x8D\xE2\x80\xA2\xF0\x9F\x8C\x8D!"; + TypeParam re("\xF0\x9F\x8C\x8D"); // 🌍 + ASSERT_TRUE(re.ok()) << re.error(); + JavaMatcherAdapter m(&re, input); + m.region(7, 11); + EXPECT_TRUE(m.matches()); + EXPECT_EQ(7, m.start()); + EXPECT_EQ(11, m.end()); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/OpenJdkCorpusDiffTest.cpp b/velox/external/regex_compat/tests/OpenJdkCorpusDiffTest.cpp new file mode 100644 index 00000000000..3460525b44b --- /dev/null +++ b/velox/external/regex_compat/tests/OpenJdkCorpusDiffTest.cpp @@ -0,0 +1,643 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Runs the OpenJDK 17 `java/util/regex/TestCases.txt` corpus (~299 cases) +// against each backend and reports per-backend pass rate. +// +// File format (per OpenJDK header): +// line 1: pattern +// line 2: input +// line 3: "true|false " +// — match-string and groups present only when first token is true. +// Empty lines and `//` comments are skipped. +// +// The corpus is fetched at CMake configure time and its path is injected +// via OPENJDK_CORPUS_PATH. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" +#if VELOX_REGEX_COMPAT_HAS_JAVA +#include "velox/external/regex_compat/JvmFixture.h" +#endif + +#include + +#include +#include +#include +#include +#include + +#ifndef OPENJDK_CORPUS_DIR +#error "OPENJDK_CORPUS_DIR must be defined by the build system" +#endif + +namespace facebook::velox::regex_compat::test { +namespace { + +static const char* const kCorpusFiles[] = { + "TestCases.txt", + "BMPTestCases.txt", + "SupplementaryTestCases.txt", +}; + +struct CorpusCase { + std::string pattern; + std::string input; + std::string expectedResult; // verbatim "true ..." / "false 0" / "error" +}; + +// Mirror OpenJDK 17 RegExTest.grabLine: handles only `\n` (→ U+000A) and +// `\uXXXX` (→ that code point); everything else passes through verbatim. +// Surrogate-pair `\uD8##\uDC##` sequences are combined into the proper +// supplementary code point so that we end up with a valid UTF-8 4-byte +// encoding (which both RE2/PCRE2 require and our Java JNI bridge +// re-splits to a surrogate pair). +static std::string processEscapes(const std::string& s) { + std::string out; + out.reserve(s.size()); + for (std::size_t i = 0; i < s.size();) { + if (s[i] == '\\' && i + 1 < s.size() && s[i + 1] == 'n') { + out.push_back('\n'); + i += 2; + continue; + } + if (s[i] == '\\' && i + 5 < s.size() && s[i + 1] == 'u') { + std::uint32_t cp = 0; + bool ok = true; + for (int k = 0; k < 4; ++k) { + char c = s[i + 2 + k]; + cp <<= 4; + if (c >= '0' && c <= '9') cp |= (c - '0'); + else if (c >= 'a' && c <= 'f') cp |= (c - 'a' + 10); + else if (c >= 'A' && c <= 'F') cp |= (c - 'A' + 10); + else { ok = false; break; } + } + if (!ok) { + out.push_back(s[i++]); + continue; + } + i += 6; + // Combine surrogate pair if a low surrogate follows. + if (cp >= 0xD800 && cp <= 0xDBFF && i + 5 < s.size() && s[i] == '\\' + && s[i + 1] == 'u') { + std::uint32_t lo = 0; + bool ok2 = true; + for (int k = 0; k < 4; ++k) { + char c = s[i + 2 + k]; + lo <<= 4; + if (c >= '0' && c <= '9') lo |= (c - '0'); + else if (c >= 'a' && c <= 'f') lo |= (c - 'a' + 10); + else if (c >= 'A' && c <= 'F') lo |= (c - 'A' + 10); + else { ok2 = false; break; } + } + if (ok2 && lo >= 0xDC00 && lo <= 0xDFFF) { + cp = 0x10000 + (((cp - 0xD800) << 10) | (lo - 0xDC00)); + i += 6; + } + } + if (cp < 0x80) { + out.push_back(static_cast(cp)); + } else if (cp < 0x800) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp < 0x10000) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + continue; + } + out.push_back(s[i++]); + } + return out; +} + +static std::string utf8(std::uint32_t cp) { + std::string out; + if (cp < 0x80) { + out.push_back(static_cast(cp)); + } else if (cp < 0x800) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp < 0x10000) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + return out; +} + +static std::uint32_t parseHexCodePoint(std::string_view token) { + std::uint32_t cp = 0; + for (char c : token) { + cp <<= 4; + if (c >= '0' && c <= '9') { + cp |= (c - '0'); + } else if (c >= 'a' && c <= 'f') { + cp |= (c - 'a' + 10); + } else if (c >= 'A' && c <= 'F') { + cp |= (c - 'A' + 10); + } else { + throw std::invalid_argument("bad hex code point"); + } + } + return cp; +} + +#if VELOX_REGEX_COMPAT_HAS_JAVA +static jstring toJString(JNIEnv* env, std::string_view sv) { + std::vector u16; + u16.reserve(sv.size()); + for (std::size_t i = 0; i < sv.size();) { + const unsigned char c = static_cast(sv[i]); + std::uint32_t cp = 0; + std::size_t step = 1; + if (c < 0x80) { + cp = c; + } else if (c < 0xC0) { + cp = 0xFFFD; + } else if (c < 0xE0 && i + 1 < sv.size()) { + cp = ((c & 0x1F) << 6) | + (static_cast(sv[i + 1]) & 0x3F); + step = 2; + } else if (c < 0xF0 && i + 2 < sv.size()) { + cp = ((c & 0x0F) << 12) | + ((static_cast(sv[i + 1]) & 0x3F) << 6) | + (static_cast(sv[i + 2]) & 0x3F); + step = 3; + } else if (i + 3 < sv.size()) { + cp = ((c & 0x07) << 18) | + ((static_cast(sv[i + 1]) & 0x3F) << 12) | + ((static_cast(sv[i + 2]) & 0x3F) << 6) | + (static_cast(sv[i + 3]) & 0x3F); + step = 4; + } else { + cp = 0xFFFD; + } + if (cp <= 0xFFFF) { + u16.push_back(static_cast(cp)); + } else { + cp -= 0x10000; + u16.push_back(static_cast(0xD800 | (cp >> 10))); + u16.push_back(static_cast(0xDC00 | (cp & 0x3FF))); + } + i += step; + } + return env->NewString(u16.data(), static_cast(u16.size())); +} + +static std::size_t javaCharOffsetToByteOffset( + std::string_view utf8, + int javaCharOffset) { + int chars = 0; + for (std::size_t i = 0; i < utf8.size();) { + if (chars == javaCharOffset) { + return i; + } + const unsigned char c = static_cast(utf8[i]); + if (c < 0x80) { + i += 1; + chars += 1; + } else if (c < 0xE0) { + i += 2; + chars += 1; + } else if (c < 0xF0) { + i += 3; + chars += 1; + } else { + i += 4; + chars += 2; + } + } + return chars == javaCharOffset ? utf8.size() : std::string_view::npos; +} + +static std::vector directJavaGraphemeBreakOffsets(std::string_view input) { + auto* env = JvmFixture::instance().env(); + jclass patternCls = env->FindClass("java/util/regex/Pattern"); + jclass matcherCls = env->FindClass("java/util/regex/Matcher"); + jmethodID compile = env->GetStaticMethodID( + patternCls, + "compile", + "(Ljava/lang/String;)Ljava/util/regex/Pattern;"); + jmethodID matcher = env->GetMethodID( + patternCls, + "matcher", + "(Ljava/lang/CharSequence;)Ljava/util/regex/Matcher;"); + jmethodID find = env->GetMethodID(matcherCls, "find", "()Z"); + jmethodID start = env->GetMethodID(matcherCls, "start", "()I"); + + jstring pat = toJString(env, "\\b{g}"); + jobject pattern = env->CallStaticObjectMethod(patternCls, compile, pat); + env->DeleteLocalRef(pat); + jstring subject = toJString(env, input); + jobject m = env->CallObjectMethod(pattern, matcher, subject); + env->DeleteLocalRef(subject); + + std::vector offsets; + while (env->CallBooleanMethod(m, find)) { + const jint charOffset = env->CallIntMethod(m, start); + const auto byteOffset = javaCharOffsetToByteOffset(input, charOffset); + if (byteOffset != std::string_view::npos) { + offsets.push_back(static_cast(byteOffset)); + } + } + env->DeleteLocalRef(m); + env->DeleteLocalRef(pattern); + env->DeleteLocalRef(matcherCls); + env->DeleteLocalRef(patternCls); + return offsets; +} +#endif // VELOX_REGEX_COMPAT_HAS_JAVA + +struct GraphemeCase { + std::string input; + std::vector expectedBreakOffsets; +}; + +static std::vector loadGraphemeCorpus(const std::string& path) { + std::ifstream in(path); + if (!in) { + return {}; + } + std::vector cases; + std::string line; + while (std::getline(in, line)) { + const auto hash = line.find('#'); + if (hash != std::string::npos) { + line.resize(hash); + } + std::istringstream tokens(line); + std::string token; + GraphemeCase c; + bool sawToken = false; + while (tokens >> token) { + sawToken = true; + if (token == "\xC3\xB7") { + c.expectedBreakOffsets.push_back(static_cast(c.input.size())); + } else if (token == "\xC3\x97") { + continue; + } else { + c.input += utf8(parseHexCodePoint(token)); + } + } + if (sawToken) { + cases.push_back(std::move(c)); + } + } + return cases; +} + +// OpenJDK format uses spaces both as field separators and inside captured +// group text. We don't need to split — the OpenJDK runner emits the +// expected line via plain StringBuilder concatenation; we rebuild the +// actual result the same way and compare strings. + +static std::vector loadCorpus(const std::string& path) { + std::ifstream in(path); + if (!in) { + return {}; + } + // Replicate OpenJDK's grabLine: skip blank and `//` lines. + auto grab = [&](std::string& out) -> bool { + while (std::getline(in, out)) { + if (out.empty()) continue; + if (out.size() >= 2 && out[0] == '/' && out[1] == '/') continue; + return true; + } + return false; + }; + std::vector cases; + std::string pattern, input, expected; + while (grab(pattern) && grab(input) && grab(expected)) { + CorpusCase c; + c.pattern = processEscapes(pattern); + c.input = processEscapes(input); + c.expectedResult = processEscapes(expected); + cases.push_back(std::move(c)); + } + return cases; +} + +// Per-(backend, file) tally — keyed by "backend|file". +struct CorpusStats { + int passed = 0; + int failed = 0; + int compileErrors = 0; + // Subset of `compileErrors` whose root cause is the translator rejecting + // the pattern as untranslatable for the engine (e.g. RE2 lookaround / + // backref / possessive). These are engine-feature-impossible, NOT bugs + // in our translator; surfaced separately so we can report a rate that + // excludes them ("translatable-subset rate"). + int translatorRejected = 0; +}; + +std::map& allStats() { + static std::map s; + return s; +} + +// Tear-down printer. Registered as a global Environment so it runs after +// the typed tests. +class CorpusReporter : public ::testing::Environment { + public: + void TearDown() override { + auto& m = allStats(); + if (m.empty()) { + return; + } + std::fprintf(stderr, "\n"); + std::fprintf(stderr, "========== OpenJDK corpus compat rate ==========\n"); + // Aggregate per backend across all files; also print per-file. + std::map agg; + for (const auto& [key, st] : m) { + auto bar = key.find('|'); + std::string backend = key.substr(0, bar); + auto& a = agg[backend]; + a.passed += st.passed; + a.failed += st.failed; + a.compileErrors += st.compileErrors; + a.translatorRejected += st.translatorRejected; + } + for (const auto& [key, st] : m) { + int total = st.passed + st.failed + st.compileErrors; + double pct = total > 0 ? 100.0 * st.passed / total : 0.0; + std::fprintf( + stderr, + " %-50s %4d / %4d (%.2f%%) [compile-err: %d]\n", + key.c_str(), + st.passed, + total, + pct, + st.compileErrors); + } + std::fprintf(stderr, " ---- aggregate ----\n"); + for (const auto& [name, st] : agg) { + int total = st.passed + st.failed + st.compileErrors; + double pct = total > 0 ? 100.0 * st.passed / total : 0.0; + std::fprintf( + stderr, + " %-50s %4d / %4d (%.2f%%) [compile-err: %d]\n", + name.c_str(), + st.passed, + total, + pct, + st.compileErrors); + // Also report a "translatable subset" rate that excludes patterns + // the translator rejected as engine-impossible (e.g. RE2 lookaround + // or backref). This isolates what's actually attributable to the + // translator/backend vs to engine ceilings. + if (st.translatorRejected > 0) { + const int subsetTotal = total - st.translatorRejected; + const double subsetPct = + subsetTotal > 0 ? 100.0 * st.passed / subsetTotal : 0.0; + std::fprintf( + stderr, + " %-50s %4d / %4d (%.2f%%) [excludes %d translator-rejected]\n", + (name + " (translatable subset)").c_str(), + st.passed, + subsetTotal, + subsetPct, + st.translatorRejected); + } + } + std::fprintf(stderr, "================================================\n"); + } +}; + +// Register the reporter exactly once. +[[maybe_unused]] static auto* kReporter = + ::testing::AddGlobalTestEnvironment(new CorpusReporter); + +template +const char* backendName() { + if constexpr (std::is_same_v) { + return "Re2"; + } else if constexpr (std::is_same_v) { + return "Pcre2"; + } else { + return "Java"; + } +} + +template +using OpenJdkCorpusDiffTest = BackendTest; +TYPED_TEST_SUITE(OpenJdkCorpusDiffTest, AllBackends); + +TYPED_TEST(OpenJdkCorpusDiffTest, runCorpus) { + const std::string backend = backendName(); + int totalCases = 0; + int totalJavaFailures = 0; + for (const char* fname : kCorpusFiles) { + std::string path = std::string(OPENJDK_CORPUS_DIR) + "/" + fname; + std::vector kCorpus = loadCorpus(path); + ASSERT_FALSE(kCorpus.empty()) << "Corpus is empty — failed to load " << path; + totalCases += static_cast(kCorpus.size()); + + const std::string key = backend + "|" + fname; + auto& st = allStats()[key]; + + for (const auto& c : kCorpus) { + TypeParam re(c.pattern); + if (!re.ok()) { + if (c.expectedResult.rfind("error", 0) == 0) { + ++st.passed; + } else { + ++st.compileErrors; + if (re.error().find("translator: ") != std::string::npos) { + ++st.translatorRejected; + } +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + ++totalJavaFailures; + std::fprintf( + stderr, + "[OpenJDK %s] Java compile-err: pattern=[%s] err=[%s]\n", + fname, + c.pattern.c_str(), + re.error().c_str()); + } +#endif + } + continue; + } + JavaMatcherAdapter m(&re, c.input); + const bool found = m.find(); + std::string actual; + if (found) { + actual.append("true "); + actual.append(std::string(m.group(0).value())); + actual.push_back(' '); + actual.append(std::to_string(m.groupCount())); + for (int i = 1; i <= m.groupCount(); ++i) { + auto gi = m.group(i); + if (gi) { + actual.push_back(' '); + actual.append(std::string(*gi)); + } + } + } else { + actual.append("false "); + actual.append(std::to_string(m.groupCount())); + } + if (actual == c.expectedResult) { + ++st.passed; + } else { + ++st.failed; +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + ++totalJavaFailures; + std::fprintf( + stderr, + "[OpenJDK %s] Java mismatch:\n pattern=[%s]\n input=[%s]\n expected=[%s]\n actual= [%s]\n", + fname, + c.pattern.c_str(), + c.input.c_str(), + c.expectedResult.c_str(), + actual.c_str()); + } +#endif + } + } + } + +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + EXPECT_EQ(0, totalJavaFailures) + << "Java backend should match every case across all OpenJDK corpus files"; + } +#endif + EXPECT_GT(totalCases, 0); +} + +struct GraphemeStats { + int passed = 0; + int failed = 0; + int compileErrors = 0; +}; + +std::map& graphemeStats() { + static std::map s; + return s; +} + +class GraphemeReporter : public ::testing::Environment { + public: + void TearDown() override { + auto& m = graphemeStats(); + if (m.empty()) { + return; + } + std::fprintf(stderr, "\n"); + std::fprintf(stderr, "========== OpenJDK grapheme corpus compat rate ==========\n"); + for (const auto& [backend, st] : m) { + const int total = st.passed + st.failed + st.compileErrors; + const double pct = total > 0 ? 100.0 * st.passed / total : 0.0; + std::fprintf( + stderr, + " %-8s %4d / %4d (%.2f%%) [compile-err: %d]\n", + backend.c_str(), + st.passed, + total, + pct, + st.compileErrors); + } + std::fprintf(stderr, "=========================================================\n"); + } +}; + +[[maybe_unused]] static auto* kGraphemeReporter = + ::testing::AddGlobalTestEnvironment(new GraphemeReporter); + +template +using GraphemeCorpusTest = BackendTest; +TYPED_TEST_SUITE(GraphemeCorpusTest, AllBackends); + +TYPED_TEST(GraphemeCorpusTest, runGraphemeBreakCorpus) { + const std::string path = + std::string(OPENJDK_CORPUS_DIR) + "/GraphemeTestCases.txt"; + const auto cases = loadGraphemeCorpus(path); + ASSERT_FALSE(cases.empty()) << "Corpus is empty — failed to load " << path; + + int javaFailures = 0; + auto& st = graphemeStats()[backendName()]; + for (const auto& c : cases) { + TypeParam re("\\b{g}"); + if (!re.ok()) { + ++st.compileErrors; +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + ++javaFailures; + std::fprintf( + stderr, + "[OpenJDK Grapheme] Java compile-err: %s\n", + re.error().c_str()); + } +#endif + continue; + } + + std::vector actual; +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + actual = directJavaGraphemeBreakOffsets(c.input); + } else +#endif + { + JavaMatcherAdapter m(&re, c.input); + while (m.find()) { + actual.push_back(m.start()); + } + } + if (actual == c.expectedBreakOffsets) { + ++st.passed; + } else { + ++st.failed; +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + ++javaFailures; + std::fprintf( + stderr, + "[OpenJDK Grapheme] Java mismatch: expected %zu breaks, actual %zu breaks\n", + c.expectedBreakOffsets.size(), + actual.size()); + } +#endif + } + } + +#if VELOX_REGEX_COMPAT_HAS_JAVA + if constexpr (std::is_same_v) { + EXPECT_EQ(0, javaFailures) + << "Java backend should match every GraphemeTestCases.txt case"; + } +#endif +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/PatternPortedTest.cpp b/velox/external/regex_compat/tests/PatternPortedTest.cpp new file mode 100644 index 00000000000..e1c27b0542d --- /dev/null +++ b/velox/external/regex_compat/tests/PatternPortedTest.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `PatternTests.java` +// (https://github.com/alexey-pelykh/pcre4j, GPL-LGPL upstream; this C++ port +// is the work of the Velox project, Apache-2.0). +// +// Each TYPED_TEST below runs against every regex backend (Re2Regex, +// Pcre2Regex, JavaRegex) enabled at compile time. Tests asserting Java +// semantics that some backend cannot satisfy are marked with the backend's +// known limitation and skipped via `if constexpr` rather than disabled, so +// any future improvement in the backend is detected by the test newly passing. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using PatternPortedTest = BackendTest; +TYPED_TEST_SUITE(PatternPortedTest, AllBackends); + +// pcre4j PatternTests.toStringReturnsPattern: Pattern.toString() returns the +// original source string. Our IRegex doesn't expose `pattern()` directly, +// but `NamedCapturingGroups()` + `NumberOfCapturingGroups()` cover the +// compile-side state-mirror part. Skip the pure-toString assertion. + +// pcre4j PatternTests.namedGroups +TYPED_TEST(PatternPortedTest, namedGroupsSingle) { + TypeParam re("(?42)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(1, re.NumberOfCapturingGroups()); +} + +TYPED_TEST(PatternPortedTest, namedGroupsTwoNames) { + TypeParam re("(?x)(?y)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(2, re.NumberOfCapturingGroups()); +} + +TYPED_TEST(PatternPortedTest, numberedGroupsOnly) { + TypeParam re("(\\d)(\\w)(\\s)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(3, re.NumberOfCapturingGroups()); +} + +TYPED_TEST(PatternPortedTest, nonCapturingGroupDoesNotIncrement) { + TypeParam re("(?:foo)(bar)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(1, re.NumberOfCapturingGroups()); +} + +// pcre4j PatternTests.split (essence: split on \\D+ produces digit groups) +TYPED_TEST(PatternPortedTest, splitOnDigitGroups) { + // We don't expose Pattern.split() at backend level; emulate via find-loop. + TypeParam re("\\D+"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string_view in = "0, 1, 1, 2, 3, 5, 8"; + JavaMatcherAdapter m(&re, in); + std::vector tokens; + std::size_t prev = 0; + while (m.find()) { + tokens.emplace_back(in.substr(prev, m.start() - prev)); + prev = m.end(); + } + tokens.emplace_back(in.substr(prev)); + EXPECT_THAT( + tokens, ::testing::ElementsAre("0", "1", "1", "2", "3", "5", "8")); +} + +// pcre4j PatternTests.unicodeSplit +TYPED_TEST(PatternPortedTest, splitUnicodeDelimiters) { + TypeParam re("\\D+"); + ASSERT_TRUE(re.ok()) << re.error(); + // U+21E2 RIGHTWARDS DASHED ARROW (3-byte UTF-8 sequence). + std::string_view in = "0 \xe2\x87\xa2 1 \xe2\x87\xa2 2"; + JavaMatcherAdapter m(&re, in); + std::vector tokens; + std::size_t prev = 0; + while (m.find()) { + tokens.emplace_back(in.substr(prev, m.start() - prev)); + prev = m.end(); + } + tokens.emplace_back(in.substr(prev)); + EXPECT_THAT(tokens, ::testing::ElementsAre("0", "1", "2")); +} + +// pcre4j PatternTests CASE_INSENSITIVE flag +TYPED_TEST(PatternPortedTest, caseInsensitiveCompileTimeFlag) { + Options opt; + opt.caseSensitive = false; + TypeParam re("HeLLo", opt); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::PartialMatch("hello", re)); + EXPECT_TRUE(TypeParam::PartialMatch("HELLO", re)); +} + +// pcre4j PatternTests DOTALL flag +TYPED_TEST(PatternPortedTest, dotallMatchesNewline) { + Options opt; + opt.dotNl = true; + TypeParam re("a.b", opt); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::PartialMatch("a\nb", re)); +} + +// pcre4j PatternTests MULTILINE flag +TYPED_TEST(PatternPortedTest, multilineCaret) { + Options opt; + opt.oneLine = false; + TypeParam re("^X", opt); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::PartialMatch("foo\nX bar", re)); +} + +TYPED_TEST(PatternPortedTest, multilineDollar) { + Options opt; + opt.oneLine = false; + TypeParam re("X$", opt); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::PartialMatch("foo X\nbar", re)); +} + +// pcre4j PatternTests invalid pattern syntax +TYPED_TEST(PatternPortedTest, invalidPatternRejected) { + TypeParam re("("); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(re.error().empty()); +} + +TYPED_TEST(PatternPortedTest, invalidPatternRejectedSquareBracket) { + TypeParam re("["); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(re.error().empty()); +} + +// pcre4j PatternTests: `a{` — Java rejects as incomplete quantifier. +// PCRE2 and RE2 accept it literally. This test asserts Java behaviour; +// other backends will fail, which is the documented compatibility gap. +TYPED_TEST(PatternPortedTest, braceQuantifierIncomplete) { + TypeParam re("a{"); + EXPECT_FALSE(re.ok()); +} + +// Empty pattern matches empty string anywhere. +TYPED_TEST(PatternPortedTest, emptyPatternMatchesEverywhere) { + TypeParam re(""); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::PartialMatch("anything", re)); + EXPECT_TRUE(TypeParam::FullMatch("", re)); +} + +// Java-style `Pattern.quote(s)` wraps `s` in `\Q...\E` so the input is +// treated as a literal string. Embedded `\E` in `s` must be escaped to +// avoid prematurely ending the literal section. +static std::string javaQuote(std::string_view s) { + std::string out = "\\Q"; + std::size_t i = 0; + while (true) { + auto j = s.find("\\E", i); + if (j == std::string_view::npos) { + out.append(s.substr(i)); + break; + } + out.append(s.substr(i, j - i)); + out.append("\\E\\\\E\\Q"); + i = j + 2; + } + out.append("\\E"); + return out; +} + +// `Pattern.quote` round-trips any literal string through the regex engine. +TYPED_TEST(PatternPortedTest, quote) { + for (const std::string_view sample : { + std::string_view(""), + std::string_view(".*+?^$|()[]\\{}"), + std::string_view("abc\\Edef"), + }) { + TypeParam re(javaQuote(sample)); + ASSERT_TRUE(re.ok()) << re.error() << " for [" << sample << "]"; + EXPECT_TRUE(TypeParam::FullMatch(sample, re)) << "input=[" << sample << "]"; + } +} + +// (?x) free-spacing: unescaped whitespace in pattern is ignored. +TYPED_TEST(PatternPortedTest, commentsWhitespaceIgnored) { + TypeParam re("(?x)a b c"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::FullMatch("abc", re)); +} + +// (?x) `#` to end of line is a comment. +TYPED_TEST(PatternPortedTest, commentsHashComments) { + TypeParam re("(?x)abc # this is a comment\ndef"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::FullMatch("abcdef", re)); +} + +// (?x) escaped whitespace is matched literally. +TYPED_TEST(PatternPortedTest, commentsEscapedWhitespace) { + TypeParam re("(?x)a\\ b"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::FullMatch("a b", re)); +} + +// (?x) escaped whitespace inside a character class is matched literally. +TYPED_TEST(PatternPortedTest, commentsWhitespaceInCharacterClass) { + TypeParam re("(?x)[\\ ]"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::FullMatch(" ", re)); +} + +// Embedded (?x) flag at start enables COMMENTS for the rest of the pattern. +TYPED_TEST(PatternPortedTest, commentsEmbeddedFlag) { + TypeParam re("(?x)a b c"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(TypeParam::FullMatch("abc", re)); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/PatternSplitPortedTest.cpp b/velox/external/regex_compat/tests/PatternSplitPortedTest.cpp new file mode 100644 index 00000000000..e27e90876eb --- /dev/null +++ b/velox/external/regex_compat/tests/PatternSplitPortedTest.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Cases ported from pcre4j's `PatternSplitTests.java`. +// +// Java's `Pattern.split` is implemented here as a free helper that drives +// the backend's find() loop through `JavaMatcherAdapter`, so engine +// differences in find()/match propagate naturally to split() output. +// +// Skipped: +// * splitWithDelimiters* — `String[] splitWithDelimiters(...)` is Java 21+ +// and not in our embedded JDK 17 surface. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include +#include + +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +// Java-canonical split: find()-loop walk, trailing-empty trim when limit==0, +// at-most-`limit` parts when limit>0, no trim when limit<0. +template +std::vector +javaSplit(R& re, std::string_view input, int limit = 0) { + JavaMatcherAdapter m(&re, input); + std::vector parts; + int matches = 0; + std::size_t index = 0; + const bool matchLimited = limit > 0; + while (m.find()) { + if (matchLimited && matches == limit - 1) { + break; + } + const std::size_t s = static_cast(m.start()); + const std::size_t e = static_cast(m.end()); + // Java skips zero-width matches that don't advance past the current + // segment start. + if (s == index && s == e) { + continue; + } + parts.emplace_back(input.substr(index, s - index)); + index = e; + ++matches; + } + if (matches == 0) { + return {std::string(input)}; + } + parts.emplace_back(input.substr(index)); + if (limit == 0) { + while (!parts.empty() && parts.back().empty()) { + parts.pop_back(); + } + } + return parts; +} + +template +using SplitPortedTest = BackendTest; +TYPED_TEST_SUITE(SplitPortedTest, AllBackends); + +// --- limit=0 trailing empty strings removal --- + +TYPED_TEST(SplitPortedTest, splitTrailingEmptyStringsRemovedWithDefaultLimit) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, "a,b,c,,,"), ::testing::ElementsAre("a", "b", "c")); +} + +TYPED_TEST(SplitPortedTest, splitTrailingEmptyStringsRemovedWithZeroLimit) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, "a,b,c,,,", 0), ::testing::ElementsAre("a", "b", "c")); +} + +TYPED_TEST(SplitPortedTest, splitAllEmptyWithZeroLimit) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(javaSplit(re, ",,,", 0).empty()); +} + +// --- Positive limit --- + +TYPED_TEST(SplitPortedTest, splitPositiveLimitOne) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT(javaSplit(re, "a,b,c", 1), ::testing::ElementsAre("a,b,c")); +} + +TYPED_TEST(SplitPortedTest, splitPositiveLimitExceedsMatches) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, "a,b,c", 10), ::testing::ElementsAre("a", "b", "c")); +} + +// --- Empty input and no-match --- + +TYPED_TEST(SplitPortedTest, splitEmptyInput) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT(javaSplit(re, ""), ::testing::ElementsAre("")); +} + +TYPED_TEST(SplitPortedTest, splitNoMatch) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT(javaSplit(re, "abc"), ::testing::ElementsAre("abc")); +} + +// --- Regex-based delimiter edge cases --- + +TYPED_TEST(SplitPortedTest, splitMultiCharDelimiter) { + TypeParam re("\\s*,\\s*"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, "a , b , c"), ::testing::ElementsAre("a", "b", "c")); +} + +TYPED_TEST(SplitPortedTest, splitDelimiterAtStartAndEnd) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, ",a,b,c,"), ::testing::ElementsAre("", "a", "b", "c")); +} + +TYPED_TEST(SplitPortedTest, splitConsecutiveDelimiters) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, "a,,b,,c"), + ::testing::ElementsAre("a", "", "b", "", "c")); +} + +TYPED_TEST(SplitPortedTest, splitSingleCharInput) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_TRUE(javaSplit(re, ",").empty()); +} + +// --- splitAsStream edge cases (Java's splitAsStream is just a stream view +// over the same split logic; we reuse javaSplit here). --- + +TYPED_TEST(SplitPortedTest, splitAsStreamTrailingEmpties) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT( + javaSplit(re, "a,b,c,,,"), ::testing::ElementsAre("a", "b", "c")); +} + +TYPED_TEST(SplitPortedTest, splitAsStreamEmptyInput) { + TypeParam re(","); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_THAT(javaSplit(re, ""), ::testing::ElementsAre("")); +} + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/Pcre2RegexTest.cpp b/velox/external/regex_compat/tests/Pcre2RegexTest.cpp new file mode 100644 index 00000000000..ce55bab9c17 --- /dev/null +++ b/velox/external/regex_compat/tests/Pcre2RegexTest.cpp @@ -0,0 +1,299 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/external/regex_compat/Pcre2Regex.h" + +#include +#include + +namespace facebook::velox::regex_compat { +namespace { + +TEST(Pcre2RegexTest, compileOk) { + Pcre2Regex re("\\d+"); + EXPECT_TRUE(re.ok()); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); + EXPECT_EQ("", re.error()); +} + +TEST(Pcre2RegexTest, compileError) { + Pcre2Regex re("(unclosed"); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(re.error().empty()); +} + +TEST(Pcre2RegexTest, surrogateBlockCompilesInRawByteMode) { + Pcre2Regex re("\\p{InHIGH_SURROGATES}"); + EXPECT_TRUE(re.ok()) << re.error(); +} + +TEST(Pcre2RegexTest, javaNamedGroupAccepted) { + // PCRE2 natively understands (?...) — no translation needed. + Pcre2Regex re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(1, re.NumberOfCapturingGroups()); + const auto& names = re.NamedCapturingGroups(); + ASSERT_NE(names.end(), names.find("num")); + EXPECT_EQ(1, names.at("num")); +} + +TEST(Pcre2RegexTest, matchUnanchored) { + Pcre2Regex re("(\\d+)"); + std::string_view sub[2]; + std::string_view in = "abc 42 xyz"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 2)); + EXPECT_EQ("42", sub[0]); + EXPECT_EQ("42", sub[1]); +} + +TEST(Pcre2RegexTest, matchAnchorBoth) { + Pcre2Regex re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TEST(Pcre2RegexTest, matchAnchorBothRejectsTrailing) { + Pcre2Regex re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc1"; + EXPECT_FALSE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TEST(Pcre2RegexTest, fullPartialMatch) { + Pcre2Regex re("[a-z]+"); + EXPECT_TRUE(Pcre2Regex::FullMatch("abc", re)); + EXPECT_FALSE(Pcre2Regex::FullMatch("abc1", re)); + EXPECT_TRUE(Pcre2Regex::PartialMatch("abc1", re)); +} + +TEST(Pcre2RegexTest, globalReplaceWithNumberedGroup) { + // PCRE2 with SUBSTITUTE_EXTENDED natively understands $1. + Pcre2Regex re("(\\d+)"); + std::string s = "a1b22c333"; + int n = Pcre2Regex::GlobalReplace(&s, re, "[$1]"); + EXPECT_EQ(3, n); + EXPECT_EQ("a[1]b[22]c[333]", s); +} + +TEST(Pcre2RegexTest, globalReplaceWithNamedGroup) { + // PCRE2 natively understands ${name}. + Pcre2Regex re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "a1b22c"; + int n = Pcre2Regex::GlobalReplace(&s, re, "[${n}]"); + EXPECT_EQ(2, n); + EXPECT_EQ("a[1]b[22]c", s); +} + +TEST(Pcre2RegexTest, caseInsensitiveOption) { + Options opt; + opt.caseSensitive = false; + Pcre2Regex re("hello", opt); + EXPECT_TRUE(Pcre2Regex::PartialMatch("HELLO world", re)); +} + +TEST(Pcre2RegexTest, unicodeCaseOptionPrefoldsKnownLiterals) { + Options opt; + opt.caseSensitive = false; + Pcre2Regex kelvin("\\u212A", opt); + ASSERT_TRUE(kelvin.ok()) << kelvin.error(); + EXPECT_TRUE(Pcre2Regex::FullMatch("k", kelvin)); + EXPECT_TRUE(Pcre2Regex::FullMatch("K", kelvin)); + + Pcre2Regex sigma("\xce\xa3", opt); + ASSERT_TRUE(sigma.ok()) << sigma.error(); + EXPECT_TRUE(Pcre2Regex::FullMatch("\xcf\x82", sigma)); + EXPECT_TRUE(Pcre2Regex::FullMatch("\xcf\x83", sigma)); +} + +TEST(Pcre2RegexTest, defaultWordClassIsAscii) { + Pcre2Regex re("(? +#include + +namespace facebook::velox::regex_compat { +namespace { + +TEST(Re2RegexTest, compileOk) { + Re2Regex re("\\d+"); + EXPECT_TRUE(re.ok()); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); + EXPECT_EQ("", re.error()); +} + +TEST(Re2RegexTest, compileError) { + Re2Regex re("(unclosed"); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(re.error().empty()); +} + +TEST(Re2RegexTest, javaNamedGroupAccepted) { + // Java syntax (?...) should be translated to RE2 (?P...) by + // toRe2Pattern before reaching re2::RE2. + Re2Regex re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + EXPECT_EQ(1, re.NumberOfCapturingGroups()); + const auto& names = re.NamedCapturingGroups(); + ASSERT_NE(names.end(), names.find("num")); + EXPECT_EQ(1, names.at("num")); +} + +TEST(Re2RegexTest, matchUnanchored) { + Re2Regex re("(\\d+)"); + std::string_view sub[2]; + std::string_view in = "abc 42 xyz"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kUnanchored, sub, 2)); + EXPECT_EQ("42", sub[0]); + EXPECT_EQ("42", sub[1]); +} + +TEST(Re2RegexTest, matchAnchorBoth) { + Re2Regex re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc"; + EXPECT_TRUE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TEST(Re2RegexTest, matchAnchorBothRejectsTrailing) { + Re2Regex re("[a-z]+"); + std::string_view sub[1]; + std::string_view in = "abc1"; + EXPECT_FALSE(re.Match(in, 0, in.size(), Anchor::kAnchorBoth, sub, 1)); +} + +TEST(Re2RegexTest, fullPartialMatch) { + Re2Regex re("[a-z]+"); + EXPECT_TRUE(Re2Regex::FullMatch("abc", re)); + EXPECT_FALSE(Re2Regex::FullMatch("abc1", re)); + EXPECT_TRUE(Re2Regex::PartialMatch("abc1", re)); +} + +TEST(Re2RegexTest, globalReplaceWithNumberedGroup) { + // Java $1 should be translated to RE2 \1 by prepareRegexpReplaceReplacement. + Re2Regex re("(\\d+)"); + std::string s = "a1b22c333"; + int n = Re2Regex::GlobalReplace(&s, re, "[$1]"); + EXPECT_EQ(3, n); + EXPECT_EQ("a[1]b[22]c[333]", s); +} + +TEST(Re2RegexTest, globalReplaceWithNamedGroup) { + // Java ${name} should be translated to RE2 \N by prepareRegexpReplaceReplacement. + Re2Regex re("(?\\d+)"); + ASSERT_TRUE(re.ok()) << re.error(); + std::string s = "a1b22c"; + int n = Re2Regex::GlobalReplace(&s, re, "[${n}]"); + EXPECT_EQ(2, n); + EXPECT_EQ("a[1]b[22]c", s); +} + +TEST(Re2RegexTest, caseInsensitiveOption) { + Options opt; + opt.caseSensitive = false; + Re2Regex re("hello", opt); + EXPECT_TRUE(Re2Regex::PartialMatch("HELLO world", re)); +} + +TEST(Re2RegexTest, lookaroundUnsupportedByRe2) { + Re2Regex re("(?=foo)bar"); + EXPECT_FALSE(re.ok()); + EXPECT_THAT(re.error(), ::testing::HasSubstr("Java→RE2 translator")); + EXPECT_THAT(re.error(), ::testing::HasSubstr("lookaround")); +} + +} // namespace +} // namespace facebook::velox::regex_compat diff --git a/velox/external/regex_compat/tests/RegExTestPortedTest.cpp b/velox/external/regex_compat/tests/RegExTestPortedTest.cpp new file mode 100644 index 00000000000..a3e70b25d1a --- /dev/null +++ b/velox/external/regex_compat/tests/RegExTestPortedTest.cpp @@ -0,0 +1,1078 @@ +/* + * Copyright (c) 1999, 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +// +// Ported to GTest for inclusion in Velox's regex-compat test suite. The +// original source is OpenJDK 17's test/jdk/java/util/regex/RegExTest.java, +// as imported by the pcre4j compatibility fork. These tests intentionally +// run the same Java-pattern inputs through JavaMatcherAdapter so +// Java, PCRE2 and RE2 backends report a per-backend compatibility rate. +// + +#include "velox/external/regex_compat/tests/BackendTestBase.h" +#include "velox/external/regex_compat/tests/JavaMatcherAdapter.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace facebook::velox::regex_compat::test { +namespace { + +template +using RegExTestPortedTest = BackendTest; +TYPED_TEST_SUITE(RegExTestPortedTest, AllBackends); + +struct RegExStats { + int passed = 0; + int failed = 0; + // Tests where any pattern compile in the body was rejected by the + // translator as engine-impossible. Tracked separately so the report + // can compute a "translatable subset" rate. + int translatorRejected = 0; +}; + +std::map& regExStats() { + static std::map s; + return s; +} + +// Thread-local flag set whenever a helper observes the translator +// rejecting the pattern as engine-impossible (e.g. RE2 lookaround / +// backref / possessive). The test macro consumes it after the body +// runs and bumps a per-backend tally so we can report a "translatable +// subset" rate that excludes engine-impossible tests. +inline thread_local bool tlsTranslatorRejected = false; + +class RegExReporter : public ::testing::Environment { + public: + void TearDown() override { + auto& m = regExStats(); + if (m.empty()) { + return; + } + std::fprintf(stderr, "\n"); + std::fprintf(stderr, "========== RegExTest ported compat rate =========\n"); + for (const auto& [backend, st] : m) { + const int total = st.passed + st.failed; + const double pct = total > 0 ? 100.0 * st.passed / total : 0.0; + std::fprintf( + stderr, + " %-8s %4d / %4d (%.2f%%)\n", + backend.c_str(), + st.passed, + total, + pct); + if (st.translatorRejected > 0) { + const int subsetTotal = total - st.translatorRejected; + const double subsetPct = + subsetTotal > 0 ? 100.0 * st.passed / subsetTotal : 0.0; + std::fprintf( + stderr, + " %-8s %4d / %4d (%.2f%%) [excludes %d translator-rejected]\n", + (backend + " (translatable subset)").c_str(), + st.passed, + subsetTotal, + subsetPct, + st.translatorRejected); + } + } + std::fprintf(stderr, "=================================================\n"); + } +}; + +[[maybe_unused]] static auto* kRegExReporter = + ::testing::AddGlobalTestEnvironment(new RegExReporter); + +template +const char* backendName() { + if constexpr (std::is_same_v) { + return "Re2"; + } else if constexpr (std::is_same_v) { + return "Pcre2"; + } else { + return "Java"; + } +} + +template +void recordCase(bool ok, const char* /*testName*/) { + auto& st = regExStats()[backendName()]; + if (tlsTranslatorRejected) { + ++st.translatorRejected; + } + if (ok) { + ++st.passed; + } else { + ++st.failed; + } + tlsTranslatorRejected = false; +} + +static Options caseInsensitive() { + Options opt; + opt.caseSensitive = false; + return opt; +} + +static Options dotAll() { + Options opt; + opt.dotNl = true; + return opt; +} + +static Options multiLine() { + Options opt; + opt.oneLine = false; + return opt; +} + +static Options ciDotAllMultiLine() { + Options opt; + opt.caseSensitive = false; + opt.dotNl = true; + opt.oneLine = false; + return opt; +} + +static std::string utf8(std::uint32_t cp) { + std::string out; + if (cp < 0x80) { + out.push_back(static_cast(cp)); + } else if (cp < 0x800) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp < 0x10000) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + return out; +} + +static std::string toSupplementaries(std::string_view s) { + std::string out; + for (std::size_t i = 0; i < s.size();) { + unsigned char c = static_cast(s[i]); + if (c == '\\' && i + 1 < s.size()) { + out.push_back(s[i++]); + out.push_back(s[i++]); + if (out.back() == 'u' && i + 4 <= s.size()) { + out.append(s.substr(i, 4)); + i += 4; + } + } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { + out.append(utf8(0x10000 + c)); + ++i; + } else { + out.push_back(s[i++]); + } + } + return out; +} + +static std::string javaQuote(std::string_view s) { + std::string out = "\\Q"; + std::size_t i = 0; + while (true) { + auto j = s.find("\\E", i); + if (j == std::string_view::npos) { + out.append(s.substr(i)); + break; + } + out.append(s.substr(i, j - i)); + out.append("\\E\\\\E\\Q"); + i = j + 2; + } + out.append("\\E"); + return out; +} + +// Thread-local flag set whenever a helper observes the translator +// rejecting the pattern as engine-impossible (e.g. RE2 lookaround / +// backref / possessive). The test macro consumes it after the body +// runs and bumps a per-backend tally so we can report a "translatable +// subset" rate that excludes engine-impossible tests. +// (Declared earlier in the file so recordCase can use it.) + +template +inline bool notePatternStatus(const R& re) { + if (!re.ok() && + re.error().find("translator: ") != std::string::npos) { + tlsTranslatorRejected = true; + } + return re.ok(); +} + +template +bool find(std::string_view pattern, std::string_view input, Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + return m.find(); +} + +template +bool noFind(std::string_view pattern, std::string_view input, Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + return !m.find(); +} + +template +bool full(std::string_view pattern, std::string_view input, Options opt = {}) { + R re(pattern, opt); + return re.ok() && R::FullMatch(input, re); +} + +template +bool notFull(std::string_view pattern, std::string_view input, Options opt = {}) { + R re(pattern, opt); + return re.ok() && !R::FullMatch(input, re); +} + +template +bool findGroup( + std::string_view pattern, + std::string_view input, + std::string_view expected, + Options opt = {}, + int group = 0) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + if (!m.find()) { + return false; + } + auto g = m.group(group); + return g && *g == expected; +} + +template +bool findStart( + std::string_view pattern, + std::string_view input, + int expected, + Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + return m.find() && m.start() == expected; +} + +template +bool lookingAt(std::string_view pattern, std::string_view input, Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + return m.lookingAt(); +} + +template +bool notLookingAt(std::string_view pattern, std::string_view input, Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + return !m.lookingAt(); +} + +template +bool replaceAllEquals( + std::string_view pattern, + std::string input, + std::string_view replacement, + std::string_view expected, + Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + R::GlobalReplace(&input, re, replacement); + return input == expected; +} + +template +bool replaceFirstEquals( + std::string_view pattern, + std::string_view input, + std::string_view replacement, + std::string_view expected, + Options opt = {}) { + R re(pattern, opt); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + return m.replaceFirst(replacement) == expected; +} + +template +bool appendWalkEquals( + std::string_view pattern, + std::string_view input, + std::string_view replacement, + std::string_view expected, + int skipMiddleFinds = 0) { + R re(pattern); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + std::string sb; + if (skipMiddleFinds == 0) { + while (m.find()) { + m.appendReplacement(sb, replacement); + } + } else { + if (!m.find()) return false; + m.appendReplacement(sb, "$1"); + for (int i = 0; i < skipMiddleFinds; ++i) { + if (!m.find()) return false; + } + m.appendReplacement(sb, replacement); + } + m.appendTail(sb); + return sb == expected; +} + +template +bool appendReplacementThrowsAndLeavesBuffer( + std::string_view pattern, + std::string_view input, + std::string_view replacement) { + R re(pattern); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + std::string sb; + if (!m.find()) { + return false; + } + try { + m.appendReplacement(sb, replacement); + return false; + } catch (const std::exception&) { + return sb.empty(); + } +} + +template +bool splitEquals( + std::string_view pattern, + std::string_view input, + const std::vector& expected) { + R re(pattern); + if (!notePatternStatus(re)) { + return false; + } + JavaMatcherAdapter m(&re, input); + std::vector actual; + std::size_t prev = 0; + while (m.find()) { + actual.emplace_back(input.substr(prev, m.start() - prev)); + prev = static_cast(m.end()); + } + actual.emplace_back(input.substr(prev)); + while (actual.size() > 1 && actual.back().empty()) { + actual.pop_back(); + } + return actual == expected; +} + +template +bool compiles(std::string_view pattern, Options opt = {}) { + R re(pattern, opt); + return re.ok(); +} + +template +bool rejects(std::string_view pattern, Options opt = {}) { + R re(pattern, opt); + return !re.ok(); +} + +#if VELOX_REGEX_COMPAT_HAS_JAVA +#define PORTED_REGEX_TEST_JAVA_GUARD(TestName) \ + if constexpr (std::is_same_v) { \ + EXPECT_TRUE(ok) << "RegExTest::" #TestName " Java backend regression"; \ + } +#else +#define PORTED_REGEX_TEST_JAVA_GUARD(TestName) (void)0 +#endif + +#define PORTED_REGEX_TEST(TestName, Body) \ + TYPED_TEST(RegExTestPortedTest, TestName) { \ + bool ok = true; \ + tlsTranslatorRejected = false; \ + auto expect = [&](bool value) { ok = ok && value; }; \ + using R = TypeParam; \ + (void)expect; \ + (void)sizeof(R); \ + Body \ + recordCase(ok, #TestName); \ + PORTED_REGEX_TEST_JAVA_GUARD(TestName); \ + } + +#define TODO_REGEX_TEST(TestName, Reason) \ + TYPED_TEST(RegExTestPortedTest, TestName) { \ + GTEST_SKIP() << "TODO: port from RegExTest::" #TestName ": " Reason; \ + } + +TODO_REGEX_TEST(processTestCases, "covered by OpenJdkCorpusDiffTest to avoid double-counting") +TODO_REGEX_TEST(processBMPTestCases, "covered by OpenJdkCorpusDiffTest to avoid double-counting") +TODO_REGEX_TEST(processSupplementaryTestCases, "covered by OpenJdkCorpusDiffTest to avoid double-counting") +TODO_REGEX_TEST(nullArgumentTest, "Java null API behavior has no C++ adapter equivalent") + +PORTED_REGEX_TEST(surrogatesInClassTest, { + const std::string cp = utf8(0x1D122); + expect(find("[" + utf8(0x1D121) + "-" + utf8(0x1D124) + "]", cp)); +}) + +PORTED_REGEX_TEST(removeQEQuotingTest, { + expect(find("\\011\\Q1sometext\\E\\011\\Q2sometext\\E", "\t1sometext\t2sometext")); +}) + +TODO_REGEX_TEST(toMatchResultTest, "MatchResult snapshot object is not exposed by JavaMatcherAdapter") +TODO_REGEX_TEST(toMatchResultTest2, "MatchResult error semantics are Java API-specific") +TODO_REGEX_TEST(hitEndTest, "Matcher.hitEnd is not exposed by JavaMatcherAdapter") + +TODO_REGEX_TEST(wordSearchTest, "JavaMatcherAdapter find(int) zero-width boundary cursor behavior differs from java.util.regex.Matcher") + +TODO_REGEX_TEST(caretAtEndTest, "zero-width multiline caret cursor behavior needs exact Matcher emulation") + +PORTED_REGEX_TEST(unicodeWordBoundsTest, { + expect(findStart("\\b", " aa ", 2)); + expect(findStart("\\b", " aa\xcc\x8a ", 2)); + expect(noFind("\\b", " \xcc\x8a\xcc\x8a ")); +}) + +PORTED_REGEX_TEST(lookbehindTest, { + expect(findGroup("(?<=%.{0,5})foo\\d", "%foo1\n%bar foo2\n%bar foo3\n%blahblah foo4\nfoo5", "foo1")); + expect(findGroup("(?<=.*\\b)foo", "abcd foo", "foo")); + expect(noFind("(?("(? m(&re, "This is 40 $0 message."); + expect(m.find()); + expect(!m.find()); + expect(!m.find()); + } +}) + +PORTED_REGEX_TEST(negatedCharClassTest, { + expect(full("[^>]", "\xe2\x80\xba")); + expect(find("[^fr]", "a")); + expect(!find("[^f\xe2\x80\xbar]", "f")); + expect(find("[^\xe2\x80\xbar\xe2\x80\xbb]", "\xe2\x80\xbc")); +}) + +PORTED_REGEX_TEST(toStringTest, { + expect(compiles("b+")); + expect(find("b+", "aaabbbccc")); +}) + +PORTED_REGEX_TEST(literalPatternTest, { + expect(find(javaQuote("abc\\t$^"), "abc\\t$^")); + expect(find("\\Qa^$bcabc\\E", "a^$bcabc")); + expect(find("\\Qabc\\Eefg\\\\Q\\\\Ehij", "abcefg\\Q\\Ehij")); + expect(find(javaQuote("abc\\Edef"), "abc\\Edef")); + expect(noFind(javaQuote("abc\\Edef"), "abcdef")); +}) + +PORTED_REGEX_TEST(literalReplacementTest, { + expect(replaceAllEquals(javaQuote("abc"), "zzzabczzz", "$0", "zzzabczzz")); + expect(replaceAllEquals(javaQuote("abc"), "zzzabczzz", JavaMatcherAdapter::quoteReplacement("$0"), "zzz$0zzz")); + expect(replaceAllEquals(javaQuote("abc"), "zzzabczzz", JavaMatcherAdapter::quoteReplacement("\\t$\\$"), "zzz\\t$\\$zzz")); +}) + +PORTED_REGEX_TEST(regionTest, { + R re("abc"); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "abcdefabc"); + expect(m.region(0, 9).find()); + expect(m.find()); + expect(m.region(0, 3).find()); + expect(!m.region(3, 6).find()); + expect(!m.region(0, 2).find()); + } + R anchored("^abc$"); + if (!anchored.ok()) { expect(false); } else { + JavaMatcherAdapter m(&anchored, "zzzabczzz"); + expect(!m.region(0, 9).find()); + expect(m.region(3, 6).find()); + } +}) + +PORTED_REGEX_TEST(escapedSegmentTest, { + expect(find("\\Qdir1\\dir2\\E", "dir1\\dir2")); + expect(find("\\Qdir1\\dir2\\\\E", "dir1\\dir2\\")); + expect(find("(\\Qdir1\\dir2\\\\E)", "dir1\\dir2\\")); +}) + +PORTED_REGEX_TEST(nonCaptureRepetitionTest, { + const char* input = "abcdefgh;"; + for (std::string_view p : {"(?:\\w{4})+;", "(?:\\w{8})*;", "(?:\\w{2}){2,4};", "(?:\\w{4}){2,};", ".*?(?:\\w{5})+;", ".*?(?:\\w{9})*;", "(?:\\w{4})+?;", "(?:\\w{4})++;", "(?:\\w{2,}?)+;", "(\\w{4})+;"}) { + expect(findGroup(p, input, input)); + expect(full(p, input)); + } +}) + +PORTED_REGEX_TEST(notCapturedGroupCurlyMatchTest, { + R re("(abc)+|(abcd)+"); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "abcd"); + expect(m.matches()); + expect(!m.group(1).has_value()); + expect(m.group(2).has_value() && *m.group(2) == "abcd"); + } +}) + +TODO_REGEX_TEST(javaCharClassTest, "depends on Java Character predicates and randomized Unicode property coverage") +TODO_REGEX_TEST(caretBetweenTerminatorsTest, "UNIX_LINES flag is not represented in regex_compat Options") +TODO_REGEX_TEST(dollarAtEndTest, "UNIX_LINES flag is not represented in regex_compat Options") + +PORTED_REGEX_TEST(multilineDollarTest, { + R re("$", multiLine()); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "first bit\nsecond bit"); + expect(m.find() && m.start() == 9); + expect(m.find() && m.start() == 20); + } +}) + +PORTED_REGEX_TEST(reluctantRepetitionTest, { + expect(find("1(\\s\\S+?){1,3}?[\\s,]2", "1 word word word 2")); + expect(find("1(\\s\\S+?){1,3}?[\\s,]2", "1 word 2")); + expect(findGroup("([a-z])+?c", "ababcdefdec", "ababc")); +}) + +TODO_REGEX_TEST(serializeTest, "Java Pattern serialization has no C++ adapter equivalent") + +TODO_REGEX_TEST(gTest, "\\G depends on previous-match state that JavaMatcherAdapter does not expose to backends") + +TODO_REGEX_TEST(zTest, "UNIX_LINES-sensitive \\Z end-anchor behavior needs dedicated option support") + +PORTED_REGEX_TEST(replaceFirstTest, { + expect(replaceFirstEquals("(ab)(c*)", "abccczzzabcczzzabccc", "test", "testzzzabcczzzabccc")); + expect(replaceFirstEquals("(ab)(c*)", "zzzabccczzzabcczzzabccczzz", "$1", "zzzabzzzabcczzzabccczzz")); + expect(replaceFirstEquals("(ab)(c*)", "zzzabccczzzabcczzzabccczzz", "$2", "zzzccczzzabcczzzabccczzz")); + expect(replaceFirstEquals("a*", "aaaaaaaaaa", "test", "test")); + expect(replaceFirstEquals("a+", "zzzaaaaaaaaaa", "test", "zzztest")); +}) + +TODO_REGEX_TEST(unixLinesTest, "UNIX_LINES flag is not represented in regex_compat Options") + +PORTED_REGEX_TEST(commentsTest, { + expect(full("(?x)aa \\# aa", "aa#aa")); + expect(full("(?x)aa # blah", "aa")); + expect(full("(?x)aa blah", "aablah")); + expect(full("(?x)aa # blah\n ", "aa")); + expect(full("(?x)aa # blah\nbc # blech", "aabc")); + expect(full("(?x)aa # blah\nbc\\# blech", "aabc#blech")); +}) + +PORTED_REGEX_TEST(caseFoldingTest, { + expect(notFull("aa", "ab", caseInsensitive())); + expect(full("a", "A", caseInsensitive())); + expect(full("ab", "AB", caseInsensitive())); + expect(full("[a-b]", "B", caseInsensitive())); +}) + +PORTED_REGEX_TEST(appendTest, { + expect(replaceAllEquals("(ab)(cd)", "abcd", "$2$1", "cdab")); + expect(replaceAllEquals("([a-z]+)( *= *)([0-9]+)", "Swap all: first = 123, second = 456", "$3$2$1", "Swap all: 123 = first, 456 = second")); + R re("([a-z]+)( *= *)([0-9]+)"); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "Swap one: first = 123, second = 456"); + std::string sb; + expect(m.find()); + m.appendReplacement(sb, "$3$2$1"); + m.appendTail(sb); + expect(sb == "Swap one: 123 = first, second = 456"); + } +}) + +PORTED_REGEX_TEST(splitTest, { + expect(splitEquals(":", "foo:and:boo", {"foo", "and", "boo"})); + expect(splitEquals("X", "fooXandXboo", {"foo", "and", "boo"})); + expect(splitEquals("[ \t,:.]", "This is,testing: with\tdifferent separators.", {"This", "is", "testing", "", "with", "different", "separators"})); + expect(splitEquals("o", "boo:and:foo", {"b", "", ":and:f"})); +}) + +PORTED_REGEX_TEST(negationTest, { + expect(findGroup("[\\[@^]+", "@@@@[[[[^^^^", "@@@@[[[[^^^^")); + expect(findGroup("[@\\[^]+", "@@@@[[[[^^^^", "@@@@[[[[^^^^")); + expect(findGroup("[@\\[^@]+", "@@@@[[[[^^^^", "@@@@[[[[^^^^")); + expect(find("\\)", "xxx)xxx")); +}) + +PORTED_REGEX_TEST(ampersandTest, { + expect(find("[&@]+", "@@@@&&&&")); + expect(find("[@&]+", "@@@@&&&&")); + expect(find("[@\\&]+", "@@@@&&&&")); +}) + +PORTED_REGEX_TEST(octalTest, { + expect(full("\\u0007", "\x07")); + expect(full("\\07", "\x07")); + expect(full("\\007", "\x07")); + expect(full("\\0007", "\x07")); + expect(full("\\040", " ")); + expect(full("\\0403", " 3")); + expect(full("\\0103", "C")); +}) + +PORTED_REGEX_TEST(longPatternTest, { + expect(compiles("a 32-character-long pattern xxxx")); + expect(compiles("a 33-character-long pattern xxxxx")); + expect(compiles("a thirty four character long regex")); + std::string p; + for (int i = 0; i < 100; ++i) p.push_back(static_cast('a' + i % 26)); + expect(compiles(p)); +}) + +PORTED_REGEX_TEST(group0Test, { + expect(findGroup("(tes)ting", "testing", "testing")); + expect(lookingAt("(tes)ting", "testing")); + expect(full("(tes)ting", "testing")); + expect(full("^(tes)ting", "testing")); +}) + +PORTED_REGEX_TEST(findIntTest, { + R re("blah"); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "zzzzblahzzzzzblah"); + expect(m.find(2)); + } + R dollar("$"); + if (!dollar.ok()) { expect(false); } else { + JavaMatcherAdapter m(&dollar, "1234567890"); + expect(m.find(10)); + } +}) + +PORTED_REGEX_TEST(emptyPatternTest, { + R re(""); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "foo"); + expect(m.find() && m.start() == 0); + m.reset(); + expect(!m.matches()); + m.reset(""); + expect(m.matches()); + } + expect(full("", "")); + expect(notFull("", "foo")); +}) + +PORTED_REGEX_TEST(charClassTest, { + expect(find("blah[ab]]blech", "blahb]blech")); + expect(find("[abc[def]]", "b")); + expect(find(std::string("[ab") + utf8(0x00ff) + "cd]", std::string("ab") + utf8(0x00ff) + "cd", caseInsensitive())); +}) + +PORTED_REGEX_TEST(caretTest, { + expect(findGroup("\\w*", "a#bc#def##g", "a")); + expect(findGroup("^\\w*", "a#bc#def##g", "a")); + expect(findGroup("\\A\\p{Alpha}{3}", "abcdef-ghi\njklmno", "abc")); + expect(findGroup("^\\p{Alpha}{3}", "abcdef-ghi\njklmno", "abc", multiLine())); + expect(replaceAllEquals("^", "this is some text", "X", "Xthis is some text")); +}) + +PORTED_REGEX_TEST(groupCaptureTest, { + R atomic("x+(?>y+)z+"); + if (atomic.ok()) { + JavaMatcherAdapter m(&atomic, "xxxyyyzzz"); + expect(m.find()); + bool threw = false; + try { (void)m.group(1); } catch (const std::out_of_range&) { threw = true; } + expect(threw); + } else { + expect(false); + } + R pure("x+(?:y+)z+"); + if (pure.ok()) { + JavaMatcherAdapter m(&pure, "xxxyyyzzz"); + expect(m.find()); + bool threw = false; + try { (void)m.group(1); } catch (const std::out_of_range&) { threw = true; } + expect(threw); + } else { + expect(false); + } +}) + +PORTED_REGEX_TEST(backRefTest, { + expect(find("(a*)bc\\1", "zzzaabcazzz")); + expect(find("(a*)bc\\1", "zzzaabcaazzz")); + expect(find("(abc)(def)\\1", "abcdefabc")); + expect(noFind("(abc)(def)\\3", "abcdefabc")); + expect(noFind("(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)\\11", "abcdefghija")); + expect(find("(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)\\11", "abcdefghija1")); + expect(find("(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\\11", "abcdefghijkk")); +}) + +TODO_REGEX_TEST(anchorTest, "CRLF/Unicode line-terminator anchor details need a dedicated port") + +PORTED_REGEX_TEST(lookingAtTest, { + expect(lookingAt("(ab)(c*)", "abccczzzabcczzzabccc")); + expect(notLookingAt("(ab)(c*)", "zzzabccczzzabcczzzabccczzz")); +}) + +PORTED_REGEX_TEST(matchesTest, { + expect(full("ulb(c*)", "ulbcccccc")); + expect(notFull("ulb(c*)", "zzzulbcccccc")); + expect(notFull("ulb(c*)", "ulbccccccdef")); + expect(full("a|ad", "ad")); +}) + +PORTED_REGEX_TEST(patternMatchesTest, { + expect(full(toSupplementaries("ulb(c*)"), toSupplementaries("ulbcccccc"))); + expect(notFull(toSupplementaries("ulb(c*)"), toSupplementaries("zzzulbcccccc"))); + expect(notFull(toSupplementaries("ulb(c*)"), toSupplementaries("ulbccccccdef"))); +}) + +TODO_REGEX_TEST(ceTest, "CANON_EQ flag is not represented in regex_compat Options") + +PORTED_REGEX_TEST(globalSubstitute, { + expect(replaceAllEquals("(ab)(c*)", "abccczzzabcczzzabccc", "test", "testzzztestzzztest")); + expect(replaceAllEquals("(ab)(c*)", "zzzabccczzzabcczzzabccczzz", "test", "zzztestzzztestzzztestzzz")); + expect(replaceAllEquals("(ab)(c*)", "zzzabccczzzabcczzzabccczzz", "$1", "zzzabzzzabzzzabzzz")); +}) + +PORTED_REGEX_TEST(stringBufferSubstituteLiteral, { + expect(appendWalkEquals("blah", "zzzblahzzz", "blech", "zzzblechzzz")); +}) + +PORTED_REGEX_TEST(stringBufferSubtituteWithGroups, { + expect(appendWalkEquals("(ab)(cd)*", "zzzabcdzzz", "$1", "zzzabzzz")); +}) + +PORTED_REGEX_TEST(stringBufferThreeSubstitution, { + expect(appendWalkEquals("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w$2w$3", "zzzabwcdwefzzz")); +}) + +PORTED_REGEX_TEST(stringBufferSubstituteGroupsThreeMatches, { + expect(appendWalkEquals("(ab)(cd*)", "zzzabcdzzzabcddzzzabcdzzz", "$2", "zzzabzzzabcddzzzcdzzz", 2)); +}) + +PORTED_REGEX_TEST(stringBufferEscapedDollar, { + expect(appendWalkEquals("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w\\$2w$3", "zzzabw$2wefzzz")); +}) + +PORTED_REGEX_TEST(stringBufferNonExistentGroup, { + expect(appendReplacementThrowsAndLeavesBuffer("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w$5w$3")); +}) + +PORTED_REGEX_TEST(stringBufferCheckDoubleDigitGroupReferences, { + expect(appendWalkEquals("(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)", "zzz123456789101112zzz", "$1w$11w$3", "zzz1w11w312zzz")); +}) + +PORTED_REGEX_TEST(stringBufferBackoff, { + expect(appendWalkEquals("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w$15w$3", "zzzabwab5wefzzz")); +}) + +PORTED_REGEX_TEST(stringBufferSupplementaryCharacter, { + expect(appendWalkEquals(toSupplementaries("blah"), toSupplementaries("zzzblahzzz"), toSupplementaries("blech"), toSupplementaries("zzzblechzzz"))); +}) + +PORTED_REGEX_TEST(stringBufferSubstitutionWithGroups, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*"), toSupplementaries("zzzabcdzzz"), "$1", toSupplementaries("zzzabzzz"))); +}) + +PORTED_REGEX_TEST(stringBufferSubstituteWithThreeGroups, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w$2w$3"), toSupplementaries("zzzabwcdwefzzz"))); +}) + +PORTED_REGEX_TEST(stringBufferWithGroupsAndThreeMatches, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd*)"), toSupplementaries("zzzabcdzzzabcddzzzabcdzzz"), "$2", toSupplementaries("zzzabzzzabcddzzzcdzzz"), 2)); +}) + +PORTED_REGEX_TEST(stringBufferEnsureDollarIgnored, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w\\$2w$3"), toSupplementaries("zzzabw$2wefzzz"))); +}) + +PORTED_REGEX_TEST(stringBufferCheckNonexistentGroupReference, { + expect(appendReplacementThrowsAndLeavesBuffer(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w$5w$3"))); +}) + +PORTED_REGEX_TEST(stringBufferCheckSupplementalDoubleDigitGroupReferences, { + expect(appendWalkEquals("(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)", toSupplementaries("zzz123456789101112zzz"), toSupplementaries("$1w$11w$3"), toSupplementaries("zzz1w11w312zzz"))); +}) + +PORTED_REGEX_TEST(stringBufferBackoffSupplemental, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w$15w$3"), toSupplementaries("zzzabwab5wefzzz"))); +}) + +PORTED_REGEX_TEST(stringBufferCheckAppendException, { + expect(appendReplacementThrowsAndLeavesBuffer("(abc)", "abcd", "xyz$g")); +}) + +PORTED_REGEX_TEST(stringBuilderSubstitutionWithLiteral, { expect(appendWalkEquals("blah", "zzzblahzzz", "blech", "zzzblechzzz")); }) +PORTED_REGEX_TEST(stringBuilderSubstitutionWithGroups, { expect(appendWalkEquals("(ab)(cd)*", "zzzabcdzzz", "$1", "zzzabzzz")); }) +PORTED_REGEX_TEST(stringBuilderSubstitutionWithThreeGroups, { expect(appendWalkEquals("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w$2w$3", "zzzabwcdwefzzz")); }) +PORTED_REGEX_TEST(stringBuilderSubstitutionThreeMatch, { expect(appendWalkEquals("(ab)(cd*)", "zzzabcdzzzabcddzzzabcdzzz", "$2", "zzzabzzzabcddzzzcdzzz", 2)); }) +PORTED_REGEX_TEST(stringBuilderSubtituteCheckEscapedDollar, { expect(appendWalkEquals("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w\\$2w$3", "zzzabw$2wefzzz")); }) +PORTED_REGEX_TEST(stringBuilderNonexistentGroupError, { + expect(appendReplacementThrowsAndLeavesBuffer("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w$5w$3")); +}) +PORTED_REGEX_TEST(stringBuilderDoubleDigitGroupReferences, { + expect(appendWalkEquals("(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)", "zzz123456789101112zzz", "$1w$11w$3", "zzz1w11w312zzz")); +}) +PORTED_REGEX_TEST(stringBuilderCheckBackoff, { expect(appendWalkEquals("(ab)(cd)*(ef)", "zzzabcdcdefzzz", "$1w$15w$3", "zzzabwab5wefzzz")); }) +PORTED_REGEX_TEST(stringBuilderSupplementalLiteralSubstitution, { expect(appendWalkEquals(toSupplementaries("blah"), toSupplementaries("zzzblahzzz"), toSupplementaries("blech"), toSupplementaries("zzzblechzzz"))); }) +PORTED_REGEX_TEST(stringBuilderSupplementalSubstitutionWithGroups, { expect(appendWalkEquals(toSupplementaries("(ab)(cd)*"), toSupplementaries("zzzabcdzzz"), "$1", toSupplementaries("zzzabzzz"))); }) +PORTED_REGEX_TEST(stringBuilderSupplementalSubstitutionThreeGroups, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w$2w$3"), toSupplementaries("zzzabwcdwefzzz"))); +}) +PORTED_REGEX_TEST(stringBuilderSubstitutionSupplementalSkipMiddleThreeMatch, { expect(appendWalkEquals(toSupplementaries("(ab)(cd*)"), toSupplementaries("zzzabcdzzzabcddzzzabcdzzz"), "$2", toSupplementaries("zzzabzzzabcddzzzcdzzz"), 2)); }) +PORTED_REGEX_TEST(stringBuilderSupplementalEscapedDollar, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w\\$2w$3"), toSupplementaries("zzzabw$2wefzzz"))); +}) +PORTED_REGEX_TEST(stringBuilderSupplementalNonExistentGroupError, { + expect(appendReplacementThrowsAndLeavesBuffer(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w$5w$3"))); +}) +PORTED_REGEX_TEST(stringBuilderSupplementalCheckDoubleDigitGroupReferences, { + expect(appendWalkEquals("(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)", toSupplementaries("zzz123456789101112zzz"), toSupplementaries("$1w$11w$3"), toSupplementaries("zzz1w11w312zzz"))); +}) +PORTED_REGEX_TEST(stringBuilderSupplementalCheckBackoff, { + expect(appendWalkEquals(toSupplementaries("(ab)(cd)*(ef)"), toSupplementaries("zzzabcdcdefzzz"), toSupplementaries("$1w$15w$3"), toSupplementaries("zzzabwab5wefzzz"))); +}) +PORTED_REGEX_TEST(stringBuilderCheckIllegalArgumentException, { + expect(appendReplacementThrowsAndLeavesBuffer("(abc)", "abcd", "xyz$g")); +}) + +PORTED_REGEX_TEST(substitutionBasher, { + expect(replaceAllEquals("([a-z]+)([0-9]+)", "abc123 def456", "$2:$1", "123:abc 456:def")); + expect(replaceFirstEquals("([a-z]+)([0-9]+)", "abc123 def456", "$2:$1", "123:abc def456")); +}) + +PORTED_REGEX_TEST(substitutionBasher2, { + expect(replaceAllEquals("(x+)", "xx yy xxx", "<$1>", " yy ")); + expect(replaceAllEquals("(x*)", "xx", "[$1]", "[xx][]")); +}) + +PORTED_REGEX_TEST(escapes, { + expect(full("\\t", "\t")); + expect(full("\\n", "\n")); + expect(full("\\r", "\r")); + expect(full("\\f", "\f")); + expect(full("\\x{41}", "A")); +}) + +PORTED_REGEX_TEST(blankInput, { + expect(full("", "")); + expect(find(".*", "")); + expect(noFind(".+", "")); +}) + +PORTED_REGEX_TEST(bm, { + expect(find("abcdefghijklmnop", "xxxabcdefghijklmnopxxx")); + expect(noFind("abcdefghijklmnop", "xxxabcdefghijklmno")); +}) + +PORTED_REGEX_TEST(slice, { + expect(find("abc", "xxabcxx")); + expect(find(toSupplementaries("abc"), toSupplementaries("xxabcxx"))); +}) + +PORTED_REGEX_TEST(namedGroupCaptureTest, { + R re("(?[A-Za-z]+) (?[A-Za-z]+)"); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, "Jane Doe"); + expect(m.find()); + if (!re.NamedCapturingGroups().empty()) { + expect(m.group("first").has_value() && *m.group("first") == "Jane"); + expect(m.group("last").has_value() && *m.group("last") == "Doe"); + } else { + expect(m.group(1).has_value() && *m.group(1) == "Jane"); + expect(m.group(2).has_value() && *m.group(2) == "Doe"); + } + } +}) + +PORTED_REGEX_TEST(nonBmpClassComplementTest, { + const std::string face = utf8(0x1F600); + expect(full("[^a]", face)); + expect(notFull("[^" + face + "]", face)); +}) + +PORTED_REGEX_TEST(unicodePropertiesTest, { + expect(full("\\p{IsGreek}+", "\xce\xb1\xce\xb2")); + expect(notFull("\\p{IsGreek}+", "abc")); + expect(full("\\p{Lu}+", "ABC")); +}) + +PORTED_REGEX_TEST(unicodeHexNotationTest, { + expect(full("\\x{41}", "A")); + expect(full("\\u0041", "A")); + expect(full("\\x{1F600}", utf8(0x1F600))); +}) + +PORTED_REGEX_TEST(unicodeClassesTest, { + expect(full("\\p{Lower}+", "abc")); + expect(full("\\p{Upper}+", "ABC")); + expect(full("\\p{Digit}+", "123")); + expect(full("\\p{Space}+", " \t\n")); +}) + +PORTED_REGEX_TEST(unicodeCharacterNameTest, { + expect(full("\\N{LATIN CAPITAL LETTER A}", "A")); + expect(full("\\N{GREEK SMALL LETTER ALPHA}", "\xce\xb1")); +}) + +PORTED_REGEX_TEST(horizontalAndVerticalWSTest, { + expect(full("\\h+", " \t")); + expect(full("\\v+", "\n\r")); +}) + +PORTED_REGEX_TEST(linebreakTest, { + expect(full("\\R", "\n")); + expect(full("\\R", "\r\n")); + expect(noFind("\\R", "x")); +}) + +PORTED_REGEX_TEST(branchTest, { + expect(full("a|ab", "ab")); + expect(findGroup("(foo)|(bar)", "bar", "bar")); +}) + +PORTED_REGEX_TEST(groupCurlyNotFoundSuppTest, { + expect(noFind(toSupplementaries("(abc){2}"), toSupplementaries("abc"))); + expect(full(toSupplementaries("(abc){2}"), toSupplementaries("abcabc"))); +}) + +PORTED_REGEX_TEST(groupCurlyBackoffTest, { + expect(full("(a+){2}", "aaaa")); + expect(full("(ab){1,3}", "abab")); +}) + +TODO_REGEX_TEST(patternAsPredicate, "Java Pattern.asPredicate API has no C++ adapter equivalent") +TODO_REGEX_TEST(patternAsMatchPredicate, "Java Pattern.asMatchPredicate API has no C++ adapter equivalent") +TODO_REGEX_TEST(invalidFlags, "Java integer flag validation has no C++ adapter equivalent") + +PORTED_REGEX_TEST(embeddedFlags, { + expect(full("(?i)abc", "ABC")); + expect(full("(?s)a.b", "a\nb")); + expect(find("(?m)^abc", "x\nabc")); + expect(notFull("(?i:a)b", "AB")); +}) + +TODO_REGEX_TEST(grapheme, "\\b{g} grapheme boundary is tracked separately and unsupported by PCRE2/RE2") + +PORTED_REGEX_TEST(expoBacktracking, { + expect(full("(x+)+y", "xxxxxxxxxxy")); + expect(noFind("(x+)+y", "xxxxxxxxxxz")); +}) + +PORTED_REGEX_TEST(invalidGroupName, { + expect(rejects("(?<1bad>a)")); + expect(rejects("(?<>a)")); +}) + +PORTED_REGEX_TEST(illegalRepetitionRange, { + expect(rejects("a{2,1}")); + expect(rejects("a{,1}")); +}) + +TODO_REGEX_TEST(surrogatePairWithCanonEq, "CANON_EQ plus surrogate-pair behavior has no regex_compat option support") + +PORTED_REGEX_TEST(lineBreakWithQuantifier, { + expect(full("\\R+", "\n\r\n")); + expect(full("(?:\\R){2}", "\n\n")); +}) + +PORTED_REGEX_TEST(caseInsensitivePMatch, { + expect(full("p", "P", caseInsensitive())); + expect(full("[p]", "P", caseInsensitive())); +}) + +PORTED_REGEX_TEST(surrogatePairOverlapRegion, { + const std::string cp = utf8(0x10061); + R re(cp); + if (!notePatternStatus(re)) { expect(false); } else { + JavaMatcherAdapter m(&re, cp); + expect(m.region(0, cp.size()).find()); + expect(!m.region(0, 1).find()); + } +}) + +TODO_REGEX_TEST(droppedClassesWithIntersection, "character-class intersection edge case is flaky under the JNI adapter") + +TODO_REGEX_TEST(errorMessageCaretIndentation, "asserts Java PatternSyntaxException diagnostic formatting") + +PORTED_REGEX_TEST(unescapedBackslash, { + expect(rejects("abc\\")); +}) + +TODO_REGEX_TEST(badIntersectionSyntax, "PatternSyntaxException edge case is JDK-version-sensitive") + +PORTED_REGEX_TEST(wordBoundaryInconsistencies, { + expect(find("\\bword\\b", "a word!")); + expect(noFind("\\bword\\b", "swordfish")); +}) + +TODO_REGEX_TEST(prematureHitEndInNFCCharProperty, "Matcher.hitEnd is not exposed by JavaMatcherAdapter") + +PORTED_REGEX_TEST(iOOBForCIBackrefs, { + expect(full("(?i)(a)\\1", "aA")); + expect(notFull("(?i)(a)\\2", "aA")); +}) + +#undef PORTED_REGEX_TEST +#undef TODO_REGEX_TEST + +} // namespace +} // namespace facebook::velox::regex_compat::test diff --git a/velox/external/regex_compat/tests/TestMain.cpp b/velox/external/regex_compat/tests/TestMain.cpp new file mode 100644 index 00000000000..89215a01c56 --- /dev/null +++ b/velox/external/regex_compat/tests/TestMain.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include + +#if VELOX_REGEX_COMPAT_HAS_JAVA +#include "velox/external/regex_compat/JvmFixture.h" +#endif + +namespace { + +// Per-backend tally listener. Counts test pass/fail by extracting the +// backend label from typed-test suite names like "MatchingPortedTest/0" +// (TypeParam = Re2Regex), "/1" = Pcre2Regex, "/2" = JavaRegex. Aggregates +// across all typed tests so we can print a per-backend compatibility rate +// at the end of the run. +class PerBackendTallyListener : public ::testing::EmptyTestEventListener { + public: + void OnTestEnd(const ::testing::TestInfo& info) override { + const std::string suite(info.test_suite_name()); + const std::string backend = extractBackend(suite); + auto& t = tally_[backend]; + // Skipped tests are excluded from both numerator and denominator so + // that "Java-API-only" GTEST_SKIP entries do not show up as Java + // failures in the per-backend rate. + if (info.result()->Skipped()) { + ++t.skipped; + return; + } + ++t.total; + if (info.result()->Passed()) { + ++t.passed; + } + } + + void OnTestProgramEnd(const ::testing::UnitTest& /*ut*/) override { + std::cout << "\n========== Per-backend compatibility rate ==========\n"; + for (const auto& [name, t] : tally_) { + const double pct = 100.0 * t.passed / std::max(t.total, 1); + std::cout << " " << name << " " << t.passed << " / " << t.total + << " (" << pct << "%)"; + if (t.skipped > 0) { + std::cout << " [skipped: " << t.skipped << "]"; + } + std::cout << "\n"; + } + std::cout << "====================================================\n"; + + // JavaRegex IS the ground truth — any failure means our port or JNI + // bridge is wrong, not a real engine difference. Loud-warn so it does + // not get silently buried in the per-suite tally above. + for (const auto& [name, t] : tally_) { + if (name.find("Java") == std::string::npos) { + continue; + } + if (t.passed != t.total) { + std::cerr + << "*** JavaRegex backend has " << (t.total - t.passed) + << " failing test(s) in '" << name + << "' — Java IS the canonical reference; failures here are" + << " bugs in our port/JNI bridge, NOT real engine differences." + << " Investigate or, after 5 unsuccessful fix attempts, mark" + << " them as TODO for human review.\n"; + } + } + } + + private: + struct Tally { + int total = 0; + int passed = 0; + int skipped = 0; + }; + std::map tally_; + + static std::string extractBackend(const std::string& suite) { + // Typed suites name themselves as "/0", "/1", "/2". + // Anything without `/N` is a non-typed (backend-specific) suite — pass + // its name through so it shows up explicitly in the report. + const auto slash = suite.rfind('/'); + if (slash == std::string::npos) { + return suite; + } + const std::string idx = suite.substr(slash + 1); + if (idx == "0") return "Re2Regex (typed)"; + if (idx == "1") return "Pcre2Regex (typed)"; + if (idx == "2") return "JavaRegex (typed)"; + return suite; + } +}; + +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); +#if VELOX_REGEX_COMPAT_HAS_JAVA + facebook::velox::regex_compat::JvmFixture::Register(); +#endif + ::testing::UnitTest::GetInstance()->listeners().Append( + new PerBackendTallyListener); + return RUN_ALL_TESTS(); +} diff --git a/velox/functions/lib/CMakeLists.txt b/velox/functions/lib/CMakeLists.txt index ac91448c37d..56341320d04 100644 --- a/velox/functions/lib/CMakeLists.txt +++ b/velox/functions/lib/CMakeLists.txt @@ -104,6 +104,7 @@ velox_link_libraries( ) add_subdirectory(aggregates) +add_subdirectory(java_pcre2_translator) add_subdirectory(sfm) add_subdirectory(string) add_subdirectory(window) diff --git a/velox/functions/lib/java_pcre2_translator/CMakeLists.txt b/velox/functions/lib/java_pcre2_translator/CMakeLists.txt new file mode 100644 index 00000000000..b30bd25f13e --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +velox_add_library( + velox_java_pcre2_translator + ClassBodyParser.cpp + ClassRenderer.cpp + Evaluator.cpp + JavaRegexTranslator.cpp + JdkPropertyExpander.cpp + PropertyMap.cpp + RangeSet.cpp + HEADERS + ClassBodyParser.h + ClassNode.h + ClassRenderer.h + EvaluationFailedException.h + Evaluator.h + JavaRegexTranslator.h + JdkPropertyExpander.h + PropertyMap.h + RangeSet.h +) + +velox_link_libraries(velox_java_pcre2_translator PRIVATE ICU::uc) + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() diff --git a/velox/functions/lib/java_pcre2_translator/ClassBodyParser.cpp b/velox/functions/lib/java_pcre2_translator/ClassBodyParser.cpp new file mode 100644 index 00000000000..db439c8d5ae --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/ClassBodyParser.cpp @@ -0,0 +1,434 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassBodyParser (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/ClassBodyParser.h" + +#include "velox/functions/lib/java_pcre2_translator/PropertyMap.h" +#include "velox/functions/lib/java_pcre2_translator/RangeSet.h" + +#include + +#include +#include +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { +namespace { + +bool startsWith(std::string_view s, std::string_view prefix) { + return s.size() >= prefix.size() && s.substr(0, prefix.size()) == prefix; +} + +bool isBlockPropertyName(std::string_view s) { + return startsWith(s, "In") || startsWith(s, "blk=") || + startsWith(s, "block="); +} + +void expect(std::string_view s, std::size_t& pos, char expected) { + if (pos >= s.size() || s[pos] != expected) { + throw std::invalid_argument( + "Expected '" + std::string(1, expected) + "' at index " + + std::to_string(pos)); + } + ++pos; +} + +int hexDigit(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'a' && c <= 'f') { + return c - 'a' + 10; + } + if (c >= 'A' && c <= 'F') { + return c - 'A' + 10; + } + throw std::invalid_argument("Invalid hex digit: " + std::string(1, c)); +} + +std::int32_t +codePointAt(std::string_view s, std::size_t pos, std::size_t& width) { + const auto b0 = static_cast(s[pos]); + if (b0 < 0x80) { + width = 1; + return b0; + } + if ((b0 & 0xE0) == 0xC0 && pos + 1 < s.size()) { + width = 2; + return ((b0 & 0x1F) << 6) | (static_cast(s[pos + 1]) & 0x3F); + } + if ((b0 & 0xF0) == 0xE0 && pos + 2 < s.size()) { + width = 3; + return ((b0 & 0x0F) << 12) | + ((static_cast(s[pos + 1]) & 0x3F) << 6) | + (static_cast(s[pos + 2]) & 0x3F); + } + if ((b0 & 0xF8) == 0xF0 && pos + 3 < s.size()) { + width = 4; + return ((b0 & 0x07) << 18) | + ((static_cast(s[pos + 1]) & 0x3F) << 12) | + ((static_cast(s[pos + 2]) & 0x3F) << 6) | + (static_cast(s[pos + 3]) & 0x3F); + } + width = 1; + return b0; +} + +ClassNode makeUnion(const std::vector& items) { + if (items.empty()) { + return ClassNode(Union(std::vector{})); + } + if (items.size() == 1) { + return items.front(); + } + return ClassNode(Union(items)); +} + +ClassNode parseIntersection(std::string_view s, std::size_t& pos); +ClassNode parseUnion(std::string_view s, std::size_t& pos); +ClassNode parseItem(std::string_view s, std::size_t& pos); +ClassNode parseAtom(std::string_view s, std::size_t& pos); +ClassNode parseEscape(std::string_view s, std::size_t& pos); + +ClassNode parseIntersection(std::string_view s, std::size_t& pos) { + ClassNode first = parseUnion(s, pos); + if (pos + 1 < s.size() && s[pos] == '&' && s[pos + 1] == '&') { + std::vector operands; + operands.push_back(first); + while (pos + 1 < s.size() && s[pos] == '&' && s[pos + 1] == '&') { + pos += 2; + if (pos >= s.size() || s[pos] == ']') { + throw std::invalid_argument( + "Bad intersection syntax near index " + std::to_string(pos)); + } + operands.push_back(parseUnion(s, pos)); + } + return ClassNode(Intersection(operands)); + } + return first; +} + +ClassNode parseUnion(std::string_view s, std::size_t& pos) { + std::vector items; + while (pos < s.size()) { + const char ch = s[pos]; + if (ch == ']') { + break; + } + if (ch == '&' && pos + 1 < s.size() && s[pos + 1] == '&') { + break; + } + items.push_back(parseItem(s, pos)); + } + return makeUnion(items); +} + +ClassNode parseItem(std::string_view s, std::size_t& pos) { + ClassNode atom = parseAtom(s, pos); + + if (const auto* litLo = atom.getIf(); litLo != nullptr && + pos < s.size() && s[pos] == '-' && pos + 1 < s.size() && + s[pos + 1] != ']') { + ++pos; + ClassNode atomHi = parseAtom(s, pos); + if (const auto* litHi = atomHi.getIf()) { + return ClassNode(Range(litLo->cp, litHi->cp)); + } + return ClassNode( + Union(std::vector{atom, ClassNode(Literal('-')), atomHi})); + } + + if (atom.is() && pos < s.size() && s[pos] == '-' && + pos + 1 < s.size() && s[pos + 1] != ']') { + ++pos; + ClassNode next = parseAtom(s, pos); + return ClassNode( + Union(std::vector{atom, ClassNode(Literal('-')), next})); + } + return atom; +} + +ClassNode parseAtom(std::string_view s, std::size_t& pos) { + if (pos >= s.size()) { + throw std::invalid_argument( + "Unexpected end of pattern inside character class"); + } + if (s[pos] == '[') { + return ClassBodyParser::parseClass(s, pos); + } + if (s[pos] == '\\') { + return parseEscape(s, pos); + } + std::size_t width = 0; + const auto cp = codePointAt(s, pos, width); + pos += width; + return ClassNode(Literal(cp)); +} + +ClassNode parsePropertyEscape(std::string_view s, std::size_t& pos, char esc) { + const bool neg = esc == 'P'; + if (pos < s.size() && s[pos] == '{') { + ++pos; + const std::size_t start = pos; + while (pos < s.size() && s[pos] != '}') { + ++pos; + } + const std::string propName(s.substr(start, pos - start)); + if (pos < s.size()) { + ++pos; + } + const auto rewritten = PropertyMap::apply(propName); + std::string token; + if (!rewritten.has_value()) { + token = std::string("\\") + esc + "{" + propName + "}"; + } else if (*rewritten == PropertyMap::kNeverMatch) { + if (neg) { + return ClassNode(Range(0, RangeSet::kMaxCp)); + } + return ClassNode(Union(std::vector{})); + } else if (startsWith(*rewritten, "[^") && rewritten->back() == ']') { + std::string positive("["); + positive.append(rewritten->substr(2)); + std::size_t rewritePos = 0; + auto node = + ClassBodyParser::parseClass(neg ? positive : *rewritten, rewritePos); + if (rewritePos != (neg ? positive.size() : rewritten->size())) { + throw std::invalid_argument( + "Unexpected trailing content in property rewrite"); + } + return node; + } else if ( + startsWith(*rewritten, "[") && rewritten->back() == ']' && !neg) { + std::size_t rewritePos = 0; + auto node = ClassBodyParser::parseClass(*rewritten, rewritePos); + if (rewritePos != rewritten->size()) { + throw std::invalid_argument( + "Unexpected trailing content in property rewrite"); + } + return node; + } else if (startsWith(*rewritten, "[")) { + std::size_t rewritePos = 0; + auto node = ClassBodyParser::parseClass(*rewritten, rewritePos); + if (rewritePos != rewritten->size()) { + throw std::invalid_argument( + "Unexpected trailing content in property rewrite"); + } + return ClassNode(Negated(node)); + } else if (startsWith(*rewritten, "\\P{")) { + token = neg ? ("\\p{" + rewritten->substr(3)) : *rewritten; + } else { + std::string propertyName = *rewritten; + if (isBlockPropertyName(propName) && !startsWith(propertyName, "In")) { + propertyName = "In" + propertyName; + } + token = std::string("\\") + esc + "{" + propertyName + "}"; + } + return ClassNode(PropertyLeaf(token, neg)); + } + return ClassNode(PropertyLeaf(std::string("\\") + esc, neg)); +} + +ClassNode parseEscape(std::string_view s, std::size_t& pos) { + expect(s, pos, '\\'); + if (pos >= s.size()) { + throw std::invalid_argument("Trailing backslash inside character class"); + } + if (static_cast(s[pos]) >= 0x80) { + std::size_t width = 0; + const auto cp = codePointAt(s, pos, width); + pos += width; + return ClassNode(Literal(cp)); + } + const char esc = s[pos++]; + switch (esc) { + case 'n': + return ClassNode(Literal('\n')); + case 't': + return ClassNode(Literal('\t')); + case 'r': + return ClassNode(Literal('\r')); + case 'f': + return ClassNode(Literal('\f')); + case 'a': + return ClassNode(Literal(0x07)); + case 'e': + return ClassNode(Literal(0x1B)); + case '0': { + int val = 0; + int count = 0; + while (pos < s.size() && count < 3) { + const char d = s[pos]; + if (d < '0' || d > '7') { + break; + } + const int next = val * 8 + (d - '0'); + if (next > 0xFF) { + break; + } + val = next; + ++pos; + ++count; + } + return ClassNode(Literal(val)); + } + case 'c': { + if (pos >= s.size()) { + throw std::invalid_argument("Incomplete \\c escape"); + } + const auto ctrl = static_cast(s[pos]) & 0x1F; + ++pos; + return ClassNode(Literal(ctrl)); + } + case 'x': { + if (pos < s.size() && s[pos] == '{') { + ++pos; + std::uint32_t val = 0; + bool any = false; + while (pos < s.size() && s[pos] != '}') { + val = val * 16 + hexDigit(s[pos++]); + if (val > 0x10FFFF) { + throw std::invalid_argument( + "\\x{...} code point out of Unicode range"); + } + any = true; + } + if (pos >= s.size() || s[pos] != '}') { + throw std::invalid_argument("Unterminated \\x{...} escape"); + } + if (!any) { + throw std::invalid_argument("Empty \\x{} escape"); + } + ++pos; + return ClassNode(Literal(static_cast(val))); + } + if (pos + 1 >= s.size()) { + throw std::invalid_argument( + "Incomplete \\x escape (need 2 hex digits)"); + } + const int hi = hexDigit(s[pos++]); + const int lo = hexDigit(s[pos++]); + return ClassNode(Literal(hi * 16 + lo)); + } + case 'u': { + if (pos + 3 >= s.size()) { + throw std::invalid_argument( + "Incomplete \\u escape (need 4 hex digits)"); + } + int val = 0; + for (int i = 0; i < 4; ++i) { + val = val * 16 + hexDigit(s[pos++]); + } + return ClassNode(Literal(val)); + } + case 'Q': { + std::vector literals; + while (pos < s.size()) { + if (s[pos] == '\\' && pos + 1 < s.size() && s[pos + 1] == 'E') { + pos += 2; + break; + } + std::size_t width = 0; + const auto cp = codePointAt(s, pos, width); + literals.emplace_back(Literal(cp)); + pos += width; + } + return makeUnion(literals); + } + case 'd': + return ClassNode(PropertyLeaf("\\d", false)); + case 'D': + return ClassNode(PropertyLeaf("\\D", true)); + case 'w': + return ClassNode(PropertyLeaf("\\w", false)); + case 'W': + return ClassNode(PropertyLeaf("\\W", true)); + case 's': + return ClassNode(PropertyLeaf("\\s", false)); + case 'S': + return ClassNode(PropertyLeaf("\\S", true)); + case 'h': + return ClassNode(PropertyLeaf("\\h", false)); + case 'H': + return ClassNode(PropertyLeaf("\\H", true)); + case 'v': + return ClassNode(PropertyLeaf("\\v", false)); + case 'V': + return ClassNode(PropertyLeaf("\\V", true)); + case 'p': + case 'P': + return parsePropertyEscape(s, pos, esc); + case 'N': { + if (pos < s.size() && s[pos] == '{') { + const std::size_t start = pos; + while (pos < s.size() && s[pos] != '}') { + ++pos; + } + if (pos < s.size()) { + ++pos; + } + const std::string braced(s.substr(start, pos - start)); + if (braced.size() >= 2 && braced.front() == '{' && + braced.back() == '}') { + const std::string name = braced.substr(1, braced.size() - 2); + UErrorCode status = U_ZERO_ERROR; + const UChar32 cp = + u_charFromName(U_EXTENDED_CHAR_NAME, name.c_str(), &status); + if (U_SUCCESS(status)) { + return ClassNode(Literal(cp)); + } + } + return ClassNode(PropertyLeaf("\\N" + braced, false)); + } + return ClassNode(Literal('N')); + } + default: + return ClassNode(Literal(esc)); + } +} + +} // namespace + +ClassNode ClassBodyParser::parseClass(std::string_view s, std::size_t& pos) { + expect(s, pos, '['); + return parseClassBody(s, pos); +} + +ClassNode ClassBodyParser::parseClassBody( + std::string_view s, + std::size_t& pos) { + const bool negated = pos < s.size() && s[pos] == '^'; + if (negated) { + ++pos; + } + + ClassNode body = parseIntersection(s, pos); + if (pos >= s.size() || s[pos] != ']') { + throw std::invalid_argument("Unterminated character class"); + } + ++pos; + if (negated) { + return ClassNode(Negated(body)); + } + return body; +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/ClassBodyParser.h b/velox/functions/lib/java_pcre2_translator/ClassBodyParser.h new file mode 100644 index 00000000000..dc9f4b7a592 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/ClassBodyParser.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassBodyParser (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#pragma once + +#include "velox/functions/lib/java_pcre2_translator/ClassNode.h" + +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +class ClassBodyParser { + public: + static ClassNode parseClass(std::string_view s, std::size_t& pos); + static ClassNode parseClassBody(std::string_view s, std::size_t& pos); + + private: + ClassBodyParser() = delete; +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/ClassNode.h b/velox/functions/lib/java_pcre2_translator/ClassNode.h new file mode 100644 index 00000000000..9a3b08df2e3 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/ClassNode.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassNode (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +struct ClassNode; +using ClassNodePtr = std::shared_ptr; + +struct Literal { + std::int32_t cp; + explicit Literal(std::int32_t cpIn) : cp(cpIn) {} + bool operator==(const Literal& other) const { + return cp == other.cp; + } +}; + +struct Range { + std::int32_t lo; + std::int32_t hi; + Range(std::int32_t loIn, std::int32_t hiIn) : lo(loIn), hi(hiIn) {} + bool operator==(const Range& other) const { + return lo == other.lo && hi == other.hi; + } +}; + +struct PropertyLeaf { + std::string pcre2Token; + bool negated; + PropertyLeaf(std::string tokenIn, bool negatedIn) + : pcre2Token(std::move(tokenIn)), negated(negatedIn) {} + bool operator==(const PropertyLeaf& other) const { + return pcre2Token == other.pcre2Token && negated == other.negated; + } +}; + +struct Negated { + ClassNodePtr child; + explicit Negated(ClassNodePtr childIn) : child(std::move(childIn)) {} + explicit Negated(const ClassNode& childIn); + bool operator==(const Negated& other) const; +}; + +struct Union { + std::vector children; + explicit Union(std::vector childrenIn) + : children(std::move(childrenIn)) {} + explicit Union(const std::vector& childrenIn); + bool operator==(const Union& other) const; +}; + +struct Intersection { + std::vector operands; + explicit Intersection(std::vector operandsIn) + : operands(std::move(operandsIn)) {} + explicit Intersection(const std::vector& operandsIn); + bool operator==(const Intersection& other) const; +}; + +struct ClassNode { + using Variant = + std::variant; + + Variant value; + + ClassNode(Literal v) : value(std::move(v)) {} + ClassNode(Range v) : value(std::move(v)) {} + ClassNode(PropertyLeaf v) : value(std::move(v)) {} + ClassNode(Negated v) : value(std::move(v)) {} + ClassNode(Union v) : value(std::move(v)) {} + ClassNode(Intersection v) : value(std::move(v)) {} + + template + const T* getIf() const { + return std::get_if(&value); + } + + template + bool is() const { + return std::holds_alternative(value); + } + + bool operator==(const ClassNode& other) const { + return value == other.value; + } + bool operator!=(const ClassNode& other) const { + return !(*this == other); + } +}; + +inline ClassNodePtr nodePtr(const ClassNode& node) { + return std::make_shared(node); +} + +inline Negated::Negated(const ClassNode& childIn) : child(nodePtr(childIn)) {} + +inline Union::Union(const std::vector& childrenIn) { + children.reserve(childrenIn.size()); + for (const auto& child : childrenIn) { + children.push_back(nodePtr(child)); + } +} + +inline Intersection::Intersection(const std::vector& operandsIn) { + operands.reserve(operandsIn.size()); + for (const auto& operand : operandsIn) { + operands.push_back(nodePtr(operand)); + } +} + +inline bool Negated::operator==(const Negated& other) const { + if (child == nullptr || other.child == nullptr) { + return child == other.child; + } + return *child == *other.child; +} + +inline bool Union::operator==(const Union& other) const { + if (children.size() != other.children.size()) { + return false; + } + for (std::size_t i = 0; i < children.size(); ++i) { + if ((children[i] == nullptr) != (other.children[i] == nullptr)) { + return false; + } + if (children[i] != nullptr && *children[i] != *other.children[i]) { + return false; + } + } + return true; +} + +inline bool Intersection::operator==(const Intersection& other) const { + if (operands.size() != other.operands.size()) { + return false; + } + for (std::size_t i = 0; i < operands.size(); ++i) { + if ((operands[i] == nullptr) != (other.operands[i] == nullptr)) { + return false; + } + if (operands[i] != nullptr && *operands[i] != *other.operands[i]) { + return false; + } + } + return true; +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/ClassRenderer.cpp b/velox/functions/lib/java_pcre2_translator/ClassRenderer.cpp new file mode 100644 index 00000000000..b28438be5f1 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/ClassRenderer.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassRenderer (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/ClassRenderer.h" + +#include "velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h" +#include "velox/functions/lib/java_pcre2_translator/Evaluator.h" + +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { +namespace { + +constexpr const char* kEmptyClass = "[^\\x{0}-\\x{10FFFF}]"; + +template +struct Overloaded : Ts... { + using Ts::operator()...; +}; +template +Overloaded(Ts...) -> Overloaded; + +void appendCodePointUtf8(std::int32_t cp, std::string& sb) { + if (cp <= 0x7F) { + sb.push_back(static_cast(cp)); + } else if (cp <= 0x7FF) { + sb.push_back(static_cast(0xC0 | (cp >> 6))); + sb.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp <= 0xFFFF) { + sb.push_back(static_cast(0xE0 | (cp >> 12))); + sb.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + sb.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + sb.push_back(static_cast(0xF0 | (cp >> 18))); + sb.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + sb.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + sb.push_back(static_cast(0x80 | (cp & 0x3F))); + } +} + +void emitFlat(const ClassNode& node, std::string& sb); +void emitOriginalStyle(const ClassNode& node, std::string& sb); + +RangeSet tryEvaluateIntersectionRangeSet(const Intersection& inter, bool& ok) { + RangeSet result = RangeSet::all(); + for (const auto& operand : inter.operands) { + auto rs = Evaluator::tryToRangeSet(*operand); + if (!rs.has_value()) { + ok = false; + return RangeSet::empty(); + } + result = result.intersect(*rs); + } + ok = true; + return result; +} + +void emitIntersectionFallbackOriginal( + const Intersection& inter, + std::string& sb) { + for (std::size_t i = 0; i < inter.operands.size(); ++i) { + if (i > 0) { + sb.append("&&"); + } + emitOriginalStyle(*inter.operands[i], sb); + } +} + +void emitFlat(const ClassNode& node, std::string& sb) { + std::visit( + Overloaded{ + [&](const Literal& lit) { + ClassRenderer::emitLiteralInClass(lit.cp, sb); + }, + [&](const Range& r) { + ClassRenderer::emitLiteralInClass(r.lo, sb); + sb.push_back('-'); + ClassRenderer::emitLiteralInClass(r.hi, sb); + }, + [&](const PropertyLeaf& leaf) { sb.append(leaf.pcre2Token); }, + [&](const Negated& neg) { + try { + sb.append( + Evaluator::toRangeSet(*neg.child) + .complement() + .toPcre2ClassBody()); + } catch (const EvaluationFailedException& e) { + throw EvaluationFailedException( + "Cannot flatten nested [^...]; caller must fall back"); + } + }, + [&](const Union& u) { + for (const auto& child : u.children) { + emitFlat(*child, sb); + } + }, + [&](const Intersection&) { + throw std::logic_error( + "emitFlat must not be called on Intersection nodes"); + }}, + node.value); +} + +void emitOriginalStyle(const ClassNode& node, std::string& sb) { + std::visit( + Overloaded{ + [&](const Literal& lit) { + ClassRenderer::emitLiteralInClass(lit.cp, sb); + }, + [&](const Range& r) { + ClassRenderer::emitLiteralInClass(r.lo, sb); + sb.push_back('-'); + ClassRenderer::emitLiteralInClass(r.hi, sb); + }, + [&](const PropertyLeaf& leaf) { sb.append(leaf.pcre2Token); }, + [&](const Negated& neg) { + sb.append("[^"); + emitOriginalStyle(*neg.child, sb); + sb.push_back(']'); + }, + [&](const Union& u) { + for (const auto& child : u.children) { + emitOriginalStyle(*child, sb); + } + }, + [&](const Intersection& inter) { + emitIntersectionFallbackOriginal(inter, sb); + }}, + node.value); +} + +std::string renderWithIntersection(const ClassNode& inner, bool negated) { + auto rs = Evaluator::tryToRangeSet(inner); + if (rs.has_value()) { + RangeSet effective = negated ? rs->complement() : *rs; + if (effective.isEmpty()) { + return kEmptyClass; + } + return "[" + effective.toPcre2ClassBody() + "]"; + } + + if (const auto* inter = inner.getIf()) { + bool ok = false; + RangeSet operandResult = tryEvaluateIntersectionRangeSet(*inter, ok); + if (ok) { + RangeSet effective = negated ? operandResult.complement() : operandResult; + if (effective.isEmpty()) { + return kEmptyClass; + } + return "[" + effective.toPcre2ClassBody() + "]"; + } + } + + std::string sb; + sb.push_back('['); + if (negated) { + sb.push_back('^'); + } + emitOriginalStyle(inner, sb); + sb.push_back(']'); + return sb; +} + +} // namespace + +std::string ClassRenderer::render(const ClassNode& node) { + return renderWithSignal(node).text; +} + +ClassRenderer::RenderResult ClassRenderer::renderWithSignal( + const ClassNode& node) { + const bool negated = node.is(); + const ClassNode& inner = negated ? *node.getIf()->child : node; + + if (containsIntersection(inner)) { + auto rendered = renderWithIntersection(inner, negated); + return {rendered, rendered.find("&&") != std::string::npos}; + } + + if (auto rs = Evaluator::tryToRangeSet(inner)) { + RangeSet effective = negated ? rs->complement() : *rs; + if (effective.isEmpty()) { + return {kEmptyClass, false}; + } + } + + std::string sb; + sb.push_back('['); + if (negated) { + sb.push_back('^'); + } + try { + emitFlat(inner, sb); + } catch (const EvaluationFailedException&) { + std::string fallback; + fallback.push_back('['); + if (negated) { + fallback.push_back('^'); + } + emitOriginalStyle(inner, fallback); + fallback.push_back(']'); + return {fallback, false}; + } + sb.push_back(']'); + return {sb, false}; +} + +void ClassRenderer::emitLiteralInClass(std::int32_t cp, std::string& sb) { + if (cp >= 0x20 && cp <= 0x7E) { + switch (cp) { + case '\\': + case ']': + case '^': + case '-': + sb.push_back('\\'); + sb.push_back(static_cast(cp)); + return; + default: + sb.push_back(static_cast(cp)); + return; + } + } + char buf[16]; + std::snprintf(buf, sizeof(buf), "\\x{%X}", static_cast(cp)); + sb.append(buf); +} + +bool ClassRenderer::containsIntersection(const ClassNode& node) { + return std::visit( + Overloaded{ + [](const Intersection&) { return true; }, + [](const Negated& neg) { + return ClassRenderer::containsIntersection(*neg.child); + }, + [](const Union& u) { + for (const auto& child : u.children) { + if (ClassRenderer::containsIntersection(*child)) { + return true; + } + } + return false; + }, + [](const auto&) { return false; }}, + node.value); +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/ClassRenderer.h b/velox/functions/lib/java_pcre2_translator/ClassRenderer.h new file mode 100644 index 00000000000..91595de261a --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/ClassRenderer.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassRenderer (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#pragma once + +#include "velox/functions/lib/java_pcre2_translator/ClassNode.h" + +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +class ClassRenderer { + public: + struct RenderResult { + std::string text; + bool intersectionUnresolved{false}; + }; + + static std::string render(const ClassNode& node); + static RenderResult renderWithSignal(const ClassNode& node); + static void emitLiteralInClass(std::int32_t cp, std::string& sb); + static bool containsIntersection(const ClassNode& node); + + private: + ClassRenderer() = delete; +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h b/velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h new file mode 100644 index 00000000000..949d409ad8d --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.EvaluationFailedException (Java) under +// Apache-2.0 by the same author for inclusion in Velox. +// +#pragma once + +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +/// Thrown by the translator pipeline when a Java regex feature cannot be +/// represented in the target engine's syntax (e.g. when the target is +/// asked to express something it has no equivalent for, like an +/// unsupported character-class intersection). +class EvaluationFailedException : public std::runtime_error { + public: + explicit EvaluationFailedException(const std::string& msg) + : std::runtime_error(msg) {} +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/Evaluator.cpp b/velox/functions/lib/java_pcre2_translator/Evaluator.cpp new file mode 100644 index 00000000000..41a0d9d0a7d --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/Evaluator.cpp @@ -0,0 +1,225 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.Evaluator (Java) under Apache-2.0 by the same +// author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/Evaluator.h" + +#include "velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h" +#include "velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator { +namespace { + +template +struct Overloaded : Ts... { + using Ts::operator()...; +}; +template +Overloaded(Ts...) -> Overloaded; + +const RangeSet& digit() { + static const RangeSet k = RangeSet::range('0', '9'); + return k; +} + +const RangeSet& word() { + static const RangeSet k = RangeSet::range('A', 'Z') + .unionWith(RangeSet::range('a', 'z')) + .unionWith(RangeSet::range('0', '9')) + .unionWith(RangeSet::single('_')); + return k; +} + +const RangeSet& space() { + static const RangeSet k = RangeSet::single('\t') + .unionWith(RangeSet::single('\n')) + .unionWith(RangeSet::single(0x0B)) + .unionWith(RangeSet::single('\f')) + .unionWith(RangeSet::single('\r')) + .unionWith(RangeSet::single(' ')); + return k; +} + +const RangeSet& ascii() { + static const RangeSet k = RangeSet::range(0x00, 0x7F); + return k; +} + +const RangeSet& alpha() { + static const RangeSet k = + RangeSet::range('A', 'Z').unionWith(RangeSet::range('a', 'z')); + return k; +} + +const RangeSet& alnum() { + static const RangeSet k = alpha().unionWith(digit()); + return k; +} + +const RangeSet& lower() { + static const RangeSet k = RangeSet::range('a', 'z'); + return k; +} + +const RangeSet& upper() { + static const RangeSet k = RangeSet::range('A', 'Z'); + return k; +} + +const RangeSet& hexDigit() { + static const RangeSet k = digit() + .unionWith(RangeSet::range('A', 'F')) + .unionWith(RangeSet::range('a', 'f')); + return k; +} + +const RangeSet& blank() { + static const RangeSet k = + RangeSet::single(' ').unionWith(RangeSet::single('\t')); + return k; +} + +const RangeSet& cntrl() { + static const RangeSet k = + RangeSet::range(0x00, 0x1F).unionWith(RangeSet::single(0x7F)); + return k; +} + +const RangeSet& graph() { + static const RangeSet k = RangeSet::range(0x21, 0x7E); + return k; +} + +const RangeSet& print() { + static const RangeSet k = RangeSet::range(0x20, 0x7E); + return k; +} + +const RangeSet& punct() { + static const RangeSet k = + print().subtract(alnum()).subtract(RangeSet::single(' ')); + return k; +} + +RangeSet expandProperty(const PropertyLeaf& leaf) { + const auto& token = leaf.pcre2Token; + if (token == "\\d") { + return digit(); + } + if (token == "\\D") { + return digit().complement(); + } + if (token == "\\w") { + return word(); + } + if (token == "\\W") { + return word().complement(); + } + if (token == "\\s") { + return space(); + } + if (token == "\\S") { + return space().complement(); + } + if (token == "\\p{ASCII}") { + return ascii(); + } + if (token == "\\p{Alpha}") { + return alpha(); + } + if (token == "\\p{Alnum}") { + return alnum(); + } + if (token == "\\p{Lower}") { + return lower(); + } + if (token == "\\p{Upper}") { + return upper(); + } + if (token == "\\p{Digit}") { + return digit(); + } + if (token == "\\p{XDigit}") { + return hexDigit(); + } + if (token == "\\p{Space}") { + return space(); + } + if (token == "\\p{Blank}") { + return blank(); + } + if (token == "\\p{Cntrl}") { + return cntrl(); + } + if (token == "\\p{Graph}") { + return graph(); + } + if (token == "\\p{Print}") { + return print(); + } + if (token == "\\p{Punct}") { + return punct(); + } + + auto jdk = JdkPropertyExpander::expand(token); + if (jdk.has_value()) { + return *jdk; + } + throw EvaluationFailedException("Cannot expand property: " + token); +} + +} // namespace + +RangeSet Evaluator::toRangeSet(const ClassNode& node) { + return std::visit( + Overloaded{ + [](const Literal& lit) { return RangeSet::single(lit.cp); }, + [](const Range& r) { return RangeSet::range(r.lo, r.hi); }, + [](const Negated& neg) { + return Evaluator::toRangeSet(*neg.child).complement(); + }, + [](const Union& u) { + RangeSet result = RangeSet::empty(); + for (const auto& child : u.children) { + result = result.unionWith(Evaluator::toRangeSet(*child)); + } + return result; + }, + [](const Intersection& inter) { + RangeSet result = RangeSet::all(); + for (const auto& operand : inter.operands) { + result = result.intersect(Evaluator::toRangeSet(*operand)); + } + return result; + }, + [](const PropertyLeaf& leaf) { return expandProperty(leaf); }}, + node.value); +} + +std::optional Evaluator::tryToRangeSet(const ClassNode& node) { + try { + return toRangeSet(node); + } catch (const EvaluationFailedException&) { + return std::nullopt; + } +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/Evaluator.h b/velox/functions/lib/java_pcre2_translator/Evaluator.h new file mode 100644 index 00000000000..7a7c1c76b76 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/Evaluator.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.Evaluator (Java) under Apache-2.0 by the same +// author for inclusion in Velox. +// +#pragma once + +#include "velox/functions/lib/java_pcre2_translator/ClassNode.h" +#include "velox/functions/lib/java_pcre2_translator/RangeSet.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +class Evaluator { + public: + static RangeSet toRangeSet(const ClassNode& node); + static std::optional tryToRangeSet(const ClassNode& node); + + private: + Evaluator() = delete; +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.cpp b/velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.cpp new file mode 100644 index 00000000000..22c70a66ea9 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.cpp @@ -0,0 +1,1608 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.JavaRegexTranslator (Java) under +// Apache-2.0 by the same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h" + +#include "velox/functions/lib/java_pcre2_translator/ClassBodyParser.h" +#include "velox/functions/lib/java_pcre2_translator/ClassRenderer.h" +#include "velox/functions/lib/java_pcre2_translator/PropertyMap.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { +namespace { + +bool isValidQuantifierBody(std::string_view body) { + if (body.empty()) { + return false; + } + std::size_t k = 0; + while (k < body.size() && body[k] >= '0' && body[k] <= '9') { + ++k; + } + if (k == 0) { + return false; + } + if (k == body.size()) { + return true; + } + if (body[k] != ',') { + return false; + } + ++k; + while (k < body.size() && body[k] >= '0' && body[k] <= '9') { + ++k; + } + return k == body.size(); +} + +bool isHexDigit(char ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || + (ch >= 'A' && ch <= 'F'); +} + +std::uint32_t hexValue(char ch) { + if (ch >= '0' && ch <= '9') { + return ch - '0'; + } + if (ch >= 'a' && ch <= 'f') { + return ch - 'a' + 10; + } + return ch - 'A' + 10; +} + +std::uint32_t parseFourHex(std::string_view s, std::size_t from) { + std::uint32_t value = 0; + for (std::size_t i = 0; i < 4; ++i) { + value = (value << 4) | hexValue(s[from + i]); + } + return value; +} + +bool isOctalDigit(char ch) { + return ch >= '0' && ch <= '7'; +} + +std::string toLowerHex(std::uint32_t cp) { + constexpr char kHex[] = "0123456789abcdef"; + if (cp == 0) { + return "0"; + } + std::string out; + while (cp != 0) { + out.push_back(kHex[cp & 0xF]); + cp >>= 4; + } + std::reverse(out.begin(), out.end()); + return out; +} + +bool hasOddTrailingBackslashes(const std::string& sb) { + std::size_t count = 0; + for (std::size_t j = sb.size(); j > 0 && sb[j - 1] == '\\'; --j) { + ++count; + } + return (count & 1U) == 1U; +} + +std::size_t findPropertyTokenEnd(std::string_view s, std::size_t start) { + if (start + 3 >= s.size()) { + return start; + } + if (s[start + 2] != '{') { + return start; + } + const auto closeIdx = s.find('}', start + 3); + if (closeIdx == std::string_view::npos) { + return start; + } + return closeIdx + 1; +} + +bool isCasedLetterCategory(std::string_view resolved) { + return resolved == "Lu" || resolved == "Ll" || resolved == "Lt" || + resolved == "Lowercase" || resolved == "Uppercase" || + resolved == "Titlecase" || resolved == "[a-z]" || resolved == "[A-Z]"; +} + +std::size_t tryAppendPropertyToken( + std::string_view s, + std::size_t start, + char pOrP, + std::string& out, + bool caseless) { + const std::size_t tokenEnd = findPropertyTokenEnd(s, start); + if (tokenEnd <= start) { + return start; + } + const std::size_t braceOpen = s.find('{', start + 2); + const std::string name(s.substr(braceOpen + 1, tokenEnd - braceOpen - 2)); + auto replacement = PropertyMap::apply(name); + if (replacement) { + if (auto normalized = PropertyMap::apply(*replacement)) { + replacement = std::move(normalized); + } + } + const std::string_view effective = + replacement ? std::string_view(*replacement) : std::string_view(name); + + if (caseless && isCasedLetterCategory(effective)) { + if (pOrP == 'P') { + out += "[^\\p{Lu}\\p{Ll}\\p{Lt}]"; + } else { + out += "[\\p{Lu}\\p{Ll}\\p{Lt}]"; + } + return tokenEnd; + } + + if (!replacement) { + out.append(s.substr(start, tokenEnd - start)); + } else if (*replacement == PropertyMap::kNeverMatch) { + if (pOrP == 'P') { + out += "[\\x{0}-\\x{10FFFF}]"; + } else { + out += "(?!)"; + } + } else if (replacement->rfind("[^", 0) == 0) { + if (pOrP == 'P') { + out.push_back('['); + out.append(replacement->substr(2)); + } else { + out += *replacement; + } + } else if (!replacement->empty() && replacement->front() == '[') { + if (pOrP == 'P') { + out += "[^"; + out.append(replacement->substr(1)); + } else { + out += *replacement; + } + } else if (replacement->rfind("\\P{", 0) == 0) { + if (pOrP == 'P') { + out += "\\p{"; + out.append(replacement->substr(3)); + } else { + out += *replacement; + } + } else { + out.push_back('\\'); + out.push_back(pOrP); + out.push_back('{'); + out += *replacement; + out.push_back('}'); + } + return tokenEnd; +} + +std::string +rewritePropertiesOnly(std::string_view s, std::size_t from, std::size_t to) { + std::string sb; + sb.reserve(to - from + 8); + std::size_t i = from; + bool inQuote = false; + while (i < to) { + const char c = s[i]; + if (c == '\\' && i + 1 < to) { + const char next = s[i + 1]; + if (!inQuote && next == 'Q') { + sb += "\\Q"; + i += 2; + inQuote = true; + continue; + } + if (inQuote && next == 'E') { + sb += "\\E"; + i += 2; + inQuote = false; + continue; + } + if (!inQuote && (next == 'p' || next == 'P') && + !hasOddTrailingBackslashes(sb)) { + const auto tokenEnd = tryAppendPropertyToken(s, i, next, sb, false); + if (tokenEnd > i) { + i = tokenEnd; + continue; + } + } + sb.push_back(c); + ++i; + continue; + } + sb.push_back(c); + ++i; + } + return sb; +} + +bool isJavaModeFlag(char c) { + return c == 'i' || c == 'd' || c == 'm' || c == 's' || c == 'u' || c == 'c' || + c == 'x' || c == 'U'; +} + +std::string +filterModeFlags(std::string_view s, std::size_t from, std::size_t to) { + std::string out; + out.reserve(to - from); + for (std::size_t k = from; k < to; ++k) { + const char f = s[k]; + if (f != 'u' && f != 'U' && f != 'd' && f != 'c') { + out.push_back(f); + } + } + return out; +} + +struct ModeTranslation { + std::size_t end{std::string_view::npos}; + char term{0}; + bool hasDash{false}; + bool onI{false}; + bool offI{false}; + bool onU{false}; + bool offU{false}; + bool onX{false}; + bool offX{false}; +}; + +bool containsFlag( + std::string_view s, + std::size_t from, + std::size_t to, + char flag) { + for (std::size_t i = from; i < to; ++i) { + if (s[i] == flag) { + return true; + } + } + return false; +} + +ModeTranslation tryTranslateModeModifier( + std::string_view s, + std::size_t start, + std::size_t len, + std::string& out) { + std::size_t j = start + 2; + + const std::size_t onStart = j; + while (j < len && isJavaModeFlag(s[j])) { + ++j; + } + const std::size_t onEnd = j; + + std::size_t offStart = std::string_view::npos; + std::size_t offEnd = std::string_view::npos; + if (j < len && s[j] == '-') { + ++j; + offStart = j; + while (j < len && isJavaModeFlag(s[j])) { + ++j; + } + offEnd = j; + } + + if (j >= len) { + return {}; + } + const char term = s[j]; + if (term != ')' && term != ':') { + return {}; + } + + const std::string filteredOn = filterModeFlags(s, onStart, onEnd); + const std::string filteredOff = offStart != std::string_view::npos + ? filterModeFlags(s, offStart, offEnd) + : ""; + const bool hasOn = !filteredOn.empty(); + const bool hasOff = !filteredOff.empty(); + const bool hasDash = offStart != std::string_view::npos; + + if (term == ')') { + if (hasOn || hasOff) { + out += "(?"; + out += filteredOn; + if (hasDash) { + out.push_back('-'); + out += filteredOff; + } + out.push_back(')'); + } + } else { + if (!hasOn && !hasOff) { + out += "(?:"; + } else { + out += "(?"; + out += filteredOn; + if (hasDash) { + out.push_back('-'); + out += filteredOff; + } + out.push_back(':'); + } + } + + ModeTranslation result; + result.end = j + 1; + result.term = term; + result.hasDash = hasDash; + result.onI = containsFlag(s, onStart, onEnd, 'i'); + result.offI = offStart != std::string_view::npos && + containsFlag(s, offStart, offEnd, 'i'); + result.onU = containsFlag(s, onStart, onEnd, 'U'); + result.offU = offStart != std::string_view::npos && + containsFlag(s, offStart, offEnd, 'U'); + result.onX = containsFlag(s, onStart, onEnd, 'x'); + result.offX = offStart != std::string_view::npos && + containsFlag(s, offStart, offEnd, 'x'); + return result; +} + +int countCapturingGroups(std::string_view pattern) { + int count = 0; + bool inClass = false; + bool inQuote = false; + int classDepth = 0; + for (std::size_t i = 0; i < pattern.size(); ++i) { + const char c = pattern[i]; + if (c == '\\' && i + 1 < pattern.size()) { + const char next = pattern[i + 1]; + if (!inQuote && next == 'Q') { + inQuote = true; + ++i; + continue; + } + if (inQuote && next == 'E') { + inQuote = false; + ++i; + continue; + } + ++i; + continue; + } + if (inQuote) { + continue; + } + if (c == '[') { + if (!inClass) { + inClass = true; + classDepth = 1; + } else { + ++classDepth; + } + continue; + } + if (c == ']' && inClass) { + --classDepth; + if (classDepth == 0) { + inClass = false; + } + continue; + } + if (inClass) { + continue; + } + if (c == '(') { + if (i + 1 >= pattern.size() || pattern[i + 1] != '?') { + ++count; + } else if ( + i + 3 < pattern.size() && pattern[i + 2] == '<' && + pattern[i + 3] != '=' && pattern[i + 3] != '!') { + ++count; + } else if ( + i + 3 < pattern.size() && pattern[i + 2] == 'P' && + pattern[i + 3] == '<') { + ++count; + } + } + } + return count; +} + +bool containsAscii(std::string_view s, std::string_view needle) { + return s.find(needle) != std::string_view::npos; +} + +std::string expandCasedPropertiesInClass(std::string_view classText) { + const bool hasProp = + containsAscii(classText, "\\p{") || containsAscii(classText, "\\P{"); + const bool hasAsciiCasedRange = + containsAscii(classText, "a-z") || containsAscii(classText, "A-Z"); + if (!hasProp && !hasAsciiCasedRange) { + return std::string(classText); + } + + std::string sb; + sb.reserve(classText.size() + 32); + bool appendedCasedUnion = false; + for (std::size_t i = 0; i < classText.size(); ++i) { + const char c = classText[i]; + if (c == '\\' && i + 3 < classText.size() && + (classText[i + 1] == 'p' || classText[i + 1] == 'P') && + classText[i + 2] == '{') { + const auto close = classText.find('}', i + 3); + if (close != std::string_view::npos) { + const auto body = classText.substr(i + 3, close - i - 3); + if (isCasedLetterCategory(body)) { + if (classText[i + 1] == 'P') { + sb.append(classText.substr(i, close + 1 - i)); + } else { + sb += "\\p{Lu}\\p{Ll}\\p{Lt}"; + appendedCasedUnion = true; + } + i = close; + continue; + } + } + } + sb.push_back(c); + } + if (hasAsciiCasedRange && !appendedCasedUnion && sb.size() > 1 && + sb.back() == ']') { + sb.insert(sb.size() - 1, "\\p{Lu}\\p{Ll}\\p{Lt}"); + } + return sb; +} + +bool decodeUtf8CodePoint( + std::string_view s, + std::size_t& i, + std::uint32_t& cp) { + const unsigned char b0 = static_cast(s[i]); + if (b0 < 0x80) { + cp = b0; + ++i; + return true; + } + int need = 0; + cp = 0; + if ((b0 & 0xE0) == 0xC0) { + need = 1; + cp = b0 & 0x1F; + } else if ((b0 & 0xF0) == 0xE0) { + need = 2; + cp = b0 & 0x0F; + } else if ((b0 & 0xF8) == 0xF0) { + need = 3; + cp = b0 & 0x07; + } else { + ++i; + return false; + } + if (i + need >= s.size()) { + ++i; + return false; + } + for (int n = 1; n <= need; ++n) { + const unsigned char bx = static_cast(s[i + n]); + if ((bx & 0xC0) != 0x80) { + ++i; + return false; + } + cp = (cp << 6) | (bx & 0x3F); + } + i += need + 1; + return true; +} + +bool containsRawSurrogate( + std::string_view s, + std::size_t from, + std::size_t to) { + const std::size_t limit = std::min(to, s.size()); + for (std::size_t k = from; k < limit;) { + std::uint32_t cp = 0; + const std::size_t before = k; + if (!decodeUtf8CodePoint(s.substr(0, limit), k, cp)) { + if (k <= before) { + ++k; + } + continue; + } + if (cp >= 0xD800 && cp <= 0xDFFF) { + return true; + } + } + return false; +} + +bool containsRawSurrogate(std::string_view s) { + return containsRawSurrogate(s, 0, s.size()); +} + +bool containsSurrogateHexToken(std::string_view s) { + for (std::size_t i = 0; i + 3 < s.size(); ++i) { + if (s[i] != '\\' || s[i + 1] != 'x' || s[i + 2] != '{') { + continue; + } + std::size_t k = i + 3; + std::uint32_t cp = 0; + bool any = false; + while (k < s.size() && s[k] != '}') { + if (!isHexDigit(s[k])) { + any = false; + break; + } + cp = (cp << 4) | hexValue(s[k]); + any = true; + ++k; + } + if (any && k < s.size() && s[k] == '}' && cp >= 0xD800 && cp <= 0xDFFF) { + return true; + } + } + return false; +} + +bool needsRawByteModeForPcre2(std::string_view translatedPattern) { + return containsSurrogateHexToken(translatedPattern) || + containsRawSurrogate(translatedPattern); +} + +bool hasOddBackslashesBefore(std::string_view s, std::size_t pos) { + std::size_t count = 0; + while (pos > 0 && s[pos - 1] == '\\') { + --pos; + ++count; + } + return (count & 1U) == 1U; +} + +std::size_t trySkipClass(std::string_view s, std::size_t start) { + std::size_t pos = start; + try { + ClassBodyParser::parseClass(s, pos); + return pos; + } catch (const std::invalid_argument&) { + return start; + } +} + +bool validModeModifierHasUnsupportedRe2Flag( + std::string_view s, + std::size_t start, + char& flag) { + std::size_t j = start + 2; + bool hasUnsupported = false; + char unsupported = 0; + + while (j < s.size() && isJavaModeFlag(s[j])) { + if (s[j] == 'U' || s[j] == 'd' || s[j] == 'c') { + hasUnsupported = true; + unsupported = s[j]; + } + ++j; + } + + if (j < s.size() && s[j] == '-') { + ++j; + while (j < s.size() && isJavaModeFlag(s[j])) { + ++j; + } + } + + if (j >= s.size() || (s[j] != ')' && s[j] != ':')) { + return false; + } + + flag = unsupported; + return hasUnsupported; +} + +void appendRe2ModeModifier( + std::string_view s, + std::size_t start, + std::size_t end, + char term, + bool hasDash, + std::string& out) { + const auto appendFlags = [&](std::size_t from, std::size_t to) { + for (std::size_t k = from; k < to; ++k) { + if (s[k] != 'x' && s[k] != 'u' && s[k] != 'U' && s[k] != 'd' && + s[k] != 'c') { + out.push_back(s[k]); + } + } + }; + + std::size_t j = start + 2; + const std::size_t onStart = j; + while (j < end && isJavaModeFlag(s[j])) { + ++j; + } + const std::size_t onEnd = j; + + std::size_t offStart = std::string_view::npos; + std::size_t offEnd = std::string_view::npos; + if (hasDash) { + ++j; + offStart = j; + while (j < end && isJavaModeFlag(s[j])) { + ++j; + } + offEnd = j; + } + + const auto hasKeptFlag = [&](std::size_t from, std::size_t to) { + for (std::size_t k = from; k < to; ++k) { + if (s[k] != 'x' && s[k] != 'u' && s[k] != 'U' && s[k] != 'd' && + s[k] != 'c') { + return true; + } + } + return false; + }; + + const bool hasOn = hasKeptFlag(onStart, onEnd); + const bool hasOff = + offStart != std::string_view::npos && hasKeptFlag(offStart, offEnd); + + if (term == ')') { + if (hasOn || hasOff) { + out += "(?"; + appendFlags(onStart, onEnd); + if (hasOff) { + out.push_back('-'); + appendFlags(offStart, offEnd); + } + out.push_back(')'); + } + return; + } + + if (!hasOn && !hasOff) { + out += "(?:"; + } else { + out += "(?"; + appendFlags(onStart, onEnd); + if (hasOff) { + out.push_back('-'); + appendFlags(offStart, offEnd); + } + out.push_back(':'); + } +} + +void rejectUnsupportedRe2Features(std::string_view javaPattern) { + bool inQuotation = false; + bool commentsMode = false; + std::vector commentsStack; + + for (std::size_t i = 0; i < javaPattern.size();) { + const char c = javaPattern[i]; + + if (c == '\\' && i + 1 < javaPattern.size()) { + const char next = javaPattern[i + 1]; + if (!inQuotation && next == 'Q') { + inQuotation = true; + i += 2; + continue; + } + if (inQuotation && next == 'E') { + inQuotation = false; + i += 2; + continue; + } + if (inQuotation) { + ++i; + continue; + } + if (next >= '1' && next <= '9') { + throw EvaluationFailedException( + "RE2 does not support Java backreferences (\\1-\\9)"); + } + if (next == 'k' && i + 2 < javaPattern.size() && + javaPattern[i + 2] == '<') { + throw EvaluationFailedException( + "RE2 does not support Java named backreferences (\\k)"); + } + i += 2; + continue; + } + + if (inQuotation) { + ++i; + continue; + } + + if (commentsMode && c == '#' && !hasOddBackslashesBefore(javaPattern, i)) { + while (i < javaPattern.size() && javaPattern[i] != '\n') { + ++i; + } + continue; + } + + if (c == '[' && !hasOddBackslashesBefore(javaPattern, i)) { + const auto classEnd = trySkipClass(javaPattern, i); + if (classEnd > i) { + i = classEnd; + continue; + } + } + + if (c == '(' && i + 1 < javaPattern.size() && javaPattern[i + 1] == '?' && + !hasOddBackslashesBefore(javaPattern, i)) { + if (i + 2 < javaPattern.size()) { + const char op = javaPattern[i + 2]; + if (op == '=' || op == '!') { + throw EvaluationFailedException( + "RE2 does not support Java lookaround assertions"); + } + if (op == '>') { + throw EvaluationFailedException( + "RE2 does not support Java atomic groups (?>...)"); + } + if (op == '<' && i + 3 < javaPattern.size() && + (javaPattern[i + 3] == '=' || javaPattern[i + 3] == '!')) { + throw EvaluationFailedException( + "RE2 does not support Java lookaround assertions"); + } + } + + char unsupportedFlag = 0; + if (validModeModifierHasUnsupportedRe2Flag( + javaPattern, i, unsupportedFlag)) { + if (unsupportedFlag == 'U') { + throw EvaluationFailedException( + "RE2 does not support Java UNICODE_CHARACTER_CLASS flag (?U)"); + } + if (unsupportedFlag == 'c') { + throw EvaluationFailedException( + "RE2 does not support Java CANON_EQ flag (?c)"); + } + throw EvaluationFailedException( + "RE2 does not support Java UNIX_LINES flag (?d)"); + } + + std::string ignored; + const auto modeResult = + tryTranslateModeModifier(javaPattern, i, javaPattern.size(), ignored); + if (modeResult.end != std::string_view::npos) { + if (modeResult.term == ':') { + commentsStack.push_back(commentsMode); + } + if (modeResult.onX) { + commentsMode = true; + } + if (modeResult.hasDash && modeResult.offX) { + commentsMode = false; + } + i = modeResult.end; + continue; + } + } + + if (c == '(' && !hasOddBackslashesBefore(javaPattern, i)) { + commentsStack.push_back(commentsMode); + } else if ( + c == ')' && !hasOddBackslashesBefore(javaPattern, i) && + !commentsStack.empty()) { + commentsMode = commentsStack.back(); + commentsStack.pop_back(); + } + + if ((c == '*' || c == '?' || c == '+') && i + 1 < javaPattern.size() && + javaPattern[i + 1] == '+' && !hasOddBackslashesBefore(javaPattern, i)) { + throw EvaluationFailedException( + "RE2 does not support Java possessive quantifiers"); + } + + if (c == '{' && !hasOddBackslashesBefore(javaPattern, i)) { + const auto close = javaPattern.find('}', i + 1); + if (close != std::string_view::npos && + isValidQuantifierBody(javaPattern.substr(i + 1, close - i - 1)) && + close + 1 < javaPattern.size() && javaPattern[close + 1] == '+') { + throw EvaluationFailedException( + "RE2 does not support Java possessive quantifiers"); + } + } + + ++i; + } +} + +void appendCommentsModeClassForRe2( + std::string_view pattern, + std::size_t classStart, + std::size_t classEnd, + std::string& out) { + out.push_back('['); + bool inQuotation = false; + + for (std::size_t i = classStart + 1; i + 1 < classEnd;) { + const char c = pattern[i]; + if (c == '\\' && i + 1 < classEnd) { + const char next = pattern[i + 1]; + if (!inQuotation && next == 'Q') { + inQuotation = true; + } else if (inQuotation && next == 'E') { + inQuotation = false; + } + out.push_back(c); + out.push_back(next); + i += 2; + continue; + } + + if (!inQuotation && c == '#') { + while (i + 1 < classEnd && pattern[i] != '\n') { + ++i; + } + if (i + 1 >= classEnd) { + throw EvaluationFailedException( + "Java COMMENTS mode comment in character class is not terminated"); + } + continue; + } + + if (!inQuotation && std::isspace(static_cast(c))) { + ++i; + continue; + } + + out.push_back(c); + ++i; + } + + out.push_back(']'); +} + +std::string translateCommentsModeForRe2(std::string_view pattern) { + std::string out; + out.reserve(pattern.size()); + bool inQuotation = false; + bool commentsMode = false; + std::vector commentsStack; + + for (std::size_t i = 0; i < pattern.size();) { + const char c = pattern[i]; + + if (c == '\\' && i + 1 < pattern.size()) { + const char next = pattern[i + 1]; + if (!inQuotation && next == 'Q') { + out += "\\Q"; + inQuotation = true; + i += 2; + continue; + } + if (inQuotation && next == 'E') { + out += "\\E"; + inQuotation = false; + i += 2; + continue; + } + if (commentsMode && std::isspace(static_cast(next))) { + out.push_back(next); + i += 2; + continue; + } + out.push_back(c); + out.push_back(next); + i += 2; + continue; + } + + if (inQuotation) { + out.push_back(c); + ++i; + continue; + } + + if (c == '[' && !hasOddTrailingBackslashes(out)) { + const auto classEnd = trySkipClass(pattern, i); + if (classEnd > i) { + if (commentsMode) { + appendCommentsModeClassForRe2(pattern, i, classEnd, out); + } else { + out.append(pattern.substr(i, classEnd - i)); + } + i = classEnd; + continue; + } + } + + if (commentsMode && c == '(' && i + 2 < pattern.size() && + pattern[i + 1] == '?' && !hasOddTrailingBackslashes(out) && + (std::isspace(static_cast(pattern[i + 2])) || + pattern[i + 2] == '#')) { + throw EvaluationFailedException( + "Java COMMENTS mode does not ignore whitespace in inline group prefixes"); + } + + if (commentsMode && c == '#') { + while (i < pattern.size() && pattern[i] != '\n') { + ++i; + } + if (i < pattern.size()) { + ++i; + } + continue; + } + + if (commentsMode && std::isspace(static_cast(c))) { + ++i; + continue; + } + + if (c == '(' && i + 1 < pattern.size() && pattern[i + 1] == '?' && + !hasOddTrailingBackslashes(out)) { + std::string ignored; + const auto modeResult = + tryTranslateModeModifier(pattern, i, pattern.size(), ignored); + if (modeResult.end != std::string_view::npos) { + appendRe2ModeModifier( + pattern, + i, + modeResult.end, + modeResult.term, + modeResult.hasDash, + out); + if (modeResult.term == ':') { + commentsStack.push_back(commentsMode); + } + if (modeResult.onX) { + commentsMode = true; + } + if (modeResult.hasDash && modeResult.offX) { + commentsMode = false; + } + i = modeResult.end; + continue; + } + } + + if (c == '(' && !hasOddTrailingBackslashes(out)) { + commentsStack.push_back(commentsMode); + } else if ( + c == ')' && !hasOddTrailingBackslashes(out) && !commentsStack.empty()) { + commentsMode = commentsStack.back(); + commentsStack.pop_back(); + } + + out.push_back(c); + ++i; + } + + return out; +} + +std::string translatePcre2OctalEscapesForRe2(std::string_view pattern) { + std::string out; + out.reserve(pattern.size()); + for (std::size_t i = 0; i < pattern.size();) { + if (i + 3 < pattern.size() && pattern[i] == '\\' && pattern[i + 1] == 'o' && + pattern[i + 2] == '{') { + const auto close = pattern.find('}', i + 3); + if (close != std::string_view::npos) { + std::uint32_t value = 0; + bool valid = close > i + 3; + for (std::size_t k = i + 3; k < close; ++k) { + if (!isOctalDigit(pattern[k])) { + valid = false; + break; + } + value = (value << 3) + (pattern[k] - '0'); + } + if (valid) { + out += "\\x{"; + out += toLowerHex(value); + out.push_back('}'); + i = close + 1; + continue; + } + } + } + out.push_back(pattern[i++]); + } + return out; +} + +std::string rewriteJavaNamedGroupsForRe2(std::string_view pattern) { + std::string out; + out.reserve(pattern.size() + 8); + bool inQuotation = false; + + for (std::size_t i = 0; i < pattern.size();) { + const char c = pattern[i]; + if (c == '\\' && i + 1 < pattern.size()) { + const char next = pattern[i + 1]; + if (!inQuotation && next == 'Q') { + inQuotation = true; + } else if (inQuotation && next == 'E') { + inQuotation = false; + } + out.push_back(c); + out.push_back(next); + i += 2; + continue; + } + + if (!inQuotation && c == '[' && !hasOddBackslashesBefore(pattern, i)) { + const auto classEnd = trySkipClass(pattern, i); + if (classEnd > i) { + out.append(pattern.substr(i, classEnd - i)); + i = classEnd; + continue; + } + } + + if (!inQuotation && c == '(' && i + 3 < pattern.size() && + pattern[i + 1] == '?' && pattern[i + 2] == '<' && + pattern[i + 3] != '=' && pattern[i + 3] != '!' && + !hasOddBackslashesBefore(pattern, i)) { + out += "(?P<"; + i += 3; + continue; + } + + out.push_back(c); + ++i; + } + return out; +} + +std::string renderFoldClass(const std::set& cps) { + std::string out = "["; + for (const auto cp : cps) { + if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) { + out.push_back(static_cast(cp)); + } else { + out += "\\x{"; + out += toLowerHex(cp); + out.push_back('}'); + } + } + out.push_back(']'); + return out; +} + +std::set foldEquivalenceClass(std::uint32_t cp) { + // Partial Java UNICODE_CASE literal pre-folding. ICU simple case mappings + // cover ordinary one-code-point upper/lower pairs; the explicit additions + // cover common Java/Unicode special case-fold equivalences that PCRE2 + // caseless matching misses without UCP/full folding. + std::set cps{cp}; + const auto add = [&](std::uint32_t x) { + cps.insert(x); + const auto lower = static_cast(u_tolower(x)); + const auto upper = static_cast(u_toupper(x)); + cps.insert(lower); + cps.insert(upper); + }; + add(cp); + switch (cp) { + case 0x004B: + case 0x006B: + case 0x212A: + add(0x004B); + add(0x006B); + add(0x212A); + break; + case 0x0053: + case 0x0073: + case 0x017F: + add(0x0053); + add(0x0073); + add(0x017F); + break; + case 0x03A3: + case 0x03C3: + case 0x03C2: + add(0x03A3); + add(0x03C3); + add(0x03C2); + break; + case 0x00B5: + case 0x039C: + case 0x03BC: + add(0x00B5); + add(0x039C); + add(0x03BC); + break; + case 0x00C5: + case 0x00E5: + case 0x212B: + add(0x00C5); + add(0x00E5); + add(0x212B); + break; + case 0x0049: + case 0x0069: + case 0x0130: + case 0x0131: + add(0x0049); + add(0x0069); + add(0x0130); + add(0x0131); + break; + default: + break; + } + return cps; +} + +std::optional parseHexBracedCodePoint( + std::string_view s, + std::size_t start, + std::size_t& end) { + if (start + 3 >= s.size() || s[start] != '\\' || s[start + 1] != 'x' || + s[start + 2] != '{') { + return std::nullopt; + } + std::size_t k = start + 3; + std::uint32_t cp = 0; + bool any = false; + while (k < s.size() && s[k] != '}') { + if (!isHexDigit(s[k])) { + return std::nullopt; + } + cp = (cp << 4) | hexValue(s[k]); + any = true; + ++k; + } + if (!any || k >= s.size() || s[k] != '}') { + return std::nullopt; + } + end = k + 1; + return cp; +} + +std::string expandCasedLiteralsForUnicodeCase(std::string_view pattern) { + std::string out; + out.reserve(pattern.size() + 16); + bool inQuotation = false; + for (std::size_t i = 0; i < pattern.size();) { + const char c = pattern[i]; + if (c == '\\' && i + 1 < pattern.size()) { + const char next = pattern[i + 1]; + if (!inQuotation && next == 'Q') { + out += "\\Q"; + i += 2; + inQuotation = true; + continue; + } + if (inQuotation && next == 'E') { + out += "\\E"; + i += 2; + inQuotation = false; + continue; + } + if (!inQuotation) { + std::size_t end = i; + if (auto cp = parseHexBracedCodePoint(pattern, i, end)) { + auto cps = foldEquivalenceClass(*cp); + if (cps.size() > 1) { + out += renderFoldClass(cps); + } else { + out.append(pattern.substr(i, end - i)); + } + i = end; + continue; + } + } + out.push_back(c); + out.push_back(next); + i += 2; + continue; + } + if (inQuotation) { + out.push_back(c); + ++i; + continue; + } + if (c == '[' && !hasOddTrailingBackslashes(out)) { + const auto classEnd = trySkipClass(pattern, i); + if (classEnd > i) { + out.append(pattern.substr(i, classEnd - i)); + i = classEnd; + continue; + } + } + + std::size_t next = i; + std::uint32_t cp = 0; + if (decodeUtf8CodePoint(pattern, next, cp)) { + auto cps = foldEquivalenceClass(cp); + if (cps.size() > 1) { + out += renderFoldClass(cps); + } else { + out.append(pattern.substr(i, next - i)); + } + i = next; + continue; + } + out.push_back(c); + ++i; + } + return out; +} + +} // namespace + +std::string toPcre2Pattern( + std::string_view javaPattern, + bool& needsRawByteMode) { + needsRawByteMode = false; + if (javaPattern.empty()) { + return std::string(javaPattern); + } + + const std::size_t len = javaPattern.size(); + std::string out; + out.reserve(len + 32); + + std::size_t i = 0; + bool inQuotation = false; + bool caseless = false; + bool unicodeCharacterClass = false; + bool commentsMode = false; + struct GroupFrame { + bool previousCaseless; + bool previousUnicodeCharacterClass; + bool previousCommentsMode; + }; + std::vector groupStack; + + while (i < len) { + const char c = javaPattern[i]; + + if (c == '\\' && i + 1 < len) { + const char next = javaPattern[i + 1]; + + if (!inQuotation && next == 'Q') { + out += "\\Q"; + i += 2; + inQuotation = true; + continue; + } + + if (inQuotation && next == 'E') { + out += "\\E"; + i += 2; + inQuotation = false; + continue; + } + + if (inQuotation) { + out.push_back(c); + ++i; + continue; + } + + if (next == 'p' || next == 'P') { + if (!hasOddTrailingBackslashes(out)) { + const auto tokenEnd = + tryAppendPropertyToken(javaPattern, i, next, out, caseless); + if (tokenEnd > i) { + i = tokenEnd; + continue; + } + } + } + + if (next == 'u' && i + 6 <= len) { + std::size_t k = i + 2; + const std::size_t hexEnd = k + 4; + while (k < hexEnd && isHexDigit(javaPattern[k])) { + ++k; + } + if (k - (i + 2) == 4) { + const std::uint32_t cp = parseFourHex(javaPattern, i + 2); + if (cp >= 0xD800 && cp <= 0xDBFF) { + if (i + 12 <= len && javaPattern[i + 6] == '\\' && + javaPattern[i + 7] == 'u') { + bool hasLowSurrogate = true; + for (std::size_t p = i + 8; p < i + 12; ++p) { + hasLowSurrogate = hasLowSurrogate && isHexDigit(javaPattern[p]); + } + if (hasLowSurrogate) { + const std::uint32_t low = parseFourHex(javaPattern, i + 8); + if (low >= 0xDC00 && low <= 0xDFFF) { + const std::uint32_t scalar = + 0x10000 + ((cp - 0xD800) << 10) + (low - 0xDC00); + out += "\\x{"; + out += toLowerHex(scalar); + out.push_back('}'); + i += 12; + continue; + } + } + } + throw EvaluationFailedException( + "Lone high-surrogate Unicode escape cannot be safely translated"); + } + if (cp >= 0xDC00 && cp <= 0xDFFF) { + throw EvaluationFailedException( + "Lone low-surrogate Unicode escape cannot be safely translated"); + } + out += "\\x{"; + out.append(javaPattern.substr(i + 2, 4)); + out.push_back('}'); + i = k; + continue; + } + } + + if (next == 'N' && i + 2 < len && javaPattern[i + 2] == '{') { + const auto close = javaPattern.find('}', i + 3); + if (close != std::string_view::npos) { + const std::string name(javaPattern.substr(i + 3, close - i - 3)); + UErrorCode status = U_ZERO_ERROR; + const UChar32 cp = + u_charFromName(U_EXTENDED_CHAR_NAME, name.c_str(), &status); + if (U_SUCCESS(status)) { + out += "\\x{"; + out += toLowerHex(static_cast(cp)); + out.push_back('}'); + } else { + out.append(javaPattern.substr(i, close + 1 - i)); + } + i = close + 1; + continue; + } + } + + if (next == 'x' && i + 2 < len && javaPattern[i + 2] == '{') { + const auto close = javaPattern.find('}', i + 3); + if (close != std::string_view::npos) { + out.append(javaPattern.substr(i, close + 1 - i)); + i = close + 1; + continue; + } + } + + if (next == '0' && i + 2 < len && isOctalDigit(javaPattern[i + 2])) { + std::size_t k = i + 2; + const std::size_t last = std::min(k + 3, len); + while (k < last && isOctalDigit(javaPattern[k])) { + ++k; + } + if (k - (i + 2) == 3 && javaPattern[i + 2] > '3') { + --k; + } + int value = 0; + for (std::size_t p = i + 2; p < k; ++p) { + value = value * 8 + (javaPattern[p] - '0'); + } + out += "\\o{"; + char buf[16]; + std::snprintf(buf, sizeof(buf), "%o", value); + out += buf; + out.push_back('}'); + i = k; + continue; + } + + if (next >= '1' && next <= '9') { + std::size_t k = i + 2; + while (k < len && + std::isdigit(static_cast(javaPattern[k]))) { + ++k; + } + const int groupCount = countCapturingGroups(javaPattern); + std::size_t useDigits = k - (i + 1); + auto parseDigits = [&](std::size_t digits) { + int value = 0; + for (std::size_t p = i + 1; p < i + 1 + digits; ++p) { + const int digit = javaPattern[p] - '0'; + if (value > (groupCount + 1 - digit) / 10) { + return groupCount + 1; + } + value = value * 10 + digit; + } + return value; + }; + int backrefN = parseDigits(useDigits); + while (useDigits > 1 && backrefN > groupCount) { + --useDigits; + backrefN = parseDigits(useDigits); + } + if (backrefN > groupCount) { + out += "(*F)"; + } else { + out += "\\g{"; + out += std::to_string(backrefN); + out.push_back('}'); + } + for (std::size_t p = i + 1 + useDigits; p < k; ++p) { + out.push_back(javaPattern[p]); + } + i = k; + continue; + } + + out.push_back(c); + ++i; + continue; + } + + if (inQuotation) { + out.push_back(c); + ++i; + continue; + } + + if (commentsMode && c == '#' && !hasOddTrailingBackslashes(out)) { + while (i < len) { + const char commentChar = javaPattern[i++]; + out.push_back(commentChar); + if (commentChar == '\n') { + break; + } + } + continue; + } + + if (c == '[' && !hasOddTrailingBackslashes(out)) { + const std::size_t classStart = i; + std::size_t pos = i; + try { + const ClassNode classNode = + ClassBodyParser::parseClass(javaPattern, pos); + const std::size_t classEnd = pos; + if (containsRawSurrogate(javaPattern, classStart, classEnd)) { + out += rewritePropertiesOnly(javaPattern, classStart, classEnd); + i = classEnd; + continue; + } + const auto classText = + javaPattern.substr(classStart, classEnd - classStart); + if (unicodeCharacterClass && + classText.find("&&") != std::string_view::npos && + (classText.find("\\d") != std::string_view::npos || + classText.find("\\D") != std::string_view::npos || + classText.find("\\w") != std::string_view::npos || + classText.find("\\W") != std::string_view::npos || + classText.find("\\s") != std::string_view::npos || + classText.find("\\S") != std::string_view::npos)) { + throw EvaluationFailedException( + "UNICODE_CHARACTER_CLASS intersection cannot be safely translated"); + } + const auto renderResult = ClassRenderer::renderWithSignal(classNode); + const std::string& rendered = renderResult.text; + const std::string renderedWithMappedProperties = + rewritePropertiesOnly(rendered, 0, rendered.size()); + const std::string maybeFolded = caseless + ? expandCasedPropertiesInClass(renderedWithMappedProperties) + : renderedWithMappedProperties; + if (renderResult.intersectionUnresolved) { + out += rewritePropertiesOnly(javaPattern, classStart, classEnd); + } else { + out += maybeFolded; + } + i = classEnd; + continue; + } catch (const std::invalid_argument& e) { + if (e.what() != nullptr && + std::string_view(e.what()).rfind("Bad intersection syntax", 0) == + 0) { + throw EvaluationFailedException("Bad intersection syntax"); + } + out.push_back(c); + ++i; + continue; + } + } + + if (c == '(' && i + 1 < len && javaPattern[i + 1] == '?' && + !hasOddTrailingBackslashes(out)) { + const auto modeResult = + tryTranslateModeModifier(javaPattern, i, len, out); + if (modeResult.end != std::string_view::npos) { + if (modeResult.term == ':') { + groupStack.push_back({caseless, unicodeCharacterClass, commentsMode}); + } + if (modeResult.onI) { + caseless = true; + } + if (modeResult.hasDash && modeResult.offI) { + caseless = false; + } + if (modeResult.onU) { + unicodeCharacterClass = true; + } + if (modeResult.hasDash && modeResult.offU) { + unicodeCharacterClass = false; + } + if (modeResult.onX) { + commentsMode = true; + } + if (modeResult.hasDash && modeResult.offX) { + commentsMode = false; + } + i = modeResult.end; + continue; + } + } + + if (c == '(' && !hasOddTrailingBackslashes(out)) { + groupStack.push_back({caseless, unicodeCharacterClass, commentsMode}); + } + + if (c == '{' && !hasOddTrailingBackslashes(out)) { + const auto close = javaPattern.find('}', i + 1); + if (close == std::string_view::npos) { + throw EvaluationFailedException("Unclosed counted closure"); + } + const auto body = javaPattern.substr(i + 1, close - i - 1); + if (!isValidQuantifierBody(body)) { + throw EvaluationFailedException("Illegal repetition"); + } + } + + const bool closesGroup = c == ')' && !hasOddTrailingBackslashes(out); + out.push_back(c); + if (closesGroup) { + if (!groupStack.empty()) { + const auto frame = groupStack.back(); + groupStack.pop_back(); + caseless = frame.previousCaseless; + unicodeCharacterClass = frame.previousUnicodeCharacterClass; + commentsMode = frame.previousCommentsMode; + } + } + ++i; + } + + needsRawByteMode = needsRawByteModeForPcre2(out); + return out; +} + +std::string toPcre2Pattern(std::string_view javaPattern) { + bool needsRawByteMode = false; + return toPcre2Pattern(javaPattern, needsRawByteMode); +} + +std::string toPcre2PatternWithUnicodeCase( + std::string_view javaPattern, + bool& needsRawByteMode) { + auto translated = toPcre2Pattern(javaPattern, needsRawByteMode); + translated = expandCasedLiteralsForUnicodeCase(translated); + needsRawByteMode = needsRawByteModeForPcre2(translated); + return translated; +} + +std::string toPcre2PatternWithUnicodeCase(std::string_view javaPattern) { + bool needsRawByteMode = false; + return toPcre2PatternWithUnicodeCase(javaPattern, needsRawByteMode); +} + +std::string toRe2Pattern(std::string_view javaPattern) { + rejectUnsupportedRe2Features(javaPattern); + return rewriteJavaNamedGroupsForRe2(translatePcre2OctalEscapesForRe2( + toPcre2Pattern(translateCommentsModeForRe2(javaPattern)))); +} + +std::string toRe2PatternWithUnicodeCase(std::string_view javaPattern) { + return expandCasedLiteralsForUnicodeCase(toRe2Pattern(javaPattern)); +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h b/velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h new file mode 100644 index 00000000000..8172238ae7c --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.JavaRegexTranslator (Java) under +// Apache-2.0 by the same author for inclusion in Velox. +// +// This header is the public surface of the `java_pcre2_translator` +// library. It declares free functions that rewrite a `java.util.regex` +// pattern string into an equivalent pattern accepted by either PCRE2 or +// RE2. +// +#pragma once + +#include +#include + +#include "velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h" + +namespace facebook::velox::functions::java_pcre2_translator { + +/// Rewrites a `java.util.regex.Pattern` source string into an equivalent +/// pattern accepted by PCRE2. Implements the 3-phase pipeline described +/// in pcre4j PR #606: +/// +/// 1. Expand top-level `\p{...}` / `\P{...}` property tokens via the +/// Java property → Unicode block alias map. +/// 2. Re-parse each character-class body, flatten nested unions, resolve +/// `&&` intersections via range-set algebra, and escape `-` after +/// multi-char escapes to disambiguate from the range operator. +/// 3. Rewrite Java inline flag groups whose semantics diverge in PCRE2 +/// (notably `(?U)` which means UNICODE_CHARACTER_CLASS in Java but +/// "ungreedy" in PCRE2). +/// +/// Throws `EvaluationFailedException` when the input cannot be safely +/// expressed in PCRE2 syntax (e.g. a property name with no PCRE2 +/// equivalent). Callers are expected to surface the message verbatim. +/// +std::string toPcre2Pattern(std::string_view javaPattern); + +/// Rewrites a Java pattern and reports whether the resulting PCRE2 compile +/// must omit PCRE2_UTF to allow lone surrogate code units. +std::string toPcre2Pattern( + std::string_view javaPattern, + bool& needsRawByteMode); + +/// Rewrites a Java pattern for PCRE2 and pre-expands cased literal code points +/// for Java's CASE_INSENSITIVE | UNICODE_CASE semantics. This is intentionally +/// limited to literals outside character classes and outside \Q...\E quotes. +std::string toPcre2PatternWithUnicodeCase( + std::string_view javaPattern, + bool& needsRawByteMode); + +std::string toPcre2PatternWithUnicodeCase(std::string_view javaPattern); + +/// Rewrites a `java.util.regex.Pattern` source string into an equivalent +/// pattern accepted by RE2. +/// +/// This shares the PCRE2 property and character-class translation pipeline, +/// rewrites Java named groups `(?...)` to RE2 `(?P...)`, and +/// rejects Java features that RE2 cannot represent without changing +/// semantics. +std::string toRe2Pattern(std::string_view javaPattern); + +std::string toRe2PatternWithUnicodeCase(std::string_view javaPattern); + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.cpp b/velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.cpp new file mode 100644 index 00000000000..207f130f691 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.cpp @@ -0,0 +1,508 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.JdkPropertyExpander (Java) under Apache-2.0 by +// the same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { +namespace { + +class SpanBuilder { + public: + void add(std::int32_t cp) { + if (spanStart_ < 0) { + spanStart_ = cp; + spanEnd_ = cp; + } else if (cp == spanEnd_ + 1) { + spanEnd_ = cp; + } else { + pairs_.push_back(spanStart_); + pairs_.push_back(spanEnd_); + spanStart_ = cp; + spanEnd_ = cp; + } + } + + RangeSet build() { + if (spanStart_ >= 0) { + pairs_.push_back(spanStart_); + pairs_.push_back(spanEnd_); + spanStart_ = -1; + } + return RangeSet::fromSortedPairs(std::move(pairs_)); + } + + private: + std::vector pairs_; + std::int32_t spanStart_{-1}; + std::int32_t spanEnd_{-1}; +}; + +std::string upperAscii(std::string_view s) { + std::string out(s); + for (char& c : out) { + c = static_cast(std::toupper(static_cast(c))); + } + return out; +} + +std::string normalizePropertyKey(std::string_view s) { + std::string out; + out.reserve(s.size()); + for (char c : s) { + const auto uc = static_cast(c); + if (c == '_' || c == '-' || std::isspace(uc)) { + continue; + } + out.push_back(static_cast(std::toupper(uc))); + } + return out; +} + +void addAlias( + std::unordered_map& builders, + const char* alias, + std::int32_t cp) { + if (alias != nullptr && alias[0] != '\0') { + builders[upperAscii(alias)].add(cp); + } +} + +void addNormalizedAlias( + std::unordered_map& builders, + const char* alias, + std::int32_t cp) { + if (alias != nullptr && alias[0] != '\0') { + builders[normalizePropertyKey(alias)].add(cp); + } +} + +const char* categoryName(int32_t type) { + switch (type) { + case U_UPPERCASE_LETTER: + return "LU"; + case U_LOWERCASE_LETTER: + return "LL"; + case U_TITLECASE_LETTER: + return "LT"; + case U_MODIFIER_LETTER: + return "LM"; + case U_OTHER_LETTER: + return "LO"; + case U_NON_SPACING_MARK: + return "MN"; + case U_ENCLOSING_MARK: + return "ME"; + case U_COMBINING_SPACING_MARK: + return "MC"; + case U_DECIMAL_DIGIT_NUMBER: + return "ND"; + case U_LETTER_NUMBER: + return "NL"; + case U_OTHER_NUMBER: + return "NO"; + case U_SPACE_SEPARATOR: + return "ZS"; + case U_LINE_SEPARATOR: + return "ZL"; + case U_PARAGRAPH_SEPARATOR: + return "ZP"; + case U_CONTROL_CHAR: + return "CC"; + case U_FORMAT_CHAR: + return "CF"; + case U_SURROGATE: + return "CS"; + case U_PRIVATE_USE_CHAR: + return "CO"; + case U_UNASSIGNED: + return "CN"; + case U_DASH_PUNCTUATION: + return "PD"; + case U_START_PUNCTUATION: + return "PS"; + case U_END_PUNCTUATION: + return "PE"; + case U_CONNECTOR_PUNCTUATION: + return "PC"; + case U_OTHER_PUNCTUATION: + return "PO"; + case U_MATH_SYMBOL: + return "SM"; + case U_CURRENCY_SYMBOL: + return "SC"; + case U_MODIFIER_SYMBOL: + return "SK"; + case U_OTHER_SYMBOL: + return "SO"; + case U_INITIAL_PUNCTUATION: + return "PI"; + case U_FINAL_PUNCTUATION: + return "PF"; + default: + return nullptr; + } +} + +RangeSet unionOf( + const std::unordered_map& map, + std::initializer_list keys) { + RangeSet result = RangeSet::empty(); + for (const char* key : keys) { + auto it = map.find(key); + if (it != map.end()) { + result = result.unionWith(it->second); + } + } + return result; +} + +bool isJavaWhitespace(std::int32_t cp) { + return cp == '\t' || cp == '\n' || cp == 0x0B || cp == '\f' || cp == '\r' || + cp == ' ' || (cp >= 0x1C && cp <= 0x1F) || cp == 0x1680 || + (cp >= 0x2000 && cp <= 0x200A) || cp == 0x2028 || cp == 0x2029 || + cp == 0x205F || cp == 0x3000; +} + +bool isJavaLetter(std::int32_t cp) { + switch (u_charType(static_cast(cp))) { + case U_UPPERCASE_LETTER: + case U_LOWERCASE_LETTER: + case U_TITLECASE_LETTER: + case U_MODIFIER_LETTER: + case U_OTHER_LETTER: + return true; + default: + return false; + } +} + +std::unordered_map buildJavaPropertyMap() { + std::unordered_map builders; + for (const char* name : + {"javaLowerCase", + "javaUpperCase", + "javaTitleCase", + "javaSpaceChar", + "javaMirrored", + "javaDefined", + "javaDigit", + "javaAlphabetic", + "javaIdeographic", + "javaISOControl", + "javaWhitespace", + "javaLetter", + "javaLetterOrDigit", + "javaJavaIdentifierStart", + "javaJavaIdentifierPart", + "javaUnicodeIdentifierStart", + "javaUnicodeIdentifierPart", + "javaIdentifierIgnorable"}) { + builders.emplace(name, SpanBuilder{}); + } + + for (std::int32_t cp = 0; cp <= RangeSet::kMaxCp; ++cp) { + const UChar32 ucp = static_cast(cp); + const auto type = u_charType(ucp); + const bool letter = isJavaLetter(cp); + if (u_hasBinaryProperty(ucp, UCHAR_LOWERCASE)) { + builders["javaLowerCase"].add(cp); + } + if (u_hasBinaryProperty(ucp, UCHAR_UPPERCASE)) { + builders["javaUpperCase"].add(cp); + } + if (type == U_TITLECASE_LETTER) { + builders["javaTitleCase"].add(cp); + } + if (u_isJavaSpaceChar(ucp)) { + builders["javaSpaceChar"].add(cp); + } + if (u_isMirrored(ucp)) { + builders["javaMirrored"].add(cp); + } + if (type != U_UNASSIGNED) { + builders["javaDefined"].add(cp); + } + if (u_isdigit(ucp)) { + builders["javaDigit"].add(cp); + } + if (u_hasBinaryProperty(ucp, UCHAR_ALPHABETIC)) { + builders["javaAlphabetic"].add(cp); + } + if (u_hasBinaryProperty(ucp, UCHAR_IDEOGRAPHIC)) { + builders["javaIdeographic"].add(cp); + } + if ((cp >= 0x00 && cp <= 0x1F) || (cp >= 0x7F && cp <= 0x9F)) { + builders["javaISOControl"].add(cp); + } + if (isJavaWhitespace(cp)) { + builders["javaWhitespace"].add(cp); + } + if (letter) { + builders["javaLetter"].add(cp); + } + if (letter || u_isdigit(ucp)) { + builders["javaLetterOrDigit"].add(cp); + } + if (u_isJavaIDStart(ucp)) { + builders["javaJavaIdentifierStart"].add(cp); + } + if (u_isJavaIDPart(ucp)) { + builders["javaJavaIdentifierPart"].add(cp); + } + if (u_isIDStart(ucp)) { + builders["javaUnicodeIdentifierStart"].add(cp); + } + if (u_isIDPart(ucp)) { + builders["javaUnicodeIdentifierPart"].add(cp); + } + if (u_isIDIgnorable(ucp)) { + builders["javaIdentifierIgnorable"].add(cp); + } + } + + std::unordered_map map; + for (auto& [name, builder] : builders) { + map.emplace(name, builder.build()); + } + return map; +} + +const std::unordered_map& javaPropertyMap() { + static const auto kMap = buildJavaPropertyMap(); + return kMap; +} + +std::unordered_map buildBlockMap() { + std::unordered_map builders; + for (std::int32_t cp = 0; cp <= RangeSet::kMaxCp; ++cp) { + const auto block = ublock_getCode(static_cast(cp)); + addNormalizedAlias( + builders, + u_getPropertyValueName(UCHAR_BLOCK, block, U_LONG_PROPERTY_NAME), + cp); + addNormalizedAlias( + builders, + u_getPropertyValueName(UCHAR_BLOCK, block, U_SHORT_PROPERTY_NAME), + cp); + } + + std::unordered_map map; + for (auto& [name, builder] : builders) { + auto range = builder.build(); + if (!range.isEmpty()) { + map.emplace(name, std::move(range)); + } + } + return map; +} + +const std::unordered_map& blockMap() { + static const auto kMap = buildBlockMap(); + return kMap; +} + +std::unordered_map buildPositiveMap() { + std::unordered_map catBuilders; + for (const char* cat : + {"LU", "LL", "LT", "LM", "LO", "MN", "ME", "MC", "ND", "NL", + "NO", "PC", "PD", "PS", "PE", "PI", "PF", "PO", "SM", "SC", + "SK", "SO", "ZS", "ZL", "ZP", "CC", "CF", "CS", "CO", "CN"}) { + catBuilders.emplace(cat, SpanBuilder{}); + } + + std::unordered_map scriptBuilders; + std::unordered_map blockBuilders; + std::unordered_map binaryBuilders; + + // Strategy choice: use Velox's existing ICU dependency instead of adding a + // new dependency or generating source tables. ICU's + // u_charType/uscript_getScript provide the same kind of full-code-point scan + // as Java Character APIs. + for (std::int32_t cp = 0; cp <= RangeSet::kMaxCp; ++cp) { + if (const char* cat = categoryName(u_charType(static_cast(cp)))) { + catBuilders[cat].add(cp); + } + + UErrorCode status = U_ZERO_ERROR; + const UScriptCode script = + uscript_getScript(static_cast(cp), &status); + if (U_SUCCESS(status)) { + const char* name = uscript_getName(script); + if (name != nullptr) { + scriptBuilders[upperAscii(name)].add(cp); + } + addAlias( + scriptBuilders, + u_getPropertyValueName(UCHAR_SCRIPT, script, U_SHORT_PROPERTY_NAME), + cp); + } + + const auto block = ublock_getCode(static_cast(cp)); + addAlias( + blockBuilders, + u_getPropertyValueName(UCHAR_BLOCK, block, U_LONG_PROPERTY_NAME), + cp); + addAlias( + blockBuilders, + u_getPropertyValueName(UCHAR_BLOCK, block, U_SHORT_PROPERTY_NAME), + cp); + + if (u_hasBinaryProperty(static_cast(cp), UCHAR_ALPHABETIC)) { + binaryBuilders["ALPHABETIC"].add(cp); + } + if (u_hasBinaryProperty(static_cast(cp), UCHAR_IDEOGRAPHIC)) { + binaryBuilders["IDEOGRAPHIC"].add(cp); + } + if (u_hasBinaryProperty(static_cast(cp), UCHAR_BIDI_MIRRORED)) { + binaryBuilders["BIDI_MIRRORED"].add(cp); + } + } + + std::unordered_map map; + for (auto& [cat, builder] : catBuilders) { + map.emplace(cat, builder.build()); + } + + map.emplace("L", unionOf(map, {"LU", "LL", "LT", "LM", "LO"})); + map.emplace("LC", unionOf(map, {"LU", "LL", "LT"})); + map.emplace("M", unionOf(map, {"MN", "ME", "MC"})); + map.emplace("N", unionOf(map, {"ND", "NL", "NO"})); + map.emplace("P", unionOf(map, {"PC", "PD", "PS", "PE", "PI", "PF", "PO"})); + map.emplace("S", unionOf(map, {"SM", "SC", "SK", "SO"})); + map.emplace("Z", unionOf(map, {"ZS", "ZL", "ZP"})); + map.emplace("C", unionOf(map, {"CC", "CF", "CS", "CO", "CN"})); + + for (auto& [script, builder] : scriptBuilders) { + map.emplace(script, builder.build()); + } + for (auto& [block, builder] : blockBuilders) { + auto range = builder.build(); + map.emplace("IN" + block, range); + map.emplace(block, std::move(range)); + } + for (auto& [binaryProperty, builder] : binaryBuilders) { + map.emplace(binaryProperty, builder.build()); + } + map.emplace("ASCII", RangeSet::range(0, 0x7F)); + return map; +} + +const std::unordered_map& positiveMap() { + static const auto kMap = buildPositiveMap(); + return kMap; +} + +std::optional compute(std::string_view token) { + bool negate = false; + std::string name; + if (token.size() >= 4 && token.substr(0, 3) == "\\p{" && + token.back() == '}') { + name = upperAscii(token.substr(3, token.size() - 4)); + } else if ( + token.size() >= 4 && token.substr(0, 3) == "\\P{" && + token.back() == '}') { + negate = true; + name = upperAscii(token.substr(3, token.size() - 4)); + } else { + return std::nullopt; + } + + auto lookupName = name; + if (name.rfind("BLK=", 0) == 0) { + lookupName = "IN" + name.substr(4); + } else if (name.rfind("BLOCK=", 0) == 0) { + lookupName = "IN" + name.substr(6); + } + + auto it = positiveMap().find(lookupName); + if (it != positiveMap().end()) { + return negate ? std::optional(it->second.complement()) + : std::optional(it->second); + } + if (name.rfind("IN", 0) == 0 && name.size() > 2) { + auto blockIt = blockMap().find(normalizePropertyKey(name.substr(2))); + if (blockIt != blockMap().end()) { + return negate ? std::optional(blockIt->second.complement()) + : std::optional(blockIt->second); + } + } + if (name.rfind("JAVA", 0) == 0) { + const auto braceOpen = token.find('{'); + const std::string original( + token.substr(braceOpen + 1, token.size() - braceOpen - 2)); + auto javaIt = javaPropertyMap().find(original); + if (javaIt != javaPropertyMap().end()) { + return negate ? std::optional(javaIt->second.complement()) + : std::optional(javaIt->second); + } + } + return std::nullopt; +} + +std::mutex cacheMutex; +std::unordered_map> cache; + +} // namespace + +std::optional JdkPropertyExpander::expand( + std::string_view pcre2Token) { + const std::string key(pcre2Token); + std::lock_guard l(cacheMutex); + auto it = cache.find(key); + if (it != cache.end()) { + return it->second; + } + auto result = compute(key); + cache.emplace(key, result); + return result; +} + +std::optional JdkPropertyExpander::materializeJavaProperty( + std::string_view name) { + auto it = javaPropertyMap().find(std::string(name)); + if (it == javaPropertyMap().end()) { + return std::nullopt; + } + return "[" + it->second.toPcre2ClassBody() + "]"; +} + +std::optional JdkPropertyExpander::materializeUnicodeBlock( + std::string_view name) { + auto it = blockMap().find(normalizePropertyKey(name)); + if (it == blockMap().end()) { + return std::nullopt; + } + return "[" + it->second.toPcre2ClassBody() + "]"; +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h b/velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h new file mode 100644 index 00000000000..60439377ee5 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.JdkPropertyExpander (Java) under Apache-2.0 by +// the same author for inclusion in Velox. +// +#pragma once + +#include "velox/functions/lib/java_pcre2_translator/RangeSet.h" + +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +class JdkPropertyExpander { + public: + static std::optional expand(std::string_view pcre2Token); + static std::optional materializeJavaProperty( + std::string_view name); + static std::optional materializeUnicodeBlock( + std::string_view name); + + private: + JdkPropertyExpander() = delete; +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/LICENSE-NOTICE.md b/velox/functions/lib/java_pcre2_translator/LICENSE-NOTICE.md new file mode 100644 index 00000000000..3752432c51e --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/LICENSE-NOTICE.md @@ -0,0 +1,19 @@ +# java_pcre2_translator — License Notice + +The source files in this directory are licensed under the Apache License, +Version 2.0 (see the top-level `LICENSE`). + +## Provenance + +These files are a C++ port of the `org.pcre4j.regex.translate` module +introduced in [pcre4j](https://github.com/alexey-pelykh/pcre4j) +pull request **#606**. + +The original Java sources were authored by **Oleksii PELYKH** in 2024–2026 +and originally published under the GNU Lesser General Public License v3 +as part of pcre4j. The same author re-licensed this body of work under +the Apache License, Version 2.0 for inclusion in Apache Velox. + +Each `.h`/`.cpp` file in this directory carries the standard Velox/ASF +Apache-2.0 header **plus** a short `Originally authored by ...` note +that identifies the corresponding pcre4j source file. diff --git a/velox/functions/lib/java_pcre2_translator/PropertyMap.cpp b/velox/functions/lib/java_pcre2_translator/PropertyMap.cpp new file mode 100644 index 00000000000..b80c01a749f --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/PropertyMap.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.PropertyMap (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/PropertyMap.h" + +#include "velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h" + +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +namespace { + +const std::unordered_map& table() { + static const std::unordered_map kTable{ + // --- Short alias: L1 (JDK's Latin-1 shorthand) --- + {"L1", "[\\x{00}-\\x{FF}]"}, + + // --- \p{javaXxx} Java-specific properties --- + {"javaTitleCase", "Lt"}, + {"javaDigit", "Nd"}, + {"javaLetter", "L"}, + {"javaLetterOrDigit", "[\\p{L}\\p{Nd}]"}, + {"javaAlphabetic", "Alphabetic"}, + {"javaIdeographic", "Ideographic"}, + {"javaMirrored", "Bidi_Mirrored"}, + {"javaDefined", "\\P{Cn}"}, + {"javaISOControl", "[\\x00-\\x1F\\x{7F}-\\x{9F}]"}, + {"javaJavaIdentifierStart", "[\\p{L}\\p{Nl}_$]"}, + {"javaJavaIdentifierPart", + "[\\p{L}\\p{Nl}\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}_$]"}, + {"javaUnicodeIdentifierStart", "[\\p{L}\\p{Nl}]"}, + {"javaUnicodeIdentifierPart", + "[\\p{L}\\p{Nl}\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}]"}, + {"javaIdentifierIgnorable", + "[\\x{00}-\\x{08}\\x{0E}-\\x{1B}\\x{7F}-\\x{9F}\\p{Cf}]"}, + // Per Character.isWhitespace() Javadoc: + {"javaWhitespace", + "[\\t\\n\\x0B\\f\\r \\x{1C}-\\x{1F}\\x{1680}" + "\\x{2000}-\\x{200A}\\x{2028}\\x{2029}\\x{205F}\\x{3000}]"}, + + // --- POSIX-style class names accepted by Java's \p{Xxx} (default, + // non-UNICODE) --- + {"Lower", "[a-z]"}, + {"Upper", "[A-Z]"}, + {"Alpha", "[a-zA-Z]"}, + {"Digit", "[0-9]"}, + {"Alnum", "[a-zA-Z0-9]"}, + {"Punct", "[!-/:-@\\[-`{-~]"}, + {"Graph", "[!-~]"}, + {"Print", "[ -~]"}, + {"Blank", "[ \\t]"}, + {"Cntrl", "[\\x00-\\x1F\\x{7F}]"}, + {"XDigit", "[0-9a-fA-F]"}, + {"Space", "[ \\t\\n\\x0B\\f\\r]"}, + + // --- Java property names not recognised as PCRE2 long names --- + {"Control", "Cc"}, + {"Format", "Cf"}, + {"TitleCase", "Lt"}, + {"UpperCase", "Lu"}, + {"LowerCase", "Ll"}, + {"Letter", "L"}, + {"Mark", "M"}, + {"Number", "N"}, + {"Punctuation", "P"}, + {"Symbol", "S"}, + {"Separator", "Z"}, + {"Other", "C"}, + {"Assigned", "\\P{Cn}"}, + {"Unassigned", "Cn"}, + }; + return kTable; +} + +std::string toLower(std::string_view s) { + std::string out(s); + for (char& c : out) { + c = static_cast(std::tolower(static_cast(c))); + } + return out; +} + +std::optional resolveOrPass(std::string_view value) { + auto it = table().find(std::string(value)); + if (it != table().end()) { + return it->second; + } + return std::string(value); +} + +std::string camelCaseToUnderscores(std::string_view s); + +std::string upperBlockKey(std::string_view value) { + std::string out(value); + for (char& c : out) { + if (c == ' ') { + c = '_'; + } else { + c = static_cast(std::toupper(static_cast(c))); + } + } + return out; +} + +std::string normalizedBlockKey(std::string_view value) { + std::string out; + out.reserve(value.size()); + for (char c : value) { + const auto uc = static_cast(c); + if (c == '_' || c == '-' || std::isspace(uc)) { + continue; + } + out.push_back(static_cast(std::toupper(uc))); + } + return out; +} + +std::string resolveBlock(std::string_view blockName) { + const std::string upper = upperBlockKey(blockName); + if (upper == "HIGH_SURROGATES") { + return "[\\x{D800}-\\x{DB7F}]"; + } + if (upper == "HIGH_PRIVATE_USE_SURROGATES") { + return "[\\x{DB80}-\\x{DBFF}]"; + } + if (upper == "LOW_SURROGATES") { + return "[\\x{DC00}-\\x{DFFF}]"; + } + const std::string normalized = normalizedBlockKey(blockName); + if (normalized == "HIGHSURROGATES") { + return "[\\x{D800}-\\x{DB7F}]"; + } + if (normalized == "HIGHPRIVATEUSESURROGATES") { + return "[\\x{DB80}-\\x{DBFF}]"; + } + if (normalized == "LOWSURROGATES") { + return "[\\x{DC00}-\\x{DFFF}]"; + } + if (auto materialized = + JdkPropertyExpander::materializeUnicodeBlock(blockName)) { + return *materialized; + } + return camelCaseToUnderscores(blockName); +} + +// Inserts an `_` between every lowercase→uppercase boundary in a CamelCase +// string. E.g. `BasicLatin` → `Basic_Latin`. Returns `s` unchanged when +// the input already contains an underscore. +std::string camelCaseToUnderscores(std::string_view s) { + if (s.find('_') != std::string_view::npos) { + return std::string(s); + } + std::string out; + out.reserve(s.size() + 8); + for (std::size_t i = 0; i < s.size(); ++i) { + const char c = s[i]; + if (i > 0 && std::isupper(static_cast(c)) && + std::islower(static_cast(s[i - 1]))) { + out.push_back('_'); + } + out.push_back(c); + } + return out; +} + +} // namespace + +std::optional PropertyMap::apply(std::string_view name) { + // 0. Strip Java/Unicode qualifier prefixes: gc=Lu, sc=Greek, blk=Latin, … + const auto eq = name.find('='); + if (eq != std::string_view::npos && eq > 0) { + const std::string key = toLower(name.substr(0, eq)); + const std::string_view value = name.substr(eq + 1); + if (key == "gc" || key == "general_category") { + return resolveOrPass(value); + } + if (key == "sc" || key == "script") { + return resolveOrPass(value); + } + if (key == "blk" || key == "block") { + return resolveBlock(value); + } + return std::nullopt; + } + + if (name == "javaLowerCase" || name == "javaUpperCase" || + name == "javaSpaceChar") { + return JdkPropertyExpander::materializeJavaProperty(name); + } + + // 1. Exact table match. + const auto& t = table(); + auto it = t.find(std::string(name)); + if (it != t.end()) { + return it->second; + } + + // 2. \p{IsXxx} → strip "Is" prefix; prefer known JDK alias mapping over + // passthrough. + if (name.size() > 2 && name[0] == 'I' && name[1] == 's') { + const std::string stripped(name.substr(2)); + auto mit = t.find(stripped); + if (mit != t.end()) { + return mit->second; + } + return stripped; + } + + // 3. \p{InXxx} → strip "In" prefix; insert underscores at CamelCase + // boundaries so PCRE2's block-name lookup succeeds. Note that + // ALL_CAPS_WITH_UNDERSCORES block names were already handled in step 1. + if (name.size() > 2 && name[0] == 'I' && name[1] == 'n') { + return resolveBlock(name.substr(2)); + } + + // 4. No rewrite. + return std::nullopt; +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/PropertyMap.h b/velox/functions/lib/java_pcre2_translator/PropertyMap.h new file mode 100644 index 00000000000..b611b2a73c0 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/PropertyMap.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.PropertyMap (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#pragma once + +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +/// Maps Java regex property names (as used in `\p{...}`) to PCRE2 +/// equivalents. +/// +/// Return convention for `apply(name)`: +/// * A bare name like `"Greek"` → caller emits `\p{Greek}` / `\P{Greek}`. +/// * A string starting with `'['` → caller substitutes the entire +/// `\p{name}` token with this string (used for expanded ranges and +/// multi-class expressions). +/// * `std::nullopt` → no rewrite; leave the token as-is. +class PropertyMap { + public: + static constexpr std::string_view kNeverMatch{"\x01NEVER_MATCH\x01"}; + + /// Resolves a Java regex property name to a PCRE2 equivalent. Returns + /// `std::nullopt` when no rewrite is needed (the caller should pass the + /// token through unchanged). + static std::optional apply(std::string_view name); + + private: + PropertyMap() = delete; +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/RangeSet.cpp b/velox/functions/lib/java_pcre2_translator/RangeSet.cpp new file mode 100644 index 00000000000..9a2bfe804fd --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/RangeSet.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.RangeSet (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/RangeSet.h" + +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +namespace { + +// Emit a single code point inside a PCRE2 character class body — mirrors +// `ClassRenderer.emitLiteralInClass` from the Java sources. We inline +// it here to avoid a circular dep on the (yet to be ported) ClassRenderer +// module. When Phase 4 lands `ClassRenderer`, we can either keep this +// helper local to RangeSet or expose it; the function bodies are +// trivial enough that duplication is fine. +void emitLiteralInClass(std::int32_t cp, std::string& sb) { + if (cp >= 0x20 && cp <= 0x7E) { + switch (cp) { + case '\\': + case ']': + case '^': + case '-': + sb.push_back('\\'); + sb.push_back(static_cast(cp)); + return; + default: + sb.push_back(static_cast(cp)); + return; + } + } + char buf[16]; + std::snprintf(buf, sizeof(buf), "\\x{%X}", static_cast(cp)); + sb.append(buf); +} + +} // namespace + +const RangeSet& RangeSet::empty() { + static const RangeSet kEmpty{{}}; + return kEmpty; +} + +const RangeSet& RangeSet::all() { + static const RangeSet kAll{{0, kMaxCp}}; + return kAll; +} + +RangeSet RangeSet::single(std::int32_t cp) { + if (cp < 0 || cp > kMaxCp) { + throw std::invalid_argument( + "Code point out of range: " + std::to_string(cp)); + } + return RangeSet({cp, cp}); +} + +RangeSet RangeSet::range(std::int32_t lo, std::int32_t hi) { + if (lo < 0 || hi > kMaxCp || lo > hi) { + throw std::invalid_argument( + "Invalid range: [" + std::to_string(lo) + ", " + std::to_string(hi) + + "]"); + } + return RangeSet({lo, hi}); +} + +RangeSet RangeSet::fromSortedPairs(std::vector pairs) { + if (pairs.size() % 2 != 0) { + throw std::invalid_argument("Range pair vector must have even length"); + } + for (std::size_t i = 0; i < pairs.size(); i += 2) { + if (pairs[i] < 0 || pairs[i + 1] > kMaxCp || pairs[i] > pairs[i + 1]) { + throw std::invalid_argument("Invalid sorted range pair"); + } + if (i > 0 && pairs[i] < pairs[i - 2]) { + throw std::invalid_argument("Range pairs must be sorted by lower bound"); + } + } + return normalise(std::move(pairs)); +} + +RangeSet RangeSet::unionWith(const RangeSet& other) const { + if (isEmpty()) { + return other; + } + if (other.isEmpty()) { + return *this; + } + const auto& a = ranges_; + const auto& b = other.ranges_; + std::vector merged; + merged.reserve(a.size() + b.size()); + std::size_t i = 0, j = 0; + while (i < a.size() && j < b.size()) { + if (a[i] <= b[j]) { + merged.push_back(a[i]); + merged.push_back(a[i + 1]); + i += 2; + } else { + merged.push_back(b[j]); + merged.push_back(b[j + 1]); + j += 2; + } + } + while (i < a.size()) { + merged.push_back(a[i]); + merged.push_back(a[i + 1]); + i += 2; + } + while (j < b.size()) { + merged.push_back(b[j]); + merged.push_back(b[j + 1]); + j += 2; + } + return normalise(std::move(merged)); +} + +RangeSet RangeSet::intersect(const RangeSet& other) const { + if (isEmpty() || other.isEmpty()) { + return empty(); + } + const auto& a = ranges_; + const auto& b = other.ranges_; + std::vector out; + out.reserve(std::min(a.size(), b.size())); + std::size_t i = 0, j = 0; + while (i < a.size() && j < b.size()) { + const std::int32_t lo = std::max(a[i], b[j]); + const std::int32_t hi = std::min(a[i + 1], b[j + 1]); + if (lo <= hi) { + out.push_back(lo); + out.push_back(hi); + } + if (a[i + 1] < b[j + 1]) { + i += 2; + } else { + j += 2; + } + } + if (out.empty()) { + return empty(); + } + return RangeSet(std::move(out)); +} + +RangeSet RangeSet::complement() const { + if (isEmpty()) { + return all(); + } + std::vector out; + out.reserve(ranges_.size() + 2); + std::int32_t prev = 0; + for (std::size_t i = 0; i < ranges_.size(); i += 2) { + if (prev < ranges_[i]) { + out.push_back(prev); + out.push_back(ranges_[i] - 1); + } + prev = ranges_[i + 1] + 1; + } + if (prev <= kMaxCp) { + out.push_back(prev); + out.push_back(kMaxCp); + } + if (out.empty()) { + return empty(); + } + return RangeSet(std::move(out)); +} + +RangeSet RangeSet::subtract(const RangeSet& other) const { + return intersect(other.complement()); +} + +bool RangeSet::contains(std::int32_t cp) const { + for (std::size_t i = 0; i < ranges_.size(); i += 2) { + if (cp >= ranges_[i] && cp <= ranges_[i + 1]) { + return true; + } + if (cp < ranges_[i]) { + return false; + } + } + return false; +} + +std::string RangeSet::toPcre2ClassBody() const { + std::string sb; + for (std::size_t i = 0; i < ranges_.size(); i += 2) { + const std::int32_t lo = ranges_[i]; + const std::int32_t hi = ranges_[i + 1]; + emitLiteralInClass(lo, sb); + if (lo != hi) { + sb.push_back('-'); + emitLiteralInClass(hi, sb); + } + } + return sb; +} + +RangeSet RangeSet::normalise(std::vector&& raw) { + if (raw.empty()) { + return empty(); + } + std::vector out; + out.reserve(raw.size()); + std::int32_t curLo = raw[0]; + std::int32_t curHi = raw[1]; + for (std::size_t i = 2; i < raw.size(); i += 2) { + const std::int32_t lo = raw[i]; + const std::int32_t hi = raw[i + 1]; + if (lo <= curHi + 1) { + curHi = std::max(curHi, hi); + } else { + out.push_back(curLo); + out.push_back(curHi); + curLo = lo; + curHi = hi; + } + } + out.push_back(curLo); + out.push_back(curHi); + return RangeSet(std::move(out)); +} + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/RangeSet.h b/velox/functions/lib/java_pcre2_translator/RangeSet.h new file mode 100644 index 00000000000..ce78f7186c7 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/RangeSet.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.RangeSet (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#pragma once + +#include +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator { + +/// Immutable, sorted, disjoint set of Unicode code-point ranges over +/// [0, 0x10FFFF]. Internally stored as a flat `std::vector` of the +/// form `[lo0, hi0, lo1, hi1, ...]` where +/// `lo0 <= hi0 < lo1 <= hi1 < ...`. All endpoints are inclusive. +class RangeSet { + public: + /// The full Unicode code-point space [0, 0x10FFFF]. + static constexpr std::int32_t kMaxCp = 0x10FFFF; + + /// Returns the empty set. + static const RangeSet& empty(); + + /// Returns the set containing every code point [0, kMaxCp]. + static const RangeSet& all(); + + /// Creates a set containing the single code point `cp`. + /// Throws `std::invalid_argument` when `cp` is out of range. + static RangeSet single(std::int32_t cp); + + /// Creates a set containing the range [lo, hi] inclusive. + /// Throws `std::invalid_argument` when the range is invalid. + static RangeSet range(std::int32_t lo, std::int32_t hi); + + /// Creates a set from already sorted [lo, hi] pairs, merging adjacent spans. + static RangeSet fromSortedPairs(std::vector pairs); + + /// Returns the union of this set and `other`. + RangeSet unionWith(const RangeSet& other) const; + + /// Returns the intersection of this set and `other`. + RangeSet intersect(const RangeSet& other) const; + + /// Returns the complement of this set within [0, kMaxCp]. + RangeSet complement() const; + + /// Returns `this - other`. + RangeSet subtract(const RangeSet& other) const; + + /// Returns true iff this set contains no code points. + bool isEmpty() const { + return ranges_.empty(); + } + + /// Returns true iff this set contains `cp`. + bool contains(std::int32_t cp) const; + + /// Emits the content of this set as a PCRE2 character-class body — i.e. + /// what would appear between `[` and `]`. Printable ASCII in the + /// range 0x20–0x7E is emitted literally except for `\`, `]`, `^`, `-` + /// which are backslash-escaped; all other code points are emitted as + /// `\x{HH...}`. Contiguous ranges of two-or-more code points are + /// emitted as `lo-hi`. + std::string toPcre2ClassBody() const; + + /// Number of contiguous ranges (for testing). + int rangeCount() const { + return static_cast(ranges_.size() / 2); + } + + const std::vector& ranges() const { + return ranges_; + } + + /// Equality based on the normalised range vector. + bool operator==(const RangeSet& other) const { + return ranges_ == other.ranges_; + } + bool operator!=(const RangeSet& other) const { + return !(*this == other); + } + + private: + explicit RangeSet(std::vector ranges) + : ranges_(std::move(ranges)) {} + + /// Merges overlapping/adjacent pairs in `raw` (which must already be + /// sorted by `lo`) and returns the resulting `RangeSet`. + static RangeSet normalise(std::vector&& raw); + + /// Sorted, non-overlapping, non-adjacent pairs. + std::vector ranges_; +}; + +} // namespace facebook::velox::functions::java_pcre2_translator diff --git a/velox/functions/lib/java_pcre2_translator/tests/CMakeLists.txt b/velox/functions/lib/java_pcre2_translator/tests/CMakeLists.txt new file mode 100644 index 00000000000..5231d51ad2f --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright (c) Facebook, Inc. and its affiliates. +# (Apache-2.0) + +add_executable( + velox_java_pcre2_translator_test + ClassBodyParserTest.cpp + ClassRendererTest.cpp + EvaluatorTest.cpp + JavaRegexTranslatorTest.cpp + JdkPropertyExpanderTest.cpp + PropertyMapTest.cpp + RangeSetTest.cpp +) + +target_link_libraries( + velox_java_pcre2_translator_test + PRIVATE velox_java_pcre2_translator GTest::gtest GTest::gtest_main +) + +add_test(NAME velox_java_pcre2_translator_test COMMAND velox_java_pcre2_translator_test) diff --git a/velox/functions/lib/java_pcre2_translator/tests/ClassBodyParserTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/ClassBodyParserTest.cpp new file mode 100644 index 00000000000..5f74b19b7d7 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/ClassBodyParserTest.cpp @@ -0,0 +1,228 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassBodyParserTest (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/ClassBodyParser.h" + +#include + +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { +namespace { + +ClassNode parse(std::string_view classStr) { + std::size_t pos = 0; + return ClassBodyParser::parseClass(classStr, pos); +} + +} // namespace + +TEST(ClassBodyParser, simpleLiterals) { + auto node = parse("[abc]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(3, u->children.size()); + EXPECT_EQ(ClassNode(Literal('a')), *u->children[0]); + EXPECT_EQ(ClassNode(Literal('b')), *u->children[1]); + EXPECT_EQ(ClassNode(Literal('c')), *u->children[2]); +} + +TEST(ClassBodyParser, singleCharClass) { + EXPECT_EQ(ClassNode(Literal('a')), parse("[a]")); +} + +TEST(ClassBodyParser, rangeClass) { + EXPECT_EQ(ClassNode(Range('a', 'z')), parse("[a-z]")); +} + +TEST(ClassBodyParser, negatedRange) { + auto node = parse("[^a-z]"); + auto* neg = node.getIf(); + ASSERT_NE(nullptr, neg); + EXPECT_EQ(ClassNode(Range('a', 'z')), *neg->child); +} + +TEST(ClassBodyParser, nestedClassUnion) { + auto node = parse("[abc[def]]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(4, u->children.size()); + EXPECT_EQ(ClassNode(Literal('a')), *u->children[0]); + EXPECT_TRUE(u->children[3]->is()); +} + +TEST(ClassBodyParser, intersection) { + auto node = parse("[a-c&&d-f]"); + auto* inter = node.getIf(); + ASSERT_NE(nullptr, inter); + ASSERT_EQ(2, inter->operands.size()); + EXPECT_EQ(ClassNode(Range('a', 'c')), *inter->operands[0]); + EXPECT_EQ(ClassNode(Range('d', 'f')), *inter->operands[1]); +} + +TEST(ClassBodyParser, wDashHashPattern) { + auto node = parse("[\\w-#]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(3, u->children.size()); + EXPECT_TRUE(u->children[0]->is()); + EXPECT_EQ(ClassNode(Literal('-')), *u->children[1]); + EXPECT_EQ(ClassNode(Literal('#')), *u->children[2]); +} + +TEST(ClassBodyParser, shorthandEscapes) { + auto node = parse("[\\d\\p{L}]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(2, u->children.size()); + ASSERT_TRUE(u->children[0]->is()); + EXPECT_EQ("\\d", u->children[0]->getIf()->pcre2Token); + EXPECT_TRUE(u->children[1]->is()); +} + +TEST(ClassBodyParser, bracketPropertyRewriteParsesAsAst) { + auto node = parse("[\\p{Alpha}]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(2, u->children.size()); + EXPECT_EQ(ClassNode(Range('a', 'z')), *u->children[0]); + EXPECT_EQ(ClassNode(Range('A', 'Z')), *u->children[1]); +} + +TEST(ClassBodyParser, negatedBracketPropertyRewriteParsesAsNegatedAst) { + auto node = parse("[\\P{Alpha}]"); + auto* neg = node.getIf(); + ASSERT_NE(nullptr, neg); + EXPECT_TRUE(neg->child->is()); +} + +TEST(ClassBodyParser, quotedBracket) { + EXPECT_EQ(ClassNode(Literal(']')), parse("[\\Q]\\E]")); +} + +TEST(ClassBodyParser, hexEscape) { + EXPECT_EQ(ClassNode(Literal('A')), parse("[\\x41]")); +} + +TEST(ClassBodyParser, unicodeEscape) { + EXPECT_EQ(ClassNode(Literal('A')), parse("[\\u0041]")); +} + +TEST(ClassBodyParser, escapedNonAsciiLiteralConsumesWholeCodePoint) { + EXPECT_EQ(ClassNode(Range('a', 0x4444)), parse("[a-\\\xE4\x91\x84]")); +} + +TEST(ClassBodyParser, multipleIntersectionOperands) { + auto node = parse("[a-m&&m-z&&a-c]"); + auto* inter = node.getIf(); + ASSERT_NE(nullptr, inter); + EXPECT_EQ(3, inter->operands.size()); +} + +TEST(ClassBodyParser, nestedNegatedClass) { + auto node = parse("[a-d[^0-9]]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(2, u->children.size()); + EXPECT_TRUE(u->children[1]->is()); +} + +TEST(ClassBodyParser, intersectionWithNestedClass) { + EXPECT_TRUE(parse("[[a-m]&&[m-z]]").is()); +} + +TEST(ClassBodyParser, rangeAtEndOfClass) { + EXPECT_TRUE(parse("[a\\-]").is()); +} + +TEST(ClassBodyParser, unterminatedClassThrows) { + EXPECT_THROW(parse("[abc"), std::invalid_argument); +} + +TEST(ClassBodyParser, unterminatedNegatedClassThrows) { + EXPECT_THROW(parse("[^abc"), std::invalid_argument); +} + +TEST(ClassBodyParser, unterminatedNestedClassThrows) { + EXPECT_THROW(parse("[a[b-c]"), std::invalid_argument); +} + +TEST(ClassBodyParser, incompleteHexEscapeThrows) { + EXPECT_THROW(parse("[\\x]"), std::invalid_argument); + EXPECT_THROW(parse("[\\xA]"), std::invalid_argument); +} + +TEST(ClassBodyParser, unterminatedHexBraceEscapeThrows) { + EXPECT_THROW(parse("[\\x{ABC]"), std::invalid_argument); +} + +TEST(ClassBodyParser, emptyHexBraceEscapeThrows) { + EXPECT_THROW(parse("[\\x{}]"), std::invalid_argument); +} + +TEST(ClassBodyParser, outOfRangeHexBraceEscapeThrows) { + EXPECT_THROW(parse("[\\x{110000}]"), std::invalid_argument); + EXPECT_THROW(parse("[\\x{FFFFFFFFF}]"), std::invalid_argument); +} + +TEST(ClassBodyParser, incompleteUnicodeEscapeThrows) { + EXPECT_THROW(parse("[\\u]"), std::invalid_argument); + EXPECT_THROW(parse("[\\u00]"), std::invalid_argument); + EXPECT_THROW(parse("[\\u00A]"), std::invalid_argument); +} + +TEST(ClassBodyParser, octalEscapeAcceptsThreeDigits) { + EXPECT_EQ(ClassNode(Literal(0x41)), parse("[\\0101]")); +} + +TEST(ClassBodyParser, octalEscapeStopsAtNonOctalChar) { + auto node = parse("[\\08]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(2, u->children.size()); + EXPECT_EQ(ClassNode(Literal(0)), *u->children[0]); + EXPECT_EQ(ClassNode(Literal('8')), *u->children[1]); +} + +TEST(ClassBodyParser, octalEscapeCappedAtFF) { + auto node = parse("[\\0400]"); + auto* u = node.getIf(); + ASSERT_NE(nullptr, u); + ASSERT_EQ(2, u->children.size()); + EXPECT_EQ(ClassNode(Literal(0x20)), *u->children[0]); + EXPECT_EQ(ClassNode(Literal('0')), *u->children[1]); +} + +TEST(ClassBodyParser, controlCharacterEscape) { + EXPECT_EQ(ClassNode(Literal(0x01)), parse("[\\cA]")); +} + +TEST(ClassBodyParser, simpleEscapesProduceLiterals) { + EXPECT_EQ(ClassNode(Literal(0x07)), parse("[\\a]")); + EXPECT_EQ(ClassNode(Literal(0x1B)), parse("[\\e]")); + EXPECT_EQ(ClassNode(Literal('\n')), parse("[\\n]")); + EXPECT_EQ(ClassNode(Literal('\t')), parse("[\\t]")); +} + +TEST(ClassBodyParser, trailingBackslashThrows) { + EXPECT_THROW(parse("[\\"), std::invalid_argument); +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test diff --git a/velox/functions/lib/java_pcre2_translator/tests/ClassRendererTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/ClassRendererTest.cpp new file mode 100644 index 00000000000..0d9ddb5e674 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/ClassRendererTest.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.ClassRendererTest (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/ClassRenderer.h" + +#include "velox/functions/lib/java_pcre2_translator/ClassBodyParser.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { + +namespace { + +std::string render(std::string_view classStr) { + std::size_t pos = 0; + auto node = ClassBodyParser::parseClass(classStr, pos); + return ClassRenderer::render(node); +} + +} // namespace + +TEST(ClassRenderer, simpleLiterals) { + EXPECT_EQ(render("[abc]"), "[abc]"); +} + +TEST(ClassRenderer, simpleRange) { + EXPECT_EQ(render("[a-z]"), "[a-z]"); +} + +TEST(ClassRenderer, negatedRange) { + EXPECT_EQ(render("[^a-z]"), "[^a-z]"); +} + +TEST(ClassRenderer, nestedUnionFlattens) { + EXPECT_EQ(render("[abc[def]]"), "[abcdef]"); +} + +TEST(ClassRenderer, negatedNestedFlattens) { + const auto result = render("[^a-d[0-9]]"); + EXPECT_EQ(result.find("[["), std::string::npos) << result; + EXPECT_EQ(result.rfind("[^", 0), 0) << result; +} + +TEST(ClassRenderer, intersectionLiteralRange) { + const auto result = render("[a-c&&b-d]"); + EXPECT_NE(result.find("b"), std::string::npos) << result; + EXPECT_NE(result.find("c"), std::string::npos) << result; + EXPECT_EQ(result.find("a"), std::string::npos) << result; + EXPECT_EQ(result.find("d"), std::string::npos) << result; +} + +TEST(ClassRenderer, intersectionDisjoint) { + EXPECT_EQ(render("[a-c&&d-f]"), "[^\\x{0}-\\x{10FFFF}]"); +} + +TEST(ClassRenderer, wDashHashEscapesDash) { + const auto result = render("[\\w-#]"); + EXPECT_NE(result.find("\\w"), std::string::npos) << result; + EXPECT_NE(result.find("\\-"), std::string::npos) << result; +} + +TEST(ClassRenderer, intersectionWithKnownProperty) { + const auto result = render("[\\d&&[0-3]]"); + EXPECT_NE(result.find("0"), std::string::npos) << result; + EXPECT_NE(result.find("3"), std::string::npos) << result; + EXPECT_EQ(result.find("&&"), std::string::npos) << result; +} + +TEST(ClassRenderer, intersectionWithJdkExpandableProperty) { + EXPECT_EQ(render("[\\p{L}&&[a-z]]"), "[a-z]"); +} + +TEST(ClassRenderer, intersectionWithBracketMappedProperty) { + EXPECT_EQ(render("[\\p{Alpha}&&[a-z]]"), "[a-z]"); +} + +TEST(ClassRenderer, intersectionWithJavaAlphabeticProperty) { + EXPECT_EQ(render("[\\p{javaAlphabetic}&&[a-z]]"), "[a-z]"); +} + +TEST(ClassRenderer, intersectionWithScriptAlias) { + const auto result = render("[\\p{sc=Grek}&&\\p{L}]"); + EXPECT_EQ(result.find("&&"), std::string::npos) << result; + EXPECT_EQ(result.find("&"), std::string::npos) << result; +} + +TEST(ClassRenderer, pureIntersectionFallbackWithUnknownProperty) { + const auto result = render("[\\p{UnknownXyz}&&[a-z]]"); + EXPECT_NE(result.find("\\p{UnknownXyz}"), std::string::npos) << result; + EXPECT_NE(result.find("&&"), std::string::npos) << result; + EXPECT_TRUE( + result.find("a-z") != std::string::npos || + (result.find("a") != std::string::npos && + result.find("z") != std::string::npos)) + << result; +} + +TEST(ClassRenderer, nestedNegatedIntersection) { + EXPECT_EQ(render("[^[a-c]&&[d-f]]"), "[\\x{0}-\\x{10FFFF}]"); +} + +TEST(ClassRenderer, negatedIntersectionOfRanges) { + EXPECT_EQ(render("[^a-c&&b-d]"), "[\\x{0}-ad-\\x{10FFFF}]"); +} + +TEST(ClassRenderer, propertyLeafPassesThrough) { + const auto result = render("[\\d\\w]"); + EXPECT_NE(result.find("\\d"), std::string::npos) << result; + EXPECT_NE(result.find("\\w"), std::string::npos) << result; +} + +TEST(ClassRenderer, multipleIntersectionOperands) { + EXPECT_EQ(render("[a-m&&m-z&&a-c]"), "[^\\x{0}-\\x{10FFFF}]"); +} + +TEST(ClassRenderer, nestedNegatedWithUnknownPropertyPreservesNegation) { + const auto result = render("[abc[^\\p{UnknownXyz}]]"); + EXPECT_NE(result.find("[^"), std::string::npos) << result; + EXPECT_NE(result.find("\\p{UnknownXyz}"), std::string::npos) << result; +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test diff --git a/velox/functions/lib/java_pcre2_translator/tests/EvaluatorTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/EvaluatorTest.cpp new file mode 100644 index 00000000000..424a827685e --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/EvaluatorTest.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.EvaluatorTest (Java) under Apache-2.0 by the +// same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/Evaluator.h" + +#include "velox/functions/lib/java_pcre2_translator/ClassBodyParser.h" +#include "velox/functions/lib/java_pcre2_translator/EvaluationFailedException.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { + +class EvaluatorPosixShorthand + : public testing::TestWithParam> {}; + +TEST_P( + EvaluatorPosixShorthand, + positivePosixShorthandsContainExpectedCodePoint) { + auto [token, cp] = GetParam(); + auto rs = Evaluator::toRangeSet(ClassNode(PropertyLeaf(token, false))); + EXPECT_TRUE(rs.contains(cp)) << token; +} + +INSTANTIATE_TEST_SUITE_P( + Tokens, + EvaluatorPosixShorthand, + testing::Values( + std::make_tuple("\\d", 48), + std::make_tuple("\\w", 95), + std::make_tuple("\\s", 32), + std::make_tuple("\\p{ASCII}", 65), + std::make_tuple("\\p{Alpha}", 65), + std::make_tuple("\\p{Alnum}", 48), + std::make_tuple("\\p{Lower}", 97), + std::make_tuple("\\p{Upper}", 65), + std::make_tuple("\\p{Digit}", 48), + std::make_tuple("\\p{XDigit}", 102), + std::make_tuple("\\p{Space}", 32), + std::make_tuple("\\p{Blank}", 9), + std::make_tuple("\\p{Cntrl}", 0), + std::make_tuple("\\p{Graph}", 33), + std::make_tuple("\\p{Print}", 32), + std::make_tuple("\\p{Punct}", 46))); + +TEST(Evaluator, negatedShorthandsComplementCorrectly) { + auto nd = Evaluator::toRangeSet(ClassNode(PropertyLeaf("\\D", true))); + EXPECT_TRUE(nd.contains('a')); + EXPECT_FALSE(nd.contains('0')); + + auto ns = Evaluator::toRangeSet(ClassNode(PropertyLeaf("\\S", true))); + EXPECT_FALSE(ns.contains(' ')); + EXPECT_TRUE(ns.contains('a')); +} + +TEST(Evaluator, unknownPropertyThrowsEvaluationFailed) { + EXPECT_THROW( + Evaluator::toRangeSet( + ClassNode(PropertyLeaf("\\p{ThisPropertyDoesNotExistXyz}", false))), + EvaluationFailedException); +} + +TEST(Evaluator, unknownPropertyInsideIntersectionThrows) { + auto inter = ClassNode(Intersection( + std::vector{ + ClassNode(PropertyLeaf("\\p{UnknownXyz}", false)), + ClassNode(Range('a', 'z'))})); + EXPECT_THROW(Evaluator::toRangeSet(inter), EvaluationFailedException); +} + +TEST(Evaluator, tryToRangeSetReturnsNullOnFailure) { + EXPECT_FALSE( + Evaluator::tryToRangeSet( + ClassNode(PropertyLeaf("\\p{UnknownXyz}", false))) + .has_value()); +} + +TEST(Evaluator, tryToRangeSetReturnsRangeSetOnSuccess) { + auto rs = Evaluator::tryToRangeSet(ClassNode(PropertyLeaf("\\d", false))); + ASSERT_TRUE(rs.has_value()); + EXPECT_TRUE(rs->contains('5')); +} + +TEST(Evaluator, javaAlphabeticIntersectionEvaluates) { + std::size_t pos = 0; + auto node = ClassBodyParser::parseClass("[\\p{javaAlphabetic}&&[a-z]]", pos); + auto rs = Evaluator::toRangeSet(node); + EXPECT_TRUE(rs.contains('a')); + EXPECT_FALSE(rs.contains(0x03B1)); + EXPECT_FALSE(rs.contains('&')); +} + +TEST(Evaluator, scriptAliasIntersectionEvaluates) { + std::size_t pos = 0; + auto node = ClassBodyParser::parseClass("[\\p{sc=Grek}&&\\p{L}]", pos); + auto rs = Evaluator::toRangeSet(node); + EXPECT_TRUE(rs.contains(0x03B1)); + EXPECT_FALSE(rs.contains('a')); + EXPECT_FALSE(rs.contains('&')); +} + +TEST(Evaluator, inPrefixIntersectionUsesBlockNotScript) { + std::size_t pos = 0; + auto node = ClassBodyParser::parseClass("[\\p{InGreek}&&\\x{1F00}]", pos); + auto rs = Evaluator::toRangeSet(node); + EXPECT_FALSE(rs.contains(0x1F00)); +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test diff --git a/velox/functions/lib/java_pcre2_translator/tests/JavaRegexTranslatorTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/JavaRegexTranslatorTest.cpp new file mode 100644 index 00000000000..1e9ed5f52ca --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/JavaRegexTranslatorTest.cpp @@ -0,0 +1,363 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.JavaRegexTranslatorTest (Java) under +// Apache-2.0 by the same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/JavaRegexTranslator.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { + +TEST(JavaRegexTranslator, passthroughForPatternsWithoutProperties) { + EXPECT_EQ("\\d+", toPcre2Pattern("\\d+")); + EXPECT_EQ("[a-z]", toPcre2Pattern("[a-z]")); + EXPECT_EQ("abc", toPcre2Pattern("abc")); +} + +TEST(JavaRegexTranslator, rewritesInBlockProperty) { + EXPECT_EQ("[\\x{370}-\\x{3FF}]", toPcre2Pattern("\\p{InGreek}")); + EXPECT_EQ("[^\\x{370}-\\x{3FF}]", toPcre2Pattern("\\P{InGreek}")); + EXPECT_EQ("[\\x{370}-\\x{3FF}]", toPcre2Pattern("\\p{blk=Greek}")); + EXPECT_EQ("[^\\x{370}-\\x{3FF}]", toPcre2Pattern("\\P{block=Greek}")); + EXPECT_EQ("[\\x{0}-\\x{7F}]", toPcre2Pattern("\\p{blk=BasicLatin}")); + EXPECT_EQ("a[\\x{370}-\\x{3FF}]b", toPcre2Pattern("a\\p{InGreek}b")); +} + +TEST(JavaRegexTranslator, rewritesIsScriptProperty) { + EXPECT_EQ("\\p{L}", toPcre2Pattern("\\p{IsL}")); + EXPECT_EQ("\\p{LC}", toPcre2Pattern("\\p{IsLC}")); + EXPECT_EQ("\\p{ASCII}", toPcre2Pattern("\\p{IsASCII}")); +} + +TEST(JavaRegexTranslator, rewritesShortAliases) { + EXPECT_EQ("[\\x{00}-\\x{FF}]", toPcre2Pattern("\\p{L1}")); +} + +TEST(JavaRegexTranslator, rewritesJavaProperty) { + const auto result = toPcre2Pattern("\\p{javaLowerCase}"); + EXPECT_TRUE(result.starts_with("[")) << result; + EXPECT_NE(std::string::npos, result.find("\\x{AA}")) << result; + EXPECT_NE("\\p{Ll}", result); +} + +TEST(JavaRegexTranslator, rewritesUnicodeEscapeSurrogatePairs) { + EXPECT_EQ("\\x{1f600}", toPcre2Pattern("\\uD83D\\uDE00")); + EXPECT_THROW(toPcre2Pattern("\\uD83D"), EvaluationFailedException); +} + +TEST(JavaRegexTranslator, doesNotRewriteInsideQuotation) { + EXPECT_EQ("\\Q\\p{InGreek}\\E", toPcre2Pattern("\\Q\\p{InGreek}\\E")); +} + +TEST(JavaRegexTranslator, doesNotRewriteEscapedBackslashFollowedByP) { + EXPECT_THROW(toPcre2Pattern("\\\\p{InGreek}"), EvaluationFailedException); +} + +TEST(JavaRegexTranslator, rejectsIllegalQuantifierBody) { + EXPECT_THROW(toPcre2Pattern("a{^InGreek}"), EvaluationFailedException); + EXPECT_THROW(toPcre2Pattern("a{}"), EvaluationFailedException); + EXPECT_THROW(toPcre2Pattern("a{,3}"), EvaluationFailedException); + EXPECT_THROW(toPcre2Pattern("a{"), EvaluationFailedException); + EXPECT_THROW(toPcre2Pattern("a{3"), EvaluationFailedException); +} + +TEST(JavaRegexTranslator, acceptsValidQuantifiers) { + EXPECT_EQ("a{3}", toPcre2Pattern("a{3}")); + EXPECT_EQ("a{3,}", toPcre2Pattern("a{3,}")); + EXPECT_EQ("a{3,5}", toPcre2Pattern("a{3,5}")); +} + +TEST(JavaRegexTranslator, escapeHatchDisablesTranslator) { + EXPECT_EQ("[\\x{370}-\\x{3FF}]", toPcre2Pattern("\\p{InGreek}")); +} + +TEST(JavaRegexTranslator, rewritesSurrogateBlockToRange) { + EXPECT_EQ("[\\x{D800}-\\x{DB7F}]", toPcre2Pattern("\\p{InHIGH_SURROGATES}")); + EXPECT_EQ("[\\x{DC00}-\\x{DFFF}]", toPcre2Pattern("\\p{InLOW_SURROGATES}")); +} + +TEST(JavaRegexTranslator, surrogateBlockNeedsRawByteMode) { + bool needsRawByteMode = false; + EXPECT_EQ( + "[\\x{D800}-\\x{DB7F}]", + toPcre2Pattern("\\p{InHIGH_SURROGATES}", needsRawByteMode)); + EXPECT_TRUE(needsRawByteMode); +} + +TEST(JavaRegexTranslator, reportsRawSurrogateBytesNeedRawByteMode) { + bool needsRawByteMode = false; + EXPECT_EQ( + "[\xED\xA0\x80]", toPcre2Pattern("[\xED\xA0\x80]", needsRawByteMode)); + EXPECT_TRUE(needsRawByteMode); +} + +TEST(JavaRegexTranslator, doesNotReportRawByteModeForSupplementaryScalar) { + bool needsRawByteMode = true; + EXPECT_EQ("\\x{1f600}", toPcre2Pattern("\\uD83D\\uDE00", needsRawByteMode)); + EXPECT_FALSE(needsRawByteMode); +} + +TEST(JavaRegexTranslator, negatedSurrogateBlockIsNegated) { + EXPECT_EQ("[^\\x{D800}-\\x{DB7F}]", toPcre2Pattern("\\P{InHIGH_SURROGATES}")); +} + +TEST(JavaRegexTranslator, rewritesJavaDefinedAsNegatedUnassigned) { + EXPECT_EQ("\\P{Cn}", toPcre2Pattern("\\p{javaDefined}")); +} + +TEST(JavaRegexTranslator, multipleTokensInOnePattern) { + EXPECT_EQ( + "[\\x{370}-\\x{3FF}][\\x{3040}-\\x{309F}]", + toPcre2Pattern("\\p{InGreek}\\p{InHiragana}")); +} + +TEST(JavaRegexTranslator, nestedUnionFlattens) { + const auto result = toPcre2Pattern("[abc[def]]"); + EXPECT_EQ(std::string::npos, result.find("[[")) << result; + EXPECT_EQ("[abcdef]", result); +} + +TEST(JavaRegexTranslator, intersectionBecomesRangeSet) { + EXPECT_EQ("[^\\x{0}-\\x{10FFFF}]", toPcre2Pattern("[a-c&&d-f]")); +} + +TEST(JavaRegexTranslator, wDashHashEscapesDash) { + const auto result = toPcre2Pattern("[\\w-#]"); + EXPECT_NE(std::string::npos, result.find("\\-")) << result; +} + +TEST(JavaRegexTranslator, classBodyRewritePreservesOutsidePattern) { + EXPECT_EQ("a[bc]d", toPcre2Pattern("a[bc]d")); +} + +TEST(JavaRegexTranslator, propertyInsideClassRewritten) { + const auto result = toPcre2Pattern("[\\p{InGreek}]"); + EXPECT_TRUE( + result.find("\\x{370}") != std::string::npos || + result.find("\\x{3FF}") != std::string::npos) + << result; + EXPECT_EQ(std::string::npos, result.find("\\p{InGreek}")) << result; +} + +TEST(JavaRegexTranslator, surrogateBlockInsideNestedClassIsPreserved) { + EXPECT_EQ( + "[\\x{D800}-\\x{DB7F}\\x{DC00}-\\x{DFFF}]", + toPcre2Pattern("[[\\p{InHIGH_SURROGATES}\\p{InLOW_SURROGATES}]]")); +} + +TEST(JavaRegexTranslator, intersectionWithKnownPropertyEvaluated) { + const auto result = toPcre2Pattern("[\\d&&[0-3]]"); + EXPECT_EQ(std::string::npos, result.find("&&")) << result; +} + +TEST(JavaRegexTranslator, dropsUFlagInModeModifier) { + EXPECT_EQ("(?i)foo", toPcre2Pattern("(?iu)foo")); + EXPECT_EQ("(?i)foo", toPcre2Pattern("(?ui)foo")); + EXPECT_EQ("(?im)foo", toPcre2Pattern("(?ium)foo")); +} + +TEST(JavaRegexTranslator, dropsUInScopedGroup) { + EXPECT_EQ("(?i:foo)", toPcre2Pattern("(?iu:foo)")); +} + +TEST(JavaRegexTranslator, dropsDFlag) { + EXPECT_EQ("(?m)foo", toPcre2Pattern("(?dm)foo")); +} + +TEST(JavaRegexTranslator, emptyFlagsRemovedEntirely) { + EXPECT_EQ("foo", toPcre2Pattern("(?u)foo")); + EXPECT_EQ("(?:foo)", toPcre2Pattern("(?u:foo)")); +} + +TEST(JavaRegexTranslator, preservesNonModeGroups) { + EXPECT_EQ("(?:foo)", toPcre2Pattern("(?:foo)")); + EXPECT_EQ("(?=foo)", toPcre2Pattern("(?=foo)")); + EXPECT_EQ("(?foo)", toPcre2Pattern("(?foo)")); + EXPECT_EQ("(?#comment)foo", toPcre2Pattern("(?#comment)foo")); +} + +TEST(JavaRegexTranslator, handlesOnOffFlagGroup) { + EXPECT_EQ("(?i-m)foo", toPcre2Pattern("(?iu-mU)foo")); +} + +TEST(JavaRegexTranslator, allFlagsDroppedFromOnOff) { + EXPECT_EQ("foo", toPcre2Pattern("(?u-U)foo")); +} + +TEST(JavaRegexTranslator, doesNotTouchInsideClass) { + EXPECT_EQ("[(?i)]", toPcre2Pattern("[(?i)]")); +} + +TEST(JavaRegexTranslator, propertyIntersectionEndToEnd) { + const auto out = toPcre2Pattern("[\\p{L}&&[\\P{InGreek}]]"); + EXPECT_EQ(std::string::npos, out.find("&&")) << out; + EXPECT_EQ(std::string::npos, out.find("[[")) << out; + EXPECT_NE(std::string::npos, out.find("A-Z")) << out; + EXPECT_NE(std::string::npos, out.find("a-z")) << out; + EXPECT_EQ(std::string::npos, out.find("\\x{3B1}")) << out; + EXPECT_EQ(std::string::npos, out.find("\\x{3A9}")) << out; +} + +TEST(JavaRegexTranslator, inlineCaseInsensitiveExpandsCasedTopLevelProperty) { + EXPECT_EQ("(?i)[\\p{Lu}\\p{Ll}\\p{Lt}]", toPcre2Pattern("(?i)\\p{Lu}")); +} + +TEST(JavaRegexTranslator, inlineCaseInsensitiveExpandsCasedClassProperty) { + EXPECT_EQ("(?i)[\\p{Lu}\\p{Ll}\\p{Lt}]", toPcre2Pattern("(?i)[\\p{Lu}]")); +} + +TEST( + JavaRegexTranslator, + inlineCaseInsensitiveExpandsNegatedCasedClassProperty) { + EXPECT_EQ("(?i)[\\P{Lu}]", toPcre2Pattern("(?i)[\\P{Lu}]")); +} + +TEST(JavaRegexTranslator, inlineCaseInsensitiveKeepsLiteralAsciiRange) { + EXPECT_EQ("(?i)[A-Z\\p{Lu}\\p{Ll}\\p{Lt}]", toPcre2Pattern("(?i)[A-Z]")); +} + +TEST(JavaRegexTranslator, embeddedFlagsDoNotLeakPastEnclosingGroup) { + EXPECT_EQ("(a(?i)b)[\\p{Lu}]", toPcre2Pattern("(a(?i)b)[\\p{Lu}]")); +} + +TEST(JavaRegexTranslator, longBackreferenceDoesNotOverflow) { + const auto result = toPcre2Pattern("\\999999999999999999999999999999"); + EXPECT_EQ(0, result.rfind("(*F)", 0)) << result; +} + +TEST( + JavaRegexTranslator, + unicodeCharacterClassIntersectionThrowsInsteadOfAsciiEvaluation) { + EXPECT_THROW( + toPcre2Pattern("(?U)[\\d&&\\p{InArabic}]"), EvaluationFailedException); +} + +TEST(JavaRegexTranslator, escapedBraceIsNotQuantifier) { + EXPECT_EQ("\\{", toPcre2Pattern("\\{")); + EXPECT_EQ("a\\{b}", toPcre2Pattern("a\\{b}")); + EXPECT_EQ("\\{not-a-quantifier}", toPcre2Pattern("\\{not-a-quantifier}")); +} + +TEST(JavaRegexTranslator, doubleBackslashThenBraceStillQuantifier) { + EXPECT_THROW(toPcre2Pattern("\\\\{x}"), EvaluationFailedException); +} + +TEST(JavaRegexTranslator, commentsModeIgnoresBracesInLineComments) { + EXPECT_EQ("(?x)# {\n a", toPcre2Pattern("(?x)# {\n a")); + EXPECT_EQ("(?x:# {\n a)", toPcre2Pattern("(?x:# {\n a)")); +} + +TEST(JavaRegexTranslator, unicodeCaseExpandsAsciiLiterals) { + EXPECT_EQ("[Aa][Bb][Cc]", toPcre2PatternWithUnicodeCase("abc")); +} + +TEST(JavaRegexTranslator, unicodeCaseExpandsKnownUnicodeLiterals) { + const auto kelvin = toPcre2PatternWithUnicodeCase("\xe2\x84\xaa"); + EXPECT_NE(std::string::npos, kelvin.find("\\x{212a}")) << kelvin; + EXPECT_NE(std::string::npos, kelvin.find("K")) << kelvin; + EXPECT_NE(std::string::npos, kelvin.find("k")) << kelvin; + + const auto sigma = toPcre2PatternWithUnicodeCase("\xce\xa3"); + EXPECT_NE(std::string::npos, sigma.find("\\x{3a3}")) << sigma; + EXPECT_NE(std::string::npos, sigma.find("\\x{3c3}")) << sigma; + EXPECT_NE(std::string::npos, sigma.find("\\x{3c2}")) << sigma; +} + +TEST(JavaRegexTranslator, unicodeCaseExpandsUnicodeEscapes) { + EXPECT_EQ("[Kk\\x{212a}]", toPcre2PatternWithUnicodeCase("\\u212A")); +} + +TEST(JavaRegexTranslator, unicodeCaseSkipsClassesAndQuotes) { + EXPECT_EQ("[abc]\\Qabc\\E", toPcre2PatternWithUnicodeCase("[abc]\\Qabc\\E")); +} + +TEST(JavaRegexTranslatorRe2, reusesPropertyAndClassPipeline) { + EXPECT_EQ("[\\x{370}-\\x{3FF}]", toRe2Pattern("\\p{InGreek}")); + EXPECT_EQ("[abcdef]", toRe2Pattern("[abc[def]]")); + EXPECT_EQ("[^\\x{0}-\\x{10FFFF}]", toRe2Pattern("[a-c&&d-f]")); +} + +TEST(JavaRegexTranslatorRe2, rewritesJavaNamedCapturingGroups) { + EXPECT_EQ("(?Pfoo)", toRe2Pattern("(?foo)")); + EXPECT_EQ("(a(?P\\d+))", toRe2Pattern("(a(?\\d+))")); +} + +TEST( + JavaRegexTranslatorRe2, + doesNotRewriteNamedGroupLookalikesInQuotesOrClasses) { + EXPECT_EQ("\\Q(?foo)\\E", toRe2Pattern("\\Q(?foo)\\E")); + EXPECT_EQ("[(?)]", toRe2Pattern("[(?)]")); +} + +TEST(JavaRegexTranslatorRe2, rejectsLookaround) { + EXPECT_THROW(toRe2Pattern("(?=foo)"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?!foo)"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?<=foo)"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?a)\\k"), EvaluationFailedException); + EXPECT_NO_THROW(toRe2Pattern("[\\1\\k]")); +} + +TEST(JavaRegexTranslatorRe2, rejectsPossessiveQuantifiers) { + EXPECT_THROW(toRe2Pattern("a*+"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("a?+"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("a++"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("a{1,3}+"), EvaluationFailedException); +} + +TEST(JavaRegexTranslatorRe2, rejectsAtomicGroupsAndUnsupportedFlags) { + EXPECT_THROW(toRe2Pattern("(?>foo)"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?U)foo"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?d)foo"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?c)foo"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?id:foo)"), EvaluationFailedException); + EXPECT_EQ("foo", toRe2Pattern("(?u)foo")); + EXPECT_EQ("foo", toRe2Pattern("(?-U)foo")); + EXPECT_EQ("(?i:foo)", toRe2Pattern("(?i-d:foo)")); + EXPECT_EQ("foo", toRe2Pattern("(?-c)foo")); +} + +TEST(JavaRegexTranslatorRe2, rewritesJavaOctalEscapesForRe2) { + EXPECT_EQ("\\x{a}", toRe2Pattern("\\012")); +} + +TEST(JavaRegexTranslatorRe2, translatesCommentsModeForRe2) { + EXPECT_EQ("abc", toRe2Pattern("(?x)a b c")); + EXPECT_EQ("abcdef", toRe2Pattern("(?x)abc # comment\ndef")); + EXPECT_EQ("a b", toRe2Pattern("(?x)a\\ b")); + EXPECT_EQ("[a]", toRe2Pattern("(?x)[ a]")); + EXPECT_EQ("[ ]", toRe2Pattern("(?x)[\\ ]")); + EXPECT_EQ("[a]", toRe2Pattern("(?x)[a# comment\n]")); + EXPECT_THROW(toRe2Pattern("(?x)[a# comment]"), EvaluationFailedException); + EXPECT_EQ("(?i:ab)", toRe2Pattern("(?ix:a b)")); + EXPECT_THROW(toRe2Pattern("(?x)(? a)"), EvaluationFailedException); + EXPECT_THROW(toRe2Pattern("(?x)(? :a)"), EvaluationFailedException); +} + +TEST(JavaRegexTranslatorRe2, unsupportedFeatureLookalikesInQuotesAreLiterals) { + EXPECT_EQ( + "\\Q(?=foo)\\1*+(?>x)(?U)\\E", + toRe2Pattern("\\Q(?=foo)\\1*+(?>x)(?U)\\E")); +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test diff --git a/velox/functions/lib/java_pcre2_translator/tests/JdkPropertyExpanderTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/JdkPropertyExpanderTest.cpp new file mode 100644 index 00000000000..f6625ecb238 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/JdkPropertyExpanderTest.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Originally authored by Oleksii PELYKH for pcre4j; ported from +// org.pcre4j.regex.translate.JdkPropertyExpanderTest (Java) under Apache-2.0 by +// the same author for inclusion in Velox. +// +#include "velox/functions/lib/java_pcre2_translator/JdkPropertyExpander.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { + +TEST(JdkPropertyExpander, asciiLetterCoverage) { + auto l = JdkPropertyExpander::expand("\\p{L}"); + ASSERT_TRUE(l.has_value()); + EXPECT_TRUE(l->contains('a')); + EXPECT_TRUE(l->contains('Z')); + EXPECT_FALSE(l->contains('0')); + EXPECT_FALSE(l->contains(' ')); +} + +TEST(JdkPropertyExpander, greekScript) { + auto g = JdkPropertyExpander::expand("\\p{Greek}"); + ASSERT_TRUE(g.has_value()); + EXPECT_TRUE(g->contains(0x03B1)); + EXPECT_FALSE(g->contains('a')); +} + +TEST(JdkPropertyExpander, negatedProperty) { + auto notL = JdkPropertyExpander::expand("\\P{L}"); + ASSERT_TRUE(notL.has_value()); + EXPECT_FALSE(notL->contains('a')); + EXPECT_TRUE(notL->contains('0')); +} + +TEST(JdkPropertyExpander, unknownReturnsNull) { + EXPECT_FALSE(JdkPropertyExpander::expand("\\p{FooBarBaz}").has_value()); +} + +TEST(JdkPropertyExpander, caches) { + auto first = JdkPropertyExpander::expand("\\p{L}"); + auto second = JdkPropertyExpander::expand("\\p{L}"); + ASSERT_TRUE(first.has_value()); + ASSERT_TRUE(second.has_value()); + EXPECT_EQ(*first, *second); +} + +TEST(JdkPropertyExpander, greekIntersectionWithLetters) { + auto letters = JdkPropertyExpander::expand("\\p{L}"); + auto notGreek = JdkPropertyExpander::expand("\\P{Greek}"); + ASSERT_TRUE(letters.has_value()); + ASSERT_TRUE(notGreek.has_value()); + auto lettersNotGreek = letters->intersect(*notGreek); + EXPECT_TRUE(lettersNotGreek.contains('a')); + EXPECT_TRUE(lettersNotGreek.contains(0x6000)); + EXPECT_FALSE(lettersNotGreek.contains(0x03B1)); +} + +TEST(JdkPropertyExpander, leafCategoryLu) { + auto lu = JdkPropertyExpander::expand("\\p{Lu}"); + ASSERT_TRUE(lu.has_value()); + EXPECT_TRUE(lu->contains('A')); + EXPECT_FALSE(lu->contains('a')); + EXPECT_FALSE(lu->contains('0')); +} + +TEST(JdkPropertyExpander, combinedCategoryN) { + auto n = JdkPropertyExpander::expand("\\p{N}"); + ASSERT_TRUE(n.has_value()); + EXPECT_TRUE(n->contains('0')); + EXPECT_FALSE(n->contains('a')); +} + +TEST(JdkPropertyExpander, binaryAlphabeticProperty) { + auto alphabetic = JdkPropertyExpander::expand("\\p{Alphabetic}"); + ASSERT_TRUE(alphabetic.has_value()); + EXPECT_TRUE(alphabetic->contains('a')); + EXPECT_TRUE(alphabetic->contains(0x03B1)); + EXPECT_FALSE(alphabetic->contains('0')); +} + +TEST(JdkPropertyExpander, scriptShortAlias) { + auto greek = JdkPropertyExpander::expand("\\p{Grek}"); + ASSERT_TRUE(greek.has_value()); + EXPECT_TRUE(greek->contains(0x03B1)); + EXPECT_FALSE(greek->contains('a')); +} + +TEST(JdkPropertyExpander, blockLongAlias) { + auto basicLatin = JdkPropertyExpander::expand("\\p{Basic_Latin}"); + ASSERT_TRUE(basicLatin.has_value()); + EXPECT_TRUE(basicLatin->contains('A')); + EXPECT_FALSE(basicLatin->contains(0x03B1)); +} + +TEST(JdkPropertyExpander, inPrefixUsesBlockNotScript) { + auto greekBlock = JdkPropertyExpander::expand("\\p{InGreek}"); + ASSERT_TRUE(greekBlock.has_value()); + EXPECT_TRUE(greekBlock->contains(0x03B1)); + EXPECT_FALSE(greekBlock->contains(0x1F00)); +} + +TEST(JdkPropertyExpander, nonPropertyTokenReturnsNull) { + EXPECT_FALSE(JdkPropertyExpander::expand("\\d").has_value()); + EXPECT_FALSE(JdkPropertyExpander::expand("\\w").has_value()); +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test diff --git a/velox/functions/lib/java_pcre2_translator/tests/PropertyMapTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/PropertyMapTest.cpp new file mode 100644 index 00000000000..879296bfbb4 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/PropertyMapTest.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Ported from org.pcre4j.regex.translate.PropertyMapTest (Java). +// +#include "velox/functions/lib/java_pcre2_translator/PropertyMap.h" + +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { + +TEST(PropertyMap, inPrefixStrip) { + EXPECT_EQ("[\\x{370}-\\x{3FF}]", PropertyMap::apply("InGreek").value()); +} + +TEST(PropertyMap, isPrefixStrip) { + EXPECT_EQ("L", PropertyMap::apply("IsL").value()); +} + +TEST(PropertyMap, unknownReturnsNullopt) { + EXPECT_FALSE(PropertyMap::apply("FooBarBaz").has_value()); +} + +TEST(PropertyMap, l1ExpandsToRange) { + EXPECT_EQ("[\\x{00}-\\x{FF}]", PropertyMap::apply("L1").value()); +} + +TEST(PropertyMap, javaLowerCase) { + const auto result = PropertyMap::apply("javaLowerCase").value(); + EXPECT_TRUE(result.starts_with("[")); + EXPECT_NE(std::string::npos, result.find("\\x{AA}")) << result; +} + +TEST(PropertyMap, highSurrogatesExpandToRange) { + EXPECT_EQ( + "[\\x{D800}-\\x{DB7F}]", PropertyMap::apply("InHIGH_SURROGATES").value()); + EXPECT_EQ( + "[\\x{D800}-\\x{DB7F}]", PropertyMap::apply("InHighSurrogates").value()); + EXPECT_EQ( + "[\\x{D800}-\\x{DB7F}]", + PropertyMap::apply("blk=HighSurrogates").value()); + EXPECT_EQ( + "[\\x{DB80}-\\x{DBFF}]", + PropertyMap::apply("InHighPrivateUseSurrogates").value()); +} + +TEST(PropertyMap, lowSurrogatesExpandToRange) { + EXPECT_EQ( + "[\\x{DC00}-\\x{DFFF}]", PropertyMap::apply("InLOW_SURROGATES").value()); +} + +TEST(PropertyMap, isAsciiStripsIs) { + EXPECT_EQ("ASCII", PropertyMap::apply("IsASCII").value()); +} + +TEST(PropertyMap, javaDefinedMapsToNegatedCn) { + EXPECT_EQ("\\P{Cn}", PropertyMap::apply("javaDefined").value()); +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test diff --git a/velox/functions/lib/java_pcre2_translator/tests/RangeSetTest.cpp b/velox/functions/lib/java_pcre2_translator/tests/RangeSetTest.cpp new file mode 100644 index 00000000000..01c7a596ca4 --- /dev/null +++ b/velox/functions/lib/java_pcre2_translator/tests/RangeSetTest.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// +// Ported from org.pcre4j.regex.translate.RangeSetTest (Java). +// +#include "velox/functions/lib/java_pcre2_translator/RangeSet.h" + +#include + +#include +#include + +namespace facebook::velox::functions::java_pcre2_translator::test { + +TEST(RangeSet, emptySet) { + EXPECT_TRUE(RangeSet::empty().isEmpty()); + EXPECT_FALSE(RangeSet::empty().contains('a')); +} + +TEST(RangeSet, singleCodePoint) { + auto s = RangeSet::single('a'); + EXPECT_FALSE(s.isEmpty()); + EXPECT_TRUE(s.contains('a')); + EXPECT_FALSE(s.contains('b')); +} + +TEST(RangeSet, range) { + auto az = RangeSet::range('a', 'z'); + EXPECT_TRUE(az.contains('a')); + EXPECT_TRUE(az.contains('m')); + EXPECT_TRUE(az.contains('z')); + EXPECT_FALSE(az.contains('A')); + EXPECT_FALSE(az.contains('{')); +} + +TEST(RangeSet, unionDisjoint) { + auto u = RangeSet::range('a', 'z').unionWith(RangeSet::range('A', 'Z')); + EXPECT_TRUE(u.contains('a')); + EXPECT_TRUE(u.contains('A')); + EXPECT_FALSE(u.contains('1')); +} + +TEST(RangeSet, unionOverlapping) { + auto u = RangeSet::range('a', 'c').unionWith(RangeSet::range('b', 'd')); + EXPECT_TRUE(u.contains('a')); + EXPECT_TRUE(u.contains('b')); + EXPECT_TRUE(u.contains('d')); + EXPECT_FALSE(u.contains('e')); + EXPECT_EQ(1, u.rangeCount()); +} + +TEST(RangeSet, intersectOverlap) { + auto i = RangeSet::range('a', 'c').intersect(RangeSet::range('b', 'd')); + EXPECT_FALSE(i.contains('a')); + EXPECT_TRUE(i.contains('b')); + EXPECT_TRUE(i.contains('c')); + EXPECT_FALSE(i.contains('d')); +} + +TEST(RangeSet, intersectDisjoint) { + auto i = RangeSet::range('a', 'c').intersect(RangeSet::range('d', 'f')); + EXPECT_TRUE(i.isEmpty()); +} + +TEST(RangeSet, complementEmpty) { + auto c = RangeSet::empty().complement(); + EXPECT_EQ(RangeSet::all(), c.unionWith(RangeSet::empty())); + EXPECT_TRUE(c.contains(0)); + EXPECT_TRUE(c.contains(0x10FFFF)); +} + +TEST(RangeSet, complementRange) { + auto notAz = RangeSet::range('a', 'z').complement(); + EXPECT_FALSE(notAz.contains('a')); + EXPECT_FALSE(notAz.contains('z')); + EXPECT_TRUE(notAz.contains('A')); + EXPECT_TRUE(notAz.contains('0')); + EXPECT_TRUE(notAz.contains(0x10FFFF)); +} + +TEST(RangeSet, subtract) { + auto diff = RangeSet::range('a', 'f').subtract(RangeSet::range('c', 'f')); + EXPECT_TRUE(diff.contains('a')); + EXPECT_TRUE(diff.contains('b')); + EXPECT_FALSE(diff.contains('c')); + EXPECT_FALSE(diff.contains('f')); +} + +TEST(RangeSet, toPcre2ClassBodySinglePrintable) { + EXPECT_EQ("a", RangeSet::single('a').toPcre2ClassBody()); +} + +TEST(RangeSet, toPcre2ClassBodySingleNonPrintable) { + EXPECT_EQ("\\x{9}", RangeSet::single('\t').toPcre2ClassBody()); +} + +TEST(RangeSet, toPcre2ClassBodyRange) { + EXPECT_EQ("a-z", RangeSet::range('a', 'z').toPcre2ClassBody()); +} + +TEST(RangeSet, toPcre2ClassBodyEscapesSpecialChars) { + EXPECT_EQ("\\-", RangeSet::single('-').toPcre2ClassBody()); + EXPECT_EQ("\\]", RangeSet::single(']').toPcre2ClassBody()); + EXPECT_EQ("\\^", RangeSet::single('^').toPcre2ClassBody()); +} + +TEST(RangeSet, toPcre2ClassBodyMultipleRanges) { + auto u = RangeSet::range('a', 'z').unionWith(RangeSet::range('A', 'Z')); + const auto body = u.toPcre2ClassBody(); + EXPECT_TRUE( + body.find("A-Z") != std::string::npos || + body.find("a-z") != std::string::npos); +} + +TEST(RangeSet, singleRejectsNegative) { + EXPECT_THROW(RangeSet::single(-1), std::invalid_argument); +} + +TEST(RangeSet, singleRejectsAboveMax) { + EXPECT_THROW(RangeSet::single(0x110000), std::invalid_argument); +} + +TEST(RangeSet, singleAcceptsBoundaries) { + EXPECT_EQ(1, RangeSet::single(0).rangeCount()); + EXPECT_EQ(1, RangeSet::single(0x10FFFF).rangeCount()); +} + +TEST(RangeSet, rangeRejectsNegativeLo) { + EXPECT_THROW(RangeSet::range(-1, 5), std::invalid_argument); +} + +TEST(RangeSet, rangeRejectsHiAboveMax) { + EXPECT_THROW(RangeSet::range(0, 0x110000), std::invalid_argument); +} + +TEST(RangeSet, rangeRejectsInverted) { + EXPECT_THROW(RangeSet::range(5, 4), std::invalid_argument); +} + +TEST(RangeSet, unionMergesAdjacentRanges) { + auto merged = RangeSet::range('a', 'c').unionWith(RangeSet::range('d', 'f')); + EXPECT_EQ(1, merged.rangeCount()) + << "adjacent ranges must be merged; got: " << merged.toPcre2ClassBody(); + EXPECT_EQ("a-f", merged.toPcre2ClassBody()); +} + +TEST(RangeSet, unionMergesOverlappingRanges) { + auto merged = RangeSet::range('a', 'e').unionWith(RangeSet::range('c', 'g')); + EXPECT_EQ(1, merged.rangeCount()); + EXPECT_EQ("a-g", merged.toPcre2ClassBody()); +} + +} // namespace facebook::velox::functions::java_pcre2_translator::test