Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
8d90925
build(regex-compat): add opt-in PCRE2 dependency
baibaichen May 29, 2026
082fa55
build(regex-compat): bump PCRE2 10.45 -> 10.47
baibaichen May 29, 2026
6e9c18a
build(regex-compat): add opt-in Java backend probe (VELOX_ENABLE_REGE…
baibaichen May 29, 2026
9aef885
regex-compat: add Re2Regex backend (Phase 2 of test suite)
baibaichen May 29, 2026
d7b2c1e
regex-compat: add Pcre2Regex backend (Phase 4)
baibaichen May 29, 2026
d855881
regex-compat: add JavaRegex backend via embedded JVM (Phase 6)
baibaichen May 30, 2026
75a5a1c
regex-compat: add typed-test fixture for 3-backend cross-checking (Ph…
baibaichen May 30, 2026
98b7b72
regex-compat: port P0 pcre4j test trio over typed-test (Phases 8-10)
baibaichen May 30, 2026
8d41026
regex-compat: add README (Phase 11)
baibaichen May 30, 2026
87c334f
regex-compat: port remaining MatcherMatchingTests + add per-backend t…
baibaichen Jun 1, 2026
27e4bb0
regex-compat: port MatcherReplacementTests + fix JNI UTF-16 transcoding
Jun 1, 2026
748e571
regex-compat: port MatcherResultsTests (12 cases)
Jun 1, 2026
e418939
regex-compat: port MatcherMatchResultTests engine-relevant cases (2 o…
Jun 1, 2026
63eba37
regex-compat: port PatternSplitTests (13 cases)
Jun 1, 2026
0ae644e
regex-compat: port MatcherUnicodeTests (6 cases)
Jun 1, 2026
fbb5dbc
regex-compat: port PatternTests quote + (?x) COMMENTS cases (6 of rem…
Jun 1, 2026
87ddce5
regex-compat: add OpenJDK 17 TestCases.txt corpus diff test (299 cases)
Jun 1, 2026
b12b1a3
regex-compat: extend OpenJDK corpus test to all 3 corpus files
Jun 1, 2026
640bebe
java_pcre2_translator: Phase 1 scaffolding
Jun 1, 2026
6bc2ff5
java_pcre2_translator: Phase 2 — port RangeSet
Jun 1, 2026
e558cb7
java_pcre2_translator: Phase 3 — port PropertyMap
Jun 1, 2026
35b2c4e
java_pcre2_translator: Phase 4 — AST + parser + evaluator + renderer …
Jun 1, 2026
cb3483d
java_pcre2_translator: Phase 5 — top-level JavaRegexTranslator pipeline
Jun 1, 2026
f094b7e
regex-compat: Phase 6 — wire JavaRegexTranslator into Pcre2Regex
Jun 1, 2026
5a0084e
regex-compat: Phase 7 — extend translator to RE2 backend (P1 complete)
Jun 1, 2026
16f3ae2
Fix escaped non-ASCII class literals
Jun 1, 2026
30ec781
Allow PCRE2 surrogate corpus patterns
Jun 1, 2026
fcde27c
Port pcre4j regex translator fixes
Jun 1, 2026
ec77186
regex-compat: Phase P1.7 — close the last 6 PCRE2 supplementary-surro…
Jun 1, 2026
73fca77
Add ported OpenJDK RegExTest coverage
baibaichen Jun 1, 2026
705310c
regex-compat: add OpenJDK GraphemeTestCases corpus (Java-only)
baibaichen Jun 1, 2026
b4a1fcb
regex-compat: extend RegExTest port (16 more tests)
baibaichen Jun 1, 2026
6c87822
regex-compat: translator pre-folds cased literals for UNICODE_CASE
baibaichen Jun 1, 2026
5b7d5e3
regex-compat: exclude GTEST_SKIP from per-backend tally
Jun 1, 2026
72e34da
regex-compat: report 'translatable subset' rate excluding engine-impo…
Jun 1, 2026
d4fdfe6
regex-compat: fix dangling-pointer warning in ClassBodyParserTest + a…
Jun 1, 2026
b822270
regex-compat: gate JNI-dependent code behind VELOX_REGEX_COMPAT_HAS_JAVA
Jun 2, 2026
4f13785
regex-compat: also gate JavaRegex.cpp / JvmFixture.cpp on VELOX_REGEX…
Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions CMake/Findpcre2.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Locate a system PCRE2 installation (8-bit code unit width) and expose it
# under the canonical target name `pcre2-8::pcre2-8` used by the
# velox/external/regex_compat module.

find_package(PCRE2 QUIET CONFIG COMPONENTS 8BIT)
if(PCRE2_FOUND)
if(NOT TARGET pcre2-8::pcre2-8 AND TARGET PCRE2::8BIT)
add_library(pcre2-8::pcre2-8 ALIAS PCRE2::8BIT)
endif()
message(STATUS "Found PCRE2 via CMake.")
return()
endif()

if(TARGET pcre2-8::pcre2-8)
message(STATUS "PCRE2 target already defined.")
return()
endif()

find_package(PkgConfig REQUIRED)
pkg_check_modules(PCRE2_8 QUIET libpcre2-8)
if(PCRE2_8_FOUND)
add_library(pcre2-8::pcre2-8 INTERFACE IMPORTED)
set_property(
TARGET pcre2-8::pcre2-8
PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_8_INCLUDE_DIRS}"
)
set_property(TARGET pcre2-8::pcre2-8 PROPERTY INTERFACE_LINK_LIBRARIES "${PCRE2_8_LDFLAGS}")
set_property(
TARGET pcre2-8::pcre2-8
PROPERTY INTERFACE_COMPILE_DEFINITIONS "PCRE2_CODE_UNIT_WIDTH=8"
)
set(pcre2_FOUND TRUE)
message(STATUS "Found PCRE2 via pkg-config.")
return()
endif()

if(pcre2_FIND_REQUIRED)
message(FATAL_ERROR "Failed to find PCRE2.")
elseif(NOT pcre2_FIND_QUIETLY)
message(WARNING "Failed to find PCRE2.")
endif()
55 changes: 55 additions & 0 deletions CMake/resolve_dependency_modules/pcre2.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include_guard(GLOBAL)

if(DEFINED ENV{VELOX_PCRE2_URL})
set(VELOX_PCRE2_SOURCE_URL "$ENV{VELOX_PCRE2_URL}")
else()
set(VELOX_PCRE2_VERSION 10.47)
set(
VELOX_PCRE2_SOURCE_URL
"https://github.com/PCRE2Project/pcre2/releases/download/pcre2-${VELOX_PCRE2_VERSION}/pcre2-${VELOX_PCRE2_VERSION}.tar.gz"
)
set(
VELOX_PCRE2_BUILD_SHA256_CHECKSUM
c08ae2388ef333e8403e670ad70c0a11f1eed021fd88308d7e02f596fcd9dc16
)
endif()

message(STATUS "Building PCRE2 ${VELOX_PCRE2_VERSION} from source")
FetchContent_Declare(
pcre2
URL ${VELOX_PCRE2_SOURCE_URL}
URL_HASH SHA256=${VELOX_PCRE2_BUILD_SHA256_CHECKSUM}
)

set(PCRE2_BUILD_PCRE2_8 ON CACHE BOOL "" FORCE)
set(PCRE2_BUILD_PCRE2_16 OFF CACHE BOOL "" FORCE)
set(PCRE2_BUILD_PCRE2_32 OFF CACHE BOOL "" FORCE)
set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE)
set(PCRE2_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(PCRE2_BUILD_PCRE2GREP OFF CACHE BOOL "" FORCE)
set(PCRE2_SUPPORT_UNICODE ON CACHE BOOL "" FORCE)
set(PCRE2_STATIC_PIC ON CACHE BOOL "" FORCE)

FetchContent_MakeAvailable(pcre2)

# Normalise the target name so consumers always link `pcre2-8::pcre2-8`.
if(TARGET pcre2-8-static AND NOT TARGET pcre2-8::pcre2-8)
add_library(pcre2-8::pcre2-8 ALIAS pcre2-8-static)
elseif(TARGET pcre2-8 AND NOT TARGET pcre2-8::pcre2-8)
add_library(pcre2-8::pcre2-8 ALIAS pcre2-8)
endif()

unset(BUILD_TESTING CACHE)
34 changes: 34 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ option(VELOX_ENABLE_TPCDS_CONNECTOR "Build TPC-DS connector." ON)
option(VELOX_ENABLE_PRESTO_FUNCTIONS "Build Presto SQL functions." ON)
option(VELOX_ENABLE_SPARK_FUNCTIONS "Build Spark SQL functions." ON)
option(VELOX_ENABLE_ICEBERG_FUNCTIONS "Build Iceberg functions." ON)
option(
VELOX_ENABLE_REGEX_COMPAT_TESTS
"Build the PCRE2 vs RE2 Java-regex compatibility test suite (pulls in PCRE2 dep)."
OFF
)
option(
VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND
"Within the regex-compat test suite, also exercise an embedded-JVM Java backend as a third backend / oracle. Requires JDK on the build host. If JNI cannot be found, this option is auto-disabled with a warning. Only consulted when VELOX_ENABLE_REGEX_COMPAT_TESTS=ON."
ON
)
option(VELOX_ENABLE_EXPRESSION "Build expression." ON)
option(
VELOX_ENABLE_EXAMPLES
Expand Down Expand Up @@ -626,6 +636,30 @@ endif()
velox_set_source(re2)
velox_resolve_dependency(re2)

if(VELOX_ENABLE_REGEX_COMPAT_TESTS)
velox_set_source(pcre2)
velox_resolve_dependency(pcre2)

if(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND)
# Probe for a JDK so the test suite can embed a JVM as a third (oracle)
# backend. This is the only place in upstream Velox that touches JNI, and
# it is fully opt-in (gated by the regex-compat option above). If JNI is
# not found we silently degrade — the test suite still builds with the
# PCRE2 + RE2 backends only.
find_package(JNI QUIET)
if(JNI_FOUND)
message(STATUS "Regex-compat: enabling embedded-JVM Java backend (JNI: ${JNI_INCLUDE_DIRS})")
else()
message(
WARNING
"Regex-compat: JNI not found, disabling Java backend. "
"Install a JDK or pass -DVELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND=OFF to silence."
)
set(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND OFF CACHE BOOL "" FORCE)
endif()
endif()
endif()

if(${VELOX_BUILD_PYTHON_PACKAGE})
find_package(Python 3.9 COMPONENTS Interpreter Development.Module REQUIRED)
velox_set_source(pybind11)
Expand Down
3 changes: 3 additions & 0 deletions velox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ add_subdirectory(external/date)
add_subdirectory(external/tzdb)
add_subdirectory(external/md5)
add_subdirectory(external/hdfs)
if(VELOX_ENABLE_REGEX_COMPAT_TESTS)
add_subdirectory(external/regex_compat)
endif()
#

# examples depend on expression
Expand Down
42 changes: 42 additions & 0 deletions velox/external/regex_compat/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Only entered when VELOX_ENABLE_REGEX_COMPAT_TESTS=ON.

set(_REGEX_COMPAT_SRC Re2Regex.cpp Pcre2Regex.cpp)
set(_REGEX_COMPAT_LIBS re2::re2 pcre2-8::pcre2-8)

if(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND)
list(APPEND _REGEX_COMPAT_SRC JvmFixture.cpp JavaRegex.cpp)
list(APPEND _REGEX_COMPAT_LIBS ${JNI_LIBRARIES})
endif()

velox_add_library(velox_regex_compat ${_REGEX_COMPAT_SRC})

velox_link_libraries(velox_regex_compat
PUBLIC ${_REGEX_COMPAT_LIBS}
PRIVATE velox_functions_lib velox_java_pcre2_translator)

if(VELOX_ENABLE_REGEX_COMPAT_JAVA_BACKEND)
velox_include_directories(velox_regex_compat PUBLIC ${JNI_INCLUDE_DIRS})
velox_compile_definitions(velox_regex_compat
PUBLIC VELOX_REGEX_COMPAT_HAS_JAVA=1)
else()
velox_compile_definitions(velox_regex_compat
PUBLIC VELOX_REGEX_COMPAT_HAS_JAVA=0)
endif()

if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
endif()
Loading
Loading