-
Notifications
You must be signed in to change notification settings - Fork 4
Copy Hive and Iceberg code to connectors/lakehouse #440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
46ffa77
2bf1f29
a1cd69b
262b33f
0a0234d
d74efab
b9786b5
529ea12
c8810ca
afa5e51
4310a6d
ba614ad
1aa22a8
aed4bff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| # Copyright (c) Facebook, Inc. and its affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| name: Gluten Daily Build | ||
|
|
||
| on: | ||
| push: | ||
| branches: | ||
| - 'main' | ||
|
|
||
| concurrency: | ||
| group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} | ||
| cancel-in-progress: true | ||
|
|
||
| jobs: | ||
|
|
||
| gluten-cpp-build: | ||
| name: gluten cpp build | ||
| # prevent errors when forks ff their main branch | ||
| if: ${{ github.repository == 'IBM/velox' }} | ||
| runs-on: ubuntu-22.04 | ||
| env: | ||
| CCACHE_DIR: "${{ github.workspace }}/.ccache" | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - name: Get Ccache | ||
| uses: actions/cache/restore@v4 | ||
| with: | ||
| path: '${{ env.CCACHE_DIR }}' | ||
| key: ccache-centos7-release-default-${{github.sha}} | ||
| restore-keys: | | ||
| ccache-centos7-release-default | ||
| - name: Setup Gluten | ||
| run: | | ||
| git clone --depth 1 https://github.com/apache/incubator-gluten gluten && cd gluten | ||
| BRANCH=$(echo ${GITHUB_REF#refs/heads/}) | ||
| sed -i 's/oap-project/IBM/g' ep/build-velox/src/get_velox.sh | ||
| #sed -i 's/VELOX_BRANCH=2025.*/VELOX_BRANCH=main/g' ep/build-velox/src/get_velox.sh | ||
| - name: Build Gluten native libraries | ||
| run: | | ||
| docker pull apache/gluten:vcpkg-centos-7 | ||
| docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c " | ||
| git config --global --add safe.directory /work | ||
| set -e | ||
| df -a | ||
| cd /work/gluten | ||
| export CCACHE_DIR=/work/.ccache | ||
| mkdir -p /work/.ccache | ||
| source /opt/rh/devtoolset-11/enable | ||
| ./dev/builddeps-veloxbe.sh --enable_vcpkg=ON --build_arrow=OFF --build_tests=OFF --build_benchmarks=OFF \ | ||
| --build_examples=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON --velox_home=/work | ||
| ccache -s | ||
| mkdir -p /work/.m2/repository/org/apache/arrow/ | ||
| cp -r /root/.m2/repository/org/apache/arrow/* /work/.m2/repository/org/apache/arrow/ | ||
| " | ||
| - name: Save ccache | ||
| uses: actions/cache/save@v4 | ||
| id: ccache | ||
| with: | ||
| path: '${{ env.CCACHE_DIR }}' | ||
| key: ccache-centos7-release-default-${{github.sha}} | ||
|
|
||
| # linux-gcc: | ||
| # name: Build with GCC | ||
| # if: ${{ github.repository == 'IBM/velox' }} | ||
| # runs-on: ubuntu-22.04 | ||
| # container: ghcr.io/facebookincubator/velox-dev:adapters | ||
| # defaults: | ||
| # run: | ||
| # shell: bash | ||
| # env: | ||
| # CCACHE_DIR: ${{ github.workspace }}/ccache | ||
| # VELOX_DEPENDENCY_SOURCE: SYSTEM | ||
| # GTest_SOURCE: BUNDLED | ||
| # cudf_SOURCE: BUNDLED | ||
| # CUDA_VERSION: '12.8' | ||
| # faiss_SOURCE: BUNDLED | ||
| # steps: | ||
| # - uses: actions/checkout@v4 | ||
| # with: | ||
| # fetch-depth: 2 | ||
| # persist-credentials: false | ||
|
|
||
| # - name: Fix git permissions | ||
| # # Usually actions/checkout does this but as we run in a container | ||
| # # it doesn't work | ||
| # run: git config --global --add safe.directory ${GITHUB_WORKSPACE} | ||
|
|
||
| # - name: Install Dependencies | ||
| # run: | | ||
| # if git diff --name-only HEAD^1 HEAD | grep -q "scripts/setup-"; then | ||
| # # Overwrite old setup scripts with changed versions | ||
| # cp scripts/setup-* / | ||
|
|
||
| # mkdir /tmp/build | ||
| # cd /tmp/build | ||
| # source /opt/rh/gcc-toolset-12/enable | ||
| # # install basic deps | ||
| # bash /setup-centos9.sh | ||
|
|
||
| # source /setup-centos9.sh | ||
| # install_adapters | ||
| # install_cuda $CUDA_VERSION | ||
|
|
||
| # cd / | ||
| # rm -rf /tmp/build # cleanup to avoid issues with disk space | ||
| # fi | ||
|
|
||
| # - uses: actions/cache/restore@v4 | ||
| # with: | ||
| # path: '${{ env.CCACHE_DIR }}' | ||
| # key: ccache-linux-adapters-${{ inputs.use-clang && 'clang' || 'gcc' }}-${{github.sha}} | ||
| # restore-keys: | | ||
| # ccache-linux-adapters-${{ inputs.use-clang && 'clang' || 'gcc' }} | ||
|
|
||
| # - name: Zero Ccache Statistics | ||
| # run: | | ||
| # ccache -sz | ||
|
|
||
| # - name: Make Release Build | ||
| # env: | ||
| # #MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4' | ||
| # CUDA_ARCHITECTURES: 70 | ||
| # CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc | ||
| # # Set compiler to GCC 12 | ||
| # CUDA_FLAGS: -ccbin /opt/rh/gcc-toolset-12/root/usr/bin | ||
| # run: | | ||
| # EXTRA_CMAKE_FLAGS=( | ||
| # "-DVELOX_ENABLE_BENCHMARKS=ON" | ||
| # "-DVELOX_ENABLE_EXAMPLES=ON" | ||
| # "-DVELOX_ENABLE_ARROW=ON" | ||
| # "-DVELOX_ENABLE_GEO=ON" | ||
| # "-DVELOX_ENABLE_FAISS=ON" | ||
| # "-DVELOX_ENABLE_PARQUET=ON" | ||
| # "-DVELOX_ENABLE_HDFS=ON" | ||
| # "-DVELOX_ENABLE_S3=ON" | ||
| # "-DVELOX_ENABLE_GCS=ON" | ||
| # "-DVELOX_ENABLE_ABFS=ON" | ||
| # "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" | ||
| # "-DVELOX_ENABLE_CUDF=ON" | ||
| # "-DVELOX_ENABLE_WAVE=ON" | ||
| # "-DVELOX_MONO_LIBRARY=ON" | ||
| # "-DVELOX_BUILD_SHARED=ON" | ||
| # ) | ||
| # if [[ "${USE_CLANG}" = "true" ]]; then scripts/setup-centos9.sh install_clang15; export CC=/usr/bin/clang-15; export CXX=/usr/bin/clang++-15; CUDA_FLAGS="-ccbin /usr/lib64/llvm15/bin/clang++-15"; fi | ||
| # make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}" | ||
|
|
||
| # - name: Ccache after | ||
| # run: ccache -s | ||
|
|
||
| # - uses: actions/cache/save@v4 | ||
| # with: | ||
| # path: '${{ env.CCACHE_DIR }}' | ||
| # key: ccache-linux-adapters-gcc-${{github.sha}} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,21 +15,6 @@ | |
| name: Linux Build using GCC | ||
|
|
||
| on: | ||
| push: | ||
|
||
| branches: | ||
| - main | ||
| paths: | ||
| - velox/** | ||
| - '!velox/docs/**' | ||
| - CMakeLists.txt | ||
| - CMake/** | ||
| - scripts/setup-ubuntu.sh | ||
| - scripts/setup-common.sh | ||
| - scripts/setup-versions.sh | ||
| - scripts/setup-helper-functions.sh | ||
| - .github/workflows/linux-build.yml | ||
| - .github/workflows/linux-build-base.yml | ||
|
|
||
| pull_request: | ||
| paths: | ||
| - velox/** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -194,6 +194,11 @@ class HiveConfig { | |
| static constexpr const char* kPreserveFlatMapsInMemorySession = | ||
| "preserve_flat_maps_in_memory"; | ||
|
|
||
| static constexpr const char* kEnableRequestedTypeCheck = | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. requested-type-check-enabled |
||
| "enable-requested-type-check"; | ||
| static constexpr const char* kEnableRequestedTypeCheckSession = | ||
| "enable_requested_type_check"; | ||
|
|
||
| InsertExistingPartitionsBehavior insertExistingPartitionsBehavior( | ||
| const config::ConfigBase* session) const; | ||
|
|
||
|
|
@@ -265,6 +270,10 @@ class HiveConfig { | |
| bool readStatsBasedFilterReorderDisabled( | ||
| const config::ConfigBase* session) const; | ||
|
|
||
| /// Whether to enable requested type check in the ReaderBase::convertType. | ||
| /// Returns true by default. | ||
| bool isRequestedTypeCheckEnabled(const config::ConfigBase* session) const; | ||
|
|
||
| /// Returns the file system path containing local data. If non-empty, | ||
| /// initializes LocalHiveConnectorMetadata to provide metadata for the tables | ||
| /// in the directory. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| #include "velox/connectors/hive/HiveDataSink.h" | ||
| #include "velox/connectors/hive/HiveDataSource.h" | ||
| #include "velox/connectors/hive/HivePartitionFunction.h" | ||
| #include "velox/connectors/hive/iceberg/IcebergDataSink.h" | ||
|
|
||
| #include <boost/lexical_cast.hpp> | ||
| #include <memory> | ||
|
|
@@ -87,17 +88,29 @@ std::unique_ptr<DataSink> HiveConnector::createDataSink( | |
| ConnectorInsertTableHandlePtr connectorInsertTableHandle, | ||
| ConnectorQueryCtx* connectorQueryCtx, | ||
| CommitStrategy commitStrategy) { | ||
| auto hiveInsertHandle = | ||
| std::dynamic_pointer_cast<const HiveInsertTableHandle>( | ||
| connectorInsertTableHandle); | ||
| VELOX_CHECK_NOT_NULL( | ||
| hiveInsertHandle, "Hive connector expecting hive write handle!"); | ||
| return std::make_unique<HiveDataSink>( | ||
| inputType, | ||
| hiveInsertHandle, | ||
| connectorQueryCtx, | ||
| commitStrategy, | ||
| hiveConfig_); | ||
| if (auto icebergInsertHandle = | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the base does not contains IcebergInsertTableHandle? |
||
| std::dynamic_pointer_cast<const iceberg::IcebergInsertTableHandle>( | ||
| connectorInsertTableHandle)) { | ||
| return std::make_unique<iceberg::IcebergDataSink>( | ||
| inputType, | ||
| icebergInsertHandle, | ||
| connectorQueryCtx, | ||
| commitStrategy, | ||
| hiveConfig_); | ||
| } else { | ||
| auto hiveInsertHandle = | ||
| std::dynamic_pointer_cast<const HiveInsertTableHandle>( | ||
| connectorInsertTableHandle); | ||
|
|
||
| VELOX_CHECK_NOT_NULL( | ||
| hiveInsertHandle, "Hive connector expecting hive write handle!"); | ||
| return std::make_unique<HiveDataSink>( | ||
| inputType, | ||
| hiveInsertHandle, | ||
| connectorQueryCtx, | ||
| commitStrategy, | ||
| hiveConfig_); | ||
| } | ||
| } | ||
|
|
||
| std::unique_ptr<core::PartitionFunction> HivePartitionFunctionSpec::create( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why comment these codes instead of drop it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's just for testing now, will remove this