diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml new file mode 100644 index 0000000..7e78ff9 --- /dev/null +++ b/.github/workflows/analysis.yml @@ -0,0 +1,69 @@ +name: cpp-linter-analysis + +on: [push, pull_request] + +defaults: + run: + shell: bash -e -l {0} + +jobs: + build: + runs-on: ubuntu-24.04 + + steps: + # Set up Clang and LLVM + - name: Install LLVM and Clang + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 20 + sudo apt-get install -y clang-tools-20 + sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-20 200 + sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-20 200 + sudo update-alternatives --install /usr/bin/clang-scan-deps clang-scan-deps /usr/bin/clang-scan-deps-20 200 + sudo update-alternatives --set clang /usr/bin/clang-20 + sudo update-alternatives --set clang++ /usr/bin/clang++-20 + sudo update-alternatives --set clang-scan-deps /usr/bin/clang-scan-deps-20 + + - name: Checkout repository + uses: actions/checkout@v4 + + # Set conda environment using setup-micromamba + - name: Set conda environment + uses: mamba-org/setup-micromamba@main + with: + environment-name: myenv + environment-file: environment-dev.yml + init-shell: bash + cache-downloads: true + + # Run CMake configuration + - name: Configure using CMake + run: | + export CC=clang; export CXX=clang++ + cmake -G Ninja \ + -Bbuild \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DBUILD_TESTS=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + #-DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING + + # Run Clang-Tidy and Clang-Format Analysis + - name: Run C++ analysis + uses: cpp-linter/cpp-linter-action@v2 + id: linter + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + version: 20 + files-changed-only: false # check all files + database: 'build' + style: 'file' # Use .clang-format config file + tidy-checks: '' # Use .clang-tidy config file + step-summary: true + ignore: 'build' + extra-args: '-std=c++20' + - name: Fail fast + if: steps.linter.outputs.checks-failed > 0 + run: exit 1 diff --git a/src/serialize.cpp b/src/serialize.cpp index be5e84a..1da7ac3 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -12,10 +12,12 @@ namespace { + constexpr int64_t arrow_alignment = 8; + // Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies. int64_t align_to_8(int64_t n) { - return (n + 7) & -8; + return (n + arrow_alignment - 1) & -arrow_alignment; } // TODO Complete this with all possible formats? @@ -137,7 +139,7 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // Copy the metadata into the buffer, after the 4-byte length prefix memcpy(final_buffer.data() + sizeof(uint32_t), schema_builder.GetBufferPointer(), schema_len); // Write the 4-byte metadata length at the beginning of the message - *(reinterpret_cast(final_buffer.data())) = schema_len; + memcpy(final_buffer.data(), &schema_len, sizeof(schema_len)); } // II - Serialize the RecordBatch message @@ -148,11 +150,11 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // arrow_arr.buffers[0] is the validity bitmap // arrow_arr.buffers[1] is the data buffer - const uint8_t* validity_bitmap = reinterpret_cast(arrow_arr.buffers[0]); - const uint8_t* data_buffer = reinterpret_cast(arrow_arr.buffers[1]); + const auto validity_bitmap = static_cast(arrow_arr.buffers[0]); + const auto data_buffer = static_cast(arrow_arr.buffers[1]); // Calculate the size of the validity and data buffers - int64_t validity_size = (arrow_arr.length + 7) / 8; + int64_t validity_size = (arrow_arr.length + arrow_alignment - 1) / arrow_alignment; int64_t data_size = arrow_arr.length * sizeof(T); int64_t body_len = validity_size + data_size; // The total size of the message body @@ -181,21 +183,29 @@ std::vector serialize_primitive_array(const sparrow::primitive_array batch_builder.Finish(batch_message_offset); // III - Append the RecordBatch message to the final buffer - uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata - int64_t aligned_batch_meta_len = align_to_8(batch_meta_len); // Calculate the padded length + const uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata + const int64_t aligned_batch_meta_len = align_to_8(batch_meta_len); // Calculate the padded length - size_t current_size = final_buffer.size(); // Get the current size (which is the end of the Schema message) + const size_t current_size = final_buffer.size(); // Get the current size (which is the end of the Schema message) // Resize the buffer to append the new message final_buffer.resize(current_size + sizeof(uint32_t) + aligned_batch_meta_len + body_len); uint8_t* dst = final_buffer.data() + current_size; // Get a pointer to where the new message will start // Write the 4-byte metadata length for the RecordBatch message - *(reinterpret_cast(dst)) = batch_meta_len; + memcpy(dst, &batch_meta_len, sizeof(batch_meta_len)); dst += sizeof(uint32_t); // Copy the RecordBatch metadata into the buffer memcpy(dst, batch_builder.GetBufferPointer(), batch_meta_len); // Add padding to align the body to an 8-byte boundary - memset(dst + batch_meta_len, 0, aligned_batch_meta_len - batch_meta_len); + if (static_cast(aligned_batch_meta_len) >= batch_meta_len) + { + memset(dst + batch_meta_len, 0, aligned_batch_meta_len - batch_meta_len); + } + else + { + throw std::runtime_error("aligned_batch_meta_len should be greater than batch_meta_len"); + } + dst += aligned_batch_meta_len; // Copy the actual data buffers (the message body) into the buffer if (validity_bitmap) @@ -205,7 +215,8 @@ std::vector serialize_primitive_array(const sparrow::primitive_array else { // If validity_bitmap is null, it means there are no nulls - memset(dst, 0xFF, validity_size); + constexpr uint8_t no_nulls_bitmap = 0xFF; + memset(dst, no_nulls_bitmap, validity_size); } dst += validity_size; if (data_buffer) @@ -228,7 +239,8 @@ sparrow::primitive_array deserialize_primitive_array(const std::vector(buf_ptr + current_offset)); + uint32_t schema_meta_len = 0; + memcpy(&schema_meta_len, buf_ptr + current_offset, sizeof(schema_meta_len)); current_offset += sizeof(uint32_t); auto schema_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) @@ -245,7 +257,8 @@ sparrow::primitive_array deserialize_primitive_array(const std::vector(buf_ptr + current_offset)); + uint32_t batch_meta_len = 0; + memcpy(&batch_meta_len, buf_ptr + current_offset, sizeof(batch_meta_len)); current_offset += sizeof(uint32_t); auto batch_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); if (batch_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::RecordBatch) @@ -266,10 +279,10 @@ sparrow::primitive_array deserialize_primitive_array(const std::vectorGet(0)->length(); int64_t data_len = buffers_meta->Get(1)->length(); - uint8_t* validity_buffer_copy = new uint8_t[validity_len]; + auto validity_buffer_copy = new uint8_t[validity_len]; memcpy(validity_buffer_copy, body_ptr + buffers_meta->Get(0)->offset(), validity_len); - uint8_t* data_buffer_copy = new uint8_t[data_len]; + auto data_buffer_copy = new uint8_t[data_len]; memcpy(data_buffer_copy, body_ptr + buffers_meta->Get(1)->offset(), data_len); // Get name