Skip to content

GC string views on hash join build side #16463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion datafusion/physical-plan/src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use super::SendableRecordBatchStream;
use crate::stream::RecordBatchReceiverStream;
use crate::{ColumnStatistics, Statistics};

use arrow::array::Array;
use arrow::array::{Array, BinaryViewArray, RecordBatchOptions, StringViewArray};
use arrow::datatypes::Schema;
use arrow::record_batch::RecordBatch;
use datafusion_common::stats::Precision;
Expand Down Expand Up @@ -204,6 +204,28 @@ pub fn can_project(
}
}

/// Performs a garbage collections on all StringView- and ByteView- columns of a
/// batch.
pub(crate) fn gc_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
fn gc_array(array: &Arc<dyn Array>) -> Arc<dyn Array> {
if let Some(array) = array.as_any().downcast_ref::<StringViewArray>() {
Arc::new(array.gc())
} else if let Some(array) = array.as_any().downcast_ref::<BinaryViewArray>() {
Arc::new(array.gc())
} else {
Arc::clone(&array)
}
}

let schema = batch.schema();
let columns = batch.columns().iter().map(gc_array).collect();

let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
Ok(RecordBatch::try_new_with_options(
schema, columns, &options,
)?)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
3 changes: 3 additions & 0 deletions datafusion/physical-plan/src/joins/hash_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,9 @@ async fn collect_left_input(
}
// Merge all batches into a single batch, so we can directly index into the arrays
let single_batch = concat_batches(&schema, batches_iter)?;
// Compact the backing buffers of the single batch, for faster take operations later on large build sides
// See: https://github.com/apache/datafusion/issues/16206
let single_batch = crate::common::gc_record_batch(&single_batch)?;

// Reserve additional memory for visited indices bitmap and create shared builder
let visited_indices_bitmap = if with_visited_indices_bitmap {
Expand Down
Loading