Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pyo3-arrow = { path = "./pyo3-arrow" }
pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"] }
pyo3-file = "0.12"
pyo3-object_store = "0.4"
rayon = "1.10.0"
thiserror = "1.0.63"
tokio = { version = "1.40", features = [
"macros",
Expand Down
1 change: 1 addition & 0 deletions arro3-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,6 @@ pyo3-async-runtimes = { workspace = true, features = [
], optional = true }
pyo3-file = { workspace = true }
pyo3-object_store = { workspace = true, optional = true }
rayon = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
78 changes: 78 additions & 0 deletions arro3-io/src/parquet/reader/concurrency.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
use arrow_array::RecordBatch;
use futures::future::join_all;
use futures::{StreamExt, TryStreamExt};
use parquet::arrow::arrow_reader::ArrowReaderMetadata;
use parquet::arrow::async_reader::AsyncFileReader;
use parquet::arrow::ParquetRecordBatchStreamBuilder;

use crate::error::Arro3IoResult;
use crate::parquet::reader::options::PyParquetOptions;

pub(crate) async fn read_concurrent<T: AsyncFileReader + Unpin + Send + 'static + Clone>(
source: T,
meta: &ArrowReaderMetadata,
options: PyParquetOptions,
) -> Arro3IoResult<Vec<RecordBatch>> {
let split_options = split_options(options);
let mut readers = split_options
.into_iter()
.map(|options| {
let async_reader_builder =
ParquetRecordBatchStreamBuilder::new_with_metadata(source.clone(), meta.clone());
options
.apply_to_reader_builder(async_reader_builder, &meta)
.build()
})
.collect::<Result<Vec<_>, _>>()?;

let futures = readers
.iter_mut()
.map(|stream| stream.try_collect::<Vec<_>>())
.collect::<Vec<_>>();
let batches = join_all(futures)
.await
.into_iter()
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.flatten()
.collect::<Vec<_>>();
Ok(batches)
}

fn split_options(options: PyParquetOptions) -> Vec<PyParquetOptions> {
if can_split_readers(&options) {
let mut split_options = vec![];
if let Some(row_groups) = options.row_groups {
let row_groups_per_reader = row_groups / 2;
for i in 0..2 {
let start = i * row_groups_per_reader;
let end = (i + 1) * row_groups_per_reader;
let mut new_options = options.clone();
new_options.row_groups = Some(row_groups[start..end].to_vec());
split_options.push(new_options);
}
return split_options;
}
}
todo!()
}

fn can_split_readers(options: &PyParquetOptions) -> bool {
// No row groups to
if options
.row_groups
.is_some_and(|row_groups| row_groups.len() <= 1)
{
return false;
}
if let Some(row_groups) = options.row_groups {
if row_groups <= 1 {
return false;
}

if options.limit.is_some() {
return false;
}
if let Some(limit) = options.limit {}
}
}
33 changes: 32 additions & 1 deletion arro3-io/src/parquet/reader/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@ impl ParquetFile {
let record_batch_reader = options
.apply_to_reader_builder(sync_reader_builder, &self.meta)
.build()?;
Ok(PyRecordBatchReader::new(Box::new(record_batch_reader)).into())
record_batch_reader
.Ok(PyRecordBatchReader::new(Box::new(record_batch_reader)).into())
}
ParquetSource::Async(async_source) => {
let async_reader_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
Expand Down Expand Up @@ -220,6 +221,36 @@ impl ParquetFile {
}
}

#[pyo3(signature = (**kwargs))]
fn read_table_async(
&self,
kwargs: Option<PyParquetOptions>,
) -> Arro3IoResult<PyRecordBatchStream> {
let options = kwargs.unwrap_or_default();
match &self.source {
ParquetSource::Sync(sync_source) => {
let async_reader_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
Box::new(sync_source.try_clone()?) as _,
self.meta.clone(),
);
let record_batch_stream = options
.apply_to_reader_builder(async_reader_builder, &self.meta)
.build()?;
Ok(PyRecordBatchStream::new(record_batch_stream))
}
ParquetSource::Async(async_source) => {
let async_reader_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
Box::new(async_source.clone()) as _,
self.meta.clone(),
);
let record_batch_stream = options
.apply_to_reader_builder(async_reader_builder, &self.meta)
.build()?;
Ok(PyRecordBatchStream::new(record_batch_stream))
}
}
}

#[getter]
fn schema_arrow(&self) -> Arro3Schema {
self.meta.schema().clone().into()
Expand Down
2 changes: 2 additions & 0 deletions arro3-io/src/parquet/reader/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// mod concurrency;
mod file;
mod functional;
mod options;
mod stream;
mod thread_pool;

pub(crate) use file::ParquetFile;
pub(crate) use functional::{read_parquet, read_parquet_async};
41 changes: 31 additions & 10 deletions arro3-io/src/parquet/reader/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ use pyo3::exceptions::{PyStopAsyncIteration, PyStopIteration};
use pyo3::prelude::*;
use pyo3_arrow::export::{Arro3RecordBatch, Arro3Table};
use pyo3_arrow::PyTable;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use rayon::ThreadPool;
use tokio::sync::Mutex;

use crate::error::Arro3IoError;
use crate::parquet::reader::thread_pool::get_default_pool;

#[pyclass(name = "RecordBatchStream", frozen)]
pub(crate) struct PyRecordBatchStream {
Expand Down Expand Up @@ -41,8 +44,12 @@ impl PyRecordBatchStream {
}

fn collect_async<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
let pool = get_default_pool(py)?.clone();
let stream = self.stream.clone();
pyo3_async_runtimes::tokio::future_into_py(py, collect_stream(stream, self.schema.clone()))
pyo3_async_runtimes::tokio::future_into_py(
py,
collect_stream(stream, self.schema.clone(), pool),
)
}
}

Expand All @@ -69,16 +76,30 @@ async fn next_stream(
async fn collect_stream(
stream: Arc<Mutex<ParquetRecordBatchStream<Box<dyn AsyncFileReader + 'static>>>>,
schema: SchemaRef,
pool: Arc<ThreadPool>,
) -> PyResult<Arro3Table> {
let mut stream = stream.lock().await;
let mut batches: Vec<_> = vec![];
loop {
match stream.next().await {
Some(Ok(batch)) => {
batches.push(batch);
}
Some(Err(err)) => return Err(Arro3IoError::ParquetError(err).into()),
None => return Ok(PyTable::try_new(batches, schema)?.into()),
};

let mut readers = vec![];
while let Some(reader) = stream
.next_row_group()
.await
.map_err(Arro3IoError::ParquetError)?
{
readers.push(reader);
}

let batches = pool.install(|| {
let batches = readers
.into_par_iter()
.map(|r| r.collect::<Result<Vec<_>, _>>())
.collect::<Result<Vec<_>, _>>()
.map_err(Arro3IoError::ArrowError)?
.into_iter()
.flatten()
.collect::<Vec<_>>();
Ok::<_, PyErr>(batches)
})?;

Ok(PyTable::try_new(batches, schema)?.into())
}
19 changes: 19 additions & 0 deletions arro3-io/src/parquet/reader/thread_pool.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use std::sync::Arc;

use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;

use pyo3::sync::GILOnceCell;
use rayon::{ThreadPool, ThreadPoolBuilder};

static DEFAULT_POOL: GILOnceCell<Arc<ThreadPool>> = GILOnceCell::new();

pub fn get_default_pool(py: Python<'_>) -> PyResult<Arc<ThreadPool>> {
let runtime = DEFAULT_POOL.get_or_try_init(py, || {
let pool = ThreadPoolBuilder::new().build().map_err(|err| {
PyValueError::new_err(format!("Could not create rayon threadpool. {}", err))
})?;
Ok::<_, PyErr>(Arc::new(pool))
})?;
Ok(runtime.clone())
}
Loading