diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi index df0998ba..e0be05b5 100644 --- a/arro3-core/python/arro3/core/_core.pyi +++ b/arro3-core/python/arro3/core/_core.pyi @@ -1,6 +1,7 @@ from typing import Any, Iterable, Literal, Sequence, overload import numpy as np +import pandas as pd from numpy.typing import NDArray from .types import ( @@ -1514,6 +1515,22 @@ class Table: batches: Sequence of RecordBatch to be converted, all schemas must be equal. schema: If not passed, will be inferred from the first RecordBatch. Defaults to None. + Returns: + New Table. + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + *, + schema: ArrowSchemaExportable | None = None, + ) -> Table: + """Construct a Table from a pandas DataFrame + + Args: + df: pandas DataFrame to convert to Arrow + schema: If not passed, will be inferred. + Returns: New Table. """ diff --git a/pyo3-arrow/src/interop/mod.rs b/pyo3-arrow/src/interop/mod.rs index 4268092f..bcb9b87f 100644 --- a/pyo3-arrow/src/interop/mod.rs +++ b/pyo3-arrow/src/interop/mod.rs @@ -1 +1,2 @@ pub mod numpy; +pub(crate) mod pandas; diff --git a/pyo3-arrow/src/interop/pandas/from_pandas.rs b/pyo3-arrow/src/interop/pandas/from_pandas.rs new file mode 100644 index 00000000..301355cf --- /dev/null +++ b/pyo3-arrow/src/interop/pandas/from_pandas.rs @@ -0,0 +1,63 @@ +use arrow_array::RecordBatch; +use arrow_schema::{Schema, SchemaRef}; +use indexmap::IndexMap; +use numpy::PyArrayDescr; +use pyo3::intern; +use pyo3::prelude::*; + +use crate::error::PyArrowResult; +use crate::PyTable; + +enum PandasDtype<'a> { + Numpy(&'a PyArrayDescr), +} + +impl<'py> FromPyObject<'py> for PandasDtype<'py> { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + Ok(Self::Numpy(ob.extract()?)) + } +} + +pub fn from_pandas_dataframe( + py: Python, + df: &Bound, + schema: Option, +) -> PyArrowResult<(Vec, SchemaRef)> { + // If pandas 2.2+ and the Arrow C Stream export works, prefer that. + if df.hasattr(intern!(py, "__arrow_c_stream__"))? { + if let Ok(table) = df.extract::() { + return Ok(table.into_inner()); + } + } + + let schema = if let Some(schema) = schema { + schema + } else { + infer_arrow_schema(py, df)? + }; + let batch = import_batch(py, df, &schema)?; + Ok((vec![batch], schema)) +} + +fn infer_arrow_schema<'py>(py: Python<'py>, df: &'py Bound) -> PyResult { + let dtypes = access_dtypes(py, df)?; + todo!() +} + +fn import_batch<'py>( + py: Python<'py>, + df: &'py Bound, + schema: &Schema, +) -> PyResult { + todo!() +} + +fn access_dtypes<'py>( + py: Python<'py>, + df: &'py Bound, +) -> PyResult>> { + let dtypes_dict = df + .getattr(intern!(py, "dtypes"))? + .call_method0(intern!(py, "to_dict"))?; + dtypes_dict.extract() +} diff --git a/pyo3-arrow/src/interop/pandas/mod.rs b/pyo3-arrow/src/interop/pandas/mod.rs new file mode 100644 index 00000000..0f617bd8 --- /dev/null +++ b/pyo3-arrow/src/interop/pandas/mod.rs @@ -0,0 +1 @@ +pub(crate) mod from_pandas; diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs index 86e892bc..9440095d 100644 --- a/pyo3-arrow/src/table.rs +++ b/pyo3-arrow/src/table.rs @@ -21,6 +21,7 @@ use crate::ffi::to_schema_pycapsule; use crate::input::{ AnyArray, AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices, }; +use crate::interop::pandas::from_pandas::from_pandas_dataframe; use crate::schema::display_schema; use crate::utils::schema_equals; use crate::{PyChunkedArray, PyField, PyRecordBatch, PyRecordBatchReader, PySchema}; @@ -280,6 +281,18 @@ impl PyTable { Ok(Self::try_new(batches, schema)?) } + #[classmethod] + #[pyo3(signature = (df, *, schema=None))] + fn from_pandas( + _cls: &Bound, + py: Python, + df: &Bound, + schema: Option, + ) -> PyArrowResult { + let (batches, schema) = from_pandas_dataframe(py, df, schema.map(|s| s.into_inner()))?; + Ok(Self::try_new(batches, schema)?) + } + #[classmethod] #[pyo3(signature = (mapping, *, schema=None, metadata=None))] fn from_pydict( diff --git a/tests/core/test_table.py b/tests/core/test_table.py index 83e80f83..05f9e20d 100644 --- a/tests/core/test_table.py +++ b/tests/core/test_table.py @@ -95,3 +95,18 @@ def test_slice(): sliced2 = table.slice(1, 2) assert sliced2.num_rows == 2 assert sliced2.chunk_lengths == [1, 1] + + +def test_from_pandas(): + df = pd.DataFrame( + { + "int64": np.array([1, 2, 3, 4], dtype=np.int64), + "string": ["a", "b", "c", "d"], + } + ) + for x in df.dtypes: + pass + df["a"].array.__array__().dtype + np.array(df["a"].array) + dtypes = df.dtypes.to_dict() + pass