Skip to content

Commit 98f9499

Browse files
adriangbclaude
andcommitted
refactor: store TableSchema partition columns as Fields
Store `TableSchema::table_partition_cols` as `arrow::datatypes::Fields` (an immutable `Arc<[FieldRef]>`) instead of `Arc<Vec<FieldRef>>`. Now that `with_table_partition_cols` replaces rather than appends, the inner `Vec` only existed to support in-place `extend`. `Fields` is the idiomatic Arrow type for an immutable field list: it is a single `Arc<[FieldRef]>` (one fewer indirection than `Arc<Vec<_>>`), can be shared zero-copy with an existing schema, and makes the shared-`Arc` mutation panic that motivated the replace change structurally impossible -- there is no `Vec` to `Arc::get_mut`/`make_mut`. `TableSchema::table_partition_cols()` and the delegating `FileScanConfig::table_partition_cols()` now return `&Fields`. `Fields` derefs to `&[FieldRef]`, so iteration/indexing/`len`/`is_empty` callers are unchanged; only the arrow `FileFormat` path needed `.to_vec()` to rebuild an owned `Vec`. The constructors still take `Vec<FieldRef>`, so no call site that builds partition columns changes. Documented the `&Fields` return-type change in the 54.0.0 upgrade guide. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ee71d0c commit 98f9499

4 files changed

Lines changed: 53 additions & 15 deletions

File tree

datafusion/datasource-arrow/src/file_format.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ impl FileFormat for ArrowFormat {
199199

200200
let table_schema = TableSchema::new(
201201
Arc::clone(conf.file_schema()),
202-
conf.table_partition_cols().clone(),
202+
conf.table_partition_cols().to_vec(),
203203
);
204204

205205
let mut source: Arc<dyn FileSource> =

datafusion/datasource/src/file_scan_config/mod.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ use crate::{
2727
file_stream::work_source::SharedWorkSource, source::DataSource,
2828
statistics::MinMaxStatistics,
2929
};
30-
use arrow::datatypes::FieldRef;
30+
use arrow::datatypes::Fields;
3131
use arrow::datatypes::{DataType, Schema, SchemaRef};
3232
use datafusion_common::config::ConfigOptions;
3333
use datafusion_common::tree_node::TreeNodeRecursion;
@@ -1068,7 +1068,7 @@ impl FileScanConfig {
10681068
}
10691069

10701070
/// Get the table partition columns
1071-
pub fn table_partition_cols(&self) -> &Vec<FieldRef> {
1071+
pub fn table_partition_cols(&self) -> &Fields {
10721072
self.file_source.table_schema().table_partition_cols()
10731073
}
10741074

@@ -2074,7 +2074,10 @@ mod tests {
20742074
Some(vec![0, 2])
20752075
);
20762076
assert_eq!(new_config.limit, Some(10));
2077-
assert_eq!(*new_config.table_partition_cols(), partition_cols);
2077+
assert_eq!(
2078+
*new_config.table_partition_cols(),
2079+
Fields::from(partition_cols)
2080+
);
20782081
assert_eq!(new_config.file_groups.len(), 1);
20792082
assert_eq!(new_config.file_groups[0].len(), 1);
20802083
assert_eq!(

datafusion/datasource/src/table_schema.rs

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! Helper struct to manage table schemas with partition columns
1919
20-
use arrow::datatypes::{FieldRef, SchemaBuilder, SchemaRef};
20+
use arrow::datatypes::{FieldRef, Fields, SchemaBuilder, SchemaRef};
2121
use std::sync::Arc;
2222

2323
/// The overall schema for potentially partitioned data sources.
@@ -70,7 +70,11 @@ pub struct TableSchema {
7070
///
7171
/// These columns are NOT present in the data files but are appended to each
7272
/// row during query execution based on the file's location.
73-
table_partition_cols: Arc<Vec<FieldRef>>,
73+
///
74+
/// Stored as [`Fields`] (an immutable `Arc<[FieldRef]>`) so that cloning a
75+
/// `TableSchema` is cheap and the partition columns can be shared zero-copy
76+
/// with an existing schema.
77+
table_partition_cols: Fields,
7478

7579
/// The complete table schema: file_schema columns followed by partition columns.
7680
///
@@ -117,11 +121,14 @@ impl TableSchema {
117121
/// assert_eq!(table_schema.table_schema().fields().len(), 4);
118122
/// ```
119123
pub fn new(file_schema: SchemaRef, table_partition_cols: Vec<FieldRef>) -> Self {
124+
// Stored immutably as `Fields`; cloning a `TableSchema` is then a cheap
125+
// refcount bump and the partition columns can never be mutated in place.
126+
let table_partition_cols = Fields::from(table_partition_cols);
120127
let mut builder = SchemaBuilder::from(file_schema.as_ref());
121128
builder.extend(table_partition_cols.iter().cloned());
122129
Self {
123130
file_schema,
124-
table_partition_cols: Arc::new(table_partition_cols),
131+
table_partition_cols,
125132
table_schema: Arc::new(builder.finish()),
126133
}
127134
}
@@ -144,13 +151,13 @@ impl TableSchema {
144151
/// into [`TableSchema::with_table_partition_cols`] if you have partition columns at construction time
145152
/// since it avoids re-computing the table schema.
146153
pub fn with_table_partition_cols(mut self, partition_cols: Vec<FieldRef>) -> Self {
147-
// Replace the partition columns outright. Assigning a fresh `Arc` is
148-
// cheap and never mutates the inner `Vec`, so this is safe even when the
149-
// `Arc` is shared with a clone of this `TableSchema` (copy-on-write
150-
// isolation is automatic). The previous `Arc::get_mut().expect()`
151-
// appended in place and panicked whenever the `Arc` was shared, since
152-
// owning `self` does not imply sole ownership of the inner `Arc`.
153-
self.table_partition_cols = Arc::new(partition_cols);
154+
// Replace the partition columns outright. `Fields` is an immutable
155+
// `Arc<[FieldRef]>`, so storing it never mutates a shared allocation and
156+
// is safe even when this `TableSchema` has been cloned. (A previous
157+
// `Arc<Vec<_>>` design appended in place via `Arc::get_mut().expect()`
158+
// and panicked whenever the `Arc` was shared, since owning `self` does
159+
// not imply sole ownership of the inner `Arc`.)
160+
self.table_partition_cols = Fields::from(partition_cols);
154161
let mut builder = SchemaBuilder::from(self.file_schema.as_ref());
155162
builder.extend(self.table_partition_cols.iter().cloned());
156163
self.table_schema = Arc::new(builder.finish());
@@ -168,7 +175,7 @@ impl TableSchema {
168175
///
169176
/// These are the columns derived from the directory structure that
170177
/// will be appended to each row during query execution.
171-
pub fn table_partition_cols(&self) -> &Vec<FieldRef> {
178+
pub fn table_partition_cols(&self) -> &Fields {
172179
&self.table_partition_cols
173180
}
174181

docs/source/library-user-guide/upgrading/54.0.0.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,3 +787,31 @@ a combined `OR` (e.g., index-backed sources).
787787
```sql
788788
SET datafusion.optimizer.enable_unions_to_filter = true;
789789
```
790+
791+
### `TableSchema::table_partition_cols()` now returns `&Fields`
792+
793+
`datafusion_datasource::TableSchema` now stores its partition columns as
794+
`arrow::datatypes::Fields` (an immutable `Arc<[FieldRef]>`) instead of
795+
`Arc<Vec<FieldRef>>`. As a result:
796+
797+
- `TableSchema::table_partition_cols()` now returns `&Fields` instead of
798+
`&Vec<FieldRef>`.
799+
- `FileScanConfig::table_partition_cols()` (which delegates to it) likewise now
800+
returns `&Fields`.
801+
802+
`Fields` dereferences to `&[FieldRef]`, so the common operations — iteration,
803+
indexing, `len()`, `is_empty()` — are unchanged. Only code that named the
804+
concrete `&Vec<FieldRef>` type (e.g. binding the result to a `&Vec<FieldRef>` or
805+
calling `Vec`-specific methods) needs to adapt:
806+
807+
```rust
808+
// Before: returned &Vec<FieldRef>
809+
let cols: Vec<FieldRef> = config.table_partition_cols().clone();
810+
811+
// After: returns &Fields; use `to_vec()` when you need an owned `Vec`
812+
let cols: Vec<FieldRef> = config.table_partition_cols().to_vec();
813+
```
814+
815+
The constructors `TableSchema::new` and `TableSchema::with_table_partition_cols`
816+
still accept `Vec<FieldRef>`, so code that builds partition columns is
817+
unaffected.

0 commit comments

Comments
 (0)