Skip to content

Commit 2c5ace5

Browse files
committed
**Refactor Parquet Writer Initialization and File Handling**
- Updated `ParquetWriter` in `writer.rs` to handle `current_indexer` as an `Option`, allowing for more flexible initialization and management. - Introduced `finish_current_file` method to encapsulate logic for completing and transitioning between SST files, improving code clarity and maintainability. - Enhanced error handling and logging with `debug` statements for better traceability during file operations.
1 parent c344c6a commit 2c5ace5

File tree

1 file changed

+65
-49
lines changed

1 file changed

+65
-49
lines changed

src/mito2/src/sst/parquet/writer.rs

+65-49
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
//! Parquet writer.
1616
1717
use std::future::Future;
18+
use std::mem;
1819
use std::pin::Pin;
1920
use std::sync::atomic::{AtomicUsize, Ordering};
2021
use std::sync::Arc;
2122
use std::task::{Context, Poll};
2223

24+
use common_telemetry::debug;
2325
use common_time::Timestamp;
2426
use datatypes::arrow::datatypes::SchemaRef;
2527
use object_store::{FuturesAsyncWriter, ObjectStore};
@@ -59,7 +61,7 @@ pub struct ParquetWriter<F: WriterFactory, I: IndexerBuilder, P: FilePathProvide
5961
/// Indexer build that can create indexer for multiple files.
6062
indexer_builder: I,
6163
/// Current active indexer.
62-
current_indexer: Indexer,
64+
current_indexer: Option<Indexer>,
6365
bytes_written: Arc<AtomicUsize>,
6466
}
6567

@@ -121,8 +123,6 @@ where
121123
path_provider: P,
122124
) -> ParquetWriter<F, I, P> {
123125
let init_file = FileId::random();
124-
let index_file_path = path_provider.build_index_file_path(init_file);
125-
let indexer = indexer_builder.build(init_file, index_file_path).await;
126126

127127
ParquetWriter {
128128
path_provider,
@@ -131,22 +131,50 @@ where
131131
writer_factory: factory,
132132
metadata,
133133
indexer_builder,
134-
current_indexer: indexer,
134+
current_indexer: None,
135135
bytes_written: Arc::new(AtomicUsize::new(0)),
136136
}
137137
}
138138

139-
async fn roll_to_next_file(&mut self) {
140-
self.current_file = FileId::random();
141-
let index_file_path = self.path_provider.build_index_file_path(self.current_file);
142-
let indexer = self
143-
.indexer_builder
144-
.build(self.current_file, index_file_path)
145-
.await;
146-
self.current_indexer = indexer;
147-
139+
/// Finishes current SST file and index file.
140+
async fn finish_current_file(
141+
&mut self,
142+
ssts: &mut SstInfoArray,
143+
stats: &mut SourceStats,
144+
) -> Result<()> {
148145
// maybe_init_writer will re-create a new file.
149-
self.writer = None;
146+
if let Some(mut current_writer) = mem::take(&mut self.writer) {
147+
let stats = mem::take(stats);
148+
// At least one row has been written.
149+
assert!(stats.num_rows > 0);
150+
151+
// Finish indexer and writer.
152+
// safety: writer and index can only be both present or not.
153+
let index_output = self.current_indexer.as_mut().unwrap().finish().await;
154+
current_writer.flush().await.context(WriteParquetSnafu)?;
155+
156+
let file_meta = current_writer.close().await.context(WriteParquetSnafu)?;
157+
let file_size = self.bytes_written.load(Ordering::Relaxed) as u64;
158+
159+
// Safety: num rows > 0 so we must have min/max.
160+
let time_range = stats.time_range.unwrap();
161+
162+
// convert FileMetaData to ParquetMetaData
163+
let parquet_metadata = parse_parquet_metadata(file_meta)?;
164+
ssts.push(SstInfo {
165+
file_id: self.current_file,
166+
time_range,
167+
file_size,
168+
num_rows: stats.num_rows,
169+
num_row_groups: parquet_metadata.num_row_groups() as u64,
170+
file_metadata: Some(Arc::new(parquet_metadata)),
171+
index_metadata: index_output,
172+
});
173+
self.current_file = FileId::random();
174+
self.bytes_written.store(0, Ordering::Relaxed)
175+
};
176+
177+
Ok(())
150178
}
151179

152180
/// Iterates source and writes all rows to Parquet file.
@@ -158,6 +186,7 @@ where
158186
override_sequence: Option<SequenceNumber>, // override the `sequence` field from `Source`
159187
opts: &WriteOptions,
160188
) -> Result<SstInfoArray> {
189+
let mut results = smallvec![];
161190
let write_format =
162191
WriteFormat::new(self.metadata.clone()).with_override_sequence(override_sequence);
163192
let mut stats = SourceStats::default();
@@ -170,52 +199,31 @@ where
170199
match res {
171200
Ok(batch) => {
172201
stats.update(&batch);
173-
self.current_indexer.update(&batch).await;
202+
// safety: self.current_indexer must be set when first batch has been written.
203+
self.current_indexer.as_mut().unwrap().update(&batch).await;
174204
if self.bytes_written.load(Ordering::Relaxed) > opts.max_file_size {
175-
self.roll_to_next_file().await;
205+
debug!(
206+
"Finishing current file {}, file size: {}, max file size: {}",
207+
self.current_file,
208+
self.bytes_written.load(Ordering::Relaxed),
209+
opts.max_file_size
210+
);
211+
self.finish_current_file(&mut results, &mut stats).await?;
176212
}
177213
}
178214
Err(e) => {
179-
self.current_indexer.abort().await;
215+
if let Some(indexer) = &mut self.current_indexer {
216+
indexer.abort().await;
217+
}
180218
return Err(e);
181219
}
182220
}
183221
}
184222

185-
let index_output = self.current_indexer.finish().await;
186-
187-
if stats.num_rows == 0 {
188-
return Ok(smallvec![]);
189-
}
190-
191-
let Some(mut arrow_writer) = self.writer.take() else {
192-
// No batch actually written.
193-
return Ok(smallvec![]);
194-
};
195-
196-
arrow_writer.flush().await.context(WriteParquetSnafu)?;
197-
198-
let file_meta = arrow_writer.close().await.context(WriteParquetSnafu)?;
199-
let file_size = self.bytes_written.load(Ordering::Relaxed) as u64;
200-
201-
// Safety: num rows > 0 so we must have min/max.
202-
let time_range = stats.time_range.unwrap();
203-
204-
// convert FileMetaData to ParquetMetaData
205-
let parquet_metadata = parse_parquet_metadata(file_meta)?;
206-
207-
let file_id = self.current_file;
223+
self.finish_current_file(&mut results, &mut stats).await?;
208224

209225
// object_store.write will make sure all bytes are written or an error is raised.
210-
Ok(smallvec![SstInfo {
211-
file_id,
212-
time_range,
213-
file_size,
214-
num_rows: stats.num_rows,
215-
num_row_groups: parquet_metadata.num_row_groups() as u64,
216-
file_metadata: Some(Arc::new(parquet_metadata)),
217-
index_metadata: index_output,
218-
}])
226+
Ok(results)
219227
}
220228

221229
/// Customizes per-column config according to schema and maybe column cardinality.
@@ -286,6 +294,14 @@ where
286294
AsyncArrowWriter::try_new(writer, schema.clone(), Some(writer_props))
287295
.context(WriteParquetSnafu)?;
288296
self.writer = Some(arrow_writer);
297+
298+
let index_file_path = self.path_provider.build_index_file_path(self.current_file);
299+
let indexer = self
300+
.indexer_builder
301+
.build(self.current_file, index_file_path)
302+
.await;
303+
self.current_indexer = Some(indexer);
304+
289305
// safety: self.writer is assigned above
290306
Ok(self.writer.as_mut().unwrap())
291307
}

0 commit comments

Comments
 (0)