@@ -56,6 +56,7 @@ def _schema_for_table(schema: pa.Schema, table: pa.Table) -> pa.Schema:
5656
5757@dataclass
5858class LanceWriter (ProcessingStage [DocumentBatch , FileGroupTask ]):
59+ """Write ``DocumentBatch`` tables to Lance fragments and checkpoint the commit."""
5960 path : str
6061 commit_path : str
6162 schema : pa .Schema | None = None
@@ -125,6 +126,7 @@ def process(self, task: DocumentBatch) -> FileGroupTask:
125126
126127@dataclass
127128class LanceAnnotationWriter (ProcessingStage [DocumentBatch , FileGroupTask ]):
129+ """Update existing Lance rows using metadata columns emitted by ``LanceReader``."""
128130 path : str
129131 commit_path : str
130132 schema : pa .Schema
@@ -151,6 +153,7 @@ def outputs(self) -> tuple[list[str], list[str]]:
151153 return ["data" ], []
152154
153155 def prepare (self ) -> int :
156+ """Create or validate annotation columns and pin the Lance version for the run."""
154157 import lance
155158
156159 dataset = lance .dataset (self .path , storage_options = self .storage_options )
@@ -208,14 +211,8 @@ def process(self, task: DocumentBatch) -> FileGroupTask:
208211 msg = f"Lance annotation update table is missing required columns: { missing } "
209212 raise ValueError (msg )
210213 version = self ._update_version ()
211- dataset = lance .dataset (
212- self .path ,
213- ** {
214- key : value
215- for key , value in {"storage_options" : self .storage_options , "version" : version }.items ()
216- if value is not None
217- },
218- )
214+ options = {"storage_options" : self .storage_options , "version" : version }
215+ dataset = lance .dataset (self .path , ** {k : v for k , v in options .items () if v is not None })
219216
220217 record_paths = []
221218 fragment_ids = sorted (int (value ) for value in pc .unique (table [LANCE_FRAGID_COLUMN ].combine_chunks ()).to_pylist ())
0 commit comments