fix a couple more test cases

huggingface · alex-hh · Oct 14, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
commit 97f0f19e5a3aac9d80b7d90701d86b8379651cc2
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -2510,7 +2510,7 @@ def set_format(
 
         # Check that the format_type and format_kwargs are valid and make it possible to have a Formatter
         type = get_format_type_from_alias(type)
-        get_formatter(type, features=self._info.features, **format_kwargs)
+        get_formatter(type, features=self._info.features, **format_kwargs) if type is not None else None
 
         # Check filter column
         if isinstance(columns, str):

diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
@@ -161,10 +161,24 @@ def extract_struct_array(pa_array: pa.StructArray) -> list:
         if pa.types.is_struct(pa_array.field(field.name).type):
             batch[field.name] = extract_struct_array(pa_array.field(field.name))
         else:
-            batch[field.name] = pa_array.field(field.name).to_pylist()
+            # use logic from _arrow_array_to_numpy to preserve dtype
+            if isinstance(pa_array.type, _ArrayXDExtensionType):
+                zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)
+                batch[field.name] = list(pa_array.to_numpy(zero_copy_only=zero_copy_only))
+            else:
+                batch[field.name] = pa_array.field(field.name).to_pylist()
     return dict_of_lists_to_list_of_dicts(batch)
 
 
+def extract_array_xdextension_array(pa_array: pa.Array) -> list:
+    print("Extracting array xdextension array")
+    if isinstance(pa_array, pa.ChunkedArray):
+        return [arr for chunk in pa_array.chunks for arr in extract_array_xdextension_array(chunk)]
+    else:
+        zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)
+        return list(pa_array.to_numpy(zero_copy_only=zero_copy_only))
+
+
 class PythonArrowExtractor(BaseArrowExtractor[dict, list, dict]):
     def extract_row(self, pa_table: pa.Table) -> dict:
         return _unnest(self.extract_batch(pa_table))
@@ -183,7 +197,12 @@ def extract_batch(self, pa_table: pa.Table) -> dict:
             if pa.types.is_struct(pa_table[col].type):
                 batch[col] = extract_struct_array(pa_table[col])
             else:
-                batch[col] = pa_table[col].to_pylist()
+                pa_array = pa_table[col]
+                if isinstance(pa_array.type, _ArrayXDExtensionType):
+                    # don't call to_pylist() to preserve dtype of the fixed-size array
+                    batch[col] = extract_array_xdextension_array(pa_array)
+                else:
+                    batch[col] = pa_table[col].to_pylist()
         return batch
 
 

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -195,8 +195,8 @@ def test_dummy_dataset(self, in_memory):
                         }
                     ),
                 )
-                self.assertEqual(dset[0]["col_2"], [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]])
-                self.assertEqual(dset["col_2"][0], [[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]])
+                assert (dset[0]["col_2"] == np.array([[[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]]])).all()
+                assert (dset["col_2"][0] == np.array([[[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]]])).all()
 
     def test_dataset_getitem(self, in_memory):
         with tempfile.TemporaryDirectory() as tmp_dir: