Skip to content

Commit 5cb32a9

Browse files
committed
Switch to pyarrow for Parquet file generation for the types integration table
1 parent 451c5cc commit 5cb32a9

File tree

5 files changed

+56
-47
lines changed

5 files changed

+56
-47
lines changed

Diff for: Cargo.lock

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: crates/integration_tests/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ rust-version = { workspace = true }
2727
[dependencies]
2828
arrow-array = { workspace = true }
2929
arrow-schema = { workspace = true }
30-
datafusion = "43"
30+
datafusion = "44"
3131
futures = { workspace = true }
3232
iceberg = { workspace = true }
3333
iceberg-catalog-rest = { workspace = true }

Diff for: crates/integration_tests/testdata/pyiceberg/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
FROM python:3.9-bullseye
1717

18-
RUN pip install pyiceberg[pyarrow]==0.8 datafusion==43.1.0
18+
RUN pip install pyiceberg[pyarrow]==0.8
1919

2020
COPY provision.py .
2121

Diff for: crates/integration_tests/testdata/pyiceberg/provision.py

+43-36
Original file line numberDiff line numberDiff line change
@@ -16,46 +16,53 @@
1616
# under the License.
1717

1818
import os
19-
from datafusion import SessionContext
2019
from pyiceberg.catalog import load_catalog
2120
import pyarrow.parquet as pq
21+
import pyarrow as pa
22+
from datetime import datetime, timedelta
2223

2324
# Generate a table with various types in memory and dump to a Parquet file
24-
ctx = SessionContext()
25-
ctx.sql("""
26-
CREATE TABLE types_test (
27-
cboolean BOOLEAN,
28-
cint8 TINYINT,
29-
cint16 SMALLINT,
30-
cint32 INT,
31-
cint64 BIGINT,
32-
cfloat32 REAL,
33-
cfloat64 DOUBLE PRECISION,
34-
cdecimal DECIMAL(8, 2),
35-
cdate32 DATE,
36-
ctimestamp TIMESTAMP,
37-
ctimestamptz TIMESTAMPTZ,
38-
cutf8 TEXT,
39-
cbinary BYTEA
40-
) AS SELECT
41-
s % 2 = 1 as cboolean,
42-
(s % 256 - 128) as cint8,
43-
s as cint16,
44-
s as cint32,
45-
s as cint64,
46-
s as cfloat32,
47-
s as cfloat64,
48-
s::NUMERIC / 100 as cnumeric,
49-
s as cdate,
50-
s * 1000 as ctimestamp,
51-
s * 1000 as ctimestampz,
52-
s::TEXT as cutf8,
53-
s::TEXT cbinary
54-
FROM unnest(generate_series(0, 1000)) AS q(s);
55-
""")
56-
a = ctx.sql("COPY types_test TO 'types_test.parquet'")
57-
# File loading fails in the container without this line???
58-
print(f"Created a Parquet file with {a} rows")
25+
rows = 1001
26+
columns = [
27+
pa.array([(i % 2 == 1) for i in range(rows)]),
28+
pa.array([(i % 256 - 128) for i in range(rows)]),
29+
pa.array([i for i in range(rows)]),
30+
pa.array([i for i in range(rows)]),
31+
pa.array([i for i in range(rows)]),
32+
pa.array([float(i) for i in range(rows)]),
33+
pa.array([float(i) for i in range(rows)]),
34+
pa.array([round(i / 100, 2) for i in range(rows)]),
35+
pa.array([(datetime(1970, 1, 1) + timedelta(days=i)).date() for i in range(rows)]),
36+
pa.array([(datetime(1970, 1, 1) + timedelta(seconds=i)) for i in range(rows)]),
37+
pa.array([(datetime(1970, 1, 1) + timedelta(seconds=i)) for i in range(rows)]),
38+
pa.array([str(i) for i in range(rows)]),
39+
pa.array([str(i).encode("utf-8") for i in range(rows)]),
40+
]
41+
schema = pa.schema([
42+
('cboolean', pa.bool_()),
43+
('cint8', pa.int8()),
44+
('cint16', pa.int16()),
45+
('cint32', pa.int32()),
46+
('cint64', pa.int64()),
47+
('cfloat32', pa.float32()),
48+
('cfloat64', pa.float64()),
49+
('cdecimal128', pa.decimal128(8, 2)),
50+
('cdate32', pa.date32()),
51+
('ctimestamp', pa.timestamp('us')),
52+
('ctimestamptz', pa.timestamp('us', tz='UTC')),
53+
('cutf8', pa.utf8()),
54+
('cbinary', pa.binary()),
55+
])
56+
57+
# Convert to a PyArrow table
58+
table = pa.Table.from_arrays(columns, schema=schema)
59+
60+
# Write to a Parquet file
61+
pq.write_table(table, "types_test.parquet")
62+
63+
# Output the result
64+
print(f"Created a Parquet file with {rows} rows and schema {table.schema}.")
65+
5966

6067
# Load the Parquet file
6168
parquet_file = pq.read_table("./types_test.parquet")

Diff for: crates/integration_tests/tests/datafusion.rs

+9-9
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ async fn test_basic_queries() -> Result<(), DataFusionError> {
8181
PARQUET_FIELD_ID_META_KEY.to_string(),
8282
"7".to_string(),
8383
)])),
84-
Field::new("cdecimal", DataType::Decimal128(8, 2), true).with_metadata(HashMap::from(
84+
Field::new("cdecimal128", DataType::Decimal128(8, 2), true).with_metadata(HashMap::from(
8585
[(PARQUET_FIELD_ID_META_KEY.to_string(), "8".to_string(),)]
8686
)),
8787
Field::new("cdate32", DataType::Date32, true).with_metadata(HashMap::from([(
@@ -125,19 +125,19 @@ async fn test_basic_queries() -> Result<(), DataFusionError> {
125125
.collect()
126126
.await?;
127127
let expected = [
128-
"+----------+-------+--------+--------+--------+----------+----------+----------+------------+----------------------------+-----------------------------+-------+---------+",
129-
"| cboolean | cint8 | cint16 | cint32 | cint64 | cfloat32 | cfloat64 | cdecimal | cdate32 | ctimestamp | ctimestamptz | cutf8 | cbinary |",
130-
"+----------+-------+--------+--------+--------+----------+----------+----------+------------+----------------------------+-----------------------------+-------+---------+",
131-
"| false | -128 | 0 | 0 | 0 | 0.0 | 0.0 | 0.00 | 1970-01-01 | 1970-01-01T00:00:00 | 1970-01-01T00:00:00Z | 0 | 30 |",
132-
"| true | -127 | 1 | 1 | 1 | 1.0 | 1.0 | 0.01 | 1970-01-02 | 1970-01-01T00:00:00.000001 | 1970-01-01T00:00:00.000001Z | 1 | 31 |",
133-
"| false | -126 | 2 | 2 | 2 | 2.0 | 2.0 | 0.02 | 1970-01-03 | 1970-01-01T00:00:00.000002 | 1970-01-01T00:00:00.000002Z | 2 | 32 |",
134-
"+----------+-------+--------+--------+--------+----------+----------+----------+------------+----------------------------+-----------------------------+-------+---------+",
128+
"+----------+-------+--------+--------+--------+----------+----------+-------------+------------+---------------------+----------------------+-------+---------+",
129+
"| cboolean | cint8 | cint16 | cint32 | cint64 | cfloat32 | cfloat64 | cdecimal128 | cdate32 | ctimestamp | ctimestamptz | cutf8 | cbinary |",
130+
"+----------+-------+--------+--------+--------+----------+----------+-------------+------------+---------------------+----------------------+-------+---------+",
131+
"| false | -128 | 0 | 0 | 0 | 0.0 | 0.0 | 0.00 | 1970-01-01 | 1970-01-01T00:00:00 | 1970-01-01T00:00:00Z | 0 | 30 |",
132+
"| true | -127 | 1 | 1 | 1 | 1.0 | 1.0 | 0.01 | 1970-01-02 | 1970-01-01T00:00:01 | 1970-01-01T00:00:01Z | 1 | 31 |",
133+
"| false | -126 | 2 | 2 | 2 | 2.0 | 2.0 | 0.02 | 1970-01-03 | 1970-01-01T00:00:02 | 1970-01-01T00:00:02Z | 2 | 32 |",
134+
"+----------+-------+--------+--------+--------+----------+----------+-------------+------------+---------------------+----------------------+-------+---------+",
135135
];
136136
assert_batches_eq!(expected, &batches);
137137

138138
// TODO: this isn't OK, and should be fixed with https://github.com/apache/iceberg-rust/issues/813
139139
let err = ctx
140-
.sql("SELECT cdecimal FROM types_table WHERE cint16 <= 2")
140+
.sql("SELECT cdecimal128 FROM types_table WHERE cint16 <= 2")
141141
.await?
142142
.collect()
143143
.await

0 commit comments

Comments
 (0)