Skip to content

Commit 4d1ad16

Browse files
committed
Add Spark to the integration test
1 parent 3b71074 commit 4d1ad16

File tree

6 files changed

+271
-0
lines changed

6 files changed

+271
-0
lines changed

crates/integration_tests/testdata/docker-compose.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,18 @@ services:
7070
/bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; /usr/bin/mc rm -r --force minio/icebergdata; /usr/bin/mc mb minio/icebergdata; /usr/bin/mc policy set public minio/icebergdata; tail -f /dev/null "
7171
networks:
7272
rest_bridge:
73+
74+
spark-iceberg:
75+
build: spark/
76+
networks:
77+
rest_bridge:
78+
depends_on:
79+
- rest
80+
- minio
81+
environment:
82+
- AWS_ACCESS_KEY_ID=admin
83+
- AWS_SECRET_ACCESS_KEY=password
84+
- AWS_REGION=us-east-1
85+
links:
86+
- rest:rest
87+
- minio:minio
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
FROM python:3.9-bullseye
17+
18+
RUN apt-get -qq update && \
19+
apt-get -qq install -y --no-install-recommends sudo curl openjdk-11-jdk && \
20+
apt-get -qq clean && \
21+
rm -rf /var/lib/apt/lists/*
22+
23+
ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
24+
ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"}
25+
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
26+
27+
RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
28+
WORKDIR ${SPARK_HOME}
29+
30+
ENV SPARK_VERSION=3.5.3
31+
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
32+
ENV ICEBERG_VERSION=1.6.0
33+
34+
RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
35+
&& tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
36+
&& rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
37+
38+
# Download iceberg spark runtime
39+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar -Lo iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
40+
&& mv iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar /opt/spark/jars
41+
42+
# Download AWS bundle
43+
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
44+
45+
COPY spark-defaults.conf /opt/spark/conf
46+
ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
47+
48+
RUN chmod u+x /opt/spark/sbin/* && \
49+
chmod u+x /opt/spark/bin/*
50+
51+
WORKDIR '/home/'
52+
53+
COPY entrypoint.sh .
54+
COPY provision.py .
55+
56+
HEALTHCHECK --retries=120 --interval=1s \
57+
CMD ls /tmp/ready || exit 1
58+
59+
ENTRYPOINT ["./entrypoint.sh"]
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/bin/bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
21+
start-master.sh -p 7077
22+
start-worker.sh spark://spark-iceberg:7077
23+
start-history-server.sh
24+
25+
python3 ./provision.py
26+
27+
touch /tmp/ready
28+
29+
tail -f /dev/null
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
from pyspark.sql import SparkSession
19+
from pyspark.sql.functions import current_date, date_add, expr
20+
21+
spark = SparkSession.builder.getOrCreate()
22+
23+
spark.sql(
24+
f"""
25+
CREATE OR REPLACE TABLE rest.default.test_positional_merge_on_read_deletes (
26+
dt date,
27+
number integer,
28+
letter string
29+
)
30+
USING iceberg
31+
TBLPROPERTIES (
32+
'write.delete.mode'='merge-on-read',
33+
'write.update.mode'='merge-on-read',
34+
'write.merge.mode'='merge-on-read',
35+
'format-version'='2'
36+
);
37+
"""
38+
)
39+
40+
spark.sql(
41+
f"""
42+
INSERT INTO rest.default.test_positional_merge_on_read_deletes
43+
VALUES
44+
(CAST('2023-03-01' AS date), 1, 'a'),
45+
(CAST('2023-03-02' AS date), 2, 'b'),
46+
(CAST('2023-03-03' AS date), 3, 'c'),
47+
(CAST('2023-03-04' AS date), 4, 'd'),
48+
(CAST('2023-03-05' AS date), 5, 'e'),
49+
(CAST('2023-03-06' AS date), 6, 'f'),
50+
(CAST('2023-03-07' AS date), 7, 'g'),
51+
(CAST('2023-03-08' AS date), 8, 'h'),
52+
(CAST('2023-03-09' AS date), 9, 'i'),
53+
(CAST('2023-03-10' AS date), 10, 'j'),
54+
(CAST('2023-03-11' AS date), 11, 'k'),
55+
(CAST('2023-03-12' AS date), 12, 'l');
56+
"""
57+
)
58+
59+
spark.sql(f"DELETE FROM rest.default.test_positional_merge_on_read_deletes WHERE number = 9")
60+
61+
spark.sql(
62+
f"""
63+
CREATE OR REPLACE TABLE rest.default.test_positional_merge_on_read_double_deletes (
64+
dt date,
65+
number integer,
66+
letter string
67+
)
68+
USING iceberg
69+
TBLPROPERTIES (
70+
'write.delete.mode'='merge-on-read',
71+
'write.update.mode'='merge-on-read',
72+
'write.merge.mode'='merge-on-read',
73+
'format-version'='2'
74+
);
75+
"""
76+
)
77+
78+
spark.sql(
79+
f"""
80+
INSERT INTO rest.default.test_positional_merge_on_read_double_deletes
81+
VALUES
82+
(CAST('2023-03-01' AS date), 1, 'a'),
83+
(CAST('2023-03-02' AS date), 2, 'b'),
84+
(CAST('2023-03-03' AS date), 3, 'c'),
85+
(CAST('2023-03-04' AS date), 4, 'd'),
86+
(CAST('2023-03-05' AS date), 5, 'e'),
87+
(CAST('2023-03-06' AS date), 6, 'f'),
88+
(CAST('2023-03-07' AS date), 7, 'g'),
89+
(CAST('2023-03-08' AS date), 8, 'h'),
90+
(CAST('2023-03-09' AS date), 9, 'i'),
91+
(CAST('2023-03-10' AS date), 10, 'j'),
92+
(CAST('2023-03-11' AS date), 11, 'k'),
93+
(CAST('2023-03-12' AS date), 12, 'l');
94+
"""
95+
)
96+
97+
# Creates two positional deletes that should be merged
98+
spark.sql(f"DELETE FROM rest.default.test_positional_merge_on_read_double_deletes WHERE number = 9")
99+
spark.sql(f"DELETE FROM rest.default.test_positional_merge_on_read_double_deletes WHERE letter == 'f'")
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
19+
spark.sql.catalog.rest org.apache.iceberg.spark.SparkCatalog
20+
spark.sql.catalog.rest.type rest
21+
spark.sql.catalog.rest.uri http://rest:8181
22+
spark.sql.catalog.rest.io-impl org.apache.iceberg.aws.s3.S3FileIO
23+
spark.sql.catalog.rest.warehouse s3://warehouse/rest/
24+
spark.sql.catalog.rest.s3.endpoint http://minio:9000
25+
spark.sql.defaultCatalog rest
26+
spark.eventLog.enabled true
27+
spark.eventLog.dir /home/iceberg/spark-events
28+
spark.history.fs.logDirectory /home/iceberg/spark-events
29+
spark.sql.catalogImplementation in-memory
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Integration tests for rest catalog.
19+
20+
use iceberg::writer::IcebergWriterBuilder;
21+
use iceberg_integration_tests::set_test_fixture;
22+
use iceberg::{Catalog, TableIdent, NamespaceIdent};
23+
24+
#[tokio::test]
25+
async fn test_read_table_with_positional_deletes() {
26+
let fixture = set_test_fixture("read_table_with_positional_deletes").await;
27+
28+
let catalog = fixture.rest_catalog;
29+
30+
let table = catalog
31+
.load_table(
32+
&TableIdent::from_strs(["default", "test_positional_merge_on_read_double_deletes"])
33+
.unwrap(),
34+
)
35+
.await
36+
.unwrap();
37+
38+
// 😱 If we don't support positional deletes, we should not be able to plan them
39+
println!("{:?}", table.scan().build().unwrap());
40+
}

0 commit comments

Comments
 (0)