Skip to content

Commit 783a180

Browse files
authored
Add imageSpec to structured_dataset.py (#1708)
* Add imageSpec to structured_dataset.py Signed-off-by: Kevin Su <pingsutw@apache.org> * lint Signed-off-by: Kevin Su <pingsutw@apache.org> * udpated other examples Signed-off-by: Kevin Su <pingsutw@apache.org> * nit Signed-off-by: Kevin Su <pingsutw@apache.org> * nit Signed-off-by: Kevin Su <pingsutw@apache.org> --------- Signed-off-by: Kevin Su <pingsutw@apache.org>
1 parent 9597270 commit 783a180

4 files changed

Lines changed: 23 additions & 12 deletions

File tree

.github/workflows/checks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ jobs:
307307
- name: Register specific tests
308308
run: |
309309
source .venv/bin/activate
310+
export FLYTE_PUSH_IMAGE_SPEC=${{ github.event_name != 'pull_request' }}
310311
while read -r line;
311312
do
312313
pyflyte --config ./boilerplate/flyte/end2end/functional-test-config.yaml \

examples/data_types_and_io/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ ENV VENV /opt/venv
1717
RUN python3 -m venv ${VENV}
1818
ENV PATH="${VENV}/bin:$PATH"
1919

20-
RUN pip install flytekit==1.12.0 pandas
20+
RUN pip install flytekit pandas
2121
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
2222

2323
# Copy the actual code

examples/data_types_and_io/data_types_and_io/dataclass.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from dataclasses import dataclass
44

55
import pandas as pd
6-
from flytekit import task, workflow
6+
from flytekit import ImageSpec, task, workflow
77
from flytekit.types.directory import FlyteDirectory
88
from flytekit.types.file import FlyteFile
99
from flytekit.types.structured import StructuredDataset
@@ -14,6 +14,11 @@
1414
# If you're using Flytekit version >= v1.11.1, you don't need to decorate with `@dataclass_json` or
1515
# inherit from Mashumaro's `DataClassJSONMixin`.
1616

17+
image_spec = ImageSpec(
18+
registry="ghcr.io/flyteorg",
19+
packages=["pandas", "pyarrow"],
20+
)
21+
1722

1823
# Python types
1924
# Define a `dataclass` with `int`, `str` and `dict` as the data types
@@ -25,15 +30,15 @@ class Datum(DataClassJSONMixin):
2530

2631

2732
# Once declared, a dataclass can be returned as an output or accepted as an input
28-
@task
33+
@task(container_image=image_spec)
2934
def stringify(s: int) -> Datum:
3035
"""
3136
A dataclass return will be treated as a single complex JSON return.
3237
"""
3338
return Datum(x=s, y=str(s), z={s: str(s)})
3439

3540

36-
@task
41+
@task(container_image=image_spec)
3742
def add(x: Datum, y: Datum) -> Datum:
3843
"""
3944
Flytekit automatically converts the provided JSON into a data class.
@@ -51,7 +56,7 @@ class FlyteTypes(DataClassJSONMixin):
5156
directory: FlyteDirectory
5257

5358

54-
@task
59+
@task(container_image=image_spec)
5560
def upload_data() -> FlyteTypes:
5661
"""
5762
Flytekit will upload FlyteFile, FlyteDirectory and StructuredDataset to the blob store,
@@ -76,7 +81,7 @@ def upload_data() -> FlyteTypes:
7681
return fs
7782

7883

79-
@task
84+
@task(container_image=image_spec)
8085
def download_data(res: FlyteTypes):
8186
assert pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}).equals(res.dataframe.open(pd.DataFrame).all())
8287
f = open(res.file, "r")

examples/data_types_and_io/data_types_and_io/structured_dataset.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,16 @@
1818
)
1919
from typing_extensions import Annotated
2020

21+
image_spec = ImageSpec(
22+
registry="ghcr.io/flyteorg",
23+
packages=["pandas", "pyarrow", "numpy"],
24+
)
25+
2126

2227
# Define a task that returns a Pandas DataFrame.
2328
# Flytekit will detect the Pandas dataframe return signature and
2429
# convert the interface for the task to the StructuredDatased type
25-
@task
30+
@task(container_image=image_spec)
2631
def generate_pandas_df(a: int) -> pd.DataFrame:
2732
return pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [a, 22], "Height": [160, 178]})
2833

@@ -39,7 +44,7 @@ def generate_pandas_df(a: int) -> pd.DataFrame:
3944
# that's supported or added to structured dataset.
4045
# For instance, you can use ``pa.Table`` to convert
4146
# the Pandas DataFrame to a PyArrow table.
42-
@task
47+
@task(container_image=image_spec)
4348
def get_subset_pandas_df(df: Annotated[StructuredDataset, all_cols]) -> Annotated[StructuredDataset, col]:
4449
df = df.open(pd.DataFrame).all()
4550
df = pd.concat([df, pd.DataFrame([[30]], columns=["Age"])])
@@ -61,7 +66,7 @@ def simple_sd_wf(a: int = 19) -> Annotated[StructuredDataset, col]:
6166
register_csv_handlers()
6267

6368

64-
@task
69+
@task(container_image=image_spec)
6570
def pandas_to_csv(df: pd.DataFrame) -> Annotated[StructuredDataset, CSV]:
6671
return StructuredDataset(dataframe=df)
6772

@@ -132,12 +137,12 @@ def to_html(self, df: np.ndarray) -> str:
132137

133138
# You can now use `numpy.ndarray` to deserialize the parquet file to NumPy
134139
# and serialize a task's output (NumPy array) to a parquet file.
135-
@task
140+
@task(container_image=image_spec)
136141
def generate_pd_df_with_str() -> pd.DataFrame:
137142
return pd.DataFrame({"Name": ["Tom", "Joseph"]})
138143

139144

140-
@task
145+
@task(container_image=image_spec)
141146
def to_numpy(sd: StructuredDataset) -> Annotated[StructuredDataset, None, PARQUET]:
142147
numpy_array = sd.open(np.ndarray).all()
143148
return StructuredDataset(dataframe=numpy_array)
@@ -197,7 +202,7 @@ class CompanyField:
197202
MySecondDataClassDataset = Annotated[StructuredDataset, kwtypes(info=InfoField)]
198203
MyNestedDataClassDataset = Annotated[StructuredDataset, kwtypes(info=kwtypes(contacts=ContactsField))]
199204

200-
image = ImageSpec(packages=["pandas", "tabulate"], registry="ghcr.io/flyteorg")
205+
image = ImageSpec(packages=["pandas", "pyarrow", "pandas", "tabulate"], registry="ghcr.io/flyteorg")
201206

202207

203208
@task(container_image=image)

0 commit comments

Comments
 (0)