Skip to content

Commit

Permalink
feat(java): jit support for chunk based map serialization (#2027)
Browse files Browse the repository at this point in the history
## What does this PR do?
This PR added jit support for chunk based map serialization, it supports
all kinds of map serializaiton by generated code:
-  final map key and value field type
- polymorphic map key and value field type
- nested map key and value type

This PR also removed the old map serialization protocol code.


The new chunk based protocol improve serialized size by **2.3X** at
most.

data:
```
stringMap: {"k1": "v1", "k2": "v2, ..., "k10": "v10" }
intMap: {1:2, 2:4, 3: 6, ..., 10: 20}
```

new protocol:
```
stringMapBytes 68
stringKVStructBytes 69
intMapBytes 28
intKVStructBytes 29
```

old protocol:
```
stringMapBytes 104
stringKVStructBytes 87
intMapBytes 64
intKVStructBytes 47
```

And improve performance by 20%


## Related issues

Closes #925 

#2025 

## Does this PR introduce any user-facing change?

<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->

- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?

## Benchmark


[chunk-jmh-result.csv](https://github.com/user-attachments/files/18575900/chunk-jmh-result.csv)

[nochunk-jmh-result.csv](https://github.com/user-attachments/files/18575901/nochunk-jmh-result.csv)


![image](https://github.com/user-attachments/assets/754f8e48-b45e-489b-adf5-cca1c5d03f1e)
  • Loading branch information
chaokunyang authored Jan 28, 2025
1 parent 1e63705 commit e952b63
Show file tree
Hide file tree
Showing 19 changed files with 1,508 additions and 760 deletions.
158 changes: 158 additions & 0 deletions java/benchmark/plot_map_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the CSV data
no_chunk_file = "nochunk-jmh-result.csv"
chunk_file = "chunk-jmh-result.csv"
# Read the CSV files
no_chunk_df = pd.read_csv(no_chunk_file)
chunk_df = pd.read_csv(chunk_file)


# Function to plot the figures
def plot_benchmark(ax, data1, data2, operation, struct, datatype, title):
# Filter data
filtered_data1 = data1[
(data1["Benchmark"] == (operation))
& (data1["struct"] == struct)
& (data1["datatype"] == datatype)
]
filtered_data2 = data2[
(data2["Benchmark"] == (operation))
& (data2["struct"] == struct)
& (data2["datatype"] == datatype)
]

# Sort data according to 'mapSize'
filtered_data1 = filtered_data1.sort_values("mapSize")
filtered_data2 = filtered_data2.sort_values("mapSize")

# Plotting
x_labels = filtered_data1["mapSize"].astype(str).tolist()
x = np.arange(len(x_labels))
width = 0.35

ax.bar(
x - width / 2,
filtered_data1["Score"],
width,
yerr=filtered_data1["ScoreError"],
label="No Chunk",
)
ax.bar(
x + width / 2,
filtered_data2["Score"],
width,
yerr=filtered_data2["ScoreError"],
label="Chunk",
)
ax.set_xlabel("Map Size")
ax.set_ylabel("Score (ops/s)")
ax.set_title(title)
ax.set_xticks(x)
ax.set_xticklabels(x_labels)
ax.legend()


# Create the subplots for datatype "int"
fig1, axs1 = plt.subplots(2, 2, figsize=(10, 8))
plot_benchmark(
axs1[0, 0],
no_chunk_df,
chunk_df,
"serialize",
True,
"int",
"Serialize | Datatype: Int",
)
plot_benchmark(
axs1[0, 1],
no_chunk_df,
chunk_df,
"serialize",
True,
"string",
"Serialize | Datatype: String",
)
plot_benchmark(
axs1[1, 0],
no_chunk_df,
chunk_df,
"deserialize",
True,
"int",
"Deserialize | Datatype: Int",
)
plot_benchmark(
axs1[1, 1],
no_chunk_df,
chunk_df,
"deserialize",
True,
"string",
"Deserialize | Datatype: String",
)
plt.tight_layout()
plt.suptitle("Benchmarks for codegen", y=1.05)


# Create the subplots for datatype "string"
fig2, axs2 = plt.subplots(2, 2, figsize=(10, 8))
plot_benchmark(
axs2[0, 0],
no_chunk_df,
chunk_df,
"serialize",
False,
"int",
"Serialize | Datatype: Int",
)
plot_benchmark(
axs2[0, 1],
no_chunk_df,
chunk_df,
"serialize",
False,
"string",
"Serialize | Datatype: String",
)
plot_benchmark(
axs2[1, 0],
no_chunk_df,
chunk_df,
"deserialize",
False,
"int",
"Deserialize | Datatype: Int",
)
plot_benchmark(
axs2[1, 1],
no_chunk_df,
chunk_df,
"deserialize",
False,
"string",
"Deserialize | Datatype: String",
)
plt.tight_layout()
plt.suptitle("Benchmarks for no codegen", y=1.05)

plt.show()
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import java.util.HashMap;
import java.util.Map;
import org.apache.fury.Fury;
import org.apache.fury.serializer.Serializer;
import org.apache.fury.serializer.collection.AbstractMapSerializer;
import org.openjdk.jmh.Main;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
Expand All @@ -49,53 +47,70 @@ public static void main(String[] args) throws IOException {
Main.main(args);
}

public static class StringKVMapStruct {
Map<String, String> map;
}

public static class IntKVMapStruct {
Map<Integer, Integer> map;
}

@State(Scope.Thread)
public static class MapState {
@Param({"5", "20", "50", "100", "200"})
@Param({"50"})
public int mapSize;

@Param({"false", "true"})
public boolean enableChunkEncoding;
public boolean struct;

@Param({"int", "string"})
public String datatype;

private Map<String, String> stringMap;
private Map<Integer, Integer> integerMap;
private byte[] stringMapBytes;
private byte[] integerMapBytes;
private Object object;
private byte[] bytes;
private Fury fury;

@Setup(Level.Trial)
public void setup() {
fury = Fury.builder().build();
Serializer<HashMap> serializer = fury.getSerializer(HashMap.class);
((AbstractMapSerializer) serializer).setUseChunkSerialize(enableChunkEncoding);
stringMap = new HashMap<>(mapSize);
integerMap = new HashMap<>(mapSize);
fury.register(StringKVMapStruct.class);
fury.register(IntKVMapStruct.class);
Map<String, String> stringMap = new HashMap<>(mapSize);
Map<Integer, Integer> intMap = new HashMap<>(mapSize);
for (int i = 0; i < mapSize; i++) {
stringMap.put("k" + i, "v" + i);
integerMap.put(i, i * 2);
intMap.put(i, i * 2);
}
StringKVMapStruct stringKVMapStruct = new StringKVMapStruct();
stringKVMapStruct.map = stringMap;
IntKVMapStruct intKVMapStruct = new IntKVMapStruct();
intKVMapStruct.map = intMap;
byte[] stringMapBytes = fury.serialize(stringMap);
byte[] intMapBytes = fury.serialize(intMap);
byte[] stringKVStructBytes = fury.serialize(stringKVMapStruct);
byte[] intKVStructBytes = fury.serialize(intKVMapStruct);
switch (datatype) {
case "int":
object = struct ? intKVMapStruct : intMap;
bytes = struct ? intKVStructBytes : intMapBytes;
break;
case "string":
object = struct ? stringKVMapStruct : stringMap;
bytes = struct ? stringKVStructBytes : stringMapBytes;
break;
default:
throw new UnsupportedOperationException();
}
stringMapBytes = fury.serialize(stringMap);
integerMapBytes = fury.serialize(integerMap);
}
}

@Benchmark
public Object serializeStringMap(MapState state) {
return state.fury.serialize(state.stringMap);
}

@Benchmark
public Object serializeIntMap(MapState state) {
return state.fury.serialize(state.integerMap);
}

@Benchmark
public Object deserializeStringMap(MapState state) {
return state.fury.deserialize(state.stringMapBytes);
public Object serialize(MapState state) {
return state.fury.serialize(state.object);
}

@Benchmark
public Object deserializeIntMap(MapState state) {
return state.fury.deserialize(state.integerMapBytes);
public Object deserialize(MapState state) {
return state.fury.deserialize(state.bytes);
}
}
Loading

0 comments on commit e952b63

Please sign in to comment.