feat(java): jit support for chunk based map serialization (#2027)

## What does this PR do? This PR added jit support for chunk based map serialization, it supports all kinds of map serializaiton by generated code: - final map key and value field type - polymorphic map key and value field type - nested map key and value type This PR also removed the old map serialization protocol code. The new chunk based protocol improve serialized size by **2.3X** at most. data: ``` stringMap: {"k1": "v1", "k2": "v2, ..., "k10": "v10" } intMap: {1:2, 2:4, 3: 6, ..., 10: 20} ``` new protocol: ``` stringMapBytes 68 stringKVStructBytes 69 intMapBytes 28 intKVStructBytes 29 ``` old protocol: ``` stringMapBytes 104 stringKVStructBytes 87 intMapBytes 64 intKVStructBytes 47 ``` And improve performance by 20% ## Related issues Closes #925 #2025 ## Does this PR introduce any user-facing change?  - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark [chunk-jmh-result.csv](https://github.com/user-attachments/files/18575900/chunk-jmh-result.csv) [nochunk-jmh-result.csv](https://github.com/user-attachments/files/18575901/nochunk-jmh-result.csv) ![image](https://github.com/user-attachments/assets/754f8e48-b45e-489b-adf5-cca1c5d03f1e)
apache · Jan 28, 2025 · e952b63 · e952b63
1 parent 1e63705
commit e952b63
Show file tree

Hide file tree

Showing 19 changed files with 1,508 additions and 760 deletions.
diff --git a/java/benchmark/plot_map_benchmark.py b/java/benchmark/plot_map_benchmark.py
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Load the CSV data
+no_chunk_file = "nochunk-jmh-result.csv"
+chunk_file = "chunk-jmh-result.csv"
+# Read the CSV files
+no_chunk_df = pd.read_csv(no_chunk_file)
+chunk_df = pd.read_csv(chunk_file)
+
+
+# Function to plot the figures
+def plot_benchmark(ax, data1, data2, operation, struct, datatype, title):
+    # Filter data
+    filtered_data1 = data1[
+        (data1["Benchmark"] == (operation))
+        & (data1["struct"] == struct)
+        & (data1["datatype"] == datatype)
+    ]
+    filtered_data2 = data2[
+        (data2["Benchmark"] == (operation))
+        & (data2["struct"] == struct)
+        & (data2["datatype"] == datatype)
+    ]
+
+    # Sort data according to 'mapSize'
+    filtered_data1 = filtered_data1.sort_values("mapSize")
+    filtered_data2 = filtered_data2.sort_values("mapSize")
+
+    # Plotting
+    x_labels = filtered_data1["mapSize"].astype(str).tolist()
+    x = np.arange(len(x_labels))
+    width = 0.35
+
+    ax.bar(
+        x - width / 2,
+        filtered_data1["Score"],
+        width,
+        yerr=filtered_data1["ScoreError"],
+        label="No Chunk",
+    )
+    ax.bar(
+        x + width / 2,
+        filtered_data2["Score"],
+        width,
+        yerr=filtered_data2["ScoreError"],
+        label="Chunk",
+    )
+    ax.set_xlabel("Map Size")
+    ax.set_ylabel("Score (ops/s)")
+    ax.set_title(title)
+    ax.set_xticks(x)
+    ax.set_xticklabels(x_labels)
+    ax.legend()
+
+
+# Create the subplots for datatype "int"
+fig1, axs1 = plt.subplots(2, 2, figsize=(10, 8))
+plot_benchmark(
+    axs1[0, 0],
+    no_chunk_df,
+    chunk_df,
+    "serialize",
+    True,
+    "int",
+    "Serialize | Datatype: Int",
+)
+plot_benchmark(
+    axs1[0, 1],
+    no_chunk_df,
+    chunk_df,
+    "serialize",
+    True,
+    "string",
+    "Serialize | Datatype: String",
+)
+plot_benchmark(
+    axs1[1, 0],
+    no_chunk_df,
+    chunk_df,
+    "deserialize",
+    True,
+    "int",
+    "Deserialize | Datatype: Int",
+)
+plot_benchmark(
+    axs1[1, 1],
+    no_chunk_df,
+    chunk_df,
+    "deserialize",
+    True,
+    "string",
+    "Deserialize | Datatype: String",
+)
+plt.tight_layout()
+plt.suptitle("Benchmarks for codegen", y=1.05)
+
+
+# Create the subplots for datatype "string"
+fig2, axs2 = plt.subplots(2, 2, figsize=(10, 8))
+plot_benchmark(
+    axs2[0, 0],
+    no_chunk_df,
+    chunk_df,
+    "serialize",
+    False,
+    "int",
+    "Serialize | Datatype: Int",
+)
+plot_benchmark(
+    axs2[0, 1],
+    no_chunk_df,
+    chunk_df,
+    "serialize",
+    False,
+    "string",
+    "Serialize | Datatype: String",
+)
+plot_benchmark(
+    axs2[1, 0],
+    no_chunk_df,
+    chunk_df,
+    "deserialize",
+    False,
+    "int",
+    "Deserialize | Datatype: Int",
+)
+plot_benchmark(
+    axs2[1, 1],
+    no_chunk_df,
+    chunk_df,
+    "deserialize",
+    False,
+    "string",
+    "Deserialize | Datatype: String",
+)
+plt.tight_layout()
+plt.suptitle("Benchmarks for no codegen", y=1.05)
+
+plt.show()
diff --git a/java/benchmark/src/main/java/org/apache/fury/benchmark/MapSerializationSuite.java b/java/benchmark/src/main/java/org/apache/fury/benchmark/MapSerializationSuite.java
@@ -23,8 +23,6 @@
 import java.util.HashMap;
 import java.util.Map;
 import org.apache.fury.Fury;
-import org.apache.fury.serializer.Serializer;
-import org.apache.fury.serializer.collection.AbstractMapSerializer;
 import org.openjdk.jmh.Main;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
@@ -49,53 +47,70 @@ public static void main(String[] args) throws IOException {
     Main.main(args);
   }
 
+  public static class StringKVMapStruct {
+    Map<String, String> map;
+  }
+
+  public static class IntKVMapStruct {
+    Map<Integer, Integer> map;
+  }
+
   @State(Scope.Thread)
   public static class MapState {
-    @Param({"5", "20", "50", "100", "200"})
+    @Param({"50"})
     public int mapSize;
 
     @Param({"false", "true"})
-    public boolean enableChunkEncoding;
+    public boolean struct;
+
+    @Param({"int", "string"})
+    public String datatype;
 
-    private Map<String, String> stringMap;
-    private Map<Integer, Integer> integerMap;
-    private byte[] stringMapBytes;
-    private byte[] integerMapBytes;
+    private Object object;
+    private byte[] bytes;
     private Fury fury;
 
     @Setup(Level.Trial)
     public void setup() {
       fury = Fury.builder().build();
-      Serializer<HashMap> serializer = fury.getSerializer(HashMap.class);
-      ((AbstractMapSerializer) serializer).setUseChunkSerialize(enableChunkEncoding);
-      stringMap = new HashMap<>(mapSize);
-      integerMap = new HashMap<>(mapSize);
+      fury.register(StringKVMapStruct.class);
+      fury.register(IntKVMapStruct.class);
+      Map<String, String> stringMap = new HashMap<>(mapSize);
+      Map<Integer, Integer> intMap = new HashMap<>(mapSize);
       for (int i = 0; i < mapSize; i++) {
         stringMap.put("k" + i, "v" + i);
-        integerMap.put(i, i * 2);
+        intMap.put(i, i * 2);
+      }
+      StringKVMapStruct stringKVMapStruct = new StringKVMapStruct();
+      stringKVMapStruct.map = stringMap;
+      IntKVMapStruct intKVMapStruct = new IntKVMapStruct();
+      intKVMapStruct.map = intMap;
+      byte[] stringMapBytes = fury.serialize(stringMap);
+      byte[] intMapBytes = fury.serialize(intMap);
+      byte[] stringKVStructBytes = fury.serialize(stringKVMapStruct);
+      byte[] intKVStructBytes = fury.serialize(intKVMapStruct);
+      switch (datatype) {
+        case "int":
+          object = struct ? intKVMapStruct : intMap;
+          bytes = struct ? intKVStructBytes : intMapBytes;
+          break;
+        case "string":
+          object = struct ? stringKVMapStruct : stringMap;
+          bytes = struct ? stringKVStructBytes : stringMapBytes;
+          break;
+        default:
+          throw new UnsupportedOperationException();
       }
-      stringMapBytes = fury.serialize(stringMap);
-      integerMapBytes = fury.serialize(integerMap);
     }
   }
 
   @Benchmark
-  public Object serializeStringMap(MapState state) {
-    return state.fury.serialize(state.stringMap);
-  }
-
-  @Benchmark
-  public Object serializeIntMap(MapState state) {
-    return state.fury.serialize(state.integerMap);
-  }
-
-  @Benchmark
-  public Object deserializeStringMap(MapState state) {
-    return state.fury.deserialize(state.stringMapBytes);
+  public Object serialize(MapState state) {
+    return state.fury.serialize(state.object);
   }
 
   @Benchmark
-  public Object deserializeIntMap(MapState state) {
-    return state.fury.deserialize(state.integerMapBytes);
+  public Object deserialize(MapState state) {
+    return state.fury.deserialize(state.bytes);
   }
 }