Skip to content

Commit 47decfb

Browse files
committed
Refactor serialization: Enhance type safety by introducing Optional in various data structures, implement LanguageVersion interning for memory optimization, and update Protocol Buffer handling for improved deserialization logic.
1 parent 7eab52f commit 47decfb

11 files changed

+96
-42
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,5 @@ coverage html
6565
## Update Protobuffer classes
6666

6767
```
68-
protoc --proto_path=./src --python_out=./src ./src/lionweb/serialization/proto/Chunk.proto
68+
protoc --proto_path=./src --python_out=./src --mypy_out=./src -I . ./src/lionweb/serialization/proto/Chunk.proto
6969
```

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ mdurl==0.1.2
4545
more-itertools==10.6.0
4646
mypy==1.15.0
4747
mypy-extensions==1.0.0
48+
mypy-protobuf==3.6.0
4849
nh3==0.2.20
4950
nodeenv==1.9.1
5051
numpy==2.0.2

src/lionweb/client/repository_archives.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,3 @@ def upload(bulk_import: BulkImport) -> int:
4242
end = time.perf_counter()
4343
elapsed_seconds = end - start
4444
print(f"Uploaded {total_nodes} nodes in {elapsed_seconds:.3f} seconds")
45-
46-

src/lionweb/serialization/archive.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
def load_archive(filename) -> List[SerializationChunk]:
1010
ps = ProtoBufSerialization(LionWebVersion.V2023_1)
1111
start = time.time()
12-
chunks = []
12+
chunks : List[SerializationChunk] = []
1313
import zipfile
1414
with zipfile.ZipFile(filename, 'r') as zf:
1515
for name in zf.namelist():

src/lionweb/serialization/data/language_version.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,45 @@
1-
from typing import Optional
1+
from typing import Optional, Dict, Tuple
2+
import threading
23

34

45
class LanguageVersion:
56
"""
67
The pair Language Key and Language Version identify a specific version of a language.
7-
Corresponds to the Java class 'UsedLanguage'.
8+
It is used also in the role of 'UsedLanguage', as specified in the specs.
89
"""
910

11+
# Class-level cache for interning instances
12+
_instances: Dict[Tuple[Optional[str], Optional[str]], 'LanguageVersion'] = {}
13+
_lock = threading.Lock() # Thread-safe access to cache
14+
15+
def __new__(cls, key: Optional[str] = None, version: Optional[str] = None):
16+
# Create cache key
17+
cache_key = (key, version)
18+
19+
# Thread-safe cache lookup
20+
with cls._lock:
21+
if cache_key in cls._instances:
22+
return cls._instances[cache_key]
23+
24+
# Create new instance and cache it
25+
instance = super().__new__(cls)
26+
cls._instances[cache_key] = instance
27+
return instance
28+
1029
def __init__(self, key: Optional[str] = None, version: Optional[str] = None):
11-
self.key = key
12-
self.version = version
30+
# Only initialize if not already initialized (due to interning)
31+
if not hasattr(self, '_initialized'):
32+
self._key = key
33+
self._version = version
34+
self._initialized = True
35+
36+
@classmethod
37+
def of(cls, key: Optional[str] = None, version: Optional[str] = None) -> 'LanguageVersion':
38+
"""
39+
Factory method to get an interned LanguageVersion instance.
40+
This is the preferred way to create LanguageVersion instances.
41+
"""
42+
return cls(key, version)
1343

1444
@staticmethod
1545
def from_language(language):
@@ -51,23 +81,46 @@ def from_meta_pointer(meta_pointer):
5181
raise ValueError("meta_pointer language should not be null")
5282
if meta_pointer.version is None:
5383
raise ValueError("meta_pointer version should not be null")
54-
return LanguageVersion(meta_pointer.language, meta_pointer.version)
84+
return LanguageVersion.of(meta_pointer.language, meta_pointer.version)
5585

5686
def get_key(self) -> Optional[str]:
57-
return self.key
87+
return self._key
5888

5989
def set_key(self, key: str):
60-
self.key = key
90+
raise RuntimeError("LanguageVersion instances are immutable after creation")
6191

6292
def get_version(self) -> Optional[str]:
63-
return self.version
93+
return self._version
6494

6595
def set_version(self, version: str):
66-
self.version = version
96+
raise RuntimeError("LanguageVersion instances are immutable after creation")
97+
98+
@property
99+
def key(self) -> Optional[str]:
100+
return self._key
101+
102+
@property
103+
def version(self) -> Optional[str]:
104+
return self._version
105+
106+
@classmethod
107+
def clear_cache(cls):
108+
"""Clear the interning cache. Useful for testing or memory management."""
109+
with cls._lock:
110+
cls._instances.clear()
111+
112+
@classmethod
113+
def cache_size(cls) -> int:
114+
"""Get the current size of the interning cache."""
115+
with cls._lock:
116+
return len(cls._instances)
67117

68118
def __eq__(self, other):
69119
if not isinstance(other, LanguageVersion):
70120
return False
121+
# With interning, we can use identity comparison for performance
122+
if self is other:
123+
return True
71124
return self.key == other.key and self.version == other.version
72125

73126
def __hash__(self):

src/lionweb/serialization/data/serialized_chunk.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def get_instance_by_id(self, instance_id: str) -> SerializedClassifierInstance:
2727
raise ValueError(f"Cannot find instance with ID {instance_id}")
2828
return instance
2929

30-
def add_language(self, language):
30+
def add_language(self, language: LanguageVersion) -> None:
3131
self.languages.append(language)
3232

3333
def __str__(self):

src/lionweb/serialization/data/serialized_classifier_instance.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class SerializedClassifierInstance:
1717
properties: List[SerializedPropertyValue] = field(default_factory=list)
1818
containments: List[SerializedContainmentValue] = field(default_factory=list)
1919
references: List[SerializedReferenceValue] = field(default_factory=list)
20-
annotations: List[str] = field(default_factory=list)
20+
annotations: List[Optional[str]] = field(default_factory=list)
2121
parent_node_id: Optional[str] = None
2222

2323
def get_parent_node_id(self):
@@ -57,7 +57,7 @@ def set_property_value(self, property_meta_pointer, serialized_value):
5757
SerializedPropertyValue(property_meta_pointer, serialized_value)
5858
)
5959

60-
def add_children(self, containment_meta_pointer, children_ids: List[str]):
60+
def add_children(self, containment_meta_pointer, children_ids: List[Optional[str]]):
6161
from .serialized_containment_value import SerializedContainmentValue
6262

6363
self.containments.append(
@@ -101,22 +101,22 @@ def get_reference_values(self, reference_meta_pointer) -> List:
101101
return rv.get_value()
102102
return []
103103

104-
def get_containment_values_by_key(self, containment_key: str) -> List[str]:
104+
def get_containment_values_by_key(self, containment_key: str) -> List[Optional[str]]:
105105
for rv in self.containments:
106106
if rv.get_meta_pointer().key == containment_key:
107107
return rv.get_value()
108108
return []
109109

110-
def get_containment_values(self, containment_meta_pointer) -> List[str]:
110+
def get_containment_values(self, containment_meta_pointer) -> List[Optional[str]]:
111111
for cv in self.containments:
112112
if containment_meta_pointer == cv.get_meta_pointer():
113113
return cv.get_value()
114114
return []
115115

116-
def set_annotations(self, annotation_ids: List[str]):
116+
def set_annotations(self, annotation_ids: List[Optional[str]]):
117117
self.annotations = annotation_ids[:]
118118

119-
def add_annotation(self, annotation_id: str):
119+
def add_annotation(self, annotation_id: Optional[str]):
120120
self.annotations.append(annotation_id)
121121

122122
def __eq__(self, other):

src/lionweb/serialization/data/serialized_containment_value.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
from typing import List
1+
from typing import List, Optional
22

33
from lionweb.serialization.data.metapointer import MetaPointer
44

55

66
class SerializedContainmentValue:
7-
def __init__(self, meta_pointer: MetaPointer, value: List[str]):
7+
def __init__(self, meta_pointer: MetaPointer, value: List[Optional[str]]):
88
self.meta_pointer = meta_pointer
99
self.value = value if value is not None else []
1010

@@ -14,10 +14,10 @@ def get_meta_pointer(self) -> MetaPointer:
1414
def set_meta_pointer(self, meta_pointer):
1515
self.meta_pointer = meta_pointer
1616

17-
def get_value(self) -> List[str]:
17+
def get_value(self) -> List[Optional[str]]:
1818
return self.value.copy()
1919

20-
def set_value(self, value: List[str]):
20+
def set_value(self, value: List[Optional[str]]):
2121
self.value = value.copy()
2222

2323
def __eq__(self, other):

src/lionweb/serialization/low_level_json_serialization.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import json
2-
from typing import Iterable, List, cast
2+
from typing import Iterable, List, cast, Optional
33

44
from lionweb import LionWebVersion
55
from lionweb.serialization.data.metapointer import MetaPointer
@@ -190,7 +190,6 @@ def _read_languages(
190190
if isinstance(languages, list):
191191
for element in languages:
192192
try:
193-
language_key_version = LanguageVersion()
194193
if isinstance(element, dict):
195194
extra_keys = set(element.keys()) - {"key", "version"}
196195
if extra_keys:
@@ -207,8 +206,7 @@ def _read_languages(
207206
raise ValueError(
208207
"Both 'key' and 'version' should be strings"
209208
)
210-
language_key_version.key = element.get("key")
211-
language_key_version.version = element.get("version")
209+
language_key_version = LanguageVersion(element.get("key"), element.get("version"))
212210
else:
213211
raise ValueError(
214212
f"Language should be an object. Found: {element}"
@@ -292,11 +290,9 @@ def _deserialize_classifier_instance(
292290

293291
for containment_entry in containments:
294292
containment_obj = cast(JsonObject, containment_entry)
295-
ids = SerializationUtils.try_to_get_array_of_ids(
293+
ids : List[Optional[str]] = SerializationUtils.try_to_get_array_of_ids(
296294
containment_obj, "children"
297-
)
298-
if ids is None:
299-
ids = []
295+
) or []
300296
mp = SerializationUtils.try_to_get_meta_pointer_property(
301297
containment_obj, "containment"
302298
)

src/lionweb/serialization/protobuf_serialization.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
import lionweb.serialization.proto.Chunk_pb2 as pb
44
from lionweb.lionweb_version import LionWebVersion
55
from lionweb.serialization import MetaPointer, SerializedClassifierInstance, SerializedPropertyValue, \
6-
SerializedContainmentValue, SerializedReferenceValue
6+
SerializedContainmentValue, SerializedReferenceValue, SerializationChunk
77
from lionweb.serialization.data import LanguageVersion
8-
from lionweb.serialization.data import SerializationChunk
98
from lionweb.serialization.data.serialized_reference_value import SerializedReferenceValueEntry
109
from lionweb.serialization.deserialization_exception import DeserializationException
1110

@@ -14,6 +13,7 @@
1413
from lionweb.serialization.data import SerializationChunk
1514

1615

16+
1717
# from io.lionweb.serialization.data import (
1818
# SerializationChunk,
1919
# SerializedClassifierInstance,
@@ -68,12 +68,15 @@ def __init__(self, lionweb_version: Optional["LionWebVersion"] = LionWebVersion.
6868
# return self._deserialize_serialization_chunk_to_instances(serialization_chunk)
6969
#
7070

71-
def read_chunk_from_bytes(self, data: bytes) -> pb.PBChunk:
71+
def read_pbchunk_from_bytes(self, data: bytes) -> pb.PBChunk:
7272
"""Read a protobuf Chunk from binary content"""
7373
self._chunk_instance.Clear() # Reset the instance
7474
self._chunk_instance.ParseFromString(data)
7575
return self._chunk_instance
7676

77+
def read_chunk_from_bytes(self, data: bytes) -> SerializationChunk:
78+
return self._deserialize_pbchunk_to_serialization_chunk(self.read_pbchunk_from_bytes(data))
79+
7780

7881
def _deserialize_pbchunk_to_serialization_chunk(self, chunk: pb.PBChunk) -> SerializationChunk:
7982
# Pre-size arrays as in Java
@@ -101,17 +104,19 @@ def _deserialize_pbchunk_to_serialization_chunk(self, chunk: pb.PBChunk) -> Seri
101104
f"Unable to deserialize meta pointer with language {mp.li_language}"
102105
)
103106
language_version = languages_array[mp.li_language]
107+
language_key : Optional[str] = language_version.key if language_version is not None else None
108+
language_v : Optional[str]= language_version.version if language_version is not None else None
104109
meta_pointer = MetaPointer(
105-
language_version.key, language_version.version, strings_array[mp.si_key]
110+
language_key, language_v, strings_array[mp.si_key]
106111
)
107112
metapointers_array[i] = meta_pointer
108113

109114
serialization_chunk = SerializationChunk()
110115
serialization_chunk.serialization_format_version = chunk.serialization_format_version
111116

112-
for lv in languages_array:
113-
if lv is not None:
114-
serialization_chunk.add_language(lv)
117+
valid_languages = [lv for lv in languages_array if lv is not None]
118+
for lv in valid_languages:
119+
serialization_chunk.add_language(lv)
115120

116121
# Nodes
117122
for n in chunk.nodes:
@@ -131,7 +136,7 @@ def _deserialize_pbchunk_to_serialization_chunk(self, chunk: pb.PBChunk) -> Seri
131136

132137
# containments
133138
for c in n.containments:
134-
children: List[str] = []
139+
children: List[Optional[str]] = []
135140
for child_index in c.si_children:
136141
if child_index == 0:
137142
raise DeserializationException(
@@ -344,3 +349,4 @@ def _deserialize_pbchunk_to_serialization_chunk(self, chunk: pb.PBChunk) -> Seri
344349
# chunk.interned_meta_pointers.append(pmp)
345350
#
346351
# return chunk
352+

0 commit comments

Comments
 (0)