Skip to content

Added TensorFlow-free npy and h5 weight conversions #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ Models/Burmese_temp_genvec/
Models/Burmese_model4_version2/
Models/Other/
*~
venv/
convert_weights.py
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
48 changes: 48 additions & 0 deletions convert_weights.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import numpy as np
import tensorflow as tf # Needed to handle TF tensors
import h5py

def convert_to_numpy(value):
"""
Convert TensorFlow tensors/variables to NumPy arrays (float32).
Ensures we remove any TensorFlow-specific data.
"""
if isinstance(value, tf.Tensor) or isinstance(value, tf.Variable):
return value.numpy().astype(np.float32)
elif isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.number):
return value.astype(np.float32)
else:
return None # Ignore non-numeric data

def convert_weights(npy_path):
"""Convert `weights.npy` to a TensorFlow-free HDF5 format."""
if not os.path.exists(npy_path):
print(f"❌ Error: {npy_path} not found!")
return

h5_path = npy_path.replace(".npy", "_tf_free.h5")

# Load the weights
print(f"πŸ” Loading {npy_path}...")
weights = np.load(npy_path, allow_pickle=True)

# Convert all elements to NumPy arrays (remove TensorFlow dtypes)
converted_weights = [convert_to_numpy(w) for w in weights if convert_to_numpy(w) is not None]

# Save to HDF5 format
with h5py.File(h5_path, "w") as hf:
for i, w in enumerate(converted_weights):
hf.create_dataset(f"weight_{i}", data=w)

print(f"βœ… Converted: {npy_path} -> {h5_path}")

if __name__ == "__main__":
# Search for all `weights.npy` files and convert them
for root, _, files in os.walk("."):
for file in files:
if file == "weights.npy":
convert_weights(os.path.join(root, file))

print("πŸš€ All weight files converted successfully!")

4,244 changes: 4,244 additions & 0 deletions h origin fix-h5-weights

Large diffs are not rendered by default.

66 changes: 33 additions & 33 deletions lstm_word_segmentation/word_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,41 +601,41 @@ def save_model(self):
This function saves the current trained model of this word_segmenter instance.
"""
# Save the model using Keras
model_path = (Path.joinpath(Path(__file__).parent.parent.absolute(), "Models/" + self.name))
tf.saved_model.save(self.model, model_path)

# Save one np array that holds all weights
file = Path.joinpath(Path(__file__).parent.parent.absolute(), "Models/" + self.name + "/weights")
np.save(str(file), self.model.weights)

# Save the model in json format, that has both weights and grapheme clusters dictionary
json_file = Path.joinpath(Path(__file__).parent.parent.absolute(), "Models/" + self.name + "/weights.json")
with open(str(json_file), 'w') as wfile:
output = dict()
output["model"] = self.name
model_dir = Path(__file__).parent.parent / "Models" / self.name
model_dir.mkdir(parents=True, exist_ok=True)

# Save model as .h5 file
self.model.save(model_dir / "weights.h5")

# Save weights as a NumPy file
np.save(model_dir / "weights.npy", np.array([w.numpy() for w in self.model.weights], dtype=object))

# Save the model in JSON format
json_file = model_dir / "weights.json"
with open(json_file, 'w') as wfile:
output = {"model": self.name}

# Store grapheme clusters or codepoints
if "grapheme_clusters" in self.embedding_type:
output["dic"] = self.graph_clust_dic
output["dic"] = self.graph_clust_dic
elif "codepoints" in self.embedding_type:
if self.language == "Thai":
output["dic"] = constants.THAI_CODE_POINT_DICTIONARY
if self.language == "Burmese":
output["dic"] = constants.BURMESE_CODE_POINT_DICTIONARY
for i in range(len(self.model.weights)):
dic_model = dict()
dic_model["v"] = 1
mat = self.model.weights[i].numpy()
dim0 = mat.shape[0]
dim1 = 1
if len(mat.shape) == 1:
dic_model["dim"] = [dim0]
else:
dim1 = mat.shape[1]
dic_model["dim"] = [dim0, dim1]
serial_mat = np.reshape(mat, newshape=[dim0 * dim1])
serial_mat = serial_mat.tolist()
dic_model["data"] = serial_mat
output["mat{}".format(i+1)] = dic_model
json.dump(output, wfile)
output["dic"] = (
constants.THAI_CODE_POINT_DICTIONARY if self.language == "Thai"
else constants.BURMESE_CODE_POINT_DICTIONARY if self.language == "Burmese"
else {}
)

# Serialized weights into JSON format
for i, weight in enumerate(self.model.weights, start=1):
mat = weight.numpy()
output[f"mat{i}"] = {
"v": 1,
"dim": list(mat.shape),
"data": mat.flatten().tolist()
}

# Write JSON output to file
json.dump(output, wfile, indent=4)

def set_model(self, input_model):
"""
Expand Down