-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathverify_encoding.py
29 lines (22 loc) · 964 Bytes
/
verify_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# verify_encoding.py
import json
def verify_encoding(vocab_path, merges_path):
try:
with open(vocab_path, 'r', encoding='utf-8') as vocab_file:
vocab = json.load(vocab_file)
with open(merges_path, 'r', encoding='utf-8') as merges_file:
merges = merges_file.readlines()
vocab_tokens = set(vocab.keys())
merge_tokens = set()
for line in merges:
if not line.startswith("#") and line.strip():
merge_tokens.update(line.strip().split())
for token in merge_tokens:
if token not in vocab_tokens:
print(f"Token in merges.txt but not in vocab.json: {token}")
except Exception as e:
print(f"Error verifying encoding: {e}")
if __name__ == "__main__":
vocab_path = "./converted_model/vocab.json"
merges_path = "./converted_model/cleaned_merges.txt"
verify_encoding(vocab_path, merges_path)