forked from taishan1994/OneRel_chinese
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinspect_input.py
More file actions
38 lines (33 loc) · 1.37 KB
/
inspect_input.py
File metadata and controls
38 lines (33 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
## cehck if input files are valid or not.
"""
1. all relationships have been included in the rel2id.json file.
2. all tags have "text" > 0, and "triple_list" > 0.
"""
import json
import os
dir = "data/corpus3"
files = ["train_triples.json", "dev_triples.json", "test_triples.json"]
rel2id = json.load(open(os.path.join(dir, "rel2id.json"), "r"))
for file in files:
with open(os.path.join(dir, file), "r") as f:
data = json.load(f)
# check if all relationships are included in rel2id.json
for item in data:
for triple in item["triple_list"]:
if triple[1] not in rel2id[1]:
print(f"Error: {triple[1]} not in rel2id.json")
if len(item["text"]) == 0:
print(f"Error: empty text in {file}")
if len(item["triple_list"]) == 0:
print(f"Error: empty triple_list in {file}")
# check if all tags have "text" > 0 and the max_len is reasonale
max_text_len = 0
min_text_len = 1
for item in data:
if len(item["text"]) > max_text_len:
max_text_len = len(item["text"])
if len(item["text"]) <= min_text_len:
print(item)
min_text_len = len(item["text"])
print(f"Max text length: {max_text_len}")
print(f"Min text length: {min_text_len}")