Skip to content

Commit 34b37cb

Browse files
committed
update scripts
1 parent 67cd9fd commit 34b37cb

2 files changed

Lines changed: 26 additions & 19 deletions

File tree

scripts/extract_key.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,68 @@
11
import json
22

3+
34
def extract_keys_from_jsonl(input_file, output_file):
45
"""
56
Extract only the 'key' values from a JSONL file and save them to a text file.
6-
7+
78
Args:
89
input_file (str): Path to the input JSONL file
910
output_file (str): Path to the output text file
1011
"""
1112
keys = []
12-
13+
1314
try:
14-
with open(input_file, 'r', encoding='utf-8') as f:
15+
with open(input_file, "r", encoding="utf-8") as f:
1516
for line_num, line in enumerate(f, 1):
1617
line = line.strip()
1718
if not line: # Skip empty lines
1819
continue
19-
20+
2021
try:
2122
data = json.loads(line)
2223
ref_length = len(data["ref_text"].split())
2324
if ref_length < 16:
2425
continue
25-
if 'key' in data:
26+
if "key" in data:
2627
start, end = data["key"].split("_")[-2:]
27-
if float(end) - float(start) > 30 or float(end) - float(start) < 10:
28+
if (
29+
float(end) - float(start) > 30
30+
or float(end) - float(start) < 10
31+
):
2832
continue
29-
keys.append(data['key'])
33+
keys.append(data["key"])
3034
else:
3135
print(f"Warning: No 'key' field found in line {line_num}")
32-
36+
3337
except json.JSONDecodeError as e:
3438
print(f"Error parsing JSON on line {line_num}: {e}")
3539
continue
36-
40+
3741
# Write keys to output file
38-
with open(output_file, 'w', encoding='utf-8') as f:
42+
with open(output_file, "w", encoding="utf-8") as f:
3943
for key in keys:
40-
f.write(key + '\n')
41-
44+
f.write(key + "\n")
45+
4246
print(f"Successfully extracted {len(keys)} keys to '{output_file}'")
4347
return keys
44-
48+
4549
except FileNotFoundError:
4650
print(f"Error: Input file '{input_file}' not found")
4751
return []
4852
except Exception as e:
4953
print(f"Error processing file: {e}")
5054
return []
5155

56+
5257
# Example usage:
5358
if __name__ == "__main__":
5459
# Replace 'input.jsonl' with your actual input file path
5560
# Replace 'keys_only.txt' with your desired output file path
5661
input_filename = "filtered_results/deletion_error_lt_0.05.jsonl"
5762
output_filename = "deletion_error_lt0.05_300h.txt"
58-
63+
5964
extracted_keys = extract_keys_from_jsonl(input_filename, output_filename)
60-
65+
6166
# Optional: Print the first few keys as a preview
6267
if extracted_keys:
6368
print(f"\nFirst few keys extracted:")
@@ -66,15 +71,16 @@ def extract_keys_from_jsonl(input_file, output_file):
6671
if len(extracted_keys) > 5:
6772
print(f"... and {len(extracted_keys) - 5} more keys")
6873

74+
6975
# Alternative one-liner approach using list comprehension:
7076
def extract_keys_one_liner(input_file, output_file):
7177
"""
7278
One-liner version to extract keys from JSONL file
7379
"""
7480
try:
75-
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
76-
keys = [json.loads(line)['key'] for line in f_in if line.strip()]
77-
f_out.write('\n'.join(keys))
81+
with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
82+
keys = [json.loads(line)["key"] for line in f_in if line.strip()]
83+
f_out.write("\n".join(keys))
7884
print(f"Extracted {len(keys)} keys using one-liner approach")
7985
except Exception as e:
8086
print(f"Error: {e}")

scripts/postprocess/filter_wer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ def process_jsonl_file(input_file, output_dir="filtered_results"):
8484
"wer_lt_0.05": lambda x: x["wer"] < 0.05,
8585
"insertion_error_lt_0.05": lambda x: x["insertion_error"] < 0.05,
8686
"deletion_error_lt_0.05": lambda x: x["deletion_error"] < 0.05,
87-
"deletion_inseration_error_lt_0.05": lambda x: x["insertion_error"] < 0.05 and x["deletion_error"] < 0.05,
87+
"deletion_inseration_error_lt_0.05": lambda x: x["insertion_error"] < 0.05
88+
and x["deletion_error"] < 0.05,
8889
}
8990

9091
# Apply filters and save results

0 commit comments

Comments
 (0)