Fix bugs, improve arg parse, update baseline input

AmeWenJ · AmeWenJ · commit d024a19f0714 · 2025-07-22T01:17:19.000Z
diff --git a/silnlp/nmt/exp_summary.py b/silnlp/nmt/exp_summary.py
@@ -11,19 +11,56 @@
 from .config import get_mt_exp_dir
 
 chap_num = 0
+trained_books = []
+target_book = ""
+all_books = []
+metrics = []
+key_word = ""
 
 
-def extract_data(filename, metrics, target_book, header_row=5) -> dict:
+def read_data(file_path, data, chapters):
     global chap_num
+    global all_books
+    global key_word
+
+    for lang_pair in os.listdir(file_path):
+        lang_pattern = re.compile(r"([\w-]+)\-([\w-]+)")
+        if not lang_pattern.match(lang_pair):
+            continue
+
+        data[lang_pair] = {}
+        prefix = "+".join(all_books)
+        pattern = re.compile(rf"^{re.escape(prefix)}_{key_word}_order_(\d+)_ch$")
+
+        for groups in os.listdir(os.path.join(file_path, lang_pair)):
+            m = pattern.match(os.path.basename(groups))
+            if m:
+                folder_path = os.path.join(file_path, lang_pair, os.path.basename(groups))
+                diff_pred_file = glob.glob(os.path.join(folder_path, "diff_predictions*"))
+                if diff_pred_file:
+                    r = extract_data(diff_pred_file[0])
+                    data[lang_pair][int(m.group(1))] = r
+                    chapters.append(int(m.group(1)))
+                    if int(m.group(1)) > chap_num:
+                        chap_num = int(m.group(1))
+                else:
+                    print(folder_path + " has no diff_predictions file.")
+
+
+def extract_data(filename, header_row=5) -> dict:
+    global chap_num
+    global metrics
+    global target_book
 
     metrics = [m.lower() for m in metrics]
     df = pd.read_excel(filename, header=header_row)
     df.columns = [col.strip().lower() for col in df.columns]
 
     result = {}
+    metric_warning = False
     for _, row in df.iterrows():
         vref = row["vref"]
-        m = re.match(r"([A-Za-z]+)\s+(\d+)", str(vref))
+        m = re.match(r"(\d?[A-Z]{2,3}) (\d+)", str(vref))
 
         book_name, chap = m.groups()
         if book_name != target_book:
@@ -37,17 +74,22 @@ def extract_data(filename, metrics, target_book, header_row=5) -> dict:
             if metric in row:
                 values.append(row[metric])
             else:
-                print("Warning: {metric} is not calculated in {filename}")
+                metric = True
                 values.append(None)
 
         result[int(chap)] = values
+
+    if metric_warning:
+        print("Warning: {metric} is not calculated in {filename}")
+
     return result
 
 
-def flatten_dict(data, metrics, chapters) -> list:
+def flatten_dict(data, chapters, baseline={}) -> list:
     global chap_num
+    global metrics
 
-    res = []
+    rows = []
     for lang_pair in data:
         for chap in range(1, chap_num + 1):
             row = [lang_pair, chap]
@@ -60,12 +102,16 @@ def flatten_dict(data, metrics, chapters) -> list:
                     for m in range(len(metrics)):
                         index_m = 3 + 1 + len(metrics) + chapters.index(res_chap) * (len(metrics) * 3 + 1) + m * 3
                         row[index_m] = data[lang_pair][res_chap][chap][m]
-            res.append(row)
-    return res
+            if len(baseline) > 0:
+                for m in range(len(metrics)):
+                    row[3 + m] = baseline[lang_pair][chap][m]
+            rows.append(row)
+    return rows
 
 
-def create_xlsx(rows, metrics, chapters, output_path):
+def create_xlsx(rows, chapters, output_path):
     global chap_num
+    global metrics
 
     wb = Workbook()
     ws = wb.active
@@ -104,8 +150,9 @@ def create_xlsx(rows, metrics, chapters, output_path):
             ws.cell(row=2, column=col + i, value=sub_header)
 
         col += len(sub_headers)
-        for row in rows:
-            ws.append(row)
+
+    for row in rows:
+        ws.append(row)
 
     for row_idx in [1, 2]:
         for col in range(1, ws.max_column + 1):
@@ -118,6 +165,12 @@ def create_xlsx(rows, metrics, chapters, output_path):
 
     cur_lang_pair = 3
     for row_idx in range(3, ws.max_row + 1):
+        if ws.cell(row=row_idx, column=4).value is not None:
+            ws.cell(row=row_idx, column=3).value = (
+                f"=RANK.EQ(D{row_idx}, INDEX(D:D, INT((ROW(D{row_idx})-3)/{chap_num})*{chap_num}+3):INDEX(D:D, \
+                        INT((ROW(D{row_idx})-3)/{chap_num})*{chap_num}+{chap_num}+2), 0)"
+            )
+
         start_col = 3 + len(metrics) + 1
         end_col = ws.max_column
 
@@ -164,60 +217,73 @@ def create_xlsx(rows, metrics, chapters, output_path):
     wb.save(output_path)
 
 
+# Sample command:
+# python -m silnlp.nmt.exp_summary Catapult_Reloaded_Confidences
+# --trained-books MRK --target-book MAT --metrics chrf3 confidence --key-word conf --baseline Catapult_Reloaded/2nd_book/MRK
 def main() -> None:
     global chap_num
+    global trained_books
+    global target_book
+    global all_books
+    global metrics
+    global key_word
 
-    # TODO: Add args for books, metrics, key word, baseline
     parser = argparse.ArgumentParser(description="Pull results")
     parser.add_argument("exp1", type=str, help="Experiment folder")
+    parser.add_argument(
+        "--trained-books", nargs="*", required=True, type=str.upper, help="Books that are trained in the exp"
+    )
+    parser.add_argument("--target-book", required=True, type=str.upper, help="Book that is going to be analyzed")
+    parser.add_argument(
+        "--metrics",
+        nargs="*",
+        metavar="metrics",
+        default=["chrf3", "confidence"],
+        type=str.lower,
+        help="Metrics that will be analyzed with",
+    )
+    parser.add_argument("--key-word", type=str, default="conf", help="Key word in the filename for the exp group")
+    parser.add_argument("--baseline", type=str, help="Baseline for the exp group")
     args = parser.parse_args()
 
-    trained_books = ["MRK"]
-    target_book = ["MAT"]
-    all_books = trained_books + target_book
-
-    metrics = ["chrf3", "confidence"]
-
-    key_word = "conf"
+    trained_books = args.trained_books
+    target_book = args.target_book
+    all_books = trained_books + [target_book]
+    metrics = args.metrics
+    key_word = args.key_word
 
     exp1_name = args.exp1
     exp1_dir = get_mt_exp_dir(exp1_name)
 
+    exp2_name = args.baseline
+    exp2_dir = get_mt_exp_dir(exp2_name) if exp2_name else None
+
     folder_name = "+".join(all_books)
     os.makedirs(os.path.join(exp1_dir, "a_result_folder"), exist_ok=True)
     output_path = os.path.join(exp1_dir, "a_result_folder", f"{folder_name}.xlsx")
 
     data = {}
     chapters = []
+    read_data(exp1_dir, data, chapters)
+    chapters = sorted(set(chapters))
 
-    for lang_pair in os.listdir(exp1_dir):
-        lang_pattern = re.compile(r"([\w-]+)\-([\w-]+)")
-        if not lang_pattern.match(lang_pair):
-            continue
-
-        data[lang_pair] = {}
-        prefix = "+".join(all_books)
-        pattern = re.compile(rf"^{re.escape(prefix)}_{key_word}_order_(\d+)_ch$")
-
-        for groups in os.listdir(os.path.join(exp1_dir, lang_pair)):
-            m = pattern.match(os.path.basename(groups))
-            if m:
-                base_name = "diff_predictions"
-                folder_path = os.path.join(exp1_dir, lang_pair, os.path.basename(groups))
-                diff_pred_file = glob.glob(os.path.join(folder_path, f"{base_name}*"))
-                if diff_pred_file:
-                    r = extract_data(diff_pred_file[0], metrics, target_book[0])
-                    data[lang_pair][int(m.group(1))] = r
-                    chapters.append(int(m.group(1)))
-                    if int(m.group(1)) > chap_num:
-                        chap_num = int(m.group(1))
-                else:
-                    print(os.path.basename(groups) + " has no diff_predictions file.")
+    baseline_data = {}
+    if exp2_dir:
+        for lang_pair in os.listdir(exp2_dir):
+            lang_pattern = re.compile(r"([\w-]+)\-([\w-]+)")
+            if not lang_pattern.match(lang_pair):
+                continue
+
+            baseline_path = os.path.join(exp2_dir, lang_pair)
+            baseline_diff_pred = glob.glob(os.path.join(baseline_path, "diff_predictions*"))
+            if baseline_diff_pred:
+                baseline_data[lang_pair] = extract_data(baseline_diff_pred[0])
+            else:
+                print(f"Baseline experiment has no diff_predictions file in {baseline_path}")
 
-    chapters = sorted(set(chapters))
     print("Writing data...")
-    rows = flatten_dict(data, metrics, chapters)
-    create_xlsx(rows, metrics, chapters, output_path)
+    rows = flatten_dict(data, chapters, baseline=baseline_data)
+    create_xlsx(rows, chapters, output_path)
     print(f"Result is in {output_path}")