sillsdev · davidbaines · Jun 18, 2025 · Aug 13, 2025 · Sep 25, 2025 · Oct 7, 2025
diff --git a/debug.log b/debug.log
@@ -0,0 +1,2 @@
+[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
+[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
diff --git a/silnlp/common/bulk_extract_local.py b/silnlp/common/bulk_extract_local.py
@@ -0,0 +1,143 @@
+import argparse
+import logging
+from pathlib import Path
+from typing import List
+import sys
+
+from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, get_books
+
+from .corpus import count_lines
+from .paratext import check_versification, extract_project, extract_term_renderings
+from machine.corpora import FileParatextProjectSettingsParser
+from ..common.environment import SIL_NLP_ENV
+
+LOGGER = logging.getLogger(__package__ + ".bulk_extract_local")
+SETTINGS_FILENAME = "Settings.xml"
+
+
+def parse_settings(project):
+    settings_file_path = project / SETTINGS_FILENAME
+    if not settings_file_path.is_file():
+        LOGGER.warning(f"Warning: {SETTINGS_FILENAME} not found.")
+        return
+
+    try:
+        parser = FileParatextProjectSettingsParser(str(project))
+        project_settings = parser.parse()
+
+        # project_settings.name
+        # project_settings.full_name
+        # if project_settings.encoding:
+        #     self.setting_encoding = getattr(project_settings.encoding, 'name', str(project_settings.encoding))
+
+        # if project_settings.versification:
+        #     setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
+
+            # project_settings.file_name_prefix
+            # project_settings.file_name_form
+            # project_settings.file_name_suffix
+            # project_settings.biblical_terms_list_type
+            # project_settings.biblical_terms_project_name
+            # project_settings.biblical_terms_file_name
+            # project_settings.language_code
+
+    except Exception as e:
+        print(f"Error parsing {SETTINGS_FILENAME}: {e}")
+        return None
+
+    return project_settings
+
+def get_expected_verse_count(project: Path, include: List[str], exclude: List[str]) -> int:
+    include_books_set = get_books(include) if len(include) > 0 else None
+    exclude_books_set = get_books(exclude) if len(exclude) > 0 else None
+    project_settings = parse_settings(project)
+
+    if project_settings.versification:
+        setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
+    print(f"Found versification {setting_versification} in {SETTINGS_FILENAME} for {project}")
+
+    def filter_lines(verse_ref_str: str) -> bool:
+        if include_books_set is None and exclude_books_set is None:
+            return True
+
+        vref = VerseRef.from_string(verse_ref_str.strip(), setting_versification)
+        if exclude_books_set is not None and vref.book_num in exclude_books_set:
+            return False
+
+        if include_books_set is not None and vref.book_num in include_books_set:
+            return True
+
+        return include_books_set is None
+
+    return count_lines(SIL_NLP_ENV.assets_dir / "vref.txt", filter_lines)
+
+
+def has_settings_file(project_folder: Path) -> bool:
+    return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Extracts text corpora from Paratext projects")
+    parser.add_argument("input", type=str, help="The input folder.")
+    parser.add_argument("output", type=str, help="The output corpus folder.")
+    parser.add_argument("--terms", type=str, required=True, help="The output terms folder.")
+    parser.add_argument(
+        "--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'"
+    )
+    parser.add_argument(
+        "--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'"
+    )
+    parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers")
+    parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available")
+    parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs")
+
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    terms_path = Path(args.terms)
+
+    if not input_path.is_dir():
+        print(f"Error: Projects folder not found: {args.input}")
+        sys.exit(1)
+
+    if not output_path.is_dir():
+        print(f"Error: Output folder not found: {args.output}")
+        sys.exit(1)
+
+    if not terms_path.is_dir():
+        print(f"Error: Output terms folder not found: {args.terms}")
+        sys.exit(1)
+
+    # Which folders have a Settings.xml file we can find?
+    projects = [folder for folder in input_path.glob("*") if folder.is_dir() and has_settings_file(folder)]
+
+    # Process the projects that have data and tell the user.
+    if len(projects) > 0:
+        for project in projects:
+            LOGGER.info(f"Extracting {project} to {output_path}")
+            expected_verse_count = get_expected_verse_count(project, args.include, args.exclude)
+
+            check_versification(project)
+            corpus_filename, verse_count = extract_project(
+                project,
+                output_path,
+                args.include,
+                args.exclude,
+                args.markers,
+                args.lemmas,
+                args.project_vrefs,
+            )
+
+            # check if the number of lines in the file is correct (the same as vref.txt)
+            LOGGER.info(f"# of Verses: {verse_count}")
+            if verse_count != expected_verse_count:
+                LOGGER.error(f"The number of verses is {verse_count}, but should be {expected_verse_count}.")
+            terms_count = extract_term_renderings(project, corpus_filename, terms_path)
+            LOGGER.info(f"# of Terms: {terms_count}")
+            LOGGER.info("Done.")
+    else:
+        LOGGER.warning(f"Couldn't find any data to process for any project in {input_path}.")
+
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/check_books.py b/silnlp/common/check_books.py
@@ -50,7 +50,8 @@ def parse_book(project_dir: str, book: str):
 
     settings = FileParatextProjectSettingsParser(project_dir).parse()
     book_path = Path(project_dir) / settings.get_book_file_name(book)
-
+    LOGGER.info(f"Attempting to parse {book} from {book_path}.")
+
     if not book_path.is_file():
         raise RuntimeError(f"Can't find file {book_path} for book {book}")
 

diff --git a/silnlp/common/combine_scores_save.py b/silnlp/common/combine_scores_save.py
@@ -0,0 +1,116 @@
+import argparse
+import csv
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import pandas as pd
+
+from ..common.environment import SIL_NLP_ENV
+
+
+def check_for_lock_file(folder: Path, filename: str, file_type: str):
+    """Check for lock files and ask the user to close them then exit."""
+
+    if file_type[0] == ".":
+        file_type = file_type[1:]
+
+    if file_type.lower() == "csv":
+        lockfile = folder / f".~lock.{filename}.{file_type}#"
+    elif file_type.lower() == "xlsx":
+        lockfile = folder / f"~${filename}.{file_type}"
+
+    if lockfile.is_file():
+        print(f"Found lock file: {lockfile}")
+        print(f"Please close {filename}.{file_type} in folder {folder} OR delete the lock file and try again.")
+        sys.exit()
+
+
+def aggregate_csv(folder_path):
+    # Dictionary to store rows by header type
+    data_by_header = defaultdict(list)
+
+    # Iterate over all CSV files in the folder and its subfolders
+    for csv_file in folder_path.rglob("*/scores-*.csv"):
+        series = csv_file.parts[-3]  # Extract series folder name
+        experiment = csv_file.parts[-2]  # Extract experiment folder name
+        steps = csv_file.stem.split("-")[-1]  # Extract steps from file name
+
+        # Read the CSV file and add new columns
+        with open(csv_file, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            header = tuple(rows[0])  # Use tuple to make it hashable
+
+            # Add columns to the beginning of each row
+            if header not in data_by_header:
+                data_by_header[header].append(["Series", "Experiment", "Steps"] + list(header))
+            for row in rows[1:]:
+                data_by_header[header].append([series, experiment, steps] + row)
+
+    return data_by_header
+
+
+def write_to_csv(data_by_header, folder, output_filename):
+
+    output_file = folder / f"{output_filename}.csv"
+    with open(output_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        for header, rows in data_by_header.items():
+            writer.writerows(rows)
+            writer.writerow([])  # Add a blank row to separate different types
+        # Write the folder path to the last line of the CSV file
+        writer.writerow([folder])
+    print(f"Wrote scores to {output_file}")
+
+
+def write_to_excel(data_by_header, folder, output_filename):
+    output_file = folder / f"{output_filename}.xlsx"
+    with pd.ExcelWriter(output_file) as writer:
+        for i, (header, rows) in enumerate(data_by_header.items()):
+            # Create a DataFrame for the current header
+            df = pd.DataFrame(rows[1:], columns=rows[0])
+            # Convert columns to appropriate data types
+            df = df.apply(pd.to_numeric, errors="ignore")
+            # Generate a unique sheet name
+            sheet_name = f"Table_{i + 1}"
+            # Write the DataFrame to the Excel file
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+    print(f"Wrote scores to {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Aggregate CSV files in a folder.")
+    parser.add_argument("folder", type=Path, help="Path to the folder containing CSV files.")
+    parser.add_argument(
+        "--output_filename",
+        type=str,
+        default="scores",
+        help="Filename suffix without the '.csv' or '.xlsx'. \
+            The folder name is added as a prefix to make it easier to distinguish scores files in search results.",
+    )
+    args = parser.parse_args()
+
+    folder = Path(args.folder)
+
+    csv_filename = f"{folder}_{args.output_filename}"
+    excel_filename = f"{folder}_{args.output_filename}"
+
+    if not folder.is_dir():
+        folder = Path(SIL_NLP_ENV.mt_experiments_dir) / args.folder
+
+    # Check for lock files and ask the user to close them.
+    check_for_lock_file(folder, csv_filename, "csv")
+    check_for_lock_file(folder, excel_filename, "xlsx")
+
+    data = aggregate_csv(folder)
+
+    # Write the aggregated data to a new CSV file
+    write_to_csv(data, folder, csv_filename)
+
+    # Write the aggregated data to an Excel file
+    write_to_excel(data, folder, excel_filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py
@@ -3,8 +3,10 @@
 import logging
 from pathlib import Path
 from typing import Dict, List, Set, Tuple, Union
+import regex as re
 import sys
 
+
 from .environment import SIL_NLP_ENV
 from .iso_info import NLLB_ISO_SET, ALT_ISO
 
@@ -85,6 +87,21 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict
 def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
     return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}
 
+def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]:
+    filtered = []
+    date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}')
+
+    for file in files:
+        parts = file.stem.split('-', 1)
+        if len(parts) != 2: continue
+        iso, name = parts
+        if date_pattern.search(name): continue
+        if len(iso) not in (2, 3): continue
+        if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue
+        if file.is_file() and file.stat().st_size < 100_000: continue
+        filtered.append(file)
+    return filtered
+
 def main():
     parser = argparse.ArgumentParser(description="Find related ISO language codes.")
     parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for")
@@ -151,7 +168,13 @@ def main():
 
     # Find files matching the codes
     files = get_files_by_iso(all_possible_codes, scripture_dir)
-    existing_projects, missing_projects = split_files_by_projects(files, projects_dir)
+
+    # Filter out AI and XRI files, and others.
+    excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk']
+    filtered_files = filter_files(files, excluded_patterns)
+    print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.")
+
+    existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir)
 
     # Display results
     if existing_projects:
@@ -163,8 +186,8 @@ def main():
         logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:")
         for file, _ in missing_projects.items():
             logger.info(f"{file.stem}")
-    logger.info(f"\nAll the files:")
-    for file in files:
+    logger.info(f"\nFiltered files:")
+    for file in filtered_files:
         logger.info(f"    - {file.stem}")
 
     if not files:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
		[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)