Use FileParatextProjectQuoteConventionDetector from Machine.py

Ben King · Ben King · commit 3fb4d51049b0 · 2025-09-15T17:30:30.000Z
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,7 +70,7 @@ tqdm = "^4.62.2"
 sacrebleu = "^2.3.1"
 ctranslate2 = "^3.5.1"
 libclang = "14.0.6"
-sil-machine = {extras = ["thot"], version = "1.7.4"}
+sil-machine = {extras = ["thot"], version = "1.8.2"}
 datasets = "^2.7.1"
 torch = {version = "^2.4", source = "torch"}
 sacremoses = "^0.0.53"
diff --git a/silnlp/common/paratext.py b/silnlp/common/paratext.py
@@ -2,7 +2,7 @@
 import os
 from contextlib import ExitStack
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Set, TextIO, Tuple
+from typing import Dict, List, Optional, Set, TextIO, Tuple
 from xml.sax.saxutils import escape
 
 import regex as re
@@ -15,22 +15,11 @@
     Text,
     TextCorpus,
     TextRow,
-    UsfmFileText,
     UsfmFileTextCorpus,
-    UsfmParserHandler,
     create_versification_ref_corpus,
     extract_scripture_corpus,
-    parse_usfm,
-)
-from machine.scripture import (
-    BOOK_NUMBERS,
-    ORIGINAL_VERSIFICATION,
-    VerseRef,
-    VersificationType,
-    book_id_to_number,
-    book_number_to_id,
-    get_books,
 )
+from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, VersificationType, book_id_to_number, get_books
 from machine.tokenization import WhitespaceTokenizer
 
 from .corpus import get_terms_glosses_path, get_terms_metadata_path, get_terms_vrefs_path, load_corpus
@@ -427,15 +416,6 @@ def get_book_path(project: str, book: str) -> Path:
     return SIL_NLP_ENV.pt_projects_dir / project / book_file_name
 
 
-def get_book_path_by_book_number(project: str, book_number: int) -> Path:
-    project_dir = get_project_dir(project)
-    settings = FileParatextProjectSettingsParser(project_dir).parse()
-    book_id = book_number_to_id(book_number)
-    book_file_name = settings.get_book_file_name(book_id)
-
-    return SIL_NLP_ENV.pt_projects_dir / project / book_file_name
-
-
 def get_last_verse(project_dir: str, book: str, chapter: int) -> int:
     last_verse = "0"
     book_path = get_book_path(project_dir, book)
@@ -591,35 +571,3 @@ def check_versification(project_dir: str) -> Tuple[bool, List[VersificationType]
 
     matching = True
     return (matching, detected_versification)
-
-
-def read_usfm(project_dir: str, book_number: int) -> str:
-    project_settings = FileParatextProjectSettingsParser(get_project_dir(project_dir)).parse()
-    book_path: Path = get_book_path_by_book_number(project_dir, book_number)
-
-    if not book_path.exists():
-        raise FileNotFoundError(f"USFM file for book number {book_number} not found in project {project_dir}")
-
-    usfm_text_file = UsfmFileText(
-        project_settings.stylesheet,
-        project_settings.encoding,
-        book_number_to_id(book_number),
-        book_path,
-        project_settings.versification,
-        include_all_text=True,
-        project=project_settings.name,
-    )
-    # This is not a public method, but I don't think any method exists in machine.py
-    # to read raw USFM using the project settings
-    return usfm_text_file._read_usfm()
-
-
-# This is a placeholder until the ParatextProjectQuoteConventionDetector is released in machine.py
-def parse_project(project_dir: str, selected_books: Iterable[int], usfm_parser_handler: UsfmParserHandler) -> None:
-    project_settings = FileParatextProjectSettingsParser(get_project_dir(project_dir)).parse()
-    for book_number in selected_books:
-        try:
-            usfm = read_usfm(project_dir, book_number)
-        except FileNotFoundError:
-            continue
-        parse_usfm(usfm, usfm_parser_handler, project_settings.stylesheet, project_settings.versification)
diff --git a/silnlp/common/postprocess_draft.py b/silnlp/common/postprocess_draft.py
@@ -4,7 +4,7 @@
 
 from ..nmt.clearml_connection import SILClearML
 from ..nmt.config_utils import load_config
-from ..nmt.postprocess import get_draft_paths_from_exp, postprocess_draft, postprocess_experiment
+from ..nmt.postprocess import postprocess_experiment
 from .postprocesser import PostprocessConfig, PostprocessHandler
 from .utils import get_mt_exp_dir
 
diff --git a/silnlp/common/postprocesser.py b/silnlp/common/postprocesser.py
@@ -1,9 +1,10 @@
 import logging
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, List, Optional, Sequence, Tuple
+from typing import Dict, List, Optional, Sequence
 
 from machine.corpora import (
+    FileParatextProjectQuoteConventionDetector,
     PlaceMarkersAlignmentInfo,
     PlaceMarkersUsfmUpdateBlockHandler,
     QuotationMarkDenormalizationFirstPass,
@@ -18,16 +19,11 @@
     UsfmUpdateBlockHandler,
     parse_usfm,
 )
-from machine.punctuation_analysis import (
-    STANDARD_QUOTE_CONVENTIONS,
-    QuoteConvention,
-    QuoteConventionAnalysis,
-    QuoteConventionDetector,
-)
+from machine.punctuation_analysis import STANDARD_QUOTE_CONVENTIONS, QuoteConvention, QuoteConventionDetector
 from machine.tokenization import LatinWordTokenizer
 from machine.translation import WordAlignmentMatrix
 
-from silnlp.common.paratext import parse_project
+from silnlp.common.paratext import get_project_dir
 from silnlp.nmt.corpora import CorpusPair
 
 from ..alignment.eflomal import to_word_alignment_matrix
@@ -102,8 +98,11 @@ def postprocess_usfm(
         self,
         usfm: str,
         rows: List[UpdateUsfmRow],
-        remarks: List[str] = [],
+        remarks: Optional[List[str]] = None,
     ) -> str:
+        if remarks is None:
+            remarks = []
+
         handler = UpdateUsfmParserHandler(
             rows=rows,
             text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING,
@@ -130,53 +129,42 @@ def __init__(self, project_name: str):
 
 
 class DenormalizeQuotationMarksPostprocessor:
+    _NO_CHAPTERS_REMARK_SENTENCE = "Quotation marks were not denormalized in any chapters due to errors."
     _REMARK_SENTENCE = (
         "Quotation marks in the following chapters have been automatically denormalized after translation: "
     )
+    _project_convention_cache: Dict[str, QuoteConvention] = {}
 
     def __init__(
         self,
         source_quote_convention_name: str | None,
         target_quote_convention_name: str | None,
         source_project_name: str | None = None,
         target_project_name: str | None = None,
-        selected_training_books: Dict[int, List[int]] = {},
     ):
         self._source_quote_convention = self._get_source_quote_convention(
-            source_quote_convention_name, source_project_name, selected_training_books
+            source_quote_convention_name, source_project_name
         )
         self._target_quote_convention = self._get_target_quote_convention(
-            target_quote_convention_name, target_project_name, selected_training_books
+            target_quote_convention_name, target_project_name
         )
 
-    def _get_source_quote_convention(
-        self, convention_name: str | None, project_name: str | None, selected_training_books: Dict[int, List[int]] = {}
-    ) -> QuoteConvention:
+    def _get_source_quote_convention(self, convention_name: str | None, project_name: str | None) -> QuoteConvention:
         if convention_name is None or convention_name == "detect":
             if project_name is None:
                 raise ValueError(
                     "The source project name must be explicitly provided or be present in translate_config.yml, since an explicit source quote convention name was not provided."
                 )
-            if selected_training_books is None:
-                raise ValueError(
-                    "The experiment's config.yml must exist and specify selected training books, since an explicit source quote convention name was not provided."
-                )
-            return self._detect_quote_convention(project_name, selected_training_books)
+            return self._detect_quote_convention(project_name)
         return self._get_named_quote_convention(convention_name)
 
-    def _get_target_quote_convention(
-        self, convention_name: str | None, project_name: str | None, selected_training_books: Dict[int, List[int]] = {}
-    ) -> QuoteConvention:
+    def _get_target_quote_convention(self, convention_name: str | None, project_name: str | None) -> QuoteConvention:
         if convention_name is None or convention_name == "detect":
             if project_name is None:
                 raise ValueError(
                     "The experiment's config.yml must exist and specify a target project name, since an explicit target quote convention name was not provided."
                 )
-            if selected_training_books is None:
-                raise ValueError(
-                    "The experiment's config.yml must exist and specify selected training books, since an explicit target quote convention name was not provided."
-                )
-            return self._detect_quote_convention(project_name, selected_training_books)
+            return self._detect_quote_convention(project_name)
         return self._get_named_quote_convention(convention_name)
 
     def _get_named_quote_convention(self, convention_name: str) -> QuoteConvention:
@@ -186,19 +174,24 @@ def _get_named_quote_convention(self, convention_name: str) -> QuoteConvention:
             raise UnknownQuoteConventionException(convention_name)
         return convention
 
-    def _detect_quote_convention(
-        self, project_name: str, selected_training_books: Dict[int, List[int]] = {}
-    ) -> QuoteConvention:
+    def _detect_quote_convention(self, project_name: str) -> QuoteConvention:
+        if project_name in self._project_convention_cache:
+            return self._project_convention_cache[project_name]
+
         quote_convention_detector = QuoteConventionDetector()
 
-        parse_project(project_name, selected_training_books.keys(), quote_convention_detector)
+        quote_convention_detector = FileParatextProjectQuoteConventionDetector(get_project_dir(project_name))
+        quote_convention_analysis = quote_convention_detector.get_quote_convention_analysis()
 
-        quote_convention_analysis: QuoteConventionAnalysis | None = quote_convention_detector.detect_quote_convention()
         if quote_convention_analysis is None:
             raise NoDetectedQuoteConventionException(project_name)
         LOGGER.info(
-            f'Detected quote convention for project "{project_name}" is "{quote_convention_analysis.best_quote_convention.name}" with score {quote_convention_analysis.best_quote_convention_score:.2f}.'
+            f'Detected quote convention for project "{project_name}" is '
+            + '"{quote_convention_analysis.best_quote_convention.name}" with score '
+            + "{quote_convention_analysis.best_quote_convention_score:.2f}."
         )
+        self._project_convention_cache[project_name] = quote_convention_analysis.best_quote_convention
+
         return quote_convention_analysis.best_quote_convention
 
     def _create_update_block_handlers(
@@ -221,6 +214,14 @@ def _get_best_chapter_strategies(self, usfm: str) -> List[QuotationMarkUpdateStr
         return quotation_mark_update_first_pass.find_best_chapter_strategies()
 
     def _create_remark(self, best_chapter_strategies: List[QuotationMarkUpdateStrategy]) -> str:
+        processed_chapters: List[str] = [
+            str(chapter_num)
+            for chapter_num, strategy in enumerate(best_chapter_strategies, 1)
+            if strategy != QuotationMarkUpdateStrategy.SKIP
+        ]
+
+        if len(processed_chapters) == 0:
+            return self._NO_CHAPTERS_REMARK_SENTENCE
         return (
             self._REMARK_SENTENCE
             + ", ".join(
@@ -327,7 +328,7 @@ def create_place_markers_postprocessor(self) -> PlaceMarkersPostprocessor:
     def create_denormalize_quotation_marks_postprocessor(
         self, training_corpus_pairs: List[CorpusPair], translation_source_project_name: Optional[str]
     ) -> DenormalizeQuotationMarksPostprocessor:
-        _, training_target_project_name, selected_training_books = self._get_experiment_training_info(
+        training_target_project_name = self._get_training_target_project_name(
             training_corpus_pairs,
         )
 
@@ -336,13 +337,12 @@ def create_denormalize_quotation_marks_postprocessor(
             self._config["target_quote_convention"],
             translation_source_project_name,
             training_target_project_name,
-            selected_training_books,
         )
 
-    def _get_experiment_training_info(
+    def _get_training_target_project_name(
         self,
         training_corpus_pairs: List[CorpusPair],
-    ) -> Tuple[Optional[str], Optional[str], Dict[int, List[int]]]:
+    ) -> Optional[str]:
         # Target project info is only needed for quote convention detection
         if self.is_quote_convention_detection_required():
             if len(training_corpus_pairs) > 1:
@@ -358,28 +358,25 @@ def _get_experiment_training_info(
                     "The experiment has multiple target projects. Quotation mark denormalization is unlikely to work correctly in this scenario."
                 )
 
-            source_project_name = (
-                training_corpus_pairs[0].src_files[0].project
-                if len(training_corpus_pairs) > 0 and len(training_corpus_pairs[0].src_files) > 0
-                else None
-            )
             target_project_name = (
                 training_corpus_pairs[0].trg_files[0].project
                 if len(training_corpus_pairs) > 0 and len(training_corpus_pairs[0].trg_files) > 0
                 else None
             )
-            selected_training_books = training_corpus_pairs[0].corpus_books if len(training_corpus_pairs) > 0 else {}
 
-            return source_project_name, target_project_name, selected_training_books
+            return target_project_name
 
-        return None, None, {}
+        return None
 
     def __getitem__(self, key):
         return self._config[key]
 
 
 class PostprocessHandler:
-    def __init__(self, configs: List[PostprocessConfig] = [], include_base: bool = True) -> None:
+    def __init__(self, configs: Optional[List[PostprocessConfig]] = None, include_base: bool = True) -> None:
+        if configs is None:
+            configs = []
+
         self.configs = ([PostprocessConfig()] if include_base else []) + configs
 
     # NOTE: Row metadata may need to be created/recreated at different times
diff --git a/silnlp/nmt/postprocess.py b/silnlp/nmt/postprocess.py
@@ -121,8 +121,11 @@ def postprocess_draft(
     postprocess_handler: PostprocessHandler,
     book: Optional[str] = None,
     out_dir: Optional[Path] = None,
-    training_corpus_pairs: List[CorpusPair] = [],
+    training_corpus_pairs: Optional[List[CorpusPair]] = None,
 ) -> None:
+    if training_corpus_pairs is None:
+        training_corpus_pairs = []
+
     if str(draft_metadata.source_path).startswith(str(get_project_dir(""))):
         settings = FileParatextProjectSettingsParser(draft_metadata.source_path.parent).parse()
         stylesheet = settings.stylesheet