11import logging
22from pathlib import Path
33from tempfile import TemporaryDirectory
4- from typing import Dict , List , Optional , Sequence , Tuple
4+ from typing import Dict , List , Optional , Sequence
55
66from machine .corpora import (
7+ FileParatextProjectQuoteConventionDetector ,
78 PlaceMarkersAlignmentInfo ,
89 PlaceMarkersUsfmUpdateBlockHandler ,
910 QuotationMarkDenormalizationFirstPass ,
1819 UsfmUpdateBlockHandler ,
1920 parse_usfm ,
2021)
21- from machine .punctuation_analysis import (
22- STANDARD_QUOTE_CONVENTIONS ,
23- QuoteConvention ,
24- QuoteConventionAnalysis ,
25- QuoteConventionDetector ,
26- )
22+ from machine .punctuation_analysis import STANDARD_QUOTE_CONVENTIONS , QuoteConvention , QuoteConventionDetector
2723from machine .tokenization import LatinWordTokenizer
2824from machine .translation import WordAlignmentMatrix
2925
30- from silnlp .common .paratext import parse_project
26+ from silnlp .common .paratext import get_project_dir
3127from silnlp .nmt .corpora import CorpusPair
3228
3329from ..alignment .eflomal import to_word_alignment_matrix
@@ -102,8 +98,11 @@ def postprocess_usfm(
10298 self ,
10399 usfm : str ,
104100 rows : List [UpdateUsfmRow ],
105- remarks : List [str ] = [] ,
101+ remarks : Optional [ List [str ]] = None ,
106102 ) -> str :
103+ if remarks is None :
104+ remarks = []
105+
107106 handler = UpdateUsfmParserHandler (
108107 rows = rows ,
109108 text_behavior = UpdateUsfmTextBehavior .STRIP_EXISTING ,
@@ -130,53 +129,42 @@ def __init__(self, project_name: str):
130129
131130
132131class DenormalizeQuotationMarksPostprocessor :
132+ _NO_CHAPTERS_REMARK_SENTENCE = "Quotation marks were not denormalized in any chapters due to errors."
133133 _REMARK_SENTENCE = (
134134 "Quotation marks in the following chapters have been automatically denormalized after translation: "
135135 )
136+ _project_convention_cache : Dict [str , QuoteConvention ] = {}
136137
137138 def __init__ (
138139 self ,
139140 source_quote_convention_name : str | None ,
140141 target_quote_convention_name : str | None ,
141142 source_project_name : str | None = None ,
142143 target_project_name : str | None = None ,
143- selected_training_books : Dict [int , List [int ]] = {},
144144 ):
145145 self ._source_quote_convention = self ._get_source_quote_convention (
146- source_quote_convention_name , source_project_name , selected_training_books
146+ source_quote_convention_name , source_project_name
147147 )
148148 self ._target_quote_convention = self ._get_target_quote_convention (
149- target_quote_convention_name , target_project_name , selected_training_books
149+ target_quote_convention_name , target_project_name
150150 )
151151
152- def _get_source_quote_convention (
153- self , convention_name : str | None , project_name : str | None , selected_training_books : Dict [int , List [int ]] = {}
154- ) -> QuoteConvention :
152+ def _get_source_quote_convention (self , convention_name : str | None , project_name : str | None ) -> QuoteConvention :
155153 if convention_name is None or convention_name == "detect" :
156154 if project_name is None :
157155 raise ValueError (
158156 "The source project name must be explicitly provided or be present in translate_config.yml, since an explicit source quote convention name was not provided."
159157 )
160- if selected_training_books is None :
161- raise ValueError (
162- "The experiment's config.yml must exist and specify selected training books, since an explicit source quote convention name was not provided."
163- )
164- return self ._detect_quote_convention (project_name , selected_training_books )
158+ return self ._detect_quote_convention (project_name )
165159 return self ._get_named_quote_convention (convention_name )
166160
167- def _get_target_quote_convention (
168- self , convention_name : str | None , project_name : str | None , selected_training_books : Dict [int , List [int ]] = {}
169- ) -> QuoteConvention :
161+ def _get_target_quote_convention (self , convention_name : str | None , project_name : str | None ) -> QuoteConvention :
170162 if convention_name is None or convention_name == "detect" :
171163 if project_name is None :
172164 raise ValueError (
173165 "The experiment's config.yml must exist and specify a target project name, since an explicit target quote convention name was not provided."
174166 )
175- if selected_training_books is None :
176- raise ValueError (
177- "The experiment's config.yml must exist and specify selected training books, since an explicit target quote convention name was not provided."
178- )
179- return self ._detect_quote_convention (project_name , selected_training_books )
167+ return self ._detect_quote_convention (project_name )
180168 return self ._get_named_quote_convention (convention_name )
181169
182170 def _get_named_quote_convention (self , convention_name : str ) -> QuoteConvention :
@@ -186,19 +174,24 @@ def _get_named_quote_convention(self, convention_name: str) -> QuoteConvention:
186174 raise UnknownQuoteConventionException (convention_name )
187175 return convention
188176
189- def _detect_quote_convention (
190- self , project_name : str , selected_training_books : Dict [int , List [int ]] = {}
191- ) -> QuoteConvention :
177+ def _detect_quote_convention (self , project_name : str ) -> QuoteConvention :
178+ if project_name in self ._project_convention_cache :
179+ return self ._project_convention_cache [project_name ]
180+
192181 quote_convention_detector = QuoteConventionDetector ()
193182
194- parse_project (project_name , selected_training_books .keys (), quote_convention_detector )
183+ quote_convention_detector = FileParatextProjectQuoteConventionDetector (get_project_dir (project_name ))
184+ quote_convention_analysis = quote_convention_detector .get_quote_convention_analysis ()
195185
196- quote_convention_analysis : QuoteConventionAnalysis | None = quote_convention_detector .detect_quote_convention ()
197186 if quote_convention_analysis is None :
198187 raise NoDetectedQuoteConventionException (project_name )
199188 LOGGER .info (
200- f'Detected quote convention for project "{ project_name } " is "{ quote_convention_analysis .best_quote_convention .name } " with score { quote_convention_analysis .best_quote_convention_score :.2f} .'
189+ f'Detected quote convention for project "{ project_name } " is '
190+ + '"{quote_convention_analysis.best_quote_convention.name}" with score '
191+ + "{quote_convention_analysis.best_quote_convention_score:.2f}."
201192 )
193+ self ._project_convention_cache [project_name ] = quote_convention_analysis .best_quote_convention
194+
202195 return quote_convention_analysis .best_quote_convention
203196
204197 def _create_update_block_handlers (
@@ -221,6 +214,14 @@ def _get_best_chapter_strategies(self, usfm: str) -> List[QuotationMarkUpdateStr
221214 return quotation_mark_update_first_pass .find_best_chapter_strategies ()
222215
223216 def _create_remark (self , best_chapter_strategies : List [QuotationMarkUpdateStrategy ]) -> str :
217+ processed_chapters : List [str ] = [
218+ str (chapter_num )
219+ for chapter_num , strategy in enumerate (best_chapter_strategies , 1 )
220+ if strategy != QuotationMarkUpdateStrategy .SKIP
221+ ]
222+
223+ if len (processed_chapters ) == 0 :
224+ return self ._NO_CHAPTERS_REMARK_SENTENCE
224225 return (
225226 self ._REMARK_SENTENCE
226227 + ", " .join (
@@ -327,7 +328,7 @@ def create_place_markers_postprocessor(self) -> PlaceMarkersPostprocessor:
327328 def create_denormalize_quotation_marks_postprocessor (
328329 self , training_corpus_pairs : List [CorpusPair ], translation_source_project_name : Optional [str ]
329330 ) -> DenormalizeQuotationMarksPostprocessor :
330- _ , training_target_project_name , selected_training_books = self ._get_experiment_training_info (
331+ training_target_project_name = self ._get_training_target_project_name (
331332 training_corpus_pairs ,
332333 )
333334
@@ -336,13 +337,12 @@ def create_denormalize_quotation_marks_postprocessor(
336337 self ._config ["target_quote_convention" ],
337338 translation_source_project_name ,
338339 training_target_project_name ,
339- selected_training_books ,
340340 )
341341
342- def _get_experiment_training_info (
342+ def _get_training_target_project_name (
343343 self ,
344344 training_corpus_pairs : List [CorpusPair ],
345- ) -> Tuple [ Optional [str ], Optional [ str ], Dict [ int , List [ int ]] ]:
345+ ) -> Optional [str ]:
346346 # Target project info is only needed for quote convention detection
347347 if self .is_quote_convention_detection_required ():
348348 if len (training_corpus_pairs ) > 1 :
@@ -358,28 +358,25 @@ def _get_experiment_training_info(
358358 "The experiment has multiple target projects. Quotation mark denormalization is unlikely to work correctly in this scenario."
359359 )
360360
361- source_project_name = (
362- training_corpus_pairs [0 ].src_files [0 ].project
363- if len (training_corpus_pairs ) > 0 and len (training_corpus_pairs [0 ].src_files ) > 0
364- else None
365- )
366361 target_project_name = (
367362 training_corpus_pairs [0 ].trg_files [0 ].project
368363 if len (training_corpus_pairs ) > 0 and len (training_corpus_pairs [0 ].trg_files ) > 0
369364 else None
370365 )
371- selected_training_books = training_corpus_pairs [0 ].corpus_books if len (training_corpus_pairs ) > 0 else {}
372366
373- return source_project_name , target_project_name , selected_training_books
367+ return target_project_name
374368
375- return None , None , {}
369+ return None
376370
377371 def __getitem__ (self , key ):
378372 return self ._config [key ]
379373
380374
381375class PostprocessHandler :
382- def __init__ (self , configs : List [PostprocessConfig ] = [], include_base : bool = True ) -> None :
376+ def __init__ (self , configs : Optional [List [PostprocessConfig ]] = None , include_base : bool = True ) -> None :
377+ if configs is None :
378+ configs = []
379+
383380 self .configs = ([PostprocessConfig ()] if include_base else []) + configs
384381
385382 # NOTE: Row metadata may need to be created/recreated at different times
0 commit comments