Skip to content

Commit 3fb4d51

Browse files
author
Ben King
committed
Use FileParatextProjectQuoteConventionDetector from Machine.py
1 parent bc3e5c1 commit 3fb4d51

File tree

6 files changed

+56
-108
lines changed

6 files changed

+56
-108
lines changed

poetry.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ tqdm = "^4.62.2"
7070
sacrebleu = "^2.3.1"
7171
ctranslate2 = "^3.5.1"
7272
libclang = "14.0.6"
73-
sil-machine = {extras = ["thot"], version = "1.7.4"}
73+
sil-machine = {extras = ["thot"], version = "1.8.2"}
7474
datasets = "^2.7.1"
7575
torch = {version = "^2.4", source = "torch"}
7676
sacremoses = "^0.0.53"

silnlp/common/paratext.py

Lines changed: 2 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from contextlib import ExitStack
44
from pathlib import Path
5-
from typing import Dict, Iterable, List, Optional, Set, TextIO, Tuple
5+
from typing import Dict, List, Optional, Set, TextIO, Tuple
66
from xml.sax.saxutils import escape
77

88
import regex as re
@@ -15,22 +15,11 @@
1515
Text,
1616
TextCorpus,
1717
TextRow,
18-
UsfmFileText,
1918
UsfmFileTextCorpus,
20-
UsfmParserHandler,
2119
create_versification_ref_corpus,
2220
extract_scripture_corpus,
23-
parse_usfm,
24-
)
25-
from machine.scripture import (
26-
BOOK_NUMBERS,
27-
ORIGINAL_VERSIFICATION,
28-
VerseRef,
29-
VersificationType,
30-
book_id_to_number,
31-
book_number_to_id,
32-
get_books,
3321
)
22+
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, VersificationType, book_id_to_number, get_books
3423
from machine.tokenization import WhitespaceTokenizer
3524

3625
from .corpus import get_terms_glosses_path, get_terms_metadata_path, get_terms_vrefs_path, load_corpus
@@ -427,15 +416,6 @@ def get_book_path(project: str, book: str) -> Path:
427416
return SIL_NLP_ENV.pt_projects_dir / project / book_file_name
428417

429418

430-
def get_book_path_by_book_number(project: str, book_number: int) -> Path:
431-
project_dir = get_project_dir(project)
432-
settings = FileParatextProjectSettingsParser(project_dir).parse()
433-
book_id = book_number_to_id(book_number)
434-
book_file_name = settings.get_book_file_name(book_id)
435-
436-
return SIL_NLP_ENV.pt_projects_dir / project / book_file_name
437-
438-
439419
def get_last_verse(project_dir: str, book: str, chapter: int) -> int:
440420
last_verse = "0"
441421
book_path = get_book_path(project_dir, book)
@@ -591,35 +571,3 @@ def check_versification(project_dir: str) -> Tuple[bool, List[VersificationType]
591571

592572
matching = True
593573
return (matching, detected_versification)
594-
595-
596-
def read_usfm(project_dir: str, book_number: int) -> str:
597-
project_settings = FileParatextProjectSettingsParser(get_project_dir(project_dir)).parse()
598-
book_path: Path = get_book_path_by_book_number(project_dir, book_number)
599-
600-
if not book_path.exists():
601-
raise FileNotFoundError(f"USFM file for book number {book_number} not found in project {project_dir}")
602-
603-
usfm_text_file = UsfmFileText(
604-
project_settings.stylesheet,
605-
project_settings.encoding,
606-
book_number_to_id(book_number),
607-
book_path,
608-
project_settings.versification,
609-
include_all_text=True,
610-
project=project_settings.name,
611-
)
612-
# This is not a public method, but I don't think any method exists in machine.py
613-
# to read raw USFM using the project settings
614-
return usfm_text_file._read_usfm()
615-
616-
617-
# This is a placeholder until the ParatextProjectQuoteConventionDetector is released in machine.py
618-
def parse_project(project_dir: str, selected_books: Iterable[int], usfm_parser_handler: UsfmParserHandler) -> None:
619-
project_settings = FileParatextProjectSettingsParser(get_project_dir(project_dir)).parse()
620-
for book_number in selected_books:
621-
try:
622-
usfm = read_usfm(project_dir, book_number)
623-
except FileNotFoundError:
624-
continue
625-
parse_usfm(usfm, usfm_parser_handler, project_settings.stylesheet, project_settings.versification)

silnlp/common/postprocess_draft.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from ..nmt.clearml_connection import SILClearML
66
from ..nmt.config_utils import load_config
7-
from ..nmt.postprocess import get_draft_paths_from_exp, postprocess_draft, postprocess_experiment
7+
from ..nmt.postprocess import postprocess_experiment
88
from .postprocesser import PostprocessConfig, PostprocessHandler
99
from .utils import get_mt_exp_dir
1010

silnlp/common/postprocesser.py

Lines changed: 44 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import logging
22
from pathlib import Path
33
from tempfile import TemporaryDirectory
4-
from typing import Dict, List, Optional, Sequence, Tuple
4+
from typing import Dict, List, Optional, Sequence
55

66
from machine.corpora import (
7+
FileParatextProjectQuoteConventionDetector,
78
PlaceMarkersAlignmentInfo,
89
PlaceMarkersUsfmUpdateBlockHandler,
910
QuotationMarkDenormalizationFirstPass,
@@ -18,16 +19,11 @@
1819
UsfmUpdateBlockHandler,
1920
parse_usfm,
2021
)
21-
from machine.punctuation_analysis import (
22-
STANDARD_QUOTE_CONVENTIONS,
23-
QuoteConvention,
24-
QuoteConventionAnalysis,
25-
QuoteConventionDetector,
26-
)
22+
from machine.punctuation_analysis import STANDARD_QUOTE_CONVENTIONS, QuoteConvention, QuoteConventionDetector
2723
from machine.tokenization import LatinWordTokenizer
2824
from machine.translation import WordAlignmentMatrix
2925

30-
from silnlp.common.paratext import parse_project
26+
from silnlp.common.paratext import get_project_dir
3127
from silnlp.nmt.corpora import CorpusPair
3228

3329
from ..alignment.eflomal import to_word_alignment_matrix
@@ -102,8 +98,11 @@ def postprocess_usfm(
10298
self,
10399
usfm: str,
104100
rows: List[UpdateUsfmRow],
105-
remarks: List[str] = [],
101+
remarks: Optional[List[str]] = None,
106102
) -> str:
103+
if remarks is None:
104+
remarks = []
105+
107106
handler = UpdateUsfmParserHandler(
108107
rows=rows,
109108
text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING,
@@ -130,53 +129,42 @@ def __init__(self, project_name: str):
130129

131130

132131
class DenormalizeQuotationMarksPostprocessor:
132+
_NO_CHAPTERS_REMARK_SENTENCE = "Quotation marks were not denormalized in any chapters due to errors."
133133
_REMARK_SENTENCE = (
134134
"Quotation marks in the following chapters have been automatically denormalized after translation: "
135135
)
136+
_project_convention_cache: Dict[str, QuoteConvention] = {}
136137

137138
def __init__(
138139
self,
139140
source_quote_convention_name: str | None,
140141
target_quote_convention_name: str | None,
141142
source_project_name: str | None = None,
142143
target_project_name: str | None = None,
143-
selected_training_books: Dict[int, List[int]] = {},
144144
):
145145
self._source_quote_convention = self._get_source_quote_convention(
146-
source_quote_convention_name, source_project_name, selected_training_books
146+
source_quote_convention_name, source_project_name
147147
)
148148
self._target_quote_convention = self._get_target_quote_convention(
149-
target_quote_convention_name, target_project_name, selected_training_books
149+
target_quote_convention_name, target_project_name
150150
)
151151

152-
def _get_source_quote_convention(
153-
self, convention_name: str | None, project_name: str | None, selected_training_books: Dict[int, List[int]] = {}
154-
) -> QuoteConvention:
152+
def _get_source_quote_convention(self, convention_name: str | None, project_name: str | None) -> QuoteConvention:
155153
if convention_name is None or convention_name == "detect":
156154
if project_name is None:
157155
raise ValueError(
158156
"The source project name must be explicitly provided or be present in translate_config.yml, since an explicit source quote convention name was not provided."
159157
)
160-
if selected_training_books is None:
161-
raise ValueError(
162-
"The experiment's config.yml must exist and specify selected training books, since an explicit source quote convention name was not provided."
163-
)
164-
return self._detect_quote_convention(project_name, selected_training_books)
158+
return self._detect_quote_convention(project_name)
165159
return self._get_named_quote_convention(convention_name)
166160

167-
def _get_target_quote_convention(
168-
self, convention_name: str | None, project_name: str | None, selected_training_books: Dict[int, List[int]] = {}
169-
) -> QuoteConvention:
161+
def _get_target_quote_convention(self, convention_name: str | None, project_name: str | None) -> QuoteConvention:
170162
if convention_name is None or convention_name == "detect":
171163
if project_name is None:
172164
raise ValueError(
173165
"The experiment's config.yml must exist and specify a target project name, since an explicit target quote convention name was not provided."
174166
)
175-
if selected_training_books is None:
176-
raise ValueError(
177-
"The experiment's config.yml must exist and specify selected training books, since an explicit target quote convention name was not provided."
178-
)
179-
return self._detect_quote_convention(project_name, selected_training_books)
167+
return self._detect_quote_convention(project_name)
180168
return self._get_named_quote_convention(convention_name)
181169

182170
def _get_named_quote_convention(self, convention_name: str) -> QuoteConvention:
@@ -186,19 +174,24 @@ def _get_named_quote_convention(self, convention_name: str) -> QuoteConvention:
186174
raise UnknownQuoteConventionException(convention_name)
187175
return convention
188176

189-
def _detect_quote_convention(
190-
self, project_name: str, selected_training_books: Dict[int, List[int]] = {}
191-
) -> QuoteConvention:
177+
def _detect_quote_convention(self, project_name: str) -> QuoteConvention:
178+
if project_name in self._project_convention_cache:
179+
return self._project_convention_cache[project_name]
180+
192181
quote_convention_detector = QuoteConventionDetector()
193182

194-
parse_project(project_name, selected_training_books.keys(), quote_convention_detector)
183+
quote_convention_detector = FileParatextProjectQuoteConventionDetector(get_project_dir(project_name))
184+
quote_convention_analysis = quote_convention_detector.get_quote_convention_analysis()
195185

196-
quote_convention_analysis: QuoteConventionAnalysis | None = quote_convention_detector.detect_quote_convention()
197186
if quote_convention_analysis is None:
198187
raise NoDetectedQuoteConventionException(project_name)
199188
LOGGER.info(
200-
f'Detected quote convention for project "{project_name}" is "{quote_convention_analysis.best_quote_convention.name}" with score {quote_convention_analysis.best_quote_convention_score:.2f}.'
189+
f'Detected quote convention for project "{project_name}" is '
190+
+ '"{quote_convention_analysis.best_quote_convention.name}" with score '
191+
+ "{quote_convention_analysis.best_quote_convention_score:.2f}."
201192
)
193+
self._project_convention_cache[project_name] = quote_convention_analysis.best_quote_convention
194+
202195
return quote_convention_analysis.best_quote_convention
203196

204197
def _create_update_block_handlers(
@@ -221,6 +214,14 @@ def _get_best_chapter_strategies(self, usfm: str) -> List[QuotationMarkUpdateStr
221214
return quotation_mark_update_first_pass.find_best_chapter_strategies()
222215

223216
def _create_remark(self, best_chapter_strategies: List[QuotationMarkUpdateStrategy]) -> str:
217+
processed_chapters: List[str] = [
218+
str(chapter_num)
219+
for chapter_num, strategy in enumerate(best_chapter_strategies, 1)
220+
if strategy != QuotationMarkUpdateStrategy.SKIP
221+
]
222+
223+
if len(processed_chapters) == 0:
224+
return self._NO_CHAPTERS_REMARK_SENTENCE
224225
return (
225226
self._REMARK_SENTENCE
226227
+ ", ".join(
@@ -327,7 +328,7 @@ def create_place_markers_postprocessor(self) -> PlaceMarkersPostprocessor:
327328
def create_denormalize_quotation_marks_postprocessor(
328329
self, training_corpus_pairs: List[CorpusPair], translation_source_project_name: Optional[str]
329330
) -> DenormalizeQuotationMarksPostprocessor:
330-
_, training_target_project_name, selected_training_books = self._get_experiment_training_info(
331+
training_target_project_name = self._get_training_target_project_name(
331332
training_corpus_pairs,
332333
)
333334

@@ -336,13 +337,12 @@ def create_denormalize_quotation_marks_postprocessor(
336337
self._config["target_quote_convention"],
337338
translation_source_project_name,
338339
training_target_project_name,
339-
selected_training_books,
340340
)
341341

342-
def _get_experiment_training_info(
342+
def _get_training_target_project_name(
343343
self,
344344
training_corpus_pairs: List[CorpusPair],
345-
) -> Tuple[Optional[str], Optional[str], Dict[int, List[int]]]:
345+
) -> Optional[str]:
346346
# Target project info is only needed for quote convention detection
347347
if self.is_quote_convention_detection_required():
348348
if len(training_corpus_pairs) > 1:
@@ -358,28 +358,25 @@ def _get_experiment_training_info(
358358
"The experiment has multiple target projects. Quotation mark denormalization is unlikely to work correctly in this scenario."
359359
)
360360

361-
source_project_name = (
362-
training_corpus_pairs[0].src_files[0].project
363-
if len(training_corpus_pairs) > 0 and len(training_corpus_pairs[0].src_files) > 0
364-
else None
365-
)
366361
target_project_name = (
367362
training_corpus_pairs[0].trg_files[0].project
368363
if len(training_corpus_pairs) > 0 and len(training_corpus_pairs[0].trg_files) > 0
369364
else None
370365
)
371-
selected_training_books = training_corpus_pairs[0].corpus_books if len(training_corpus_pairs) > 0 else {}
372366

373-
return source_project_name, target_project_name, selected_training_books
367+
return target_project_name
374368

375-
return None, None, {}
369+
return None
376370

377371
def __getitem__(self, key):
378372
return self._config[key]
379373

380374

381375
class PostprocessHandler:
382-
def __init__(self, configs: List[PostprocessConfig] = [], include_base: bool = True) -> None:
376+
def __init__(self, configs: Optional[List[PostprocessConfig]] = None, include_base: bool = True) -> None:
377+
if configs is None:
378+
configs = []
379+
383380
self.configs = ([PostprocessConfig()] if include_base else []) + configs
384381

385382
# NOTE: Row metadata may need to be created/recreated at different times

silnlp/nmt/postprocess.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,11 @@ def postprocess_draft(
121121
postprocess_handler: PostprocessHandler,
122122
book: Optional[str] = None,
123123
out_dir: Optional[Path] = None,
124-
training_corpus_pairs: List[CorpusPair] = [],
124+
training_corpus_pairs: Optional[List[CorpusPair]] = None,
125125
) -> None:
126+
if training_corpus_pairs is None:
127+
training_corpus_pairs = []
128+
126129
if str(draft_metadata.source_path).startswith(str(get_project_dir(""))):
127130
settings = FileParatextProjectSettingsParser(draft_metadata.source_path.parent).parse()
128131
stylesheet = settings.stylesheet

0 commit comments

Comments
 (0)