Skip to content
56 changes: 54 additions & 2 deletions silnlp/common/paratext.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from contextlib import ExitStack
from pathlib import Path
from typing import Dict, List, Optional, Set, TextIO, Tuple
from typing import Dict, Iterable, List, Optional, Set, TextIO, Tuple
from xml.sax.saxutils import escape

import regex as re
Expand All @@ -15,11 +15,22 @@
Text,
TextCorpus,
TextRow,
UsfmFileText,
UsfmFileTextCorpus,
UsfmParserHandler,
create_versification_ref_corpus,
extract_scripture_corpus,
parse_usfm,
)
from machine.scripture import (
BOOK_NUMBERS,
ORIGINAL_VERSIFICATION,
VerseRef,
VersificationType,
book_id_to_number,
book_number_to_id,
get_books,
)
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, VersificationType, book_id_to_number, get_books
from machine.tokenization import WhitespaceTokenizer

from .corpus import get_terms_glosses_path, get_terms_metadata_path, get_terms_vrefs_path, load_corpus
Expand Down Expand Up @@ -416,6 +427,15 @@ def get_book_path(project: str, book: str) -> Path:
return SIL_NLP_ENV.pt_projects_dir / project / book_file_name


def get_book_path_by_book_number(project: str, book_number: int) -> Path:
project_dir = get_project_dir(project)
settings = FileParatextProjectSettingsParser(project_dir).parse()
book_id = book_number_to_id(book_number)
book_file_name = settings.get_book_file_name(book_id)

return SIL_NLP_ENV.pt_projects_dir / project / book_file_name


def get_last_verse(project_dir: str, book: str, chapter: int) -> int:
last_verse = "0"
book_path = get_book_path(project_dir, book)
Expand Down Expand Up @@ -571,3 +591,35 @@ def check_versification(project_dir: str) -> Tuple[bool, List[VersificationType]

matching = True
return (matching, detected_versification)


def read_usfm(project_dir: str, book_number: int) -> str:
project_settings = FileParatextProjectSettingsParser(get_project_dir(project_dir)).parse()
book_path: Path = get_book_path_by_book_number(project_dir, book_number)

if not book_path.exists():
raise FileNotFoundError(f"USFM file for book number {book_number} not found in project {project_dir}")

usfm_text_file = UsfmFileText(
project_settings.stylesheet,
project_settings.encoding,
book_number_to_id(book_number),
book_path,
project_settings.versification,
include_all_text=True,
project=project_settings.name,
)
# This is not a public method, but I don't think any method exists in machine.py
# to read raw USFM using the project settings
return usfm_text_file._read_usfm()


# This is a placeholder until the ParatextProjectQuoteConventionDetector is released in machine.py
def parse_project(project_dir: str, selected_books: Iterable[int], usfm_parser_handler: UsfmParserHandler) -> None:
project_settings = FileParatextProjectSettingsParser(get_project_dir(project_dir)).parse()
for book_number in selected_books:
try:
usfm = read_usfm(project_dir, book_number)
except FileNotFoundError:
continue
parse_usfm(usfm, usfm_parser_handler, project_settings.stylesheet, project_settings.versification)
100 changes: 42 additions & 58 deletions silnlp/common/postprocess_draft.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from ..nmt.clearml_connection import SILClearML
from ..nmt.config_utils import load_config
from ..nmt.postprocess import get_draft_paths_from_exp, postprocess_draft, postprocess_experiment
from .paratext import get_project_dir
from .postprocesser import PostprocessConfig, PostprocessHandler
from .utils import get_mt_exp_dir

Expand All @@ -19,28 +18,11 @@ def main() -> None:
)
parser.add_argument(
"--experiment",
required=True,
default=None,
help="Name of an experiment directory in MT/experiments. \
If this option is used, the experiment's translate config will be used to find source and draft files.",
)
parser.add_argument(
"--source",
default=None,
help="Path of the source USFM file. \
If in a Paratext project, the project settings will be used when reading the files.",
)
parser.add_argument(
"--draft",
default=None,
help="Path of the draft USFM file that postprocessing will be applied to. \
Must have the exact same USFM structure as 'source', which it will if it is a draft from that source.",
)
parser.add_argument(
"--book",
default=None,
help="3-letter book id of book being evaluated, e.g. MAT. \
Only necessary if the source file is not in a Paratext project directory.",
)
parser.add_argument(
"--output-folder",
default=None,
Expand All @@ -63,6 +45,30 @@ def main() -> None:
action="store_true",
help="Carry over embeds from the source project to the output without translating them",
)
parser.add_argument(
"--denormalize-quotation-marks",
default=False,
action="store_true",
help="For files in USFM format, attempt to change the draft's quotation marks to match the target project's quote convention",
)
parser.add_argument(
"--source-quote-convention",
default="detect",
type=str,
help="The quote convention for the source project. If not specified, it will be detected automatically.",
)
parser.add_argument(
"--target-quote-convention",
default="detect",
type=str,
help="The quote convention for the target project. If not specified, it will be detected automatically.",
)
parser.add_argument(
"--source-project",
default="",
help="The name of the Paratext project used as the source. When the source quote convention is set to 'detect' or not specified,"
+ " this project will be used to detect the source quote convention.",
)
parser.add_argument(
"--clearml-queue",
default=None,
Expand All @@ -72,52 +78,30 @@ def main() -> None:
)
args = parser.parse_args()

experiment = args.experiment.replace("\\", "/") if args.experiment else None
experiment = args.experiment.replace("\\", "/")
args.output_folder = Path(args.output_folder.replace("\\", "/")) if args.output_folder else None
postprocess_config = PostprocessConfig(vars(args))

if args.experiment and (args.source or args.draft or args.book):
LOGGER.info("--experiment option used. --source, --draft, and --book will be ignored.")
if not (args.experiment or (args.source and args.draft)):
raise ValueError("Not enough options used. Please use --experiment OR --source and --draft.")
if not get_mt_exp_dir(experiment).exists():
raise ValueError(f"Experiment {experiment} not found.")

if experiment:
if not get_mt_exp_dir(experiment).exists():
raise ValueError(f"Experiment {experiment} not found.")

if args.clearml_queue is not None:
if "cpu" not in args.clearml_queue:
raise ValueError("Running this script on a GPU queue will not speed it up. Please only use CPU queues.")
clearml = SILClearML(experiment, args.clearml_queue)
config = clearml.config
else:
config = load_config(experiment)

if not (config.exp_dir / "translate_config.yml").exists():
raise ValueError("Experiment translate_config.yml not found.")

if not postprocess_config.is_base_config():
src_paths, draft_paths, _ = get_draft_paths_from_exp(config)
else:
LOGGER.info("No postprocessing options used. Applying postprocessing requests from translate config.")
postprocess_experiment(config, args.output_folder)
exit()
elif args.clearml_queue is not None:
raise ValueError("Must use --experiment option to use ClearML.")
if args.clearml_queue is not None:
if "cpu" not in args.clearml_queue:
raise ValueError("Running this script on a GPU queue will not speed it up. Please only use CPU queues.")
clearml = SILClearML(experiment, args.clearml_queue)
config = clearml.config
else:
src_paths = [Path(args.source.replace("\\", "/"))]
draft_paths = [Path(args.draft.replace("\\", "/"))]
if not str(src_paths[0]).startswith(str(get_project_dir(""))) and args.book is None:
raise ValueError(
"--book argument must be passed if the source file is not in a Paratext project directory."
)
config = load_config(experiment)

if postprocess_config.is_base_config():
raise ValueError("Please use at least one postprocessing option.")
postprocess_handler = PostprocessHandler([postprocess_config], include_base=False)
if not (config.exp_dir / "translate_config.yml").exists():
raise ValueError("Experiment translate_config.yml not found.")

for src_path, draft_path in zip(src_paths, draft_paths):
postprocess_draft(src_path, draft_path, postprocess_handler, args.book, args.output_folder)
if postprocess_config.is_base_config():
LOGGER.info("No postprocessing options used. Applying postprocessing requests from translate config.")
postprocess_experiment(config, out_dir=args.output_folder)
else:
postprocess_handler = PostprocessHandler([postprocess_config], include_base=False)
postprocess_experiment(config, postprocess_handler=postprocess_handler, out_dir=args.output_folder)


if __name__ == "__main__":
Expand Down
Loading