Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions debug.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[0529/225757.623:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
[0529/225806.839:ERROR:registration_protocol_win.cc(108)] CreateFile: The system cannot find the file specified. (0x2)
143 changes: 143 additions & 0 deletions silnlp/common/bulk_extract_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import argparse
import logging
from pathlib import Path
from typing import List
import sys

from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef, get_books

from .corpus import count_lines
from .paratext import check_versification, extract_project, extract_term_renderings
from machine.corpora import FileParatextProjectSettingsParser
from ..common.environment import SIL_NLP_ENV

LOGGER = logging.getLogger(__package__ + ".bulk_extract_local")
SETTINGS_FILENAME = "Settings.xml"


def parse_settings(project):
settings_file_path = project / SETTINGS_FILENAME
if not settings_file_path.is_file():
LOGGER.warning(f"Warning: {SETTINGS_FILENAME} not found.")
return

try:
parser = FileParatextProjectSettingsParser(str(project))
project_settings = parser.parse()

# project_settings.name
# project_settings.full_name
# if project_settings.encoding:
# self.setting_encoding = getattr(project_settings.encoding, 'name', str(project_settings.encoding))

# if project_settings.versification:
# setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))

# project_settings.file_name_prefix
# project_settings.file_name_form
# project_settings.file_name_suffix
# project_settings.biblical_terms_list_type
# project_settings.biblical_terms_project_name
# project_settings.biblical_terms_file_name
# project_settings.language_code

except Exception as e:
print(f"Error parsing {SETTINGS_FILENAME}: {e}")
return None

return project_settings

def get_expected_verse_count(project: Path, include: List[str], exclude: List[str]) -> int:
include_books_set = get_books(include) if len(include) > 0 else None
exclude_books_set = get_books(exclude) if len(exclude) > 0 else None
project_settings = parse_settings(project)

if project_settings.versification:
setting_versification = getattr(project_settings.versification, 'name', str(project_settings.versification))
print(f"Found versification {setting_versification} in {SETTINGS_FILENAME} for {project}")

def filter_lines(verse_ref_str: str) -> bool:
if include_books_set is None and exclude_books_set is None:
return True

vref = VerseRef.from_string(verse_ref_str.strip(), setting_versification)
if exclude_books_set is not None and vref.book_num in exclude_books_set:
return False

if include_books_set is not None and vref.book_num in include_books_set:
return True

return include_books_set is None

return count_lines(SIL_NLP_ENV.assets_dir / "vref.txt", filter_lines)


def has_settings_file(project_folder: Path) -> bool:
return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file()


def main() -> None:
parser = argparse.ArgumentParser(description="Extracts text corpora from Paratext projects")
parser.add_argument("input", type=str, help="The input folder.")
parser.add_argument("output", type=str, help="The output corpus folder.")
parser.add_argument("--terms", type=str, required=True, help="The output terms folder.")
parser.add_argument(
"--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'"
)
parser.add_argument(
"--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'"
)
parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers")
parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available")
parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs")

args = parser.parse_args()

input_path = Path(args.input)
output_path = Path(args.output)
terms_path = Path(args.terms)

if not input_path.is_dir():
print(f"Error: Projects folder not found: {args.input}")
sys.exit(1)

if not output_path.is_dir():
print(f"Error: Output folder not found: {args.output}")
sys.exit(1)

if not terms_path.is_dir():
print(f"Error: Output terms folder not found: {args.terms}")
sys.exit(1)

# Which folders have a Settings.xml file we can find?
projects = [folder for folder in input_path.glob("*") if folder.is_dir() and has_settings_file(folder)]

# Process the projects that have data and tell the user.
if len(projects) > 0:
for project in projects:
LOGGER.info(f"Extracting {project} to {output_path}")
expected_verse_count = get_expected_verse_count(project, args.include, args.exclude)

check_versification(project)
corpus_filename, verse_count = extract_project(
project,
output_path,
args.include,
args.exclude,
args.markers,
args.lemmas,
args.project_vrefs,
)

# check if the number of lines in the file is correct (the same as vref.txt)
LOGGER.info(f"# of Verses: {verse_count}")
if verse_count != expected_verse_count:
LOGGER.error(f"The number of verses is {verse_count}, but should be {expected_verse_count}.")
terms_count = extract_term_renderings(project, corpus_filename, terms_path)
LOGGER.info(f"# of Terms: {terms_count}")
LOGGER.info("Done.")
else:
LOGGER.warning(f"Couldn't find any data to process for any project in {input_path}.")

if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion silnlp/common/check_books.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def parse_book(project_dir: str, book: str):

settings = FileParatextProjectSettingsParser(project_dir).parse()
book_path = Path(project_dir) / settings.get_book_file_name(book)

LOGGER.info(f"Attempting to parse {book} from {book_path}.")

if not book_path.is_file():
raise RuntimeError(f"Can't find file {book_path} for book {book}")

Expand Down
116 changes: 116 additions & 0 deletions silnlp/common/combine_scores_save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import argparse
import csv
import sys
from collections import defaultdict
from pathlib import Path

import pandas as pd

from ..common.environment import SIL_NLP_ENV


def check_for_lock_file(folder: Path, filename: str, file_type: str):
"""Check for lock files and ask the user to close them then exit."""

if file_type[0] == ".":
file_type = file_type[1:]

if file_type.lower() == "csv":
lockfile = folder / f".~lock.{filename}.{file_type}#"
elif file_type.lower() == "xlsx":
lockfile = folder / f"~${filename}.{file_type}"

if lockfile.is_file():
print(f"Found lock file: {lockfile}")
print(f"Please close {filename}.{file_type} in folder {folder} OR delete the lock file and try again.")
sys.exit()


def aggregate_csv(folder_path):
# Dictionary to store rows by header type
data_by_header = defaultdict(list)

# Iterate over all CSV files in the folder and its subfolders
for csv_file in folder_path.rglob("*/scores-*.csv"):
series = csv_file.parts[-3] # Extract series folder name
experiment = csv_file.parts[-2] # Extract experiment folder name
steps = csv_file.stem.split("-")[-1] # Extract steps from file name

# Read the CSV file and add new columns
with open(csv_file, "r") as f:
reader = csv.reader(f)
rows = list(reader)
header = tuple(rows[0]) # Use tuple to make it hashable

# Add columns to the beginning of each row
if header not in data_by_header:
data_by_header[header].append(["Series", "Experiment", "Steps"] + list(header))
for row in rows[1:]:
data_by_header[header].append([series, experiment, steps] + row)

return data_by_header


def write_to_csv(data_by_header, folder, output_filename):

output_file = folder / f"{output_filename}.csv"
with open(output_file, "w", newline="") as f:
writer = csv.writer(f)
for header, rows in data_by_header.items():
writer.writerows(rows)
writer.writerow([]) # Add a blank row to separate different types
# Write the folder path to the last line of the CSV file
writer.writerow([folder])
print(f"Wrote scores to {output_file}")


def write_to_excel(data_by_header, folder, output_filename):
output_file = folder / f"{output_filename}.xlsx"
with pd.ExcelWriter(output_file) as writer:
for i, (header, rows) in enumerate(data_by_header.items()):
# Create a DataFrame for the current header
df = pd.DataFrame(rows[1:], columns=rows[0])
# Convert columns to appropriate data types
df = df.apply(pd.to_numeric, errors="ignore")
# Generate a unique sheet name
sheet_name = f"Table_{i + 1}"
# Write the DataFrame to the Excel file
df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"Wrote scores to {output_file}")


def main():
parser = argparse.ArgumentParser(description="Aggregate CSV files in a folder.")
parser.add_argument("folder", type=Path, help="Path to the folder containing CSV files.")
parser.add_argument(
"--output_filename",
type=str,
default="scores",
help="Filename suffix without the '.csv' or '.xlsx'. \
The folder name is added as a prefix to make it easier to distinguish scores files in search results.",
)
args = parser.parse_args()

folder = Path(args.folder)

csv_filename = f"{folder}_{args.output_filename}"
excel_filename = f"{folder}_{args.output_filename}"

if not folder.is_dir():
folder = Path(SIL_NLP_ENV.mt_experiments_dir) / args.folder

# Check for lock files and ask the user to close them.
check_for_lock_file(folder, csv_filename, "csv")
check_for_lock_file(folder, excel_filename, "xlsx")

data = aggregate_csv(folder)

# Write the aggregated data to a new CSV file
write_to_csv(data, folder, csv_filename)

# Write the aggregated data to an Excel file
write_to_excel(data, folder, excel_filename)


if __name__ == "__main__":
main()
29 changes: 26 additions & 3 deletions silnlp/common/find_by_iso.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import logging
from pathlib import Path
from typing import Dict, List, Set, Tuple, Union
import regex as re
import sys


from .environment import SIL_NLP_ENV
from .iso_info import NLLB_ISO_SET, ALT_ISO

Expand Down Expand Up @@ -85,6 +87,21 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict
def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}

def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]:
filtered = []
date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}')

for file in files:
parts = file.stem.split('-', 1)
if len(parts) != 2: continue
iso, name = parts
if date_pattern.search(name): continue
if len(iso) not in (2, 3): continue
if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue
if file.is_file() and file.stat().st_size < 100_000: continue
filtered.append(file)
return filtered

def main():
parser = argparse.ArgumentParser(description="Find related ISO language codes.")
parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for")
Expand Down Expand Up @@ -151,7 +168,13 @@ def main():

# Find files matching the codes
files = get_files_by_iso(all_possible_codes, scripture_dir)
existing_projects, missing_projects = split_files_by_projects(files, projects_dir)

# Filter out AI and XRI files, and others.
excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk']
filtered_files = filter_files(files, excluded_patterns)
print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.")

existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir)

# Display results
if existing_projects:
Expand All @@ -163,8 +186,8 @@ def main():
logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:")
for file, _ in missing_projects.items():
logger.info(f"{file.stem}")
logger.info(f"\nAll the files:")
for file in files:
logger.info(f"\nFiltered files:")
for file in filtered_files:
logger.info(f" - {file.stem}")

if not files:
Expand Down
Loading