From fb074bff7b850434ac63c1ec16156aa57cde2d0d Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Wed, 19 Feb 2025 12:55:15 -0500 Subject: [PATCH 1/9] Add LatinWordTokenizer from machine --- silnlp/common/compare_lex.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py index ffa2f8a9..84629b7a 100644 --- a/silnlp/common/compare_lex.py +++ b/silnlp/common/compare_lex.py @@ -4,20 +4,34 @@ from typing import List from ..common.environment import SIL_NLP_ENV +from machine.tokenization import LatinWordTokenizer +# Latin Tokenizer from machine library def get_all_words(src_file: str) -> List: words = [] - pattern = re.compile(r",(?=\S)") # Look for commas with no following space - with open(src_file, "r", encoding = "utf8") as src_data_file: + tokenizer = LatinWordTokenizer() + with open(src_file, "r", encoding = "utf8") as src_data_file: for line in src_data_file: - for word in line.split(" "): - word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() - finder = pattern.search(word) - if finder: # Add space after commas as needed - word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] + line_words = tokenizer.tokenize(line) + for word in line_words: if word != "": - words.append(word) - return words + words.append(word) + return(words) + +# Naive whitespace-based script-agnostic word splitter +#def get_all_words(src_file: str) -> List: +# words = [] +# pattern = re.compile(r",(?=\S)") # Look for commas with no following space +# with open(src_file, "r", encoding = "utf8") as src_data_file: +# for line in src_data_file: +# for word in line.split(" "): +# word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() +# finder = pattern.search(word) +# if finder: # Add space after commas as needed +# word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] +# if word != "": +# words.append(word) +# return words def find_unique(words1: List, words2: List) -> List: unique_words = [] @@ -37,6 +51,7 @@ def main() -> None: action='store_true') parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", action='store_true') + parser.add_argument("--books", help="Books to include from src and trg.") args = parser.parse_args() # If not explicitly limited, compare both source and target lexicons @@ -109,7 +124,7 @@ def main() -> None: for word in trg1_only_words: output_file.writelines(word+'\n') with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'src.txt words not found in {trg_file1}\n') + output_file.writelines(f'trg.txt words not found in {trg_file1}\n') for word in trg2_only_words: output_file.writelines(word+'\n') From af3c79ae0ed7076af2230d129771aaa0d44dfd0f Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Tue, 25 Feb 2025 09:38:25 -0500 Subject: [PATCH 2/9] Update lex_tools for Catapult Reloaded work --- silnlp/common/compare_lexCR.py | 180 +++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 silnlp/common/compare_lexCR.py diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py new file mode 100644 index 00000000..1bbdaca5 --- /dev/null +++ b/silnlp/common/compare_lexCR.py @@ -0,0 +1,180 @@ +import argparse +from collections import Counter +import numpy +import re +from typing import List + +from ..common.environment import SIL_NLP_ENV +from machine.tokenization import LatinWordTokenizer + +# Latin Tokenizer from machine library +def get_all_words(src_file: str) -> List: + words = [] + tokenizer = LatinWordTokenizer() + with open(src_file, "r", encoding = "utf8") as src_data_file: + for line in src_data_file: + line_words = tokenizer.tokenize(line) + for word in line_words: + word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() + if word != "" and not word.isnumeric(): + words.append(word) + return words + +# Naive whitespace-based script-agnostic word splitter +#def get_all_words(src_file: str) -> List: +# words = [] +# pattern = re.compile(r",(?=\S)") # Look for commas with no following space +# with open(src_file, "r", encoding = "utf8") as src_data_file: +# for line in src_data_file: +# for word in line.split(" "): +# word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() +# finder = pattern.search(word) +# if finder: # Add space after commas as needed +# word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] +# if word != "": +# words.append(word) +# return words + +def find_unique(words1: List, words2: List) -> List: + unique_words = [] + for word in words1: + if word not in words2: + unique_words.append(word) + return unique_words + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compares unique words in two corpora") + parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", + action='store_true') + parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", + action='store_true') + parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", + action='store_true') + args = parser.parse_args() + + # If not explicitly limited, compare both source and target lexicons + if args.src == False and args.trg == False: + args.src = True + args.trg = True + + lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1 + lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2 + + # Compare source words and write results to files + if args.src == True: + src_file1 = lex_path1 / "src.txt" + src_file2 = lex_path2 / "src.txt" + + # Find all words and unique words on source side + src_words1 = get_all_words(src_file1) + unique_src_words1 = numpy.unique(numpy.array(src_words1)) + src_words2 = get_all_words(src_file2) + unique_src_words2 = numpy.unique(numpy.array(src_words2)) + src1_only_words = find_unique(unique_src_words1,unique_src_words2) + src2_only_words = find_unique(unique_src_words2,unique_src_words1) + src1_word_counter = Counter(src_words1).most_common() + src2_word_counter = Counter(src_words2).most_common() + + # Write unique source words to files + src_words_file1 = lex_path1 / "src_words.txt" + src_words_file2 = lex_path2 / "src_words.txt" + with open(src_words_file1, "w", encoding="utf8") as output_file: + for word in unique_src_words1: + output_file.writelines(word+'\n') + with open(src_words_file2, "w", encoding="utf8") as output_file: + for word in unique_src_words2: + output_file.writelines(word+'\n') + + # Re-write src_words files with counts + with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src1_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src2_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + + # Write source words missing from the alternate source file + with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'src.txt words not found in {src_file2}\n') + for word in src1_only_words: + output_file.writelines(word+'\n') + with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'src.txt words not found in {src_file1}\n') + for word in src2_only_words: + output_file.writelines(word+'\n') + + # Compare target words and write results to files + if args.trg == True: + trg_file1 = lex_path1 / "trg.txt" + trg_file2 = lex_path2 / "trg.txt" + + # Find all words and unique words on target side + trg_words1 = get_all_words(trg_file1) + unique_trg_words1 = numpy.unique(numpy.array(trg_words1)) + trg_words2 = get_all_words(trg_file2) + unique_trg_words2 = numpy.unique(numpy.array(trg_words2)) + trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2) + trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1) + + # Write unique target words to files + trg_words_file1 = lex_path1 / "trg_words.txt" + trg_words_file2 = lex_path2 / "trg_words.txt" + with open(trg_words_file1, "w", encoding="utf8") as output_file: + for word in unique_trg_words1: + output_file.writelines(word+'\n') + with open(trg_words_file2, "w", encoding="utf8") as output_file: + for word in unique_trg_words2: + output_file.writelines(word+'\n') + + # Write target words missing from the alternate target file + with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'trg.txt words not found in {trg_file2}\n') + for word in trg1_only_words: + output_file.writelines(word+'\n') + with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'trg.txt words not found in {trg_file1}\n') + for word in trg2_only_words: + output_file.writelines(word+'\n') + + # Write the lex coverage stats + with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: + if args.src == True: + output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n') + output_file.writelines( + f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n') + output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n') + if args.trg == True: + output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n') + output_file.writelines( + f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n') + output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n') + + with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: + if args.src == True: + output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n') + output_file.writelines( + f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n') + output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n') + if args.trg == True: + output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n') + output_file.writelines( + f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n') + output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n') + + # Output stats if requested + if args.stats == True: + if args.src == True: + print(f'Unique words in src.txt: {len(unique_src_words1)}') + print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}') + print(f'Words missing from {src_words_file2}: {len(src1_only_words)}') + if args.trg == True: + print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}') + print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}') + print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}') + + +if __name__ == "__main__": + main() From 42e272411f93df3259df1142b092a756124b02f8 Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Tue, 25 Feb 2025 09:39:06 -0500 Subject: [PATCH 3/9] Furher update lex_tools for Catapult Reloaded work --- silnlp/common/compare_lex.py | 6 ++-- silnlp/common/count_words.py | 17 ++++++--- silnlp/common/find_words.py | 70 ++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 silnlp/common/find_words.py diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py index 84629b7a..da387543 100644 --- a/silnlp/common/compare_lex.py +++ b/silnlp/common/compare_lex.py @@ -14,9 +14,10 @@ def get_all_words(src_file: str) -> List: for line in src_data_file: line_words = tokenizer.tokenize(line) for word in line_words: - if word != "": + word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() + if word != "" and not word.isnumeric(): words.append(word) - return(words) + return words # Naive whitespace-based script-agnostic word splitter #def get_all_words(src_file: str) -> List: @@ -51,7 +52,6 @@ def main() -> None: action='store_true') parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", action='store_true') - parser.add_argument("--books", help="Books to include from src and trg.") args = parser.parse_args() # If not explicitly limited, compare both source and target lexicons diff --git a/silnlp/common/count_words.py b/silnlp/common/count_words.py index 6fd83204..f2fd2010 100644 --- a/silnlp/common/count_words.py +++ b/silnlp/common/count_words.py @@ -15,8 +15,9 @@ def main() -> None: parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\") parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal") parser.add_argument("--num", help="Number of most common words to include", type=int, default=100) - parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words", + parser.add_argument("--stats", help="Print word count and number of renderings for common words", action='store_true') + parser.add_argument("--count", help="Include count in src word files", action='store_true') args = parser.parse_args() # Set up path and lex files @@ -49,6 +50,8 @@ def main() -> None: if word != "" and not word.isnumeric(): src_words.append(word) src_data_word_counter = Counter(src_words).most_common(args.num) + if args.count: + src_word_counter = Counter(src_words).most_common() unique_src_words = numpy.unique(numpy.array(src_words)) # Pull all the separate words from the target data. Take all unique. @@ -65,7 +68,7 @@ def main() -> None: trg_words.append(word) unique_trg_words = numpy.unique(numpy.array(trg_words)) - # Clean lexicon file and prep for pandas csv reader + # Prep lexicon file for pandas csv reader (escape quotes) with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon: with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex: for line in lexicon.readlines(): @@ -111,9 +114,13 @@ def main() -> None: for src_wd in common_wd: writer.writerow([src_wd, *common_wd[src_wd]]) - with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file: - for word in unique_src_words: - output_file.writelines(word + '\n') + with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file: + if args.count: + for entry in src_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + else: + for word in unique_src_words: + output_file.writelines(word + '\n') # Optionally, output a few stats if args.stats: diff --git a/silnlp/common/find_words.py b/silnlp/common/find_words.py new file mode 100644 index 00000000..a5cd1a6b --- /dev/null +++ b/silnlp/common/find_words.py @@ -0,0 +1,70 @@ +import argparse +from collections import Counter +import csv +import unicodedata + +from ..common.environment import SIL_NLP_ENV + +# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings) +def NFD(s): + return unicodedata.normalize('NFD', s) + +def main(): + parser = argparse.ArgumentParser(description="Counts lexicon entries") + parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt") + args = parser.parse_args() + + # Set up path and files + lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment + word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list + vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt" + + # Get count of each word in the file + with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file: + src_word_counts = [] + for entry in src_wd_file: + entry = list(entry.split('\t')) + if len(entry) > 1: + entry[1] = int(entry[1].strip()) + src_word_counts.append(entry) + else: + print("Error: word counts are missing. Please run count_words.py with the --count flag set.") + return 1 + + # Extract list of words + src_word_dict = dict(list(src_word_counts)) + with(word_filename).open("r", encoding = "utf8") as word_file: + words = [] + for word in word_file: + words.append(word.rstrip('\n')) + # Check for words and word count in each verse; write to output file. + with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file: + with(vref_filename).open("r", encoding = "utf8") as ref_file: + word_list = list(enumerate(words)) + result = [] + seen_words = [] + for verse in zip(ref_file, src_data_file): + word_text = [] + word_num = [] + word_count = 0 + for word in word_list: + #if NFD(NFD(word[1])) in NFD(NFD(verse[1])): + if word[1] in verse[1] and word[1] not in seen_words: + for entry in src_word_counts: + if entry[0] == word[1]: + word_count += entry[1] + seen_words.append(word[1]) + word_text.append(word[1]) + word_num.append(src_word_dict[word[1]]) + result.append([verse[0].rstrip('\n'), word_count, word_num, word_text]) + with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file: + writer = csv.writer(output_file, lineterminator="\n") + writer.writerow(['Reference','Novelty Score','Word Counts','Words']) + for line in result: + writer.writerow([line[0], line[1], line[2], *line[3]]) + #print(result) + + +if __name__ == '__main__': + main() From f47eed9accb9c05410b8fbf523e5381d326d76f8 Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Wed, 23 Apr 2025 11:14:10 -0400 Subject: [PATCH 4/9] Add functionality to extract vocab coverage lists for Catapult --- silnlp/common/compare_lex_Catapult.py | 185 ++++++++++++++++++++++++++ silnlp/common/find_words_Catapult.py | 71 ++++++++++ 2 files changed, 256 insertions(+) create mode 100644 silnlp/common/compare_lex_Catapult.py create mode 100644 silnlp/common/find_words_Catapult.py diff --git a/silnlp/common/compare_lex_Catapult.py b/silnlp/common/compare_lex_Catapult.py new file mode 100644 index 00000000..9f63bf69 --- /dev/null +++ b/silnlp/common/compare_lex_Catapult.py @@ -0,0 +1,185 @@ +import argparse +from collections import Counter +import numpy +import re +from typing import List + +from ..common.environment import SIL_NLP_ENV +from machine.tokenization import LatinWordTokenizer + +# Latin Tokenizer from machine library +def get_all_words(src_file: str) -> List: + words = [] + tokenizer = LatinWordTokenizer() + with open(src_file, "r", encoding = "utf8") as src_data_file: + for line in src_data_file: + line_words = tokenizer.tokenize(line) + for word in line_words: + word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() + if word != "" and not word.isnumeric(): + words.append(word) + return words + +# Naive whitespace-based script-agnostic word splitter +#def get_all_words(src_file: str) -> List: +# words = [] +# pattern = re.compile(r",(?=\S)") # Look for commas with no following space +# with open(src_file, "r", encoding = "utf8") as src_data_file: +# for line in src_data_file: +# for word in line.split(" "): +# word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() +# finder = pattern.search(word) +# if finder: # Add space after commas as needed +# word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] +# if word != "": +# words.append(word) +# return words + +def find_unique(words1: List, words2: List) -> List: + unique_words = [] + for word in words1: + if word not in words2: + unique_words.append(word) + return unique_words + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compares unique words in two corpora") + parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", + action='store_true') + parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", + action='store_true') + parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", + action='store_true') + args = parser.parse_args() + + # If not explicitly limited, compare both source and target lexicons + if args.src == False and args.trg == False: + args.src = True + args.trg = True + + lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1 + lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2 + + # Compare source words and write results to files + if args.src == True: + src_file1 = lex_path1 / "src.txt" + src_file2 = lex_path2 / "src.txt" + + # Find all words and unique words on source side + src_words1 = get_all_words(src_file1) + unique_src_words1 = numpy.unique(numpy.array(src_words1)) + src_words2 = get_all_words(src_file2) + unique_src_words2 = numpy.unique(numpy.array(src_words2)) + src1_only_words = find_unique(unique_src_words1,unique_src_words2) + src2_only_words = find_unique(unique_src_words2,unique_src_words1) + src1_word_counter = Counter(src_words1).most_common() + src2_word_counter = Counter(src_words2).most_common() + + # Write unique source words to files + src_words_file1 = lex_path1 / "src_words.txt" + src_words_file2 = lex_path2 / "src_words.txt" + with open(src_words_file1, "w", encoding="utf8") as output_file: + for word in unique_src_words1: + output_file.writelines(word+'\n') + with open(src_words_file2, "w", encoding="utf8") as output_file: + for word in unique_src_words2: + output_file.writelines(word+'\n') + + # Re-write src_words files with counts + with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src1_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src2_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + # Output src1 words missing from src2, with their counts + with (lex_path1 / "missing_word_counts.txt").open("w", encoding = "utf8") as output_file: + for entry in src1_word_counter: + if entry[0] in src1_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + + # Write source words missing from the alternate source file + with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'src.txt words not found in {src_file2}\n') + for word in src1_only_words: + output_file.writelines(word+'\n') + with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'src.txt words not found in {src_file1}\n') + for word in src2_only_words: + output_file.writelines(word+'\n') + + # Compare target words and write results to files + if args.trg == True: + trg_file1 = lex_path1 / "trg.txt" + trg_file2 = lex_path2 / "trg.txt" + + # Find all words and unique words on target side + trg_words1 = get_all_words(trg_file1) + unique_trg_words1 = numpy.unique(numpy.array(trg_words1)) + trg_words2 = get_all_words(trg_file2) + unique_trg_words2 = numpy.unique(numpy.array(trg_words2)) + trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2) + trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1) + + # Write unique target words to files + trg_words_file1 = lex_path1 / "trg_words.txt" + trg_words_file2 = lex_path2 / "trg_words.txt" + with open(trg_words_file1, "w", encoding="utf8") as output_file: + for word in unique_trg_words1: + output_file.writelines(word+'\n') + with open(trg_words_file2, "w", encoding="utf8") as output_file: + for word in unique_trg_words2: + output_file.writelines(word+'\n') + + # Write target words missing from the alternate target file + with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'trg.txt words not found in {trg_file2}\n') + for word in trg1_only_words: + output_file.writelines(word+'\n') + with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: + output_file.writelines(f'trg.txt words not found in {trg_file1}\n') + for word in trg2_only_words: + output_file.writelines(word+'\n') + + # Write the lex coverage stats + with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: + if args.src == True: + output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n') + output_file.writelines( + f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n') + output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n') + if args.trg == True: + output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n') + output_file.writelines( + f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n') + output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n') + + with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: + if args.src == True: + output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n') + output_file.writelines( + f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n') + output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n') + if args.trg == True: + output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n') + output_file.writelines( + f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n') + output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n') + + # Output stats if requested + if args.stats == True: + if args.src == True: + print(f'Unique words in src.txt: {len(unique_src_words1)}') + print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}') + print(f'Words missing from {src_words_file2}: {len(src1_only_words)}') + if args.trg == True: + print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}') + print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}') + print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}') + + +if __name__ == "__main__": + main() diff --git a/silnlp/common/find_words_Catapult.py b/silnlp/common/find_words_Catapult.py new file mode 100644 index 00000000..d6de1782 --- /dev/null +++ b/silnlp/common/find_words_Catapult.py @@ -0,0 +1,71 @@ +import argparse +from collections import Counter +import csv +import unicodedata + +from ..common.environment import SIL_NLP_ENV + +# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings) +def NFD(s): + return unicodedata.normalize('NFD', s) + +def main(): + parser = argparse.ArgumentParser(description="Counts lexicon entries") + parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt") + args = parser.parse_args() + + # Set up path and files + lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment + word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list + vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt" + + # Get count of each word in the file + with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file: + src_word_counts = [] + for entry in src_wd_file: + entry = list(entry.split('\t')) + if len(entry) > 1: + entry[1] = int(entry[1].strip()) + src_word_counts.append(entry) + else: + print("Error: word counts are missing. Please run count_words.py with the --count flag set.") + return 1 + + # Extract list of words + src_word_dict = dict(list(src_word_counts)) + with(word_filename).open("r", encoding = "utf8") as word_file: + words = [] + for word in word_file: + words.append(word.rstrip('\n')) + # Check for words and word count in each verse; write to output file. + with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file: + with(vref_filename).open("r", encoding = "utf8") as ref_file: + word_list = list(enumerate(words)) + result = [] + seen_words = [] + for verse in zip(ref_file, src_data_file): + word_text = [] + word_num = [] + word_count = 0 + for word in word_list: + #if NFD(NFD(word[1])) in NFD(NFD(verse[1])): + #if word[1] in verse[1]: # (to find all instances; not just first) + if word[1] in verse[1] and word[1] not in seen_words: + for entry in src_word_counts: + if entry[0] == word[1]: + word_count += entry[1] + seen_words.append(word[1]) + word_text.append(word[1]) + word_num.append(src_word_dict[word[1]]) + result.append([verse[0].rstrip('\n'), word_count, word_num, word_text]) + with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file: + writer = csv.writer(output_file, lineterminator="\n") + writer.writerow(['Reference','Novelty Score','Word Counts','Words']) + for line in result: + writer.writerow([line[0], line[1], line[2], *line[3]]) + #print(result) + + +if __name__ == '__main__': + main() From 523d1d869af4ec3d772aeead889732288634b83a Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Tue, 20 May 2025 09:55:54 -0400 Subject: [PATCH 5/9] Revert to naive word separator --- silnlp/common/compare_lexCR.py | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py index 1bbdaca5..b4ab1d6f 100644 --- a/silnlp/common/compare_lexCR.py +++ b/silnlp/common/compare_lexCR.py @@ -8,33 +8,33 @@ from machine.tokenization import LatinWordTokenizer # Latin Tokenizer from machine library -def get_all_words(src_file: str) -> List: - words = [] - tokenizer = LatinWordTokenizer() - with open(src_file, "r", encoding = "utf8") as src_data_file: - for line in src_data_file: - line_words = tokenizer.tokenize(line) - for word in line_words: - word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() - if word != "" and not word.isnumeric(): - words.append(word) - return words - -# Naive whitespace-based script-agnostic word splitter #def get_all_words(src_file: str) -> List: # words = [] -# pattern = re.compile(r",(?=\S)") # Look for commas with no following space -# with open(src_file, "r", encoding = "utf8") as src_data_file: +# tokenizer = LatinWordTokenizer() +# with open(src_file, "r", encoding = "utf8") as src_data_file: # for line in src_data_file: -# for word in line.split(" "): -# word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() -# finder = pattern.search(word) -# if finder: # Add space after commas as needed -# word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] -# if word != "": -# words.append(word) +# line_words = tokenizer.tokenize(line) +# for word in line_words: +# word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() +# if word != "" and not word.isnumeric(): +# words.append(word) # return words +# Naive whitespace-based script-agnostic word splitter +def get_all_words(src_file: str) -> List: + words = [] + pattern = re.compile(r",(?=\S)") # Look for commas with no following space + with open(src_file, "r", encoding = "utf8") as src_data_file: + for line in src_data_file: + for word in line.split(" "): + word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() + finder = pattern.search(word) + if finder: # Add space after commas as needed + word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] + if word != "": + words.append(word) + return words + def find_unique(words1: List, words2: List) -> List: unique_words = [] for word in words1: From 70cd9440ac264e26f170e928bf62a1f30ddd4b7b Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Tue, 20 May 2025 12:42:29 -0400 Subject: [PATCH 6/9] Revert compare_lex.py to naive word splitter. --- silnlp/common/compare_lex.py | 42 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py index da387543..f595e5e3 100644 --- a/silnlp/common/compare_lex.py +++ b/silnlp/common/compare_lex.py @@ -7,33 +7,33 @@ from machine.tokenization import LatinWordTokenizer # Latin Tokenizer from machine library -def get_all_words(src_file: str) -> List: - words = [] - tokenizer = LatinWordTokenizer() - with open(src_file, "r", encoding = "utf8") as src_data_file: - for line in src_data_file: - line_words = tokenizer.tokenize(line) - for word in line_words: - word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() - if word != "" and not word.isnumeric(): - words.append(word) - return words - -# Naive whitespace-based script-agnostic word splitter #def get_all_words(src_file: str) -> List: # words = [] -# pattern = re.compile(r",(?=\S)") # Look for commas with no following space -# with open(src_file, "r", encoding = "utf8") as src_data_file: +# tokenizer = LatinWordTokenizer() +# with open(src_file, "r", encoding = "utf8") as src_data_file: # for line in src_data_file: -# for word in line.split(" "): +# line_words = tokenizer.tokenize(line) +# for word in line_words: # word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() -# finder = pattern.search(word) -# if finder: # Add space after commas as needed -# word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] -# if word != "": -# words.append(word) +# if word != "" and not word.isnumeric(): +# words.append(word) # return words +# Naive whitespace-based script-agnostic word splitter +def get_all_words(src_file: str) -> List: + words = [] + pattern = re.compile(r",(?=\S)") # Look for commas with no following space + with open(src_file, "r", encoding = "utf8") as src_data_file: + for line in src_data_file: + for word in line.split(" "): + word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() + finder = pattern.search(word) + if finder: # Add space after commas as needed + word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] + if word != "": + words.append(word) + return words + def find_unique(words1: List, words2: List) -> List: unique_words = [] for word in words1: From 0cce13edd5f7a9618b3b4af4b87edb1a0d9ca077 Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Fri, 23 May 2025 13:33:30 -0400 Subject: [PATCH 7/9] Add word counts to unmatched_src_words.txt output --- silnlp/common/compare_lexCR.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py index b4ab1d6f..fb14c90e 100644 --- a/silnlp/common/compare_lexCR.py +++ b/silnlp/common/compare_lexCR.py @@ -97,14 +97,27 @@ def main() -> None: output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') # Write source words missing from the alternate source file + #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + # output_file.writelines(f'src.txt words not found in {src_file2}\n') + # for word in src1_only_words: + # output_file.writelines(word+'\n') + #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + # output_file.writelines(f'src.txt words not found in {src_file1}\n') + # for word in src2_only_words: + # output_file.writelines(word+'\n') + + + # Rewrite of above section to include counts in the output file: with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: output_file.writelines(f'src.txt words not found in {src_file2}\n') - for word in src1_only_words: - output_file.writelines(word+'\n') + for entry in src1_word_counter: + if entry[0] in src1_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: output_file.writelines(f'src.txt words not found in {src_file1}\n') - for word in src2_only_words: - output_file.writelines(word+'\n') + for entry in src2_word_counter: + if entry[0] in src2_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') # Compare target words and write results to files if args.trg == True: From 5c408db3bf93919da0cbc9d0164db393114985bc Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Wed, 11 Jun 2025 14:06:46 -0400 Subject: [PATCH 8/9] Update lex_tools with word counts for Catapult Reloaded --- silnlp/common/compare_lex.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py index f595e5e3..fb14c90e 100644 --- a/silnlp/common/compare_lex.py +++ b/silnlp/common/compare_lex.py @@ -1,4 +1,5 @@ import argparse +from collections import Counter import numpy import re from typing import List @@ -14,7 +15,7 @@ # for line in src_data_file: # line_words = tokenizer.tokenize(line) # for word in line_words: -# word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() +# word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() # if word != "" and not word.isnumeric(): # words.append(word) # return words @@ -74,6 +75,8 @@ def main() -> None: unique_src_words2 = numpy.unique(numpy.array(src_words2)) src1_only_words = find_unique(unique_src_words1,unique_src_words2) src2_only_words = find_unique(unique_src_words2,unique_src_words1) + src1_word_counter = Counter(src_words1).most_common() + src2_word_counter = Counter(src_words2).most_common() # Write unique source words to files src_words_file1 = lex_path1 / "src_words.txt" @@ -85,15 +88,36 @@ def main() -> None: for word in unique_src_words2: output_file.writelines(word+'\n') + # Re-write src_words files with counts + with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src1_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src2_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + # Write source words missing from the alternate source file + #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + # output_file.writelines(f'src.txt words not found in {src_file2}\n') + # for word in src1_only_words: + # output_file.writelines(word+'\n') + #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + # output_file.writelines(f'src.txt words not found in {src_file1}\n') + # for word in src2_only_words: + # output_file.writelines(word+'\n') + + + # Rewrite of above section to include counts in the output file: with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: output_file.writelines(f'src.txt words not found in {src_file2}\n') - for word in src1_only_words: - output_file.writelines(word+'\n') + for entry in src1_word_counter: + if entry[0] in src1_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: output_file.writelines(f'src.txt words not found in {src_file1}\n') - for word in src2_only_words: - output_file.writelines(word+'\n') + for entry in src2_word_counter: + if entry[0] in src2_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') # Compare target words and write results to files if args.trg == True: From d68f27a9d51ddfe76eea8f4a470561930b0ea3e2 Mon Sep 17 00:00:00 2001 From: Bethany Moore Date: Wed, 11 Jun 2025 14:28:00 -0400 Subject: [PATCH 9/9] Clean up duplicate versions --- silnlp/common/compare_lexCR.py | 193 -------------------------- silnlp/common/compare_lex_Catapult.py | 185 ------------------------ silnlp/common/find_words.py | 1 + silnlp/common/find_words_Catapult.py | 71 ---------- 4 files changed, 1 insertion(+), 449 deletions(-) delete mode 100644 silnlp/common/compare_lexCR.py delete mode 100644 silnlp/common/compare_lex_Catapult.py delete mode 100644 silnlp/common/find_words_Catapult.py diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py deleted file mode 100644 index fb14c90e..00000000 --- a/silnlp/common/compare_lexCR.py +++ /dev/null @@ -1,193 +0,0 @@ -import argparse -from collections import Counter -import numpy -import re -from typing import List - -from ..common.environment import SIL_NLP_ENV -from machine.tokenization import LatinWordTokenizer - -# Latin Tokenizer from machine library -#def get_all_words(src_file: str) -> List: -# words = [] -# tokenizer = LatinWordTokenizer() -# with open(src_file, "r", encoding = "utf8") as src_data_file: -# for line in src_data_file: -# line_words = tokenizer.tokenize(line) -# for word in line_words: -# word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() -# if word != "" and not word.isnumeric(): -# words.append(word) -# return words - -# Naive whitespace-based script-agnostic word splitter -def get_all_words(src_file: str) -> List: - words = [] - pattern = re.compile(r",(?=\S)") # Look for commas with no following space - with open(src_file, "r", encoding = "utf8") as src_data_file: - for line in src_data_file: - for word in line.split(" "): - word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() - finder = pattern.search(word) - if finder: # Add space after commas as needed - word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] - if word != "": - words.append(word) - return words - -def find_unique(words1: List, words2: List) -> List: - unique_words = [] - for word in words1: - if word not in words2: - unique_words.append(word) - return unique_words - - -def main() -> None: - parser = argparse.ArgumentParser(description="Compares unique words in two corpora") - parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\") - parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\") - parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", - action='store_true') - parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", - action='store_true') - parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", - action='store_true') - args = parser.parse_args() - - # If not explicitly limited, compare both source and target lexicons - if args.src == False and args.trg == False: - args.src = True - args.trg = True - - lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1 - lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2 - - # Compare source words and write results to files - if args.src == True: - src_file1 = lex_path1 / "src.txt" - src_file2 = lex_path2 / "src.txt" - - # Find all words and unique words on source side - src_words1 = get_all_words(src_file1) - unique_src_words1 = numpy.unique(numpy.array(src_words1)) - src_words2 = get_all_words(src_file2) - unique_src_words2 = numpy.unique(numpy.array(src_words2)) - src1_only_words = find_unique(unique_src_words1,unique_src_words2) - src2_only_words = find_unique(unique_src_words2,unique_src_words1) - src1_word_counter = Counter(src_words1).most_common() - src2_word_counter = Counter(src_words2).most_common() - - # Write unique source words to files - src_words_file1 = lex_path1 / "src_words.txt" - src_words_file2 = lex_path2 / "src_words.txt" - with open(src_words_file1, "w", encoding="utf8") as output_file: - for word in unique_src_words1: - output_file.writelines(word+'\n') - with open(src_words_file2, "w", encoding="utf8") as output_file: - for word in unique_src_words2: - output_file.writelines(word+'\n') - - # Re-write src_words files with counts - with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file: - for entry in src1_word_counter: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file: - for entry in src2_word_counter: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - - # Write source words missing from the alternate source file - #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: - # output_file.writelines(f'src.txt words not found in {src_file2}\n') - # for word in src1_only_words: - # output_file.writelines(word+'\n') - #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: - # output_file.writelines(f'src.txt words not found in {src_file1}\n') - # for word in src2_only_words: - # output_file.writelines(word+'\n') - - - # Rewrite of above section to include counts in the output file: - with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'src.txt words not found in {src_file2}\n') - for entry in src1_word_counter: - if entry[0] in src1_only_words: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'src.txt words not found in {src_file1}\n') - for entry in src2_word_counter: - if entry[0] in src2_only_words: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - - # Compare target words and write results to files - if args.trg == True: - trg_file1 = lex_path1 / "trg.txt" - trg_file2 = lex_path2 / "trg.txt" - - # Find all words and unique words on target side - trg_words1 = get_all_words(trg_file1) - unique_trg_words1 = numpy.unique(numpy.array(trg_words1)) - trg_words2 = get_all_words(trg_file2) - unique_trg_words2 = numpy.unique(numpy.array(trg_words2)) - trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2) - trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1) - - # Write unique target words to files - trg_words_file1 = lex_path1 / "trg_words.txt" - trg_words_file2 = lex_path2 / "trg_words.txt" - with open(trg_words_file1, "w", encoding="utf8") as output_file: - for word in unique_trg_words1: - output_file.writelines(word+'\n') - with open(trg_words_file2, "w", encoding="utf8") as output_file: - for word in unique_trg_words2: - output_file.writelines(word+'\n') - - # Write target words missing from the alternate target file - with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'trg.txt words not found in {trg_file2}\n') - for word in trg1_only_words: - output_file.writelines(word+'\n') - with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'trg.txt words not found in {trg_file1}\n') - for word in trg2_only_words: - output_file.writelines(word+'\n') - - # Write the lex coverage stats - with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: - if args.src == True: - output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n') - output_file.writelines( - f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n') - output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n') - if args.trg == True: - output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n') - output_file.writelines( - f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n') - output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n') - - with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: - if args.src == True: - output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n') - output_file.writelines( - f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n') - output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n') - if args.trg == True: - output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n') - output_file.writelines( - f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n') - output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n') - - # Output stats if requested - if args.stats == True: - if args.src == True: - print(f'Unique words in src.txt: {len(unique_src_words1)}') - print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}') - print(f'Words missing from {src_words_file2}: {len(src1_only_words)}') - if args.trg == True: - print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}') - print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}') - print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}') - - -if __name__ == "__main__": - main() diff --git a/silnlp/common/compare_lex_Catapult.py b/silnlp/common/compare_lex_Catapult.py deleted file mode 100644 index 9f63bf69..00000000 --- a/silnlp/common/compare_lex_Catapult.py +++ /dev/null @@ -1,185 +0,0 @@ -import argparse -from collections import Counter -import numpy -import re -from typing import List - -from ..common.environment import SIL_NLP_ENV -from machine.tokenization import LatinWordTokenizer - -# Latin Tokenizer from machine library -def get_all_words(src_file: str) -> List: - words = [] - tokenizer = LatinWordTokenizer() - with open(src_file, "r", encoding = "utf8") as src_data_file: - for line in src_data_file: - line_words = tokenizer.tokenize(line) - for word in line_words: - word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() - if word != "" and not word.isnumeric(): - words.append(word) - return words - -# Naive whitespace-based script-agnostic word splitter -#def get_all_words(src_file: str) -> List: -# words = [] -# pattern = re.compile(r",(?=\S)") # Look for commas with no following space -# with open(src_file, "r", encoding = "utf8") as src_data_file: -# for line in src_data_file: -# for word in line.split(" "): -# word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower() -# finder = pattern.search(word) -# if finder: # Add space after commas as needed -# word = word[:finder.span()[1]]+" "+word[finder.span()[1]:] -# if word != "": -# words.append(word) -# return words - -def find_unique(words1: List, words2: List) -> List: - unique_words = [] - for word in words1: - if word not in words2: - unique_words.append(word) - return unique_words - - -def main() -> None: - parser = argparse.ArgumentParser(description="Compares unique words in two corpora") - parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\") - parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\") - parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", - action='store_true') - parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", - action='store_true') - parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", - action='store_true') - args = parser.parse_args() - - # If not explicitly limited, compare both source and target lexicons - if args.src == False and args.trg == False: - args.src = True - args.trg = True - - lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1 - lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2 - - # Compare source words and write results to files - if args.src == True: - src_file1 = lex_path1 / "src.txt" - src_file2 = lex_path2 / "src.txt" - - # Find all words and unique words on source side - src_words1 = get_all_words(src_file1) - unique_src_words1 = numpy.unique(numpy.array(src_words1)) - src_words2 = get_all_words(src_file2) - unique_src_words2 = numpy.unique(numpy.array(src_words2)) - src1_only_words = find_unique(unique_src_words1,unique_src_words2) - src2_only_words = find_unique(unique_src_words2,unique_src_words1) - src1_word_counter = Counter(src_words1).most_common() - src2_word_counter = Counter(src_words2).most_common() - - # Write unique source words to files - src_words_file1 = lex_path1 / "src_words.txt" - src_words_file2 = lex_path2 / "src_words.txt" - with open(src_words_file1, "w", encoding="utf8") as output_file: - for word in unique_src_words1: - output_file.writelines(word+'\n') - with open(src_words_file2, "w", encoding="utf8") as output_file: - for word in unique_src_words2: - output_file.writelines(word+'\n') - - # Re-write src_words files with counts - with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file: - for entry in src1_word_counter: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file: - for entry in src2_word_counter: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - # Output src1 words missing from src2, with their counts - with (lex_path1 / "missing_word_counts.txt").open("w", encoding = "utf8") as output_file: - for entry in src1_word_counter: - if entry[0] in src1_only_words: - output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') - - # Write source words missing from the alternate source file - with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'src.txt words not found in {src_file2}\n') - for word in src1_only_words: - output_file.writelines(word+'\n') - with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'src.txt words not found in {src_file1}\n') - for word in src2_only_words: - output_file.writelines(word+'\n') - - # Compare target words and write results to files - if args.trg == True: - trg_file1 = lex_path1 / "trg.txt" - trg_file2 = lex_path2 / "trg.txt" - - # Find all words and unique words on target side - trg_words1 = get_all_words(trg_file1) - unique_trg_words1 = numpy.unique(numpy.array(trg_words1)) - trg_words2 = get_all_words(trg_file2) - unique_trg_words2 = numpy.unique(numpy.array(trg_words2)) - trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2) - trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1) - - # Write unique target words to files - trg_words_file1 = lex_path1 / "trg_words.txt" - trg_words_file2 = lex_path2 / "trg_words.txt" - with open(trg_words_file1, "w", encoding="utf8") as output_file: - for word in unique_trg_words1: - output_file.writelines(word+'\n') - with open(trg_words_file2, "w", encoding="utf8") as output_file: - for word in unique_trg_words2: - output_file.writelines(word+'\n') - - # Write target words missing from the alternate target file - with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'trg.txt words not found in {trg_file2}\n') - for word in trg1_only_words: - output_file.writelines(word+'\n') - with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'trg.txt words not found in {trg_file1}\n') - for word in trg2_only_words: - output_file.writelines(word+'\n') - - # Write the lex coverage stats - with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: - if args.src == True: - output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n') - output_file.writelines( - f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n') - output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n') - if args.trg == True: - output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n') - output_file.writelines( - f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n') - output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n') - - with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file: - if args.src == True: - output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n') - output_file.writelines( - f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n') - output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n') - if args.trg == True: - output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n') - output_file.writelines( - f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n') - output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n') - - # Output stats if requested - if args.stats == True: - if args.src == True: - print(f'Unique words in src.txt: {len(unique_src_words1)}') - print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}') - print(f'Words missing from {src_words_file2}: {len(src1_only_words)}') - if args.trg == True: - print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}') - print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}') - print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}') - - -if __name__ == "__main__": - main() diff --git a/silnlp/common/find_words.py b/silnlp/common/find_words.py index a5cd1a6b..d6de1782 100644 --- a/silnlp/common/find_words.py +++ b/silnlp/common/find_words.py @@ -50,6 +50,7 @@ def main(): word_count = 0 for word in word_list: #if NFD(NFD(word[1])) in NFD(NFD(verse[1])): + #if word[1] in verse[1]: # (to find all instances; not just first) if word[1] in verse[1] and word[1] not in seen_words: for entry in src_word_counts: if entry[0] == word[1]: diff --git a/silnlp/common/find_words_Catapult.py b/silnlp/common/find_words_Catapult.py deleted file mode 100644 index d6de1782..00000000 --- a/silnlp/common/find_words_Catapult.py +++ /dev/null @@ -1,71 +0,0 @@ -import argparse -from collections import Counter -import csv -import unicodedata - -from ..common.environment import SIL_NLP_ENV - -# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings) -def NFD(s): - return unicodedata.normalize('NFD', s) - -def main(): - parser = argparse.ArgumentParser(description="Counts lexicon entries") - parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\") - parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt") - args = parser.parse_args() - - # Set up path and files - lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment - word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list - vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt" - - # Get count of each word in the file - with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file: - src_word_counts = [] - for entry in src_wd_file: - entry = list(entry.split('\t')) - if len(entry) > 1: - entry[1] = int(entry[1].strip()) - src_word_counts.append(entry) - else: - print("Error: word counts are missing. Please run count_words.py with the --count flag set.") - return 1 - - # Extract list of words - src_word_dict = dict(list(src_word_counts)) - with(word_filename).open("r", encoding = "utf8") as word_file: - words = [] - for word in word_file: - words.append(word.rstrip('\n')) - # Check for words and word count in each verse; write to output file. - with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file: - with(vref_filename).open("r", encoding = "utf8") as ref_file: - word_list = list(enumerate(words)) - result = [] - seen_words = [] - for verse in zip(ref_file, src_data_file): - word_text = [] - word_num = [] - word_count = 0 - for word in word_list: - #if NFD(NFD(word[1])) in NFD(NFD(verse[1])): - #if word[1] in verse[1]: # (to find all instances; not just first) - if word[1] in verse[1] and word[1] not in seen_words: - for entry in src_word_counts: - if entry[0] == word[1]: - word_count += entry[1] - seen_words.append(word[1]) - word_text.append(word[1]) - word_num.append(src_word_dict[word[1]]) - result.append([verse[0].rstrip('\n'), word_count, word_num, word_text]) - with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file: - writer = csv.writer(output_file, lineterminator="\n") - writer.writerow(['Reference','Novelty Score','Word Counts','Words']) - for line in result: - writer.writerow([line[0], line[1], line[2], *line[3]]) - #print(result) - - -if __name__ == '__main__': - main()