From fb074bff7b850434ac63c1ec16156aa57cde2d0d Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Wed, 19 Feb 2025 12:55:15 -0500
Subject: [PATCH 1/9] Add LatinWordTokenizer from machine

---
 silnlp/common/compare_lex.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py
index ffa2f8a9..84629b7a 100644
--- a/silnlp/common/compare_lex.py
+++ b/silnlp/common/compare_lex.py
@@ -4,20 +4,34 @@
 from typing import List
 
 from ..common.environment import SIL_NLP_ENV
+from machine.tokenization import LatinWordTokenizer
 
+# Latin Tokenizer from machine library
 def get_all_words(src_file: str) -> List:
     words = []
-    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
-    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+    tokenizer = LatinWordTokenizer()
+    with open(src_file, "r", encoding = "utf8") as src_data_file:
         for line in src_data_file:
-            for word in line.split(" "):
-                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
-                finder = pattern.search(word)
-                if finder:             # Add space after commas as needed
-                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+            line_words = tokenizer.tokenize(line)
+            for word in line_words:
                 if word != "":
-                    words.append(word)  
-    return words
+                    words.append(word)
+    return(words)
+
+# Naive whitespace-based script-agnostic word splitter
+#def get_all_words(src_file: str) -> List:
+#    words = []
+#    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
+#    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+#        for line in src_data_file:
+#            for word in line.split(" "):
+#                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+#                finder = pattern.search(word)
+#                if finder:             # Add space after commas as needed
+#                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+#                if word != "":
+#                    words.append(word)  
+#    return words
 
 def find_unique(words1: List, words2: List) -> List:
     unique_words = []
@@ -37,6 +51,7 @@ def main() -> None:
                         action='store_true')
     parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
                         action='store_true')
+    parser.add_argument("--books", help="Books to include from src and trg.")
     args = parser.parse_args()
 
     # If not explicitly limited, compare both source and target lexicons
@@ -109,7 +124,7 @@ def main() -> None:
             for word in trg1_only_words:
                 output_file.writelines(word+'\n')
         with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'src.txt words not found in {trg_file1}\n')
+            output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
             for word in trg2_only_words:
                 output_file.writelines(word+'\n')
     

From af3c79ae0ed7076af2230d129771aaa0d44dfd0f Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Tue, 25 Feb 2025 09:38:25 -0500
Subject: [PATCH 2/9] Update lex_tools for Catapult Reloaded work

---
 silnlp/common/compare_lexCR.py | 180 +++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 silnlp/common/compare_lexCR.py

diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py
new file mode 100644
index 00000000..1bbdaca5
--- /dev/null
+++ b/silnlp/common/compare_lexCR.py
@@ -0,0 +1,180 @@
+import argparse
+from collections import Counter
+import numpy
+import re
+from typing import List
+
+from ..common.environment import SIL_NLP_ENV
+from machine.tokenization import LatinWordTokenizer
+
+# Latin Tokenizer from machine library
+def get_all_words(src_file: str) -> List:
+    words = []
+    tokenizer = LatinWordTokenizer()
+    with open(src_file, "r", encoding = "utf8") as src_data_file:
+        for line in src_data_file:
+            line_words = tokenizer.tokenize(line)
+            for word in line_words:
+                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
+                if word != "" and not word.isnumeric():
+                    words.append(word)
+    return words
+
+# Naive whitespace-based script-agnostic word splitter
+#def get_all_words(src_file: str) -> List:
+#    words = []
+#    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
+#    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+#        for line in src_data_file:
+#            for word in line.split(" "):
+#                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+#                finder = pattern.search(word)
+#                if finder:             # Add space after commas as needed
+#                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+#                if word != "":
+#                    words.append(word)  
+#    return words
+
+def find_unique(words1: List, words2: List) -> List:
+    unique_words = []
+    for word in words1:
+        if word not in words2:
+            unique_words.append(word)
+    return unique_words
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compares unique words in two corpora")
+    parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", 
+                        action='store_true')
+    parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", 
+                        action='store_true')
+    parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
+                        action='store_true')
+    args = parser.parse_args()
+
+    # If not explicitly limited, compare both source and target lexicons
+    if args.src == False and args.trg == False:
+        args.src = True
+        args.trg = True
+
+    lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1
+    lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2
+
+    # Compare source words and write results to files
+    if args.src == True:
+        src_file1 = lex_path1 / "src.txt"
+        src_file2 = lex_path2 / "src.txt"
+
+        # Find all words and unique words on source side
+        src_words1 = get_all_words(src_file1)
+        unique_src_words1 = numpy.unique(numpy.array(src_words1))
+        src_words2 = get_all_words(src_file2)
+        unique_src_words2 = numpy.unique(numpy.array(src_words2))
+        src1_only_words = find_unique(unique_src_words1,unique_src_words2)
+        src2_only_words = find_unique(unique_src_words2,unique_src_words1)
+        src1_word_counter = Counter(src_words1).most_common()
+        src2_word_counter = Counter(src_words2).most_common()
+
+        # Write unique source words to files
+        src_words_file1 = lex_path1 / "src_words.txt"
+        src_words_file2 = lex_path2 / "src_words.txt"
+        with open(src_words_file1, "w", encoding="utf8") as output_file:
+            for word in unique_src_words1:
+                output_file.writelines(word+'\n')
+        with open(src_words_file2, "w", encoding="utf8") as output_file:
+            for word in unique_src_words2:
+                output_file.writelines(word+'\n')
+
+        # Re-write src_words files with counts
+        with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src1_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+        with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src2_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+
+        # Write source words missing from the alternate source file
+        with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {src_file2}\n')
+            for word in src1_only_words:
+                output_file.writelines(word+'\n')
+        with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {src_file1}\n')
+            for word in src2_only_words:
+                output_file.writelines(word+'\n')
+
+    # Compare target words and write results to files
+    if args.trg == True:
+        trg_file1 = lex_path1 / "trg.txt"
+        trg_file2 = lex_path2 / "trg.txt"
+
+        # Find all words and unique words on target side
+        trg_words1 = get_all_words(trg_file1)
+        unique_trg_words1 = numpy.unique(numpy.array(trg_words1))
+        trg_words2 = get_all_words(trg_file2)
+        unique_trg_words2 = numpy.unique(numpy.array(trg_words2))
+        trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2)
+        trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1)
+
+        # Write unique target words to files
+        trg_words_file1 = lex_path1 / "trg_words.txt"
+        trg_words_file2 = lex_path2 / "trg_words.txt"
+        with open(trg_words_file1, "w", encoding="utf8") as output_file:
+            for word in unique_trg_words1:
+                output_file.writelines(word+'\n')
+        with open(trg_words_file2, "w", encoding="utf8") as output_file:
+            for word in unique_trg_words2:
+                output_file.writelines(word+'\n')
+
+        # Write target words missing from the alternate target file
+        with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'trg.txt words not found in {trg_file2}\n')
+            for word in trg1_only_words:
+                output_file.writelines(word+'\n')
+        with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
+            for word in trg2_only_words:
+                output_file.writelines(word+'\n')
+    
+    # Write the lex coverage stats
+    with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
+        if args.src == True:
+            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n')
+            output_file.writelines(
+                f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n')
+            output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n')
+        if args.trg == True:
+            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n')
+            output_file.writelines(
+                f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n')
+            output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n')
+
+    with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
+        if args.src == True:
+            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n')
+            output_file.writelines(
+                f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n')
+            output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n')
+        if args.trg == True:
+            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n')
+            output_file.writelines(
+                f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n')
+            output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n')
+
+    # Output stats if requested
+    if args.stats == True:
+        if args.src == True:
+            print(f'Unique words in src.txt: {len(unique_src_words1)}')
+            print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}')
+            print(f'Words missing from {src_words_file2}: {len(src1_only_words)}')
+        if args.trg == True:
+            print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}')
+            print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}')
+            print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}')
+
+    
+if __name__ == "__main__":
+    main()

From 42e272411f93df3259df1142b092a756124b02f8 Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Tue, 25 Feb 2025 09:39:06 -0500
Subject: [PATCH 3/9] Furher update lex_tools for Catapult Reloaded work

---
 silnlp/common/compare_lex.py |  6 ++--
 silnlp/common/count_words.py | 17 ++++++---
 silnlp/common/find_words.py  | 70 ++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 8 deletions(-)
 create mode 100644 silnlp/common/find_words.py

diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py
index 84629b7a..da387543 100644
--- a/silnlp/common/compare_lex.py
+++ b/silnlp/common/compare_lex.py
@@ -14,9 +14,10 @@ def get_all_words(src_file: str) -> List:
         for line in src_data_file:
             line_words = tokenizer.tokenize(line)
             for word in line_words:
-                if word != "":
+                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+                if word != "" and not word.isnumeric():
                     words.append(word)
-    return(words)
+    return words
 
 # Naive whitespace-based script-agnostic word splitter
 #def get_all_words(src_file: str) -> List:
@@ -51,7 +52,6 @@ def main() -> None:
                         action='store_true')
     parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
                         action='store_true')
-    parser.add_argument("--books", help="Books to include from src and trg.")
     args = parser.parse_args()
 
     # If not explicitly limited, compare both source and target lexicons
diff --git a/silnlp/common/count_words.py b/silnlp/common/count_words.py
index 6fd83204..f2fd2010 100644
--- a/silnlp/common/count_words.py
+++ b/silnlp/common/count_words.py
@@ -15,8 +15,9 @@ def main() -> None:
     parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
     parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal")
     parser.add_argument("--num", help="Number of most common words to include", type=int, default=100)
-    parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words", 
+    parser.add_argument("--stats", help="Print word count and number of renderings for common words", 
                         action='store_true')
+    parser.add_argument("--count", help="Include count in src word files", action='store_true')
     args = parser.parse_args()
 
     # Set up path and lex files
@@ -49,6 +50,8 @@ def main() -> None:
                 if word != "" and not word.isnumeric():
                     src_words.append(word)  
     src_data_word_counter = Counter(src_words).most_common(args.num)
+    if args.count:
+       src_word_counter = Counter(src_words).most_common()
     unique_src_words = numpy.unique(numpy.array(src_words))
 
     # Pull all the separate words from the target data. Take all unique.
@@ -65,7 +68,7 @@ def main() -> None:
                     trg_words.append(word)  
     unique_trg_words = numpy.unique(numpy.array(trg_words))
     
-    # Clean lexicon file and prep for pandas csv reader
+    # Prep lexicon file for pandas csv reader (escape quotes)
     with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon:
         with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex:
             for line in lexicon.readlines():
@@ -111,9 +114,13 @@ def main() -> None:
         for src_wd in common_wd:
             writer.writerow([src_wd, *common_wd[src_wd]])
 
-    with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
-        for word in unique_src_words:
-            output_file.writelines(word + '\n')
+    with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            if args.count:
+                for entry in src_word_counter:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+            else:
+                for word in unique_src_words:
+                    output_file.writelines(word + '\n')
 
     # Optionally, output a few stats
     if args.stats:
diff --git a/silnlp/common/find_words.py b/silnlp/common/find_words.py
new file mode 100644
index 00000000..a5cd1a6b
--- /dev/null
+++ b/silnlp/common/find_words.py
@@ -0,0 +1,70 @@
+import argparse
+from collections import Counter
+import csv
+import unicodedata
+
+from ..common.environment import SIL_NLP_ENV
+
+# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings)
+def NFD(s):
+    return unicodedata.normalize('NFD', s)
+
+def main():
+    parser = argparse.ArgumentParser(description="Counts lexicon entries")
+    parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt")
+    args = parser.parse_args()
+
+    # Set up path and files
+    lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
+    word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list
+    vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt"
+
+    # Get count of each word in the file
+    with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file:
+        src_word_counts = []
+        for entry in src_wd_file:
+            entry = list(entry.split('\t'))
+            if len(entry) > 1:
+                    entry[1] = int(entry[1].strip())
+                    src_word_counts.append(entry)
+            else:
+                print("Error: word counts are missing. Please run count_words.py with the --count flag set.")
+                return 1
+
+    # Extract list of words
+    src_word_dict = dict(list(src_word_counts))
+    with(word_filename).open("r", encoding = "utf8") as word_file:
+        words = []
+        for word in word_file:
+            words.append(word.rstrip('\n'))
+    # Check for words and word count in each verse; write to output file.
+    with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
+            with(vref_filename).open("r", encoding = "utf8") as ref_file:
+                word_list = list(enumerate(words))
+                result = []
+                seen_words = []
+                for verse in zip(ref_file, src_data_file):
+                    word_text = []
+                    word_num = []
+                    word_count = 0
+                    for word in word_list:
+                        #if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
+                        if word[1] in verse[1] and word[1] not in seen_words:
+                            for entry in src_word_counts:
+                                 if entry[0] == word[1]:
+                                      word_count += entry[1]
+                            seen_words.append(word[1])
+                            word_text.append(word[1])
+                            word_num.append(src_word_dict[word[1]])
+                    result.append([verse[0].rstrip('\n'), word_count, word_num, word_text])
+    with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file:
+        writer = csv.writer(output_file, lineterminator="\n")
+        writer.writerow(['Reference','Novelty Score','Word Counts','Words'])
+        for line in result:
+            writer.writerow([line[0], line[1], line[2], *line[3]])
+    #print(result)
+
+
+if __name__ == '__main__':
+    main()

From f47eed9accb9c05410b8fbf523e5381d326d76f8 Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Wed, 23 Apr 2025 11:14:10 -0400
Subject: [PATCH 4/9] Add functionality to extract vocab coverage lists for
 Catapult

---
 silnlp/common/compare_lex_Catapult.py | 185 ++++++++++++++++++++++++++
 silnlp/common/find_words_Catapult.py  |  71 ++++++++++
 2 files changed, 256 insertions(+)
 create mode 100644 silnlp/common/compare_lex_Catapult.py
 create mode 100644 silnlp/common/find_words_Catapult.py

diff --git a/silnlp/common/compare_lex_Catapult.py b/silnlp/common/compare_lex_Catapult.py
new file mode 100644
index 00000000..9f63bf69
--- /dev/null
+++ b/silnlp/common/compare_lex_Catapult.py
@@ -0,0 +1,185 @@
+import argparse
+from collections import Counter
+import numpy
+import re
+from typing import List
+
+from ..common.environment import SIL_NLP_ENV
+from machine.tokenization import LatinWordTokenizer
+
+# Latin Tokenizer from machine library
+def get_all_words(src_file: str) -> List:
+    words = []
+    tokenizer = LatinWordTokenizer()
+    with open(src_file, "r", encoding = "utf8") as src_data_file:
+        for line in src_data_file:
+            line_words = tokenizer.tokenize(line)
+            for word in line_words:
+                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
+                if word != "" and not word.isnumeric():
+                    words.append(word)
+    return words
+
+# Naive whitespace-based script-agnostic word splitter
+#def get_all_words(src_file: str) -> List:
+#    words = []
+#    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
+#    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+#        for line in src_data_file:
+#            for word in line.split(" "):
+#                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+#                finder = pattern.search(word)
+#                if finder:             # Add space after commas as needed
+#                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+#                if word != "":
+#                    words.append(word)  
+#    return words
+
+def find_unique(words1: List, words2: List) -> List:
+    unique_words = []
+    for word in words1:
+        if word not in words2:
+            unique_words.append(word)
+    return unique_words
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compares unique words in two corpora")
+    parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", 
+                        action='store_true')
+    parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", 
+                        action='store_true')
+    parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
+                        action='store_true')
+    args = parser.parse_args()
+
+    # If not explicitly limited, compare both source and target lexicons
+    if args.src == False and args.trg == False:
+        args.src = True
+        args.trg = True
+
+    lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1
+    lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2
+
+    # Compare source words and write results to files
+    if args.src == True:
+        src_file1 = lex_path1 / "src.txt"
+        src_file2 = lex_path2 / "src.txt"
+
+        # Find all words and unique words on source side
+        src_words1 = get_all_words(src_file1)
+        unique_src_words1 = numpy.unique(numpy.array(src_words1))
+        src_words2 = get_all_words(src_file2)
+        unique_src_words2 = numpy.unique(numpy.array(src_words2))
+        src1_only_words = find_unique(unique_src_words1,unique_src_words2)
+        src2_only_words = find_unique(unique_src_words2,unique_src_words1)
+        src1_word_counter = Counter(src_words1).most_common()
+        src2_word_counter = Counter(src_words2).most_common()
+
+        # Write unique source words to files
+        src_words_file1 = lex_path1 / "src_words.txt"
+        src_words_file2 = lex_path2 / "src_words.txt"
+        with open(src_words_file1, "w", encoding="utf8") as output_file:
+            for word in unique_src_words1:
+                output_file.writelines(word+'\n')
+        with open(src_words_file2, "w", encoding="utf8") as output_file:
+            for word in unique_src_words2:
+                output_file.writelines(word+'\n')
+
+        # Re-write src_words files with counts
+        with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src1_word_counter:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+        with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src2_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+        # Output src1 words missing from src2, with their counts
+        with (lex_path1 / "missing_word_counts.txt").open("w", encoding = "utf8") as output_file:
+            for entry in src1_word_counter:
+                if entry[0] in src1_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+
+        # Write source words missing from the alternate source file
+        with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {src_file2}\n')
+            for word in src1_only_words:
+                output_file.writelines(word+'\n')
+        with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {src_file1}\n')
+            for word in src2_only_words:
+                output_file.writelines(word+'\n')
+
+    # Compare target words and write results to files
+    if args.trg == True:
+        trg_file1 = lex_path1 / "trg.txt"
+        trg_file2 = lex_path2 / "trg.txt"
+
+        # Find all words and unique words on target side
+        trg_words1 = get_all_words(trg_file1)
+        unique_trg_words1 = numpy.unique(numpy.array(trg_words1))
+        trg_words2 = get_all_words(trg_file2)
+        unique_trg_words2 = numpy.unique(numpy.array(trg_words2))
+        trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2)
+        trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1)
+
+        # Write unique target words to files
+        trg_words_file1 = lex_path1 / "trg_words.txt"
+        trg_words_file2 = lex_path2 / "trg_words.txt"
+        with open(trg_words_file1, "w", encoding="utf8") as output_file:
+            for word in unique_trg_words1:
+                output_file.writelines(word+'\n')
+        with open(trg_words_file2, "w", encoding="utf8") as output_file:
+            for word in unique_trg_words2:
+                output_file.writelines(word+'\n')
+
+        # Write target words missing from the alternate target file
+        with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'trg.txt words not found in {trg_file2}\n')
+            for word in trg1_only_words:
+                output_file.writelines(word+'\n')
+        with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
+            for word in trg2_only_words:
+                output_file.writelines(word+'\n')
+    
+    # Write the lex coverage stats
+    with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
+        if args.src == True:
+            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n')
+            output_file.writelines(
+                f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n')
+            output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n')
+        if args.trg == True:
+            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n')
+            output_file.writelines(
+                f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n')
+            output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n')
+
+    with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
+        if args.src == True:
+            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n')
+            output_file.writelines(
+                f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n')
+            output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n')
+        if args.trg == True:
+            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n')
+            output_file.writelines(
+                f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n')
+            output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n')
+
+    # Output stats if requested
+    if args.stats == True:
+        if args.src == True:
+            print(f'Unique words in src.txt: {len(unique_src_words1)}')
+            print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}')
+            print(f'Words missing from {src_words_file2}: {len(src1_only_words)}')
+        if args.trg == True:
+            print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}')
+            print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}')
+            print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}')
+
+    
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/find_words_Catapult.py b/silnlp/common/find_words_Catapult.py
new file mode 100644
index 00000000..d6de1782
--- /dev/null
+++ b/silnlp/common/find_words_Catapult.py
@@ -0,0 +1,71 @@
+import argparse
+from collections import Counter
+import csv
+import unicodedata
+
+from ..common.environment import SIL_NLP_ENV
+
+# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings)
+def NFD(s):
+    return unicodedata.normalize('NFD', s)
+
+def main():
+    parser = argparse.ArgumentParser(description="Counts lexicon entries")
+    parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt")
+    args = parser.parse_args()
+
+    # Set up path and files
+    lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
+    word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list
+    vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt"
+
+    # Get count of each word in the file
+    with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file:
+        src_word_counts = []
+        for entry in src_wd_file:
+            entry = list(entry.split('\t'))
+            if len(entry) > 1:
+                    entry[1] = int(entry[1].strip())
+                    src_word_counts.append(entry)
+            else:
+                print("Error: word counts are missing. Please run count_words.py with the --count flag set.")
+                return 1
+
+    # Extract list of words
+    src_word_dict = dict(list(src_word_counts))
+    with(word_filename).open("r", encoding = "utf8") as word_file:
+        words = []
+        for word in word_file:
+            words.append(word.rstrip('\n'))
+    # Check for words and word count in each verse; write to output file.
+    with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
+            with(vref_filename).open("r", encoding = "utf8") as ref_file:
+                word_list = list(enumerate(words))
+                result = []
+                seen_words = []
+                for verse in zip(ref_file, src_data_file):
+                    word_text = []
+                    word_num = []
+                    word_count = 0
+                    for word in word_list:
+                        #if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
+                        #if word[1] in verse[1]: # (to find all instances; not just first)
+                        if word[1] in verse[1] and word[1] not in seen_words:
+                            for entry in src_word_counts:
+                                 if entry[0] == word[1]:
+                                      word_count += entry[1]
+                            seen_words.append(word[1])
+                            word_text.append(word[1])
+                            word_num.append(src_word_dict[word[1]])
+                    result.append([verse[0].rstrip('\n'), word_count, word_num, word_text])
+    with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file:
+        writer = csv.writer(output_file, lineterminator="\n")
+        writer.writerow(['Reference','Novelty Score','Word Counts','Words'])
+        for line in result:
+            writer.writerow([line[0], line[1], line[2], *line[3]])
+    #print(result)
+
+
+if __name__ == '__main__':
+    main()

From 523d1d869af4ec3d772aeead889732288634b83a Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Tue, 20 May 2025 09:55:54 -0400
Subject: [PATCH 5/9] Revert to naive word separator

---
 silnlp/common/compare_lexCR.py | 44 +++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py
index 1bbdaca5..b4ab1d6f 100644
--- a/silnlp/common/compare_lexCR.py
+++ b/silnlp/common/compare_lexCR.py
@@ -8,33 +8,33 @@
 from machine.tokenization import LatinWordTokenizer
 
 # Latin Tokenizer from machine library
-def get_all_words(src_file: str) -> List:
-    words = []
-    tokenizer = LatinWordTokenizer()
-    with open(src_file, "r", encoding = "utf8") as src_data_file:
-        for line in src_data_file:
-            line_words = tokenizer.tokenize(line)
-            for word in line_words:
-                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
-                if word != "" and not word.isnumeric():
-                    words.append(word)
-    return words
-
-# Naive whitespace-based script-agnostic word splitter
 #def get_all_words(src_file: str) -> List:
 #    words = []
-#    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
-#    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+#    tokenizer = LatinWordTokenizer()
+#    with open(src_file, "r", encoding = "utf8") as src_data_file:
 #        for line in src_data_file:
-#            for word in line.split(" "):
-#                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
-#                finder = pattern.search(word)
-#                if finder:             # Add space after commas as needed
-#                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
-#                if word != "":
-#                    words.append(word)  
+#            line_words = tokenizer.tokenize(line)
+#            for word in line_words:
+#                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
+#                if word != "" and not word.isnumeric():
+#                    words.append(word)
 #    return words
 
+# Naive whitespace-based script-agnostic word splitter
+def get_all_words(src_file: str) -> List:
+    words = []
+    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
+    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+        for line in src_data_file:
+            for word in line.split(" "):
+                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+                finder = pattern.search(word)
+                if finder:             # Add space after commas as needed
+                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+                if word != "":
+                    words.append(word)  
+    return words
+
 def find_unique(words1: List, words2: List) -> List:
     unique_words = []
     for word in words1:

From 70cd9440ac264e26f170e928bf62a1f30ddd4b7b Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Tue, 20 May 2025 12:42:29 -0400
Subject: [PATCH 6/9] Revert compare_lex.py to naive word splitter.

---
 silnlp/common/compare_lex.py | 42 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py
index da387543..f595e5e3 100644
--- a/silnlp/common/compare_lex.py
+++ b/silnlp/common/compare_lex.py
@@ -7,33 +7,33 @@
 from machine.tokenization import LatinWordTokenizer
 
 # Latin Tokenizer from machine library
-def get_all_words(src_file: str) -> List:
-    words = []
-    tokenizer = LatinWordTokenizer()
-    with open(src_file, "r", encoding = "utf8") as src_data_file:
-        for line in src_data_file:
-            line_words = tokenizer.tokenize(line)
-            for word in line_words:
-                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
-                if word != "" and not word.isnumeric():
-                    words.append(word)
-    return words
-
-# Naive whitespace-based script-agnostic word splitter
 #def get_all_words(src_file: str) -> List:
 #    words = []
-#    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
-#    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+#    tokenizer = LatinWordTokenizer()
+#    with open(src_file, "r", encoding = "utf8") as src_data_file:
 #        for line in src_data_file:
-#            for word in line.split(" "):
+#            line_words = tokenizer.tokenize(line)
+#            for word in line_words:
 #                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
-#                finder = pattern.search(word)
-#                if finder:             # Add space after commas as needed
-#                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
-#                if word != "":
-#                    words.append(word)  
+#                if word != "" and not word.isnumeric():
+#                    words.append(word)
 #    return words
 
+# Naive whitespace-based script-agnostic word splitter
+def get_all_words(src_file: str) -> List:
+    words = []
+    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
+    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+        for line in src_data_file:
+            for word in line.split(" "):
+                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+                finder = pattern.search(word)
+                if finder:             # Add space after commas as needed
+                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+                if word != "":
+                    words.append(word)  
+    return words
+
 def find_unique(words1: List, words2: List) -> List:
     unique_words = []
     for word in words1:

From 0cce13edd5f7a9618b3b4af4b87edb1a0d9ca077 Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Fri, 23 May 2025 13:33:30 -0400
Subject: [PATCH 7/9] Add word counts to unmatched_src_words.txt output

---
 silnlp/common/compare_lexCR.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py
index b4ab1d6f..fb14c90e 100644
--- a/silnlp/common/compare_lexCR.py
+++ b/silnlp/common/compare_lexCR.py
@@ -97,14 +97,27 @@ def main() -> None:
                 output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
 
         # Write source words missing from the alternate source file
+        #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+        #    output_file.writelines(f'src.txt words not found in {src_file2}\n')
+        #    for word in src1_only_words:
+        #        output_file.writelines(word+'\n')
+        #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+        #    output_file.writelines(f'src.txt words not found in {src_file1}\n')
+        #    for word in src2_only_words:
+        #        output_file.writelines(word+'\n')
+
+
+        # Rewrite of above section to include counts in the output file: 
         with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
             output_file.writelines(f'src.txt words not found in {src_file2}\n')
-            for word in src1_only_words:
-                output_file.writelines(word+'\n')
+            for entry in src1_word_counter:
+                if entry[0] in src1_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
         with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
             output_file.writelines(f'src.txt words not found in {src_file1}\n')
-            for word in src2_only_words:
-                output_file.writelines(word+'\n')
+            for entry in src2_word_counter:
+                if entry[0] in src2_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
 
     # Compare target words and write results to files
     if args.trg == True:

From 5c408db3bf93919da0cbc9d0164db393114985bc Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Wed, 11 Jun 2025 14:06:46 -0400
Subject: [PATCH 8/9] Update lex_tools with word counts for Catapult Reloaded

---
 silnlp/common/compare_lex.py | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py
index f595e5e3..fb14c90e 100644
--- a/silnlp/common/compare_lex.py
+++ b/silnlp/common/compare_lex.py
@@ -1,4 +1,5 @@
 import argparse
+from collections import Counter
 import numpy
 import re
 from typing import List
@@ -14,7 +15,7 @@
 #        for line in src_data_file:
 #            line_words = tokenizer.tokenize(line)
 #            for word in line_words:
-#                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+#                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
 #                if word != "" and not word.isnumeric():
 #                    words.append(word)
 #    return words
@@ -74,6 +75,8 @@ def main() -> None:
         unique_src_words2 = numpy.unique(numpy.array(src_words2))
         src1_only_words = find_unique(unique_src_words1,unique_src_words2)
         src2_only_words = find_unique(unique_src_words2,unique_src_words1)
+        src1_word_counter = Counter(src_words1).most_common()
+        src2_word_counter = Counter(src_words2).most_common()
 
         # Write unique source words to files
         src_words_file1 = lex_path1 / "src_words.txt"
@@ -85,15 +88,36 @@ def main() -> None:
             for word in unique_src_words2:
                 output_file.writelines(word+'\n')
 
+        # Re-write src_words files with counts
+        with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src1_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+        with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src2_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+
         # Write source words missing from the alternate source file
+        #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+        #    output_file.writelines(f'src.txt words not found in {src_file2}\n')
+        #    for word in src1_only_words:
+        #        output_file.writelines(word+'\n')
+        #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+        #    output_file.writelines(f'src.txt words not found in {src_file1}\n')
+        #    for word in src2_only_words:
+        #        output_file.writelines(word+'\n')
+
+
+        # Rewrite of above section to include counts in the output file: 
         with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
             output_file.writelines(f'src.txt words not found in {src_file2}\n')
-            for word in src1_only_words:
-                output_file.writelines(word+'\n')
+            for entry in src1_word_counter:
+                if entry[0] in src1_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
         with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
             output_file.writelines(f'src.txt words not found in {src_file1}\n')
-            for word in src2_only_words:
-                output_file.writelines(word+'\n')
+            for entry in src2_word_counter:
+                if entry[0] in src2_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
 
     # Compare target words and write results to files
     if args.trg == True:

From d68f27a9d51ddfe76eea8f4a470561930b0ea3e2 Mon Sep 17 00:00:00 2001
From: Bethany Moore <bethany_moore@sil.org>
Date: Wed, 11 Jun 2025 14:28:00 -0400
Subject: [PATCH 9/9] Clean up duplicate versions

---
 silnlp/common/compare_lexCR.py        | 193 --------------------------
 silnlp/common/compare_lex_Catapult.py | 185 ------------------------
 silnlp/common/find_words.py           |   1 +
 silnlp/common/find_words_Catapult.py  |  71 ----------
 4 files changed, 1 insertion(+), 449 deletions(-)
 delete mode 100644 silnlp/common/compare_lexCR.py
 delete mode 100644 silnlp/common/compare_lex_Catapult.py
 delete mode 100644 silnlp/common/find_words_Catapult.py

diff --git a/silnlp/common/compare_lexCR.py b/silnlp/common/compare_lexCR.py
deleted file mode 100644
index fb14c90e..00000000
--- a/silnlp/common/compare_lexCR.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import argparse
-from collections import Counter
-import numpy
-import re
-from typing import List
-
-from ..common.environment import SIL_NLP_ENV
-from machine.tokenization import LatinWordTokenizer
-
-# Latin Tokenizer from machine library
-#def get_all_words(src_file: str) -> List:
-#    words = []
-#    tokenizer = LatinWordTokenizer()
-#    with open(src_file, "r", encoding = "utf8") as src_data_file:
-#        for line in src_data_file:
-#            line_words = tokenizer.tokenize(line)
-#            for word in line_words:
-#                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
-#                if word != "" and not word.isnumeric():
-#                    words.append(word)
-#    return words
-
-# Naive whitespace-based script-agnostic word splitter
-def get_all_words(src_file: str) -> List:
-    words = []
-    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
-    with open(src_file, "r", encoding = "utf8") as src_data_file:     
-        for line in src_data_file:
-            for word in line.split(" "):
-                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
-                finder = pattern.search(word)
-                if finder:             # Add space after commas as needed
-                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
-                if word != "":
-                    words.append(word)  
-    return words
-
-def find_unique(words1: List, words2: List) -> List:
-    unique_words = []
-    for word in words1:
-        if word not in words2:
-            unique_words.append(word)
-    return unique_words
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Compares unique words in two corpora")
-    parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\")
-    parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\")
-    parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", 
-                        action='store_true')
-    parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", 
-                        action='store_true')
-    parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
-                        action='store_true')
-    args = parser.parse_args()
-
-    # If not explicitly limited, compare both source and target lexicons
-    if args.src == False and args.trg == False:
-        args.src = True
-        args.trg = True
-
-    lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1
-    lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2
-
-    # Compare source words and write results to files
-    if args.src == True:
-        src_file1 = lex_path1 / "src.txt"
-        src_file2 = lex_path2 / "src.txt"
-
-        # Find all words and unique words on source side
-        src_words1 = get_all_words(src_file1)
-        unique_src_words1 = numpy.unique(numpy.array(src_words1))
-        src_words2 = get_all_words(src_file2)
-        unique_src_words2 = numpy.unique(numpy.array(src_words2))
-        src1_only_words = find_unique(unique_src_words1,unique_src_words2)
-        src2_only_words = find_unique(unique_src_words2,unique_src_words1)
-        src1_word_counter = Counter(src_words1).most_common()
-        src2_word_counter = Counter(src_words2).most_common()
-
-        # Write unique source words to files
-        src_words_file1 = lex_path1 / "src_words.txt"
-        src_words_file2 = lex_path2 / "src_words.txt"
-        with open(src_words_file1, "w", encoding="utf8") as output_file:
-            for word in unique_src_words1:
-                output_file.writelines(word+'\n')
-        with open(src_words_file2, "w", encoding="utf8") as output_file:
-            for word in unique_src_words2:
-                output_file.writelines(word+'\n')
-
-        # Re-write src_words files with counts
-        with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
-            for entry in src1_word_counter:
-                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-        with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
-            for entry in src2_word_counter:
-                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-
-        # Write source words missing from the alternate source file
-        #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
-        #    output_file.writelines(f'src.txt words not found in {src_file2}\n')
-        #    for word in src1_only_words:
-        #        output_file.writelines(word+'\n')
-        #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
-        #    output_file.writelines(f'src.txt words not found in {src_file1}\n')
-        #    for word in src2_only_words:
-        #        output_file.writelines(word+'\n')
-
-
-        # Rewrite of above section to include counts in the output file: 
-        with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'src.txt words not found in {src_file2}\n')
-            for entry in src1_word_counter:
-                if entry[0] in src1_only_words:
-                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-        with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'src.txt words not found in {src_file1}\n')
-            for entry in src2_word_counter:
-                if entry[0] in src2_only_words:
-                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-
-    # Compare target words and write results to files
-    if args.trg == True:
-        trg_file1 = lex_path1 / "trg.txt"
-        trg_file2 = lex_path2 / "trg.txt"
-
-        # Find all words and unique words on target side
-        trg_words1 = get_all_words(trg_file1)
-        unique_trg_words1 = numpy.unique(numpy.array(trg_words1))
-        trg_words2 = get_all_words(trg_file2)
-        unique_trg_words2 = numpy.unique(numpy.array(trg_words2))
-        trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2)
-        trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1)
-
-        # Write unique target words to files
-        trg_words_file1 = lex_path1 / "trg_words.txt"
-        trg_words_file2 = lex_path2 / "trg_words.txt"
-        with open(trg_words_file1, "w", encoding="utf8") as output_file:
-            for word in unique_trg_words1:
-                output_file.writelines(word+'\n')
-        with open(trg_words_file2, "w", encoding="utf8") as output_file:
-            for word in unique_trg_words2:
-                output_file.writelines(word+'\n')
-
-        # Write target words missing from the alternate target file
-        with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'trg.txt words not found in {trg_file2}\n')
-            for word in trg1_only_words:
-                output_file.writelines(word+'\n')
-        with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
-            for word in trg2_only_words:
-                output_file.writelines(word+'\n')
-    
-    # Write the lex coverage stats
-    with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
-        if args.src == True:
-            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n')
-            output_file.writelines(
-                f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n')
-            output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n')
-        if args.trg == True:
-            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n')
-            output_file.writelines(
-                f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n')
-            output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n')
-
-    with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
-        if args.src == True:
-            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n')
-            output_file.writelines(
-                f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n')
-            output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n')
-        if args.trg == True:
-            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n')
-            output_file.writelines(
-                f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n')
-            output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n')
-
-    # Output stats if requested
-    if args.stats == True:
-        if args.src == True:
-            print(f'Unique words in src.txt: {len(unique_src_words1)}')
-            print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}')
-            print(f'Words missing from {src_words_file2}: {len(src1_only_words)}')
-        if args.trg == True:
-            print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}')
-            print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}')
-            print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}')
-
-    
-if __name__ == "__main__":
-    main()
diff --git a/silnlp/common/compare_lex_Catapult.py b/silnlp/common/compare_lex_Catapult.py
deleted file mode 100644
index 9f63bf69..00000000
--- a/silnlp/common/compare_lex_Catapult.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import argparse
-from collections import Counter
-import numpy
-import re
-from typing import List
-
-from ..common.environment import SIL_NLP_ENV
-from machine.tokenization import LatinWordTokenizer
-
-# Latin Tokenizer from machine library
-def get_all_words(src_file: str) -> List:
-    words = []
-    tokenizer = LatinWordTokenizer()
-    with open(src_file, "r", encoding = "utf8") as src_data_file:
-        for line in src_data_file:
-            line_words = tokenizer.tokenize(line)
-            for word in line_words:
-                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
-                if word != "" and not word.isnumeric():
-                    words.append(word)
-    return words
-
-# Naive whitespace-based script-agnostic word splitter
-#def get_all_words(src_file: str) -> List:
-#    words = []
-#    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
-#    with open(src_file, "r", encoding = "utf8") as src_data_file:     
-#        for line in src_data_file:
-#            for word in line.split(" "):
-#                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
-#                finder = pattern.search(word)
-#                if finder:             # Add space after commas as needed
-#                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
-#                if word != "":
-#                    words.append(word)  
-#    return words
-
-def find_unique(words1: List, words2: List) -> List:
-    unique_words = []
-    for word in words1:
-        if word not in words2:
-            unique_words.append(word)
-    return unique_words
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Compares unique words in two corpora")
-    parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\")
-    parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\")
-    parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", 
-                        action='store_true')
-    parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", 
-                        action='store_true')
-    parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
-                        action='store_true')
-    args = parser.parse_args()
-
-    # If not explicitly limited, compare both source and target lexicons
-    if args.src == False and args.trg == False:
-        args.src = True
-        args.trg = True
-
-    lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1
-    lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2
-
-    # Compare source words and write results to files
-    if args.src == True:
-        src_file1 = lex_path1 / "src.txt"
-        src_file2 = lex_path2 / "src.txt"
-
-        # Find all words and unique words on source side
-        src_words1 = get_all_words(src_file1)
-        unique_src_words1 = numpy.unique(numpy.array(src_words1))
-        src_words2 = get_all_words(src_file2)
-        unique_src_words2 = numpy.unique(numpy.array(src_words2))
-        src1_only_words = find_unique(unique_src_words1,unique_src_words2)
-        src2_only_words = find_unique(unique_src_words2,unique_src_words1)
-        src1_word_counter = Counter(src_words1).most_common()
-        src2_word_counter = Counter(src_words2).most_common()
-
-        # Write unique source words to files
-        src_words_file1 = lex_path1 / "src_words.txt"
-        src_words_file2 = lex_path2 / "src_words.txt"
-        with open(src_words_file1, "w", encoding="utf8") as output_file:
-            for word in unique_src_words1:
-                output_file.writelines(word+'\n')
-        with open(src_words_file2, "w", encoding="utf8") as output_file:
-            for word in unique_src_words2:
-                output_file.writelines(word+'\n')
-
-        # Re-write src_words files with counts
-        with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
-            for entry in src1_word_counter:
-                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-        with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
-            for entry in src2_word_counter:
-                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-        # Output src1 words missing from src2, with their counts
-        with (lex_path1 / "missing_word_counts.txt").open("w", encoding = "utf8") as output_file:
-            for entry in src1_word_counter:
-                if entry[0] in src1_only_words:
-                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
-
-        # Write source words missing from the alternate source file
-        with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'src.txt words not found in {src_file2}\n')
-            for word in src1_only_words:
-                output_file.writelines(word+'\n')
-        with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'src.txt words not found in {src_file1}\n')
-            for word in src2_only_words:
-                output_file.writelines(word+'\n')
-
-    # Compare target words and write results to files
-    if args.trg == True:
-        trg_file1 = lex_path1 / "trg.txt"
-        trg_file2 = lex_path2 / "trg.txt"
-
-        # Find all words and unique words on target side
-        trg_words1 = get_all_words(trg_file1)
-        unique_trg_words1 = numpy.unique(numpy.array(trg_words1))
-        trg_words2 = get_all_words(trg_file2)
-        unique_trg_words2 = numpy.unique(numpy.array(trg_words2))
-        trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2)
-        trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1)
-
-        # Write unique target words to files
-        trg_words_file1 = lex_path1 / "trg_words.txt"
-        trg_words_file2 = lex_path2 / "trg_words.txt"
-        with open(trg_words_file1, "w", encoding="utf8") as output_file:
-            for word in unique_trg_words1:
-                output_file.writelines(word+'\n')
-        with open(trg_words_file2, "w", encoding="utf8") as output_file:
-            for word in unique_trg_words2:
-                output_file.writelines(word+'\n')
-
-        # Write target words missing from the alternate target file
-        with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'trg.txt words not found in {trg_file2}\n')
-            for word in trg1_only_words:
-                output_file.writelines(word+'\n')
-        with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
-            for word in trg2_only_words:
-                output_file.writelines(word+'\n')
-    
-    # Write the lex coverage stats
-    with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
-        if args.src == True:
-            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n')
-            output_file.writelines(
-                f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n')
-            output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n')
-        if args.trg == True:
-            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n')
-            output_file.writelines(
-                f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n')
-            output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n')
-
-    with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
-        if args.src == True:
-            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n')
-            output_file.writelines(
-                f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n')
-            output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n')
-        if args.trg == True:
-            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n')
-            output_file.writelines(
-                f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n')
-            output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n')
-
-    # Output stats if requested
-    if args.stats == True:
-        if args.src == True:
-            print(f'Unique words in src.txt: {len(unique_src_words1)}')
-            print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}')
-            print(f'Words missing from {src_words_file2}: {len(src1_only_words)}')
-        if args.trg == True:
-            print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}')
-            print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}')
-            print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}')
-
-    
-if __name__ == "__main__":
-    main()
diff --git a/silnlp/common/find_words.py b/silnlp/common/find_words.py
index a5cd1a6b..d6de1782 100644
--- a/silnlp/common/find_words.py
+++ b/silnlp/common/find_words.py
@@ -50,6 +50,7 @@ def main():
                     word_count = 0
                     for word in word_list:
                         #if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
+                        #if word[1] in verse[1]: # (to find all instances; not just first)
                         if word[1] in verse[1] and word[1] not in seen_words:
                             for entry in src_word_counts:
                                  if entry[0] == word[1]:
diff --git a/silnlp/common/find_words_Catapult.py b/silnlp/common/find_words_Catapult.py
deleted file mode 100644
index d6de1782..00000000
--- a/silnlp/common/find_words_Catapult.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import argparse
-from collections import Counter
-import csv
-import unicodedata
-
-from ..common.environment import SIL_NLP_ENV
-
-# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings)
-def NFD(s):
-    return unicodedata.normalize('NFD', s)
-
-def main():
-    parser = argparse.ArgumentParser(description="Counts lexicon entries")
-    parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
-    parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt")
-    args = parser.parse_args()
-
-    # Set up path and files
-    lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
-    word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list
-    vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt"
-
-    # Get count of each word in the file
-    with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file:
-        src_word_counts = []
-        for entry in src_wd_file:
-            entry = list(entry.split('\t'))
-            if len(entry) > 1:
-                    entry[1] = int(entry[1].strip())
-                    src_word_counts.append(entry)
-            else:
-                print("Error: word counts are missing. Please run count_words.py with the --count flag set.")
-                return 1
-
-    # Extract list of words
-    src_word_dict = dict(list(src_word_counts))
-    with(word_filename).open("r", encoding = "utf8") as word_file:
-        words = []
-        for word in word_file:
-            words.append(word.rstrip('\n'))
-    # Check for words and word count in each verse; write to output file.
-    with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
-            with(vref_filename).open("r", encoding = "utf8") as ref_file:
-                word_list = list(enumerate(words))
-                result = []
-                seen_words = []
-                for verse in zip(ref_file, src_data_file):
-                    word_text = []
-                    word_num = []
-                    word_count = 0
-                    for word in word_list:
-                        #if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
-                        #if word[1] in verse[1]: # (to find all instances; not just first)
-                        if word[1] in verse[1] and word[1] not in seen_words:
-                            for entry in src_word_counts:
-                                 if entry[0] == word[1]:
-                                      word_count += entry[1]
-                            seen_words.append(word[1])
-                            word_text.append(word[1])
-                            word_num.append(src_word_dict[word[1]])
-                    result.append([verse[0].rstrip('\n'), word_count, word_num, word_text])
-    with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file:
-        writer = csv.writer(output_file, lineterminator="\n")
-        writer.writerow(['Reference','Novelty Score','Word Counts','Words'])
-        for line in result:
-            writer.writerow([line[0], line[1], line[2], *line[3]])
-    #print(result)
-
-
-if __name__ == '__main__':
-    main()