sillsdev · bhartmoore · Feb 19, 2025 · Feb 25, 2025 · Feb 25, 2025 · Apr 23, 2025
diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py
@@ -1,10 +1,26 @@
 import argparse
+from collections import Counter
 import numpy
 import re
 from typing import List
 
 from ..common.environment import SIL_NLP_ENV
-
+from machine.tokenization import LatinWordTokenizer
+
+# Latin Tokenizer from machine library
+#def get_all_words(src_file: str) -> List:
+#    words = []
+#    tokenizer = LatinWordTokenizer()
+#    with open(src_file, "r", encoding = "utf8") as src_data_file:
+#        for line in src_data_file:
+#            line_words = tokenizer.tokenize(line)
+#            for word in line_words:
+#                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
+#                if word != "" and not word.isnumeric():
+#                    words.append(word)
+#    return words
+
+# Naive whitespace-based script-agnostic word splitter
 def get_all_words(src_file: str) -> List:
     words = []
     pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
@@ -59,6 +75,8 @@ def main() -> None:
         unique_src_words2 = numpy.unique(numpy.array(src_words2))
         src1_only_words = find_unique(unique_src_words1,unique_src_words2)
         src2_only_words = find_unique(unique_src_words2,unique_src_words1)
+        src1_word_counter = Counter(src_words1).most_common()
+        src2_word_counter = Counter(src_words2).most_common()
 
         # Write unique source words to files
         src_words_file1 = lex_path1 / "src_words.txt"
@@ -70,15 +88,36 @@ def main() -> None:
             for word in unique_src_words2:
                 output_file.writelines(word+'\n')
 
+        # Re-write src_words files with counts
+        with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src1_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+        with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            for entry in src2_word_counter:
+                output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+
         # Write source words missing from the alternate source file
+        #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+        #    output_file.writelines(f'src.txt words not found in {src_file2}\n')
+        #    for word in src1_only_words:
+        #        output_file.writelines(word+'\n')
+        #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+        #    output_file.writelines(f'src.txt words not found in {src_file1}\n')
+        #    for word in src2_only_words:
+        #        output_file.writelines(word+'\n')
+
+
+        # Rewrite of above section to include counts in the output file: 
         with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
             output_file.writelines(f'src.txt words not found in {src_file2}\n')
-            for word in src1_only_words:
-                output_file.writelines(word+'\n')
+            for entry in src1_word_counter:
+                if entry[0] in src1_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
         with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
             output_file.writelines(f'src.txt words not found in {src_file1}\n')
-            for word in src2_only_words:
-                output_file.writelines(word+'\n')
+            for entry in src2_word_counter:
+                if entry[0] in src2_only_words:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
 
     # Compare target words and write results to files
     if args.trg == True:
@@ -109,7 +148,7 @@ def main() -> None:
             for word in trg1_only_words:
                 output_file.writelines(word+'\n')
         with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
-            output_file.writelines(f'src.txt words not found in {trg_file1}\n')
+            output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
             for word in trg2_only_words:
                 output_file.writelines(word+'\n')
 

diff --git a/silnlp/common/count_words.py b/silnlp/common/count_words.py
@@ -15,8 +15,9 @@ def main() -> None:
     parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
     parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal")
     parser.add_argument("--num", help="Number of most common words to include", type=int, default=100)
-    parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words", 
+    parser.add_argument("--stats", help="Print word count and number of renderings for common words", 
                         action='store_true')
+    parser.add_argument("--count", help="Include count in src word files", action='store_true')
     args = parser.parse_args()
 
     # Set up path and lex files
@@ -49,6 +50,8 @@ def main() -> None:
                 if word != "" and not word.isnumeric():
                     src_words.append(word)  
     src_data_word_counter = Counter(src_words).most_common(args.num)
+    if args.count:
+       src_word_counter = Counter(src_words).most_common()
     unique_src_words = numpy.unique(numpy.array(src_words))
 
     # Pull all the separate words from the target data. Take all unique.
@@ -65,7 +68,7 @@ def main() -> None:
                     trg_words.append(word)  
     unique_trg_words = numpy.unique(numpy.array(trg_words))
 
-    # Clean lexicon file and prep for pandas csv reader
+    # Prep lexicon file for pandas csv reader (escape quotes)
     with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon:
         with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex:
             for line in lexicon.readlines():
@@ -111,9 +114,13 @@ def main() -> None:
         for src_wd in common_wd:
             writer.writerow([src_wd, *common_wd[src_wd]])
 
-    with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
-        for word in unique_src_words:
-            output_file.writelines(word + '\n')
+    with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:  
+            if args.count:
+                for entry in src_word_counter:
+                    output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
+            else:
+                for word in unique_src_words:
+                    output_file.writelines(word + '\n')
 
     # Optionally, output a few stats
     if args.stats:

diff --git a/silnlp/common/find_words.py b/silnlp/common/find_words.py
@@ -0,0 +1,71 @@
+import argparse
+from collections import Counter
+import csv
+import unicodedata
+
+from ..common.environment import SIL_NLP_ENV
+
+# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings)
+def NFD(s):
+    return unicodedata.normalize('NFD', s)
+
+def main():
+    parser = argparse.ArgumentParser(description="Counts lexicon entries")
+    parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt")
+    args = parser.parse_args()
+
+    # Set up path and files
+    lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
+    word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list
+    vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt"
+
+    # Get count of each word in the file
+    with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file:
+        src_word_counts = []
+        for entry in src_wd_file:
+            entry = list(entry.split('\t'))
+            if len(entry) > 1:
+                    entry[1] = int(entry[1].strip())
+                    src_word_counts.append(entry)
+            else:
+                print("Error: word counts are missing. Please run count_words.py with the --count flag set.")
+                return 1
+
+    # Extract list of words
+    src_word_dict = dict(list(src_word_counts))
+    with(word_filename).open("r", encoding = "utf8") as word_file:
+        words = []
+        for word in word_file:
+            words.append(word.rstrip('\n'))
+    # Check for words and word count in each verse; write to output file.
+    with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
+            with(vref_filename).open("r", encoding = "utf8") as ref_file:
+                word_list = list(enumerate(words))
+                result = []
+                seen_words = []
+                for verse in zip(ref_file, src_data_file):
+                    word_text = []
+                    word_num = []
+                    word_count = 0
+                    for word in word_list:
+                        #if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
+                        #if word[1] in verse[1]: # (to find all instances; not just first)
+                        if word[1] in verse[1] and word[1] not in seen_words:
+                            for entry in src_word_counts:
+                                 if entry[0] == word[1]:
+                                      word_count += entry[1]
+                            seen_words.append(word[1])
+                            word_text.append(word[1])
+                            word_num.append(src_word_dict[word[1]])
+                    result.append([verse[0].rstrip('\n'), word_count, word_num, word_text])
+    with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file:
+        writer = csv.writer(output_file, lineterminator="\n")
+        writer.writerow(['Reference','Novelty Score','Word Counts','Words'])
+        for line in result:
+            writer.writerow([line[0], line[1], line[2], *line[3]])
+    #print(result)
+
+
+if __name__ == '__main__':
+    main()