diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py index ffa2f8a9..fb14c90e 100644 --- a/silnlp/common/compare_lex.py +++ b/silnlp/common/compare_lex.py @@ -1,10 +1,26 @@ import argparse +from collections import Counter import numpy import re from typing import List from ..common.environment import SIL_NLP_ENV - +from machine.tokenization import LatinWordTokenizer + +# Latin Tokenizer from machine library +#def get_all_words(src_file: str) -> List: +# words = [] +# tokenizer = LatinWordTokenizer() +# with open(src_file, "r", encoding = "utf8") as src_data_file: +# for line in src_data_file: +# line_words = tokenizer.tokenize(line) +# for word in line_words: +# word = word.strip().strip("\'\"\\;,:.!?()-[]").lower() +# if word != "" and not word.isnumeric(): +# words.append(word) +# return words + +# Naive whitespace-based script-agnostic word splitter def get_all_words(src_file: str) -> List: words = [] pattern = re.compile(r",(?=\S)") # Look for commas with no following space @@ -59,6 +75,8 @@ def main() -> None: unique_src_words2 = numpy.unique(numpy.array(src_words2)) src1_only_words = find_unique(unique_src_words1,unique_src_words2) src2_only_words = find_unique(unique_src_words2,unique_src_words1) + src1_word_counter = Counter(src_words1).most_common() + src2_word_counter = Counter(src_words2).most_common() # Write unique source words to files src_words_file1 = lex_path1 / "src_words.txt" @@ -70,15 +88,36 @@ def main() -> None: for word in unique_src_words2: output_file.writelines(word+'\n') + # Re-write src_words files with counts + with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src1_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file: + for entry in src2_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + # Write source words missing from the alternate source file + #with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + # output_file.writelines(f'src.txt words not found in {src_file2}\n') + # for word in src1_only_words: + # output_file.writelines(word+'\n') + #with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: + # output_file.writelines(f'src.txt words not found in {src_file1}\n') + # for word in src2_only_words: + # output_file.writelines(word+'\n') + + + # Rewrite of above section to include counts in the output file: with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: output_file.writelines(f'src.txt words not found in {src_file2}\n') - for word in src1_only_words: - output_file.writelines(word+'\n') + for entry in src1_word_counter: + if entry[0] in src1_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file: output_file.writelines(f'src.txt words not found in {src_file1}\n') - for word in src2_only_words: - output_file.writelines(word+'\n') + for entry in src2_word_counter: + if entry[0] in src2_only_words: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') # Compare target words and write results to files if args.trg == True: @@ -109,7 +148,7 @@ def main() -> None: for word in trg1_only_words: output_file.writelines(word+'\n') with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file: - output_file.writelines(f'src.txt words not found in {trg_file1}\n') + output_file.writelines(f'trg.txt words not found in {trg_file1}\n') for word in trg2_only_words: output_file.writelines(word+'\n') diff --git a/silnlp/common/count_words.py b/silnlp/common/count_words.py index 6fd83204..f2fd2010 100644 --- a/silnlp/common/count_words.py +++ b/silnlp/common/count_words.py @@ -15,8 +15,9 @@ def main() -> None: parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\") parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal") parser.add_argument("--num", help="Number of most common words to include", type=int, default=100) - parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words", + parser.add_argument("--stats", help="Print word count and number of renderings for common words", action='store_true') + parser.add_argument("--count", help="Include count in src word files", action='store_true') args = parser.parse_args() # Set up path and lex files @@ -49,6 +50,8 @@ def main() -> None: if word != "" and not word.isnumeric(): src_words.append(word) src_data_word_counter = Counter(src_words).most_common(args.num) + if args.count: + src_word_counter = Counter(src_words).most_common() unique_src_words = numpy.unique(numpy.array(src_words)) # Pull all the separate words from the target data. Take all unique. @@ -65,7 +68,7 @@ def main() -> None: trg_words.append(word) unique_trg_words = numpy.unique(numpy.array(trg_words)) - # Clean lexicon file and prep for pandas csv reader + # Prep lexicon file for pandas csv reader (escape quotes) with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon: with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex: for line in lexicon.readlines(): @@ -111,9 +114,13 @@ def main() -> None: for src_wd in common_wd: writer.writerow([src_wd, *common_wd[src_wd]]) - with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file: - for word in unique_src_words: - output_file.writelines(word + '\n') + with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file: + if args.count: + for entry in src_word_counter: + output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n') + else: + for word in unique_src_words: + output_file.writelines(word + '\n') # Optionally, output a few stats if args.stats: diff --git a/silnlp/common/find_words.py b/silnlp/common/find_words.py new file mode 100644 index 00000000..d6de1782 --- /dev/null +++ b/silnlp/common/find_words.py @@ -0,0 +1,71 @@ +import argparse +from collections import Counter +import csv +import unicodedata + +from ..common.environment import SIL_NLP_ENV + +# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings) +def NFD(s): + return unicodedata.normalize('NFD', s) + +def main(): + parser = argparse.ArgumentParser(description="Counts lexicon entries") + parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\") + parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt") + args = parser.parse_args() + + # Set up path and files + lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment + word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list + vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt" + + # Get count of each word in the file + with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file: + src_word_counts = [] + for entry in src_wd_file: + entry = list(entry.split('\t')) + if len(entry) > 1: + entry[1] = int(entry[1].strip()) + src_word_counts.append(entry) + else: + print("Error: word counts are missing. Please run count_words.py with the --count flag set.") + return 1 + + # Extract list of words + src_word_dict = dict(list(src_word_counts)) + with(word_filename).open("r", encoding = "utf8") as word_file: + words = [] + for word in word_file: + words.append(word.rstrip('\n')) + # Check for words and word count in each verse; write to output file. + with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file: + with(vref_filename).open("r", encoding = "utf8") as ref_file: + word_list = list(enumerate(words)) + result = [] + seen_words = [] + for verse in zip(ref_file, src_data_file): + word_text = [] + word_num = [] + word_count = 0 + for word in word_list: + #if NFD(NFD(word[1])) in NFD(NFD(verse[1])): + #if word[1] in verse[1]: # (to find all instances; not just first) + if word[1] in verse[1] and word[1] not in seen_words: + for entry in src_word_counts: + if entry[0] == word[1]: + word_count += entry[1] + seen_words.append(word[1]) + word_text.append(word[1]) + word_num.append(src_word_dict[word[1]]) + result.append([verse[0].rstrip('\n'), word_count, word_num, word_text]) + with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file: + writer = csv.writer(output_file, lineterminator="\n") + writer.writerow(['Reference','Novelty Score','Word Counts','Words']) + for line in result: + writer.writerow([line[0], line[1], line[2], *line[3]]) + #print(result) + + +if __name__ == '__main__': + main()