Skip to content

Update lex tools #753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
51 changes: 45 additions & 6 deletions silnlp/common/compare_lex.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
import argparse
from collections import Counter
import numpy
import re
from typing import List

from ..common.environment import SIL_NLP_ENV

from machine.tokenization import LatinWordTokenizer

# Latin Tokenizer from machine library
#def get_all_words(src_file: str) -> List:
# words = []
# tokenizer = LatinWordTokenizer()
# with open(src_file, "r", encoding = "utf8") as src_data_file:
# for line in src_data_file:
# line_words = tokenizer.tokenize(line)
# for word in line_words:
# word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
# if word != "" and not word.isnumeric():
# words.append(word)
# return words

# Naive whitespace-based script-agnostic word splitter
def get_all_words(src_file: str) -> List:
words = []
pattern = re.compile(r",(?=\S)") # Look for commas with no following space
Expand Down Expand Up @@ -59,6 +75,8 @@ def main() -> None:
unique_src_words2 = numpy.unique(numpy.array(src_words2))
src1_only_words = find_unique(unique_src_words1,unique_src_words2)
src2_only_words = find_unique(unique_src_words2,unique_src_words1)
src1_word_counter = Counter(src_words1).most_common()
src2_word_counter = Counter(src_words2).most_common()

# Write unique source words to files
src_words_file1 = lex_path1 / "src_words.txt"
Expand All @@ -70,15 +88,36 @@ def main() -> None:
for word in unique_src_words2:
output_file.writelines(word+'\n')

# Re-write src_words files with counts
with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:
for entry in src1_word_counter:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:
for entry in src2_word_counter:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')

# Write source words missing from the alternate source file
#with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
# output_file.writelines(f'src.txt words not found in {src_file2}\n')
# for word in src1_only_words:
# output_file.writelines(word+'\n')
#with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
# output_file.writelines(f'src.txt words not found in {src_file1}\n')
# for word in src2_only_words:
# output_file.writelines(word+'\n')


# Rewrite of above section to include counts in the output file:
with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {src_file2}\n')
for word in src1_only_words:
output_file.writelines(word+'\n')
for entry in src1_word_counter:
if entry[0] in src1_only_words:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {src_file1}\n')
for word in src2_only_words:
output_file.writelines(word+'\n')
for entry in src2_word_counter:
if entry[0] in src2_only_words:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')

# Compare target words and write results to files
if args.trg == True:
Expand Down Expand Up @@ -109,7 +148,7 @@ def main() -> None:
for word in trg1_only_words:
output_file.writelines(word+'\n')
with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {trg_file1}\n')
output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
for word in trg2_only_words:
output_file.writelines(word+'\n')

Expand Down
17 changes: 12 additions & 5 deletions silnlp/common/count_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ def main() -> None:
parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal")
parser.add_argument("--num", help="Number of most common words to include", type=int, default=100)
parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words",
parser.add_argument("--stats", help="Print word count and number of renderings for common words",
action='store_true')
parser.add_argument("--count", help="Include count in src word files", action='store_true')
args = parser.parse_args()

# Set up path and lex files
Expand Down Expand Up @@ -49,6 +50,8 @@ def main() -> None:
if word != "" and not word.isnumeric():
src_words.append(word)
src_data_word_counter = Counter(src_words).most_common(args.num)
if args.count:
src_word_counter = Counter(src_words).most_common()
unique_src_words = numpy.unique(numpy.array(src_words))

# Pull all the separate words from the target data. Take all unique.
Expand All @@ -65,7 +68,7 @@ def main() -> None:
trg_words.append(word)
unique_trg_words = numpy.unique(numpy.array(trg_words))

# Clean lexicon file and prep for pandas csv reader
# Prep lexicon file for pandas csv reader (escape quotes)
with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon:
with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex:
for line in lexicon.readlines():
Expand Down Expand Up @@ -111,9 +114,13 @@ def main() -> None:
for src_wd in common_wd:
writer.writerow([src_wd, *common_wd[src_wd]])

with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
for word in unique_src_words:
output_file.writelines(word + '\n')
with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
if args.count:
for entry in src_word_counter:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
else:
for word in unique_src_words:
output_file.writelines(word + '\n')

# Optionally, output a few stats
if args.stats:
Expand Down
71 changes: 71 additions & 0 deletions silnlp/common/find_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import argparse
from collections import Counter
import csv
import unicodedata

from ..common.environment import SIL_NLP_ENV

# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings)
def NFD(s):
return unicodedata.normalize('NFD', s)

def main():
parser = argparse.ArgumentParser(description="Counts lexicon entries")
parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt")
args = parser.parse_args()

# Set up path and files
lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list
vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt"

# Get count of each word in the file
with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file:
src_word_counts = []
for entry in src_wd_file:
entry = list(entry.split('\t'))
if len(entry) > 1:
entry[1] = int(entry[1].strip())
src_word_counts.append(entry)
else:
print("Error: word counts are missing. Please run count_words.py with the --count flag set.")
return 1

# Extract list of words
src_word_dict = dict(list(src_word_counts))
with(word_filename).open("r", encoding = "utf8") as word_file:
words = []
for word in word_file:
words.append(word.rstrip('\n'))
# Check for words and word count in each verse; write to output file.
with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
with(vref_filename).open("r", encoding = "utf8") as ref_file:
word_list = list(enumerate(words))
result = []
seen_words = []
for verse in zip(ref_file, src_data_file):
word_text = []
word_num = []
word_count = 0
for word in word_list:
#if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
#if word[1] in verse[1]: # (to find all instances; not just first)
if word[1] in verse[1] and word[1] not in seen_words:
for entry in src_word_counts:
if entry[0] == word[1]:
word_count += entry[1]
seen_words.append(word[1])
word_text.append(word[1])
word_num.append(src_word_dict[word[1]])
result.append([verse[0].rstrip('\n'), word_count, word_num, word_text])
with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file:
writer = csv.writer(output_file, lineterminator="\n")
writer.writerow(['Reference','Novelty Score','Word Counts','Words'])
for line in result:
writer.writerow([line[0], line[1], line[2], *line[3]])
#print(result)


if __name__ == '__main__':
main()