Skip to content

Commit 71ef3e3

Browse files
authored
Merge pull request #905 from varunkatiyar819/varunkatiyar819-patch-1
Updated crfcut.py
2 parents fa0a2ca + d1b64a7 commit 71ef3e3

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

pythainlp/tokenize/crfcut.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,20 @@ def segment(text: str) -> List[str]:
199199
labs = _tagger.tag(feat)
200200
labs[-1] = "E" # make sure it cuts the last sentence
201201

202+
# To ensure splitting of sentences using Terminal Punctuation
203+
for idx, _ in enumerate(toks):
204+
if toks[idx].strip().endswith(("!", ".", "?")):
205+
labs[idx] = "E"
206+
# Spaces or empty strings would no longer be treated as end of sentence.
207+
elif (idx == 0 or labs[idx-1] == "E") and toks[idx].strip() == "":
208+
labs[idx] = "I"
209+
202210
sentences = []
203211
sentence = ""
204212
for i, w in enumerate(toks):
205213
sentence = sentence + w
206-
if labs[i] == "E":
214+
# Empty strings should not be part of output.
215+
if labs[i] == "E" and sentence != "":
207216
sentences.append(sentence)
208217
sentence = ""
209218

0 commit comments

Comments
 (0)