Skip to content

Commit 16cc34e

Browse files
committed
feat(nori): add metadata support to Korean tokenizer
1 parent dbaf97f commit 16cc34e

File tree

12 files changed

+168
-7
lines changed

12 files changed

+168
-7
lines changed

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
/** A token that was generated from a compound. */
2323
public class DecompoundToken extends Token {
2424
private final POS.Tag posTag;
25+
private final String metadata;
2526

2627
/**
2728
* Creates a new DecompoundToken
@@ -31,11 +32,18 @@ public class DecompoundToken extends Token {
3132
* @param startOffset The start offset of the token in the analyzed text.
3233
* @param endOffset The end offset of the token in the analyzed text.
3334
* @param type The type of this token.
35+
* @param metadata The metadata of this token.
3436
*/
3537
public DecompoundToken(
36-
POS.Tag posTag, String surfaceForm, int startOffset, int endOffset, TokenType type) {
38+
POS.Tag posTag,
39+
String surfaceForm,
40+
int startOffset,
41+
int endOffset,
42+
TokenType type,
43+
String metadata) {
3744
super(surfaceForm.toCharArray(), 0, surfaceForm.length(), startOffset, endOffset, type);
3845
this.posTag = posTag;
46+
this.metadata = metadata;
3947
}
4048

4149
@Override
@@ -77,4 +85,9 @@ public String getReading() {
7785
public KoMorphData.Morpheme[] getMorphemes() {
7886
return null;
7987
}
88+
89+
@Override
90+
public String getMetadata() {
91+
return metadata;
92+
}
8093
}

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
public class DictionaryToken extends Token {
2424
private final int wordId;
2525
private final KoMorphData morphAtts;
26+
private String metadata = null;
2627

2728
public DictionaryToken(
2829
TokenType type,
@@ -108,4 +109,21 @@ public String getReading() {
108109
public KoMorphData.Morpheme[] getMorphemes() {
109110
return morphAtts.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength());
110111
}
112+
113+
@Override
114+
public String getMetadata() {
115+
return this.metadata;
116+
}
117+
118+
public void setMetadata(String metadata) {
119+
this.metadata = metadata;
120+
}
121+
122+
public int getWordId() {
123+
return wordId;
124+
}
125+
126+
public KoMorphData getMorphAtts() {
127+
return morphAtts;
128+
}
111129
}

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
2626
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
2727
import org.apache.lucene.analysis.ko.dict.UserDictionary;
28+
import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
2829
import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
2930
import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
3031
import org.apache.lucene.analysis.morph.GraphvizFormatter;
@@ -77,6 +78,7 @@ public enum DecompoundMode {
7778
private final Viterbi viterbi;
7879

7980
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
81+
private final MetadataAttribute metadataAtt = addAttribute(MetadataAttribute.class);
8082
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
8183
private final PositionIncrementAttribute posIncAtt =
8284
addAttribute(PositionIncrementAttribute.class);
@@ -233,6 +235,7 @@ public boolean incrementToken() throws IOException {
233235
// System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
234236
// token.getSurfaceForm().length);
235237
termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
238+
metadataAtt.setToken(token);
236239
offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
237240
posAtt.setToken(token);
238241
readingAtt.setToken(token);

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,6 @@ protected Token(
4444
* token.
4545
*/
4646
public abstract KoMorphData.Morpheme[] getMorphemes();
47+
48+
public abstract String getMetadata();
4749
}

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818

1919
import java.io.IOException;
2020
import java.util.EnumMap;
21+
2122
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
2223
import org.apache.lucene.analysis.ko.dict.KoMorphData;
2324
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
2425
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
2526
import org.apache.lucene.analysis.ko.dict.UserDictionary;
27+
import org.apache.lucene.analysis.ko.dict.UserMorphData;
2628
import org.apache.lucene.analysis.morph.ConnectionCosts;
2729
import org.apache.lucene.analysis.morph.Dictionary;
2830
import org.apache.lucene.analysis.morph.GraphvizFormatter;
@@ -248,6 +250,10 @@ protected void backtrace(Position endPosData, int fromIDX) {
248250
if (token.getPOSType() == POS.Type.MORPHEME
249251
|| mode == KoreanTokenizer.DecompoundMode.NONE) {
250252
if (shouldFilterToken(token) == false) {
253+
if (token.getMorphAtts() instanceof UserMorphData userMorphData) {
254+
final String metadata = userMorphData.metadatas[token.getWordId()];
255+
token.setMetadata(metadata);
256+
}
251257
pending.add(token);
252258
if (VERBOSE) {
253259
System.out.println(" add token=" + pending.get(pending.size() - 1));
@@ -264,9 +270,11 @@ protected void backtrace(Position endPosData, int fromIDX) {
264270
int endOffset = backWordPos + length;
265271
int posLen = 0;
266272
// decompose the compound
273+
String metadata = null;
267274
for (int i = morphemes.length - 1; i >= 0; i--) {
268275
final KoMorphData.Morpheme morpheme = morphemes[i];
269276
final Token compoundToken;
277+
metadata = morpheme.metadata();
270278
if (token.getPOSType() == POS.Type.COMPOUND) {
271279
assert endOffset - morpheme.surfaceForm().length() >= 0;
272280
compoundToken =
@@ -275,15 +283,17 @@ protected void backtrace(Position endPosData, int fromIDX) {
275283
morpheme.surfaceForm(),
276284
endOffset - morpheme.surfaceForm().length(),
277285
endOffset,
278-
backType);
286+
backType,
287+
metadata);
279288
} else {
280289
compoundToken =
281290
new DecompoundToken(
282291
morpheme.posTag(),
283292
morpheme.surfaceForm(),
284293
token.getStartOffset(),
285294
token.getEndOffset(),
286-
backType);
295+
backType,
296+
metadata);
287297
}
288298
if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
289299
compoundToken.setPositionIncrement(0);
@@ -297,6 +307,7 @@ protected void backtrace(Position endPosData, int fromIDX) {
297307
}
298308
if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
299309
token.setPositionLength(Math.max(1, posLen));
310+
token.setMetadata(metadata);
300311
pending.add(token);
301312
if (VERBOSE) {
302313
System.out.println(" add token=" + pending.get(pending.size() - 1));

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@
2222
/** Represents Korean morphological information. */
2323
public interface KoMorphData extends MorphData {
2424
/** A morpheme extracted from a compound token. */
25-
record Morpheme(POS.Tag posTag, String surfaceForm) {}
25+
record Morpheme(POS.Tag posTag, String surfaceForm, String metadata) {
26+
public Morpheme(POS.Tag posTag, String surfaceForm) {
27+
this(posTag, surfaceForm, null);
28+
}
29+
}
2630

2731
/**
2832
* Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound,

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
* (세종시 세종 시).
3535
*/
3636
public final class UserDictionary implements Dictionary<UserMorphData> {
37+
public static final String METADATA_SEPARATOR = " >> ";
3738
// text -> wordID
3839
private final TokenInfoFST fst;
3940

@@ -82,10 +83,19 @@ private UserDictionary(List<String> entries) throws IOException {
8283

8384
String lastToken = null;
8485
List<int[]> _segmentations = new ArrayList<>(entries.size());
86+
List<String> _metadatas = new ArrayList<>(entries.size());
8587
short[] rightIds = new short[entries.size()];
8688
long ord = 0;
8789
int entryIndex = 0;
8890
for (String entry : entries) {
91+
if (entry.contains(METADATA_SEPARATOR)) {
92+
var split = entry.split(METADATA_SEPARATOR);
93+
entry = split[0];
94+
var metadata = split[1];
95+
_metadatas.add(metadata);
96+
} else {
97+
_metadatas.add(null);
98+
}
8999
String[] splits = entry.split("\\s+");
90100
String token = splits[0];
91101
if (token.equals(lastToken)) {
@@ -138,7 +148,8 @@ private UserDictionary(List<String> entries) throws IOException {
138148
this.fst =
139149
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
140150
int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
141-
this.morphAtts = new UserMorphData(segmentations, rightIds);
151+
String[] metadatas = _metadatas.toArray(String[]::new);
152+
this.morphAtts = new UserMorphData(segmentations, rightIds, metadatas);
142153
}
143154

144155
public TokenInfoFST getFST() {

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import org.apache.lucene.analysis.ko.POS;
2020

2121
/** Morphological information for user dictionary. */
22-
final class UserMorphData implements KoMorphData {
22+
public final class UserMorphData implements KoMorphData {
2323
private static final int WORD_COST = -100000;
2424

2525
// NNG left
@@ -28,12 +28,18 @@ final class UserMorphData implements KoMorphData {
2828
// length, length... indexed by compound ID or null for simple noun
2929
private final int[][] segmentations;
3030
private final short[] rightIds;
31+
public String[] metadatas;
3132

3233
UserMorphData(int[][] segmentations, short[] rightIds) {
3334
this.segmentations = segmentations;
3435
this.rightIds = rightIds;
3536
}
3637

38+
UserMorphData(int[][] segmentations, short[] rightIds, String[] metadatas) {
39+
this(segmentations, rightIds);
40+
this.metadatas = metadatas;
41+
}
42+
3743
@Override
3844
public int getLeftId(int morphId) {
3945
return LEFT_ID;
@@ -79,10 +85,12 @@ public Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len
7985
if (segs == null) {
8086
return null;
8187
}
88+
String metadata = metadatas[morphId];
8289
int offset = 0;
8390
Morpheme[] morphemes = new Morpheme[segs.length];
8491
for (int i = 0; i < segs.length; i++) {
85-
morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
92+
morphemes[i] =
93+
new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]), metadata);
8694
offset += segs[i];
8795
}
8896
return morphemes;
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package org.apache.lucene.analysis.ko.tokenattributes;
2+
3+
import org.apache.lucene.analysis.ko.Token;
4+
import org.apache.lucene.util.Attribute;
5+
6+
/**
7+
* Attribute for Korean token metadata.
8+
*
9+
* <p>This attribute provides access to additional metadata associated with Korean tokens,
10+
* particularly from user dictionaries and compound word morphemes.
11+
*
12+
* <p>Note: in some cases this value may not be applicable, and will be null.
13+
*
14+
* @lucene.experimental
15+
*/
16+
public interface MetadataAttribute extends Attribute {
17+
/** Get the metadata string of the token. */
18+
String getMetadata();
19+
20+
/** Set the current token. */
21+
void setToken(Token token);
22+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package org.apache.lucene.analysis.ko.tokenattributes;
2+
3+
import org.apache.lucene.analysis.ko.Token;
4+
import org.apache.lucene.util.AttributeImpl;
5+
import org.apache.lucene.util.AttributeReflector;
6+
7+
/**
8+
* Attribute implementation for Korean token metadata.
9+
*
10+
* @lucene.experimental
11+
*/
12+
public class MetadataAttributeImpl extends AttributeImpl implements MetadataAttribute {
13+
private Token token;
14+
15+
@Override
16+
public String getMetadata() {
17+
if (this.token != null) {
18+
return this.token.getMetadata();
19+
}
20+
return null;
21+
}
22+
23+
@Override
24+
public void setToken(Token token) {
25+
this.token = token;
26+
}
27+
28+
@Override
29+
public void clear() {
30+
this.token = null;
31+
}
32+
33+
@Override
34+
public void copyTo(AttributeImpl target) {
35+
final MetadataAttribute t = (MetadataAttribute) target;
36+
t.setToken(this.token);
37+
}
38+
39+
@Override
40+
public void reflectWith(AttributeReflector reflector) {
41+
reflector.reflect(MetadataAttribute.class, "metadata", getMetadata());
42+
}
43+
}

0 commit comments

Comments
 (0)