feat(nori): add metadata support to Korean tokenizer

twosom · twosom · commit 16cc34e810b3 · 2025-07-20T15:32:14.000+09:00
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java
@@ -22,6 +22,7 @@
 /** A token that was generated from a compound. */
 public class DecompoundToken extends Token {
   private final POS.Tag posTag;
+  private final String metadata;
 
   /**
    * Creates a new DecompoundToken
@@ -31,11 +32,18 @@ public class DecompoundToken extends Token {
    * @param startOffset The start offset of the token in the analyzed text.
    * @param endOffset The end offset of the token in the analyzed text.
    * @param type The type of this token.
+   * @param metadata The metadata of this token.
    */
   public DecompoundToken(
-      POS.Tag posTag, String surfaceForm, int startOffset, int endOffset, TokenType type) {
+      POS.Tag posTag,
+      String surfaceForm,
+      int startOffset,
+      int endOffset,
+      TokenType type,
+      String metadata) {
     super(surfaceForm.toCharArray(), 0, surfaceForm.length(), startOffset, endOffset, type);
     this.posTag = posTag;
+    this.metadata = metadata;
   }
 
   @Override
@@ -77,4 +85,9 @@ public String getReading() {
   public KoMorphData.Morpheme[] getMorphemes() {
     return null;
   }
+
+  @Override
+  public String getMetadata() {
+    return metadata;
+  }
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java
@@ -23,6 +23,7 @@
 public class DictionaryToken extends Token {
   private final int wordId;
   private final KoMorphData morphAtts;
+  private String metadata = null;
 
   public DictionaryToken(
       TokenType type,
@@ -108,4 +109,21 @@ public String getReading() {
   public KoMorphData.Morpheme[] getMorphemes() {
     return morphAtts.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength());
   }
+
+  @Override
+  public String getMetadata() {
+    return this.metadata;
+  }
+
+  public void setMetadata(String metadata) {
+    this.metadata = metadata;
+  }
+
+  public int getWordId() {
+    return wordId;
+  }
+
+  public KoMorphData getMorphAtts() {
+    return morphAtts;
+  }
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -25,6 +25,7 @@
 import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
 import org.apache.lucene.analysis.morph.GraphvizFormatter;
@@ -77,6 +78,7 @@ public enum DecompoundMode {
   private final Viterbi viterbi;
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final MetadataAttribute metadataAtt = addAttribute(MetadataAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAtt =
       addAttribute(PositionIncrementAttribute.class);
@@ -233,6 +235,7 @@ public boolean incrementToken() throws IOException {
     // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
     // token.getSurfaceForm().length);
     termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
+    metadataAtt.setToken(token);
     offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
     posAtt.setToken(token);
     readingAtt.setToken(token);
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java
@@ -44,4 +44,6 @@ protected Token(
    * token.
    */
   public abstract KoMorphData.Morpheme[] getMorphemes();
+
+  public abstract String getMetadata();
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
@@ -18,11 +18,13 @@
 
 import java.io.IOException;
 import java.util.EnumMap;
+
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 import org.apache.lucene.analysis.ko.dict.KoMorphData;
 import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.dict.UserMorphData;
 import org.apache.lucene.analysis.morph.ConnectionCosts;
 import org.apache.lucene.analysis.morph.Dictionary;
 import org.apache.lucene.analysis.morph.GraphvizFormatter;
@@ -248,6 +250,10 @@ protected void backtrace(Position endPosData, int fromIDX) {
         if (token.getPOSType() == POS.Type.MORPHEME
             || mode == KoreanTokenizer.DecompoundMode.NONE) {
           if (shouldFilterToken(token) == false) {
+            if (token.getMorphAtts() instanceof UserMorphData userMorphData) {
+              final String metadata = userMorphData.metadatas[token.getWordId()];
+              token.setMetadata(metadata);
+            }
             pending.add(token);
             if (VERBOSE) {
               System.out.println("    add token=" + pending.get(pending.size() - 1));
@@ -264,9 +270,11 @@ protected void backtrace(Position endPosData, int fromIDX) {
             int endOffset = backWordPos + length;
             int posLen = 0;
             // decompose the compound
+            String metadata = null;
             for (int i = morphemes.length - 1; i >= 0; i--) {
               final KoMorphData.Morpheme morpheme = morphemes[i];
               final Token compoundToken;
+              metadata = morpheme.metadata();
               if (token.getPOSType() == POS.Type.COMPOUND) {
                 assert endOffset - morpheme.surfaceForm().length() >= 0;
                 compoundToken =
@@ -275,15 +283,17 @@ protected void backtrace(Position endPosData, int fromIDX) {
                         morpheme.surfaceForm(),
                         endOffset - morpheme.surfaceForm().length(),
                         endOffset,
-                        backType);
+                        backType,
+                        metadata);
               } else {
                 compoundToken =
                     new DecompoundToken(
                         morpheme.posTag(),
                         morpheme.surfaceForm(),
                         token.getStartOffset(),
                         token.getEndOffset(),
-                        backType);
+                        backType,
+                        metadata);
               }
               if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
                 compoundToken.setPositionIncrement(0);
@@ -297,6 +307,7 @@ protected void backtrace(Position endPosData, int fromIDX) {
             }
             if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
               token.setPositionLength(Math.max(1, posLen));
+              token.setMetadata(metadata);
               pending.add(token);
               if (VERBOSE) {
                 System.out.println("    add token=" + pending.get(pending.size() - 1));
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java
@@ -22,7 +22,11 @@
 /** Represents Korean morphological information. */
 public interface KoMorphData extends MorphData {
   /** A morpheme extracted from a compound token. */
-  record Morpheme(POS.Tag posTag, String surfaceForm) {}
+  record Morpheme(POS.Tag posTag, String surfaceForm, String metadata) {
+    public Morpheme(POS.Tag posTag, String surfaceForm) {
+      this(posTag, surfaceForm, null);
+    }
+  }
 
   /**
    * Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound,
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
@@ -34,6 +34,7 @@
  * (세종시 세종 시).
  */
 public final class UserDictionary implements Dictionary<UserMorphData> {
+  public static final String METADATA_SEPARATOR = " >> ";
   // text -> wordID
   private final TokenInfoFST fst;
 
@@ -82,10 +83,19 @@ private UserDictionary(List<String> entries) throws IOException {
 
     String lastToken = null;
     List<int[]> _segmentations = new ArrayList<>(entries.size());
+    List<String> _metadatas = new ArrayList<>(entries.size());
     short[] rightIds = new short[entries.size()];
     long ord = 0;
     int entryIndex = 0;
     for (String entry : entries) {
+      if (entry.contains(METADATA_SEPARATOR)) {
+        var split = entry.split(METADATA_SEPARATOR);
+        entry = split[0];
+        var metadata = split[1];
+        _metadatas.add(metadata);
+      } else {
+        _metadatas.add(null);
+      }
       String[] splits = entry.split("\\s+");
       String token = splits[0];
       if (token.equals(lastToken)) {
@@ -138,7 +148,8 @@ private UserDictionary(List<String> entries) throws IOException {
     this.fst =
         new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
     int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
-    this.morphAtts = new UserMorphData(segmentations, rightIds);
+    String[] metadatas = _metadatas.toArray(String[]::new);
+    this.morphAtts = new UserMorphData(segmentations, rightIds, metadatas);
   }
 
   public TokenInfoFST getFST() {
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java
@@ -19,7 +19,7 @@
 import org.apache.lucene.analysis.ko.POS;
 
 /** Morphological information for user dictionary. */
-final class UserMorphData implements KoMorphData {
+public final class UserMorphData implements KoMorphData {
   private static final int WORD_COST = -100000;
 
   // NNG left
@@ -28,12 +28,18 @@ final class UserMorphData implements KoMorphData {
   // length, length... indexed by compound ID or null for simple noun
   private final int[][] segmentations;
   private final short[] rightIds;
+  public String[] metadatas;
 
   UserMorphData(int[][] segmentations, short[] rightIds) {
     this.segmentations = segmentations;
     this.rightIds = rightIds;
   }
 
+  UserMorphData(int[][] segmentations, short[] rightIds, String[] metadatas) {
+    this(segmentations, rightIds);
+    this.metadatas = metadatas;
+  }
+
   @Override
   public int getLeftId(int morphId) {
     return LEFT_ID;
@@ -79,10 +85,12 @@ public Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len
     if (segs == null) {
       return null;
     }
+    String metadata = metadatas[morphId];
     int offset = 0;
     Morpheme[] morphemes = new Morpheme[segs.length];
     for (int i = 0; i < segs.length; i++) {
-      morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
+      morphemes[i] =
+          new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]), metadata);
       offset += segs[i];
     }
     return morphemes;
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttribute.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttribute.java
@@ -0,0 +1,22 @@
+package org.apache.lucene.analysis.ko.tokenattributes;
+
+import org.apache.lucene.analysis.ko.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for Korean token metadata.
+ *
+ * <p>This attribute provides access to additional metadata associated with Korean tokens,
+ * particularly from user dictionaries and compound word morphemes.
+ *
+ * <p>Note: in some cases this value may not be applicable, and will be null.
+ *
+ * @lucene.experimental
+ */
+public interface MetadataAttribute extends Attribute {
+  /** Get the metadata string of the token. */
+  String getMetadata();
+
+  /** Set the current token. */
+  void setToken(Token token);
+}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttributeImpl.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttributeImpl.java
@@ -0,0 +1,43 @@
+package org.apache.lucene.analysis.ko.tokenattributes;
+
+import org.apache.lucene.analysis.ko.Token;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Attribute implementation for Korean token metadata.
+ *
+ * @lucene.experimental
+ */
+public class MetadataAttributeImpl extends AttributeImpl implements MetadataAttribute {
+  private Token token;
+
+  @Override
+  public String getMetadata() {
+    if (this.token != null) {
+      return this.token.getMetadata();
+    }
+    return null;
+  }
+
+  @Override
+  public void setToken(Token token) {
+    this.token = token;
+  }
+
+  @Override
+  public void clear() {
+    this.token = null;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    final MetadataAttribute t = (MetadataAttribute) target;
+    t.setToken(this.token);
+  }
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(MetadataAttribute.class, "metadata", getMetadata());
+  }
+}
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -28,6 +28,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
@@ -576,6 +577,27 @@ public void testDuplicate() throws IOException {
     }
   }
 
+  public void testMetadataAttribute() throws IOException {
+    assertMetadata(analyzer, "자바", "컴퓨터 언어");
+    assertMetadata(analyzer, "java", "컴퓨터 언어");
+    assertMetadata(analyzer, "엘라스틱서치", "검색 엔진");
+
+    assertMetadata(analyzerDecompoundKeep, "엘라스틱서치", "검색 엔진");
+  }
+
+  private void assertMetadata(Analyzer analyzer, String input, String metadata) throws IOException {
+    try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
+      final MetadataAttribute metadataAtt = ts.addAttribute(MetadataAttribute.class);
+      ts.reset();
+      while (ts.incrementToken()) {
+        assertNotNull(metadataAtt.getMetadata());
+        assertEquals(metadata, metadataAtt.getMetadata());
+      }
+      assertFalse(ts.incrementToken());
+      ts.end();
+    }
+  }
+
   private void assertReadings(Analyzer analyzer, String input, String... readings)
       throws IOException {
     try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
@@ -8,3 +8,7 @@ C샤프
 날씨
 21세기대한민국
 세기
+# With Metadata
+자바 >> 컴퓨터 언어
+java >> 컴퓨터 언어
+엘라스틱서치 엘라스틱 서치 >> 검색 엔진

Original file line number	Diff line number	Diff line change
`@@ -44,4 +44,6 @@ protected Token(`
`44`	`44`	`* token.`
`45`	`45`	`*/`
`46`	`46`	`public abstract KoMorphData.Morpheme[] getMorphemes();`
	`47`	`+`
	`48`	`+ public abstract String getMetadata();`
`47`	`49`	`}`