From 853ec6b1fbfb861aab3c51b5772aced4f47d8eeb Mon Sep 17 00:00:00 2001 From: Hugo Bollon Date: Thu, 12 Aug 2021 20:30:32 +0200 Subject: [PATCH] fix: bad similarity index calculated by matchingIndex func (#7) test: update tests to match with matchingIndex changes & add some test cases test: fix lcs distance test case which failed --- lcs_test.go | 1 + string-analysis.go | 11 +++++++---- string-analysis_test.go | 24 ++++++++++++++++++------ 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/lcs_test.go b/lcs_test.go index a206580..7c1540d 100644 --- a/lcs_test.go +++ b/lcs_test.go @@ -89,6 +89,7 @@ func TestLCSBacktrackAll(t *testing.T) { {"AZBYCWDX/ZAYBWCXD", args{"AZBYCWDX", "ZAYBWCXD"}, []string{"ABCD", "ABCX", "ABWD", "ABWX", "AYCD", "AYCX", "AYWD", "AYWX", "ZBCD", "ZBCX", "ZBWD", "ZBWX", "ZYCD", "ZYCX", "ZYWD", "ZYWX"}, false}, {"AATCC/ACACG", args{"AATCC", "ACACG"}, []string{"AAC", "ACC"}, false}, {"您好女士,你好吗?/先生,你好吗?", args{"您好女士 你好吗?", "先生 你好吗?"}, []string{" 你好吗?"}, false}, + {" 是ab是cde22f123g/222222是ab是cd123", args{" 是ab是cde22f123g", "222222是ab是cd123"}, []string{"是ab是cd123"}, false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/string-analysis.go b/string-analysis.go index a22e53f..35f7d02 100644 --- a/string-analysis.go +++ b/string-analysis.go @@ -52,11 +52,14 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error // Return matching index E [0..1] from two strings and an edit distance func matchingIndex(str1 string, str2 string, distance int) float32 { - // Compare strings length and make a matching percentage between them - if len(str1) >= len(str2) { - return float32(len(str1)-distance) / float32(len(str1)) + // Convert strings to rune slices + runeStr1 := []rune(str1) + runeStr2 := []rune(str2) + // Compare rune arrays length and make a matching percentage between them + if len(runeStr1) >= len(runeStr2) { + return float32(len(runeStr1)-distance) / float32(len(runeStr1)) } - return float32(len(str2)-distance) / float32(len(str2)) + return float32(len(runeStr2)-distance) / float32(len(runeStr2)) } // FuzzySearch realize an approximate search on a string list and return the closest one compared diff --git a/string-analysis_test.go b/string-analysis_test.go index 80a1344..4a9c9a6 100644 --- a/string-analysis_test.go +++ b/string-analysis_test.go @@ -40,6 +40,10 @@ func TestStringsSimilarity(t *testing.T) { {"Levenshtein : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Levenshtein}, 0.6666667, false}, {"Levenshtein : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Levenshtein}, 0.50, false}, {"Levenshtein : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Levenshtein}, 0.80, false}, + {"Levenshtein : abcde/бвгдж", args{"abcde", "бвгдж", Levenshtein}, 0, false}, + {"Levenshtein : abcde/fghjk", args{"abcde", "fghjk", Levenshtein}, 0, false}, + {"Levenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", Levenshtein}, 0.4, false}, + {"Levenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Levenshtein}, 0.5, false}, // DamerauLevenshtein method {"DamerauLevenshtein : First arg empty", args{"", "abcde", DamerauLevenshtein}, 0.0, false}, @@ -52,8 +56,8 @@ func TestStringsSimilarity(t *testing.T) { {"DamerauLevenshtein : a cat/an abct", args{"a cat", "an abct", DamerauLevenshtein}, 0.5714286, false}, {"DamerauLevenshtein : dixon/dicksonx", args{"dixon", "dicksonx", DamerauLevenshtein}, 0.5, false}, {"DamerauLevenshtein : jellyfish/smellyfish", args{"jellyfish", "smellyfish", DamerauLevenshtein}, 0.8, false}, - {"DamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", DamerauLevenshtein}, 0.8666667, false}, // "Hello" in Japanese - {"DamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", DamerauLevenshtein}, 0.875, false}, + {"DamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", DamerauLevenshtein}, 0.6, false}, + {"DamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", DamerauLevenshtein}, 0.5, false}, // OSADamerauLevenshtein method {"OSADamerauLevenshtein : First arg empty", args{"", "abcde", OSADamerauLevenshtein}, 0.0, false}, @@ -66,8 +70,8 @@ func TestStringsSimilarity(t *testing.T) { {"OSADamerauLevenshtein : a cat/an abct", args{"a cat", "an abct", OSADamerauLevenshtein}, 0.428571429, false}, {"OSADamerauLevenshtein : dixon/dicksonx", args{"dixon", "dicksonx", OSADamerauLevenshtein}, 0.5, false}, {"OSADamerauLevenshtein : jellyfish/smellyfish", args{"jellyfish", "smellyfish", OSADamerauLevenshtein}, 0.8, false}, - {"OSADamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", OSADamerauLevenshtein}, 0.8666667, false}, // "Hello" in Japanese - {"OSADamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", OSADamerauLevenshtein}, 0.875, false}, + {"OSADamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", OSADamerauLevenshtein}, 0.6, false}, + {"OSADamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", OSADamerauLevenshtein}, 0.5, false}, // Lcs method {"LCS : First arg empty", args{"", "abcde", Lcs}, 0.0, false}, @@ -80,6 +84,8 @@ func TestStringsSimilarity(t *testing.T) { {"LCS : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Lcs}, 0.6666667, false}, {"LCS : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Lcs}, 0.375, false}, {"LCS : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Lcs}, 0.7, false}, + {"Lcs : こにんち/こんにちは", args{"こにんち", "こんにちは", Lcs}, 0.4, false}, // "Hello" in Japanese + {"Lcs : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Lcs}, 0.5, false}, // Hamming method {"Hamming : First arg empty", args{"", "abcde", Hamming}, 0.0, true}, @@ -93,7 +99,7 @@ func TestStringsSimilarity(t *testing.T) { {"Hamming : dixon/dicksonx", args{"dixon", "dicksonx", Hamming}, 0.0, true}, {"Hamming : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Hamming}, 0.0, true}, {"Hamming : こにんち/こんにちは", args{"こにんち", "こんにちは", Hamming}, 0.0, true}, // "Hello" in Japanese - {"Hamming : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Hamming}, 0.75, false}, + {"Hamming : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Hamming}, 0.0, false}, // Jaro method {"Jaro : First arg empty", args{"", "abcde", Jaro}, 0.0, false}, @@ -104,6 +110,9 @@ func TestStringsSimilarity(t *testing.T) { {"Jaro : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaro}, 0.9444444, false}, {"Jaro : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaro}, 0.76666665, false}, {"Jaro : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Jaro}, 0.8962963, false}, + {"Jaro : こにんち/こんにちは", args{"こにんち", "こんにちは", Jaro}, 0.84999996, false}, + {"Jaro : こんににんち/こんにちは", args{"こんににんち", "こんにちは", Jaro}, 0.82222223, false}, + {"Jaro : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Jaro}, 0.8333333, false}, // JaroWinkler method {"JaroWinkler : First arg empty", args{"", "abcde", JaroWinkler}, 0.0, false}, @@ -114,6 +123,9 @@ func TestStringsSimilarity(t *testing.T) { {"JaroWinkler : MARTHA/MARHTA", args{"MARTHA", "MARHTA", JaroWinkler}, 0.96111107, false}, {"JaroWinkler : DIXON/DICKSONX", args{"DIXON", "DICKSONX", JaroWinkler}, 0.81333333, false}, {"JaroWinkler : jellyfish/smellyfish", args{"jellyfish", "smellyfish", JaroWinkler}, 0.8962963, false}, + {"JaroWinkler : こにんち/こんにちは", args{"こにんち", "こんにちは", JaroWinkler}, 0.86499995, false}, + {"JaroWinkler : こんににんち/こんにちは", args{"こんににんち", "こんにちは", JaroWinkler}, 0.8755556, false}, + {"JaroWinkler : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", JaroWinkler}, 0.8333333, false}, // Cosine method {"Cosine : First arg empty", args{"", "abcde", Cosine}, 0.0, false}, @@ -138,7 +150,7 @@ func TestStringsSimilarity(t *testing.T) { return } if got != tt.want { - t.Errorf("StringsSimilarity() = %v, want %v", got, tt.want) + t.Errorf("StringsSimilarity() = %v, want %v\nRune string 1: %v, len: %d\nRune string 2: %v, len: %d", got, tt.want, []rune(tt.args.str1), len([]rune(tt.args.str1)), []rune(tt.args.str2), len([]rune(tt.args.str2))) } }) }