Skip to content

Commit cbdf843

Browse files
committed
properly take boundary cells into account
1 parent d19dad8 commit cbdf843

File tree

4 files changed

+39
-4
lines changed

4 files changed

+39
-4
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
## Changelog
22

3+
## [3.0.5] - 2024-07-02
4+
### Fixed
5+
- the editops implementation didn't properly account for some cells in the Levenshtein matrix.
6+
This could lead both to incorrect results and crashes.
7+
38
## [3.0.4] - 2023-04-07
49
### Fixed
510
- fix tagged version

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
3232
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
3333
endif()
3434

35-
project(rapidfuzz LANGUAGES CXX VERSION 3.0.4)
35+
project(rapidfuzz LANGUAGES CXX VERSION 3.0.5)
3636

3737
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
3838
include(GNUInstallDirs)

extras/rapidfuzz_amalgamated.hpp

+17-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
22
// SPDX-License-Identifier: MIT
33
// RapidFuzz v1.0.2
4-
// Generated: 2024-04-06 15:39:26.940916
4+
// Generated: 2024-07-02 16:47:26.932914
55
// ----------------------------------------------------------
66
// This file is an amalgamation of multiple different files.
77
// You probably shouldn't edit it directly.
@@ -7719,6 +7719,9 @@ template <typename InputIt1, typename InputIt2>
77197719
HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt2>& s2,
77207720
size_t max = std::numeric_limits<size_t>::max())
77217721
{
7722+
assert(s1.size() > 1);
7723+
assert(s2.size() > 1);
7724+
77227725
HirschbergPos hpos = {};
77237726
size_t left_size = s2.size() / 2;
77247727
size_t right_size = s2.size() - left_size;
@@ -7727,8 +7730,9 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
77277730
size_t best_score = std::numeric_limits<size_t>::max();
77287731
size_t right_first_pos = 0;
77297732
size_t right_last_pos = 0;
7733+
// todo: we could avoid this allocation by counting up the right score twice
7734+
// not sure whats faster though
77307735
std::vector<size_t> right_scores;
7731-
77327736
{
77337737
auto right_row = levenshtein_row(s1.reversed(), s2.reversed(), max, right_size - 1);
77347738
if (right_row.dist > max) return find_hirschberg_pos(s1, s2, max * 2);
@@ -7758,6 +7762,17 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
77587762
auto left_last_pos = std::min(s1_len, left_row.last_block * 64 + 64);
77597763

77607764
size_t left_score = left_row.prev_score;
7765+
// take boundary into account
7766+
if (s1_len >= left_first_pos + right_first_pos) {
7767+
size_t right_index = s1_len - left_first_pos - right_first_pos;
7768+
if (right_index < right_scores.size()) {
7769+
best_score = right_scores[right_index] + left_score;
7770+
hpos.left_score = left_score;
7771+
hpos.right_score = right_scores[right_index];
7772+
hpos.s1_mid = left_first_pos;
7773+
}
7774+
}
7775+
77617776
for (size_t i = left_first_pos; i < left_last_pos; ++i) {
77627777
size_t col_pos = i % 64;
77637778
size_t col_word = i / 64;

rapidfuzz/distance/Levenshtein_impl.hpp

+16-1
Original file line numberDiff line numberDiff line change
@@ -1055,6 +1055,9 @@ template <typename InputIt1, typename InputIt2>
10551055
HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt2>& s2,
10561056
size_t max = std::numeric_limits<size_t>::max())
10571057
{
1058+
assert(s1.size() > 1);
1059+
assert(s2.size() > 1);
1060+
10581061
HirschbergPos hpos = {};
10591062
size_t left_size = s2.size() / 2;
10601063
size_t right_size = s2.size() - left_size;
@@ -1063,8 +1066,9 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
10631066
size_t best_score = std::numeric_limits<size_t>::max();
10641067
size_t right_first_pos = 0;
10651068
size_t right_last_pos = 0;
1069+
// todo: we could avoid this allocation by counting up the right score twice
1070+
// not sure whats faster though
10661071
std::vector<size_t> right_scores;
1067-
10681072
{
10691073
auto right_row = levenshtein_row(s1.reversed(), s2.reversed(), max, right_size - 1);
10701074
if (right_row.dist > max) return find_hirschberg_pos(s1, s2, max * 2);
@@ -1094,6 +1098,17 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
10941098
auto left_last_pos = std::min(s1_len, left_row.last_block * 64 + 64);
10951099

10961100
size_t left_score = left_row.prev_score;
1101+
// take boundary into account
1102+
if (s1_len >= left_first_pos + right_first_pos) {
1103+
size_t right_index = s1_len - left_first_pos - right_first_pos;
1104+
if (right_index < right_scores.size()) {
1105+
best_score = right_scores[right_index] + left_score;
1106+
hpos.left_score = left_score;
1107+
hpos.right_score = right_scores[right_index];
1108+
hpos.s1_mid = left_first_pos;
1109+
}
1110+
}
1111+
10971112
for (size_t i = left_first_pos; i < left_last_pos; ++i) {
10981113
size_t col_pos = i % 64;
10991114
size_t col_word = i / 64;

0 commit comments

Comments
 (0)