Skip to content

Commit b843602

Browse files
authored
Merge pull request #21 from tweag/ninioArtillero/docs
Document implementation of Myers algorithm
2 parents 9bbb91f + 97f2314 commit b843602

2 files changed

Lines changed: 199 additions & 17 deletions

File tree

src/Data/Algorithm/Diff.hs

Lines changed: 198 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,54 @@
88
-- Portability : portable
99
--
1010
-- This is an implementation of the diff algorithm as described in
11-
-- /An \( O(ND) \) Difference Algorithm and Its Variations (1986)/
12-
-- <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.6927>.
11+
-- [/An \( O(ND) \) Difference Algorithm and Its Variations (1986)/
12+
-- by Eugene W. Myers](https://publications.mpi-cbg.de/Myers_1986_6330.pdf).
1313
-- For inputs of size \( O(N) \) with the number of differences \( D \)
1414
-- it has \( O(ND) \) time and \( O(D^2) \) space complexity.
15+
--
16+
-- == Algorithm overview
17+
--
18+
-- Finding the shortest edit script (SES) from a list \( as \) to a list \( bs \)
19+
-- is modelled as a shortest-path search on an /edit graph/: an
20+
-- \( (M+1) \times (N+1) \) grid of nodes \( (i, j) \),
21+
-- where \( M \) and \( N \) are the lengths of \( as \) and \( bs \) respectively,
22+
-- with \( i \) increasing rightward and \( j \) increasing downward.
23+
-- Each node represents the state of having consumed \( i \) elements of \( as \)
24+
-- and \( j \) elements of \( bs \). Three types of move are possible:
25+
--
26+
-- * A /rightward/ move \( (i,j) \to (i+1,j) \) represents
27+
-- /deleting/ \( as[i] \) and costs one edit.
28+
-- * A /downward/ move \( (i,j) \to (i,j+1) \) represents
29+
-- /inserting/ \( bs[j] \) and costs one edit.
30+
-- * A /diagonal/ move \( (i,j) \to (i+1,j+1) \) is free (zero edit cost)
31+
-- and is only available when \( as[i] = bs[j] \).
32+
--
33+
-- The SES corresponds to a path from \( (0,0) \) to \( (M,N) \) that minimises
34+
-- the number of non-diagonal moves.
35+
--
36+
-- Both input lists are 0-indexed, which leads to a slightly different
37+
-- interpretation of the edit graph than in the original paper. In the paper,
38+
-- each node represents the state of the traversal /after/ an edit, so a move
39+
-- is the edit that /produced/ that node. Here, each node represents the state
40+
-- /before/ an edit, so a move is the edit performed /on/ that node to yield its
41+
-- successor. This distinction is only relevant when reading the implementation
42+
-- alongside the paper.
43+
--
44+
-- === K-diagonals and the wave front
45+
--
46+
-- Every node \( (i,j) \) lies on the /k-diagonal/ \( k = i - j \).
47+
-- After exactly \( D \) non-diagonal moves, every reachable node lies on one of
48+
-- at most \( D+1 \) k-diagonals \( k \in \{-D,\,-D+2,\,\ldots,\,D-2,\,D\} \).
49+
-- On each diagonal it suffices to track only the /furthest-reaching/ node
50+
-- (the one with the largest \( i \)), collapsing the two-dimensional grid to a
51+
-- one-dimensional /wave front/ indexed by \( k \).
52+
--
53+
-- The algorithm performs a breadth-first search over \( D = 0, 1, 2, \ldots \),
54+
-- advancing the wave front by one edit at a time until a node reaches the goal
55+
-- \( (M, N) \). The edit trace stored in that node is the SES, which
56+
-- 'getDiffBy' reconstructs into a 'PolyDiff' list. The term /trace/ here
57+
-- differs from the paper, where it denotes the sequence of k-diagonals visited
58+
-- by the SES path; that structure is not materialised in this implementation.
1559
-----------------------------------------------------------------------------
1660

1761
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -31,12 +75,37 @@ import Prelude hiding (pi)
3175
import Data.Array (listArray, (!))
3276
import Data.Bifunctor
3377

78+
-- | /Diff Instruction/ — an internal enum recording the direction of a single
79+
-- non-diagonal edge traversed in the Myers edit graph. Every non-diagonal
80+
-- move in the edit script is one of:
81+
--
82+
-- * 'F' — /First/ — a horizontal edge \( (i,j) \to (i+1,j) \), which
83+
-- corresponds to /deleting/ the element at position \( i \) of the first input
84+
-- sequence. The consumed element appears in the 'First' branch of the
85+
-- resulting 'PolyDiff'.
86+
--
87+
-- * 'S' — /Second/ — a vertical edge \( (i,j) \to (i,j+1) \), which
88+
-- corresponds to /inserting/ the element at position \( j \) of the second
89+
-- input sequence. The consumed element appears in the 'Second' branch of
90+
-- the resulting 'PolyDiff'.
91+
--
92+
-- Diagonal edges (free moves corresponding to equal elements) are /not/
93+
-- recorded as 'DI' steps; they are followed implicitly by 'addsnake' and
94+
-- produce 'Both' entries in the final output.
3495
data DI = F | S deriving (Show, Eq)
3596

36-
-- | A value is either from the 'First' list, the 'Second' or from 'Both'.
37-
-- 'Both' contains both the left and right values, in case you are using a form
38-
-- of equality that doesn't check all data (for example, if you are using a
39-
-- newtype to only perform equality on side of a tuple).
97+
-- | A value tagged with which of two input sequences it came from.
98+
-- The type parameters @a@ and @b@ may differ, which is useful when comparing
99+
-- sequences of different element types via a custom equality predicate.
100+
--
101+
-- Each constructor corresponds to one outcome for a position in the aligned
102+
-- sequences:
103+
--
104+
-- * 'First' — the element exists only in the /first/ input (a deletion).
105+
-- * 'Second' — the element exists only in the /second/ input (an insertion).
106+
-- * 'Both' — the element is common to both inputs.
107+
-- Both the left and right values are retained so that the original
108+
-- elements can be recovered even when equality ignores some fields.
40109
data PolyDiff a b = First a | Second b | Both a b
41110
deriving (Show, Eq)
42111

@@ -53,40 +122,155 @@ instance Bifunctor PolyDiff where
53122
-- | This is 'PolyDiff' specialized so both sides are the same type.
54123
type Diff a = PolyDiff a a
55124

56-
data DL = DL {poi :: !Int, poj :: !Int, path::[DI]} deriving (Show, Eq)
125+
-- | /D-path Location/ — a node on the wave front of the Myers O(ND) diff
126+
-- algorithm.
127+
--
128+
-- Each wave front consists of one 'DL' per /k-diagonal/. A 'DL' stores the
129+
-- endpoint coordinates and the edit trace of a \( D \)-path, i.e. a path from the
130+
-- origin \( (0,0) \) that uses exactly \( D \) non-diagonal edges.
131+
data DL = DL
132+
{ poi :: !Int -- ^ /Position On I/ — the @x@-coordinate of the endpoint
133+
-- in the edit graph, i.e. the number of elements
134+
-- consumed from the /first/ input sequence so far.
135+
, poj :: !Int -- ^ /Position On J/ — the @y@-coordinate of the endpoint
136+
-- in the edit graph, i.e. the number of elements
137+
-- consumed from the /second/ input sequence so far.
138+
, path :: [DI] -- ^ The edit trace accumulated so far, stored in
139+
-- /reverse/ order (most recent step first). Diagonal
140+
-- edges (matches) are not recorded here; only 'F' and
141+
-- 'S' steps are stored.
142+
} deriving (Show, Eq)
57143

144+
-- | Ordering used by 'dstep' to select the /furthest-reaching/ D-path when
145+
-- two candidates compete for the same k-diagonal.
146+
--
147+
-- As in the Myers algorithm, it is enough to compare by 'poi': the candidate
148+
-- that has advanced further along the \( x \)-axis is the furthest-reaching
149+
-- endpoint on that diagonal.
150+
--
151+
-- When 'poi' values are equal, the instance prefers the node with the
152+
-- smaller 'poj' (equivalently, the higher k-diagonal). In practice this
153+
-- branch is never decisive within 'dstep': competing candidates always
154+
-- share a k-diagonal, so equal 'poi' implies equal 'poj'.
155+
--
156+
-- TODO: This instance is /not/ a lawful 'Ord': it violates reflexivity
157+
-- (@x '<=' x@ is 'False') because the equal-'poi' branch compares 'poj'
158+
-- with a strict @'>'@. This is harmless in the current context, since the
159+
-- only use of this instance is the 'max' call in 'dstep' — which always
160+
-- returns one of its arguments — and when both candidates occupy the same
161+
-- position, either choice is equivalent. This instance should either be
162+
-- made lawful or removed in favour of a local 'max'-like helper.
58163
instance Ord DL
59164
where x <= y = if poi x == poi y
60165
then poj x > poj y
61166
else poi x <= poi y
62167

168+
-- | Build a /diagonal predicate/ — a closure that tests whether position
169+
-- @(i, j)@ in the edit graph has a diagonal edge (a /match point/ in Myers'
170+
-- terminology).
171+
--
172+
-- Indices are 0-based (\( i \in [0, lena) \), \( j \in [0, lenb) \) ),
173+
-- unlike the 1-based convention of the original paper.
174+
--
175+
-- The first two 'Int' parameters stand for the lengths of the input lists,
176+
-- which are captured from the outer scope to compute them only once.
63177
canDiag :: (a -> b -> Bool) -> [a] -> [b] -> Int -> Int -> Int -> Int -> Bool
64178
canDiag eq as bs lena lenb = \ i j ->
65179
if i < lena && j < lenb then (arAs ! i) `eq` (arBs ! j) else False
66-
where arAs = listArray (0,lena - 1) as
67-
arBs = listArray (0,lenb - 1) bs
180+
where
181+
-- Lists are converted into arrays to have O(1) lookups.
182+
arAs = listArray (0,lena - 1) as
183+
arBs = listArray (0,lenb - 1) bs
68184

69-
dstep :: (Int -> Int -> Bool) -> [DL] -> [DL]
185+
-- | Perform one breadth-first search expansion step, advancing every wave front
186+
-- 'DL' node by one 'DI' edit (one non-diagonal edge) and then following
187+
-- any available snake.
188+
--
189+
-- For each node the 'dstep' produces two candidate successors by adding:
190+
--
191+
-- * An 'F' (delete) move: 'poi' incremented by 1.
192+
-- * An 'S' (insert) move: 'poj' incremented by 1.
193+
--
194+
-- 'addsnake' is applied to each candidate immediately to extend it along any
195+
-- available sequence of matching elements.
196+
--
197+
-- The resulting candidate list interleaves the 'F' and 'S' successors of each
198+
-- wave front node. The head ('F' successor of the first node) is kept as-is, and
199+
-- 'pairMaxes' is applied to the tail — pairing each 'S' successor with the 'F'
200+
-- successor of the next wave front node. When this function is iterated from a
201+
-- single-node seed (as in 'ses'), each such pair always lies on the same
202+
-- diagonal: an 'F' edge advances to the next higher diagonal while an 'S' edge
203+
-- retreats to the next lower one, so the two members of each pair straddle the
204+
-- same diagonal from opposite sides.
205+
dstep
206+
:: (Int -> Int -> Bool) -- ^ Diagonal predicate
207+
-> [DL] -- ^ Wave front of D-paths at edit distance D
208+
-> [DL] -- ^ Wave front of D-paths at edit distance D+1
70209
dstep cd dls = hd:pairMaxes rst
71210
where (hd:rst) = nextDLs dls
211+
-- Extend each node by one edit step in both possible directions
212+
-- and then follow any available snake from the resulting position.
72213
nextDLs [] = []
73214
nextDLs (dl:rest) = dl':dl'':nextDLs rest
74215
where dl' = addsnake cd $ dl {poi=poi dl + 1, path=(F : pdl)}
75216
dl'' = addsnake cd $ dl {poj=poj dl + 1, path=(S : pdl)}
76217
pdl = path dl
218+
-- Merge adjacent pairs of candidates to retain only the furthest-reaching.
77219
pairMaxes [] = []
78220
pairMaxes [x] = [x]
79221
pairMaxes (x:y:rest) = max x y:pairMaxes rest
80222

223+
-- | Follow a /snake/ from the current position of a 'DL' node.
224+
--
225+
-- A snake is a sequence of diagonal (cost-free) edges in the edit graph,
226+
-- i.e. a run of equal elements that can be consumed simultaneously
227+
-- from both input sequences without counting as an edit. Starting from
228+
-- @(poi dl, poj dl)@, this function advances both 'poi' and 'poj' as long
229+
-- as consecutive elements match, leaving 'path' unchanged (diagonal moves
230+
-- are not recorded as edit steps).
81231
addsnake :: (Int -> Int -> Bool) -> DL -> DL
82232
addsnake cd dl
83233
| cd pi pj = addsnake cd $
84234
dl {poi = pi + 1, poj = pj + 1, path = path dl}
85235
| otherwise = dl
86236
where pi = poi dl; pj = poj dl
87237

88-
lcs :: (a -> b -> Bool) -> [a] -> [b] -> [DI]
89-
lcs eq as bs = path . head . dropWhile (\dl -> poi dl /= lena || poj dl /= lenb) .
238+
-- | Compute shortest edit script (SES), as the minimum sequence of 'DI' edit
239+
-- steps that transforms @as@ into @bs@, returned in reverse order.
240+
--
241+
-- @ses eq as bs@ runs the Myers O(ND) diff algorithm following
242+
-- a five-step pipeline:
243+
--
244+
-- 1. __Seed__: create an initial 0-path wave front @[addsnake cd (DL 0 0 [])]@
245+
-- having a single node on the tip of the longest origin-sourced snake.
246+
-- 2. __Iterate__: apply 'dstep' repeatedly via 'iterate', producing an
247+
-- infinite list of wave fronts (one per edit distance D = 0, 1, 2, …).
248+
-- 3. __Flatten__: 'concat' all wave fronts into a single stream of 'DL' nodes.
249+
-- 4. __Find__: 'dropWhile' skips nodes until one reaches @(lena, lenb)@ — the
250+
-- bottom-right corner of the edit graph — which is the terminal node of a
251+
-- shortest edit script.
252+
-- 5. __Extract__: 'head' returns that node; its 'path' field carries the edit
253+
-- trace in reverse order.
254+
--
255+
-- This implementation is purely functional: rather than updating a shared
256+
-- diagonal frontier array in place, as in the original paper, it builds a new
257+
-- list of 'DL' nodes for each value of \( D \) and concatenates them into
258+
-- a single lazy stream. This is simpler but carries a larger per-node overhead:
259+
-- each 'DL' holds its own edit trace as a @['DI']@ list that structurally
260+
-- shares its tail with the parent node's trace (consing one step reuses the
261+
-- existing spine), rather than the paper's single-integer-per-diagonal
262+
-- representation. The asymptotic time
263+
-- and space complexity — \( O(ND) \) and \( O(D^2) \) respectively — is
264+
-- unchanged. Unlike the paper, which selects the better candidate per
265+
-- diagonal before extending its snake, 'dstep' extends snakes on /both/
266+
-- candidates before 'pairMaxes' selects the winner, discarding the other
267+
-- extension. This does not affect the time bound: on any given diagonal,
268+
-- all snake intervals — retained and discarded — are non-overlapping across
269+
-- successive values of \( D \), because each new candidate starts at or
270+
-- beyond the previous winner's endpoint. The total number of element
271+
-- comparisons across all snake extensions is therefore \( O(ND) \).
272+
ses :: (a -> b -> Bool) -> [a] -> [b] -> [DI]
273+
ses eq as bs = path . head . dropWhile (\dl -> poi dl /= lena || poj dl /= lenb) .
90274
concat . iterate (dstep cd) . (:[]) . addsnake cd $
91275
DL {poi=0,poj=0,path=[]}
92276
where cd = canDiag eq as bs lena lenb
@@ -113,13 +297,14 @@ getGroupedDiff = getGroupedDiffBy (==)
113297
-- | A form of 'getDiff' with no 'Eq' constraint. Instead, an equality predicate
114298
-- is taken as the first argument.
115299
getDiffBy :: (a -> b -> Bool) -> [a] -> [b] -> [PolyDiff a b]
116-
getDiffBy eq a b = markup a b . reverse $ lcs eq a b
300+
getDiffBy eq a b = markup a b . reverse $ ses eq a b
117301
where markup (x:xs) (y:ys) ds
118302
| eq x y = Both x y : markup xs ys ds
119303
markup (x:xs) ys (F:ds) = First x : markup xs ys ds
120304
markup xs (y:ys) (S:ds) = Second y : markup xs ys ds
121305
markup _ _ _ = []
122306

307+
-- | Like 'getGroupedDiff' but accepts a custom equality predicate.
123308
getGroupedDiffBy :: (a -> b -> Bool) -> [a] -> [b] -> [PolyDiff [a] [b]]
124309
getGroupedDiffBy eq a b = go $ getDiffBy eq a b
125310
where go (First x : xs) = let (fs, rest) = goFirsts xs in First (x:fs) : go rest

test/Test.hs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,7 @@ prop_sub xs ys = isSub xs ys == elem xs (subs ys)
9999
prop_everySubIsSub xs = all (flip isSub xs) (subs xs)
100100

101101

102-
-- | Obtains a longest common subsequence of two lists using their
103-
-- diff. Note that there is an @lcs@ function in the
104-
-- 'Data.Algorithm.Diff' module, but it's not exported. It's trivial
105-
-- to reconstruct the LCS though, just by taking the 'B' elements.
102+
-- | Obtains a longest common subsequence of two lists using their diff.
106103
diffLCS :: (Eq a) => [a] -> [a] -> [a]
107104
diffLCS xs ys = recoverLCS $ getDiff xs ys
108105

0 commit comments

Comments
 (0)