88-- Portability : portable
99--
1010-- This is an implementation of the diff algorithm as described in
11- -- /An \( O(ND) \) Difference Algorithm and Its Variations (1986)/
12- -- <http ://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.6927> .
11+ -- [ /An \( O(ND) \) Difference Algorithm and Its Variations (1986)/
12+ -- by Eugene W. Myers](https ://publications.mpi-cbg.de/Myers_1986_6330.pdf) .
1313-- For inputs of size \( O(N) \) with the number of differences \( D \)
1414-- it has \( O(ND) \) time and \( O(D^2) \) space complexity.
15+ --
16+ -- == Algorithm overview
17+ --
18+ -- Finding the shortest edit script (SES) from a list \( as \) to a list \( bs \)
19+ -- is modelled as a shortest-path search on an /edit graph/: an
20+ -- \( (M+1) \times (N+1) \) grid of nodes \( (i, j) \),
21+ -- where \( M \) and \( N \) are the lengths of \( as \) and \( bs \) respectively,
22+ -- with \( i \) increasing rightward and \( j \) increasing downward.
23+ -- Each node represents the state of having consumed \( i \) elements of \( as \)
24+ -- and \( j \) elements of \( bs \). Three types of move are possible:
25+ --
26+ -- * A /rightward/ move \( (i,j) \to (i+1,j) \) represents
27+ -- /deleting/ \( as[i] \) and costs one edit.
28+ -- * A /downward/ move \( (i,j) \to (i,j+1) \) represents
29+ -- /inserting/ \( bs[j] \) and costs one edit.
30+ -- * A /diagonal/ move \( (i,j) \to (i+1,j+1) \) is free (zero edit cost)
31+ -- and is only available when \( as[i] = bs[j] \).
32+ --
33+ -- The SES corresponds to a path from \( (0,0) \) to \( (M,N) \) that minimises
34+ -- the number of non-diagonal moves.
35+ --
36+ -- Both input lists are 0-indexed, which leads to a slightly different
37+ -- interpretation of the edit graph than in the original paper. In the paper,
38+ -- each node represents the state of the traversal /after/ an edit, so a move
39+ -- is the edit that /produced/ that node. Here, each node represents the state
40+ -- /before/ an edit, so a move is the edit performed /on/ that node to yield its
41+ -- successor. This distinction is only relevant when reading the implementation
42+ -- alongside the paper.
43+ --
44+ -- === K-diagonals and the wave front
45+ --
46+ -- Every node \( (i,j) \) lies on the /k-diagonal/ \( k = i - j \).
47+ -- After exactly \( D \) non-diagonal moves, every reachable node lies on one of
48+ -- at most \( D+1 \) k-diagonals \( k \in \{-D,\,-D+2,\,\ldots,\,D-2,\,D\} \).
49+ -- On each diagonal it suffices to track only the /furthest-reaching/ node
50+ -- (the one with the largest \( i \)), collapsing the two-dimensional grid to a
51+ -- one-dimensional /wave front/ indexed by \( k \).
52+ --
53+ -- The algorithm performs a breadth-first search over \( D = 0, 1, 2, \ldots \),
54+ -- advancing the wave front by one edit at a time until a node reaches the goal
55+ -- \( (M, N) \). The edit trace stored in that node is the SES, which
56+ -- 'getDiffBy' reconstructs into a 'PolyDiff' list. The term /trace/ here
57+ -- differs from the paper, where it denotes the sequence of k-diagonals visited
58+ -- by the SES path; that structure is not materialised in this implementation.
1559-----------------------------------------------------------------------------
1660
1761{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -31,12 +75,37 @@ import Prelude hiding (pi)
3175import Data.Array (listArray , (!) )
3276import Data.Bifunctor
3377
78+ -- | /Diff Instruction/ — an internal enum recording the direction of a single
79+ -- non-diagonal edge traversed in the Myers edit graph. Every non-diagonal
80+ -- move in the edit script is one of:
81+ --
82+ -- * 'F' — /First/ — a horizontal edge \( (i,j) \to (i+1,j) \), which
83+ -- corresponds to /deleting/ the element at position \( i \) of the first input
84+ -- sequence. The consumed element appears in the 'First' branch of the
85+ -- resulting 'PolyDiff'.
86+ --
87+ -- * 'S' — /Second/ — a vertical edge \( (i,j) \to (i,j+1) \), which
88+ -- corresponds to /inserting/ the element at position \( j \) of the second
89+ -- input sequence. The consumed element appears in the 'Second' branch of
90+ -- the resulting 'PolyDiff'.
91+ --
92+ -- Diagonal edges (free moves corresponding to equal elements) are /not/
93+ -- recorded as 'DI' steps; they are followed implicitly by 'addsnake' and
94+ -- produce 'Both' entries in the final output.
3495data DI = F | S deriving (Show , Eq )
3596
36- -- | A value is either from the 'First' list, the 'Second' or from 'Both'.
37- -- 'Both' contains both the left and right values, in case you are using a form
38- -- of equality that doesn't check all data (for example, if you are using a
39- -- newtype to only perform equality on side of a tuple).
97+ -- | A value tagged with which of two input sequences it came from.
98+ -- The type parameters @a@ and @b@ may differ, which is useful when comparing
99+ -- sequences of different element types via a custom equality predicate.
100+ --
101+ -- Each constructor corresponds to one outcome for a position in the aligned
102+ -- sequences:
103+ --
104+ -- * 'First' — the element exists only in the /first/ input (a deletion).
105+ -- * 'Second' — the element exists only in the /second/ input (an insertion).
106+ -- * 'Both' — the element is common to both inputs.
107+ -- Both the left and right values are retained so that the original
108+ -- elements can be recovered even when equality ignores some fields.
40109data PolyDiff a b = First a | Second b | Both a b
41110 deriving (Show , Eq )
42111
@@ -53,40 +122,155 @@ instance Bifunctor PolyDiff where
53122-- | This is 'PolyDiff' specialized so both sides are the same type.
54123type Diff a = PolyDiff a a
55124
56- data DL = DL { poi :: ! Int , poj :: ! Int , path :: [DI ]} deriving (Show , Eq )
125+ -- | /D-path Location/ — a node on the wave front of the Myers O(ND) diff
126+ -- algorithm.
127+ --
128+ -- Each wave front consists of one 'DL' per /k-diagonal/. A 'DL' stores the
129+ -- endpoint coordinates and the edit trace of a \( D \)-path, i.e. a path from the
130+ -- origin \( (0,0) \) that uses exactly \( D \) non-diagonal edges.
131+ data DL = DL
132+ { poi :: ! Int -- ^ /Position On I/ — the @x@-coordinate of the endpoint
133+ -- in the edit graph, i.e. the number of elements
134+ -- consumed from the /first/ input sequence so far.
135+ , poj :: ! Int -- ^ /Position On J/ — the @y@-coordinate of the endpoint
136+ -- in the edit graph, i.e. the number of elements
137+ -- consumed from the /second/ input sequence so far.
138+ , path :: [DI ] -- ^ The edit trace accumulated so far, stored in
139+ -- /reverse/ order (most recent step first). Diagonal
140+ -- edges (matches) are not recorded here; only 'F' and
141+ -- 'S' steps are stored.
142+ } deriving (Show , Eq )
57143
144+ -- | Ordering used by 'dstep' to select the /furthest-reaching/ D-path when
145+ -- two candidates compete for the same k-diagonal.
146+ --
147+ -- As in the Myers algorithm, it is enough to compare by 'poi': the candidate
148+ -- that has advanced further along the \( x \)-axis is the furthest-reaching
149+ -- endpoint on that diagonal.
150+ --
151+ -- When 'poi' values are equal, the instance prefers the node with the
152+ -- smaller 'poj' (equivalently, the higher k-diagonal). In practice this
153+ -- branch is never decisive within 'dstep': competing candidates always
154+ -- share a k-diagonal, so equal 'poi' implies equal 'poj'.
155+ --
156+ -- TODO: This instance is /not/ a lawful 'Ord': it violates reflexivity
157+ -- (@x '<=' x@ is 'False') because the equal-'poi' branch compares 'poj'
158+ -- with a strict @'>'@. This is harmless in the current context, since the
159+ -- only use of this instance is the 'max' call in 'dstep' — which always
160+ -- returns one of its arguments — and when both candidates occupy the same
161+ -- position, either choice is equivalent. This instance should either be
162+ -- made lawful or removed in favour of a local 'max'-like helper.
58163instance Ord DL
59164 where x <= y = if poi x == poi y
60165 then poj x > poj y
61166 else poi x <= poi y
62167
168+ -- | Build a /diagonal predicate/ — a closure that tests whether position
169+ -- @(i, j)@ in the edit graph has a diagonal edge (a /match point/ in Myers'
170+ -- terminology).
171+ --
172+ -- Indices are 0-based (\( i \in [0, lena) \), \( j \in [0, lenb) \) ),
173+ -- unlike the 1-based convention of the original paper.
174+ --
175+ -- The first two 'Int' parameters stand for the lengths of the input lists,
176+ -- which are captured from the outer scope to compute them only once.
63177canDiag :: (a -> b -> Bool ) -> [a ] -> [b ] -> Int -> Int -> Int -> Int -> Bool
64178canDiag eq as bs lena lenb = \ i j ->
65179 if i < lena && j < lenb then (arAs ! i) `eq` (arBs ! j) else False
66- where arAs = listArray (0 ,lena - 1 ) as
67- arBs = listArray (0 ,lenb - 1 ) bs
180+ where
181+ -- Lists are converted into arrays to have O(1) lookups.
182+ arAs = listArray (0 ,lena - 1 ) as
183+ arBs = listArray (0 ,lenb - 1 ) bs
68184
69- dstep :: (Int -> Int -> Bool ) -> [DL ] -> [DL ]
185+ -- | Perform one breadth-first search expansion step, advancing every wave front
186+ -- 'DL' node by one 'DI' edit (one non-diagonal edge) and then following
187+ -- any available snake.
188+ --
189+ -- For each node the 'dstep' produces two candidate successors by adding:
190+ --
191+ -- * An 'F' (delete) move: 'poi' incremented by 1.
192+ -- * An 'S' (insert) move: 'poj' incremented by 1.
193+ --
194+ -- 'addsnake' is applied to each candidate immediately to extend it along any
195+ -- available sequence of matching elements.
196+ --
197+ -- The resulting candidate list interleaves the 'F' and 'S' successors of each
198+ -- wave front node. The head ('F' successor of the first node) is kept as-is, and
199+ -- 'pairMaxes' is applied to the tail — pairing each 'S' successor with the 'F'
200+ -- successor of the next wave front node. When this function is iterated from a
201+ -- single-node seed (as in 'ses'), each such pair always lies on the same
202+ -- diagonal: an 'F' edge advances to the next higher diagonal while an 'S' edge
203+ -- retreats to the next lower one, so the two members of each pair straddle the
204+ -- same diagonal from opposite sides.
205+ dstep
206+ :: (Int -> Int -> Bool ) -- ^ Diagonal predicate
207+ -> [DL ] -- ^ Wave front of D-paths at edit distance D
208+ -> [DL ] -- ^ Wave front of D-paths at edit distance D+1
70209dstep cd dls = hd: pairMaxes rst
71210 where (hd: rst) = nextDLs dls
211+ -- Extend each node by one edit step in both possible directions
212+ -- and then follow any available snake from the resulting position.
72213 nextDLs [] = []
73214 nextDLs (dl: rest) = dl': dl'': nextDLs rest
74215 where dl' = addsnake cd $ dl {poi= poi dl + 1 , path= (F : pdl)}
75216 dl'' = addsnake cd $ dl {poj= poj dl + 1 , path= (S : pdl)}
76217 pdl = path dl
218+ -- Merge adjacent pairs of candidates to retain only the furthest-reaching.
77219 pairMaxes [] = []
78220 pairMaxes [x] = [x]
79221 pairMaxes (x: y: rest) = max x y: pairMaxes rest
80222
223+ -- | Follow a /snake/ from the current position of a 'DL' node.
224+ --
225+ -- A snake is a sequence of diagonal (cost-free) edges in the edit graph,
226+ -- i.e. a run of equal elements that can be consumed simultaneously
227+ -- from both input sequences without counting as an edit. Starting from
228+ -- @(poi dl, poj dl)@, this function advances both 'poi' and 'poj' as long
229+ -- as consecutive elements match, leaving 'path' unchanged (diagonal moves
230+ -- are not recorded as edit steps).
81231addsnake :: (Int -> Int -> Bool ) -> DL -> DL
82232addsnake cd dl
83233 | cd pi pj = addsnake cd $
84234 dl {poi = pi + 1 , poj = pj + 1 , path = path dl}
85235 | otherwise = dl
86236 where pi = poi dl; pj = poj dl
87237
88- lcs :: (a -> b -> Bool ) -> [a ] -> [b ] -> [DI ]
89- lcs eq as bs = path . head . dropWhile (\ dl -> poi dl /= lena || poj dl /= lenb) .
238+ -- | Compute shortest edit script (SES), as the minimum sequence of 'DI' edit
239+ -- steps that transforms @as@ into @bs@, returned in reverse order.
240+ --
241+ -- @ses eq as bs@ runs the Myers O(ND) diff algorithm following
242+ -- a five-step pipeline:
243+ --
244+ -- 1. __Seed__: create an initial 0-path wave front @[addsnake cd (DL 0 0 [])]@
245+ -- having a single node on the tip of the longest origin-sourced snake.
246+ -- 2. __Iterate__: apply 'dstep' repeatedly via 'iterate', producing an
247+ -- infinite list of wave fronts (one per edit distance D = 0, 1, 2, …).
248+ -- 3. __Flatten__: 'concat' all wave fronts into a single stream of 'DL' nodes.
249+ -- 4. __Find__: 'dropWhile' skips nodes until one reaches @(lena, lenb)@ — the
250+ -- bottom-right corner of the edit graph — which is the terminal node of a
251+ -- shortest edit script.
252+ -- 5. __Extract__: 'head' returns that node; its 'path' field carries the edit
253+ -- trace in reverse order.
254+ --
255+ -- This implementation is purely functional: rather than updating a shared
256+ -- diagonal frontier array in place, as in the original paper, it builds a new
257+ -- list of 'DL' nodes for each value of \( D \) and concatenates them into
258+ -- a single lazy stream. This is simpler but carries a larger per-node overhead:
259+ -- each 'DL' holds its own edit trace as a @['DI']@ list that structurally
260+ -- shares its tail with the parent node's trace (consing one step reuses the
261+ -- existing spine), rather than the paper's single-integer-per-diagonal
262+ -- representation. The asymptotic time
263+ -- and space complexity — \( O(ND) \) and \( O(D^2) \) respectively — is
264+ -- unchanged. Unlike the paper, which selects the better candidate per
265+ -- diagonal before extending its snake, 'dstep' extends snakes on /both/
266+ -- candidates before 'pairMaxes' selects the winner, discarding the other
267+ -- extension. This does not affect the time bound: on any given diagonal,
268+ -- all snake intervals — retained and discarded — are non-overlapping across
269+ -- successive values of \( D \), because each new candidate starts at or
270+ -- beyond the previous winner's endpoint. The total number of element
271+ -- comparisons across all snake extensions is therefore \( O(ND) \).
272+ ses :: (a -> b -> Bool ) -> [a ] -> [b ] -> [DI ]
273+ ses eq as bs = path . head . dropWhile (\ dl -> poi dl /= lena || poj dl /= lenb) .
90274 concat . iterate (dstep cd) . (: [] ) . addsnake cd $
91275 DL {poi= 0 ,poj= 0 ,path= [] }
92276 where cd = canDiag eq as bs lena lenb
@@ -113,13 +297,14 @@ getGroupedDiff = getGroupedDiffBy (==)
113297-- | A form of 'getDiff' with no 'Eq' constraint. Instead, an equality predicate
114298-- is taken as the first argument.
115299getDiffBy :: (a -> b -> Bool ) -> [a ] -> [b ] -> [PolyDiff a b ]
116- getDiffBy eq a b = markup a b . reverse $ lcs eq a b
300+ getDiffBy eq a b = markup a b . reverse $ ses eq a b
117301 where markup (x: xs) (y: ys) ds
118302 | eq x y = Both x y : markup xs ys ds
119303 markup (x: xs) ys (F : ds) = First x : markup xs ys ds
120304 markup xs (y: ys) (S : ds) = Second y : markup xs ys ds
121305 markup _ _ _ = []
122306
307+ -- | Like 'getGroupedDiff' but accepts a custom equality predicate.
123308getGroupedDiffBy :: (a -> b -> Bool ) -> [a ] -> [b ] -> [PolyDiff [a ] [b ]]
124309getGroupedDiffBy eq a b = go $ getDiffBy eq a b
125310 where go (First x : xs) = let (fs, rest) = goFirsts xs in First (x: fs) : go rest
0 commit comments