From 46ce94ef7aee8a54acff97caeae5d332054d7b3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20G=C3=B3ngora?= <xavier.gongora@tweag.io>
Date: Mon, 13 Apr 2026 11:52:43 -0600
Subject: [PATCH 1/5] Document implementation of Myers algorithm

---
 src/Data/Algorithm/Diff.hs | 208 +++++++++++++++++++++++++++++++++++--
 1 file changed, 198 insertions(+), 10 deletions(-)

diff --git a/src/Data/Algorithm/Diff.hs b/src/Data/Algorithm/Diff.hs
index 1722bfa..0c47ca2 100644
--- a/src/Data/Algorithm/Diff.hs
+++ b/src/Data/Algorithm/Diff.hs
@@ -8,10 +8,56 @@
 -- Portability :  portable
 --
 -- This is an implementation of the diff algorithm as described in
--- /An \( O(ND) \) Difference Algorithm and Its Variations (1986)/
--- <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.6927>.
+-- [/An \( O(ND) \) Difference Algorithm and Its Variations (1986)/
+-- by Eugene W. Myers](https://publications.mpi-cbg.de/Myers_1986_6330.pdf).
 -- For inputs of size \( O(N) \) with the number of differences \( D \)
 -- it has \( O(ND) \) time and \( O(D^2) \) space complexity.
+--
+-- == Algorithm overview
+--
+-- Finding the shortest edit script (SES) from a list \( as \) to a list \( bs \)
+-- is modelled as a shortest-path search on an /edit graph/: an
+-- \( (M+1) \times (N+1) \) grid of nodes \( (i, j) \),
+-- where \( M = |as| \) and \( N = |bs| \), with \( i \) increasing rightward
+-- and \( j \) increasing downward.
+-- Each node represents the state of having consumed \( i \) elements of \( as \)
+-- and \( j \) elements of \( bs \). Three types of move are possible:
+--
+-- * A /rightward/ move \( (i,j) \to (i+1,j) \) represents
+--   /deleting/ \( as[i] \) and costs one edit.
+-- * A /downward/ move  \( (i,j) \to (i,j+1) \) represents
+--   /inserting/ \( bs[j] \) and costs one edit.
+-- * A /diagonal/ move  \( (i,j) \to (i+1,j+1) \) is free (zero edit cost)
+--   and is only available when \( as[i] = bs[j] \).
+--
+-- The SES corresponds to a path from \( (0,0) \) to \( (M,N) \) that minimises
+-- the number of non-diagonal moves. The nodes at which diagonal moves are taken
+-- — the /match points/ — form the Longest Common Subsequence (LCS) of the two
+-- input lists, as established in the paper.
+--
+-- Both input lists are 0-indexed, which leads to a slightly different
+-- interpretation of the edit graph than in the original paper. In the paper,
+-- each node represents the state of the traversal /after/ an edit, so a move
+-- is the edit that /produced/ that node. Here, each node represents the state
+-- /before/ an edit, so a move is the edit performed /on/ that node to yield its
+-- successor. This distinction is only relevant when reading the implementation
+-- alongside the paper.
+--
+-- === K-diagonals and the BFS frontier
+--
+-- Every node \( (i,j) \) lies on the /k-diagonal/ \( k = i - j \).
+-- After exactly \( D \) non-diagonal moves, every reachable node lies on one of
+-- at most \( D+1 \) k-diagonals \( k \in \{-D,\,-D+2,\,\ldots,\,D-2,\,D\} \).
+-- On each diagonal it suffices to track only the /furthest-reaching/ node
+-- (the one with the largest \( i \)), collapsing the two-dimensional grid to a
+-- one-dimensional frontier indexed by \( k \).
+--
+-- The algorithm performs a BFS over \( D = 0, 1, 2, \ldots \), advancing
+-- the frontier by one edit at a time until a frontier node reaches the goal
+-- \( (M, N) \). The edit trace stored in that node is the SES, which
+-- 'getDiffBy' reconstructs into a 'PolyDiff' list. The term /trace/ here
+-- differs from the paper, where it denotes the sequence of k-diagonals visited
+-- by the SES path; that structure is not materialised in this implementation.
 -----------------------------------------------------------------------------
 
 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -31,12 +77,37 @@ import Prelude hiding (pi)
 import Data.Array (listArray, (!))
 import Data.Bifunctor
 
+-- | /Diff Instruction/ — an internal enum recording the direction of a single
+-- non-diagonal edge traversed in the Myers edit graph. Every non-diagonal
+-- move in the edit script is one of:
+--
+-- * 'F' — /First/ — a horizontal edge \( (i,j) \to (i+1,j) \), which
+--   corresponds to /deleting/ the element at position \( i \) of the first input
+--   sequence. The consumed element appears in the 'First' branch of the
+--   resulting 'PolyDiff'.
+--
+-- * 'S' — /Second/ — a vertical edge \( (i,j) \to (i,j+1) \), which
+--   corresponds to /inserting/ the element at position \( j \) of the second
+--   input sequence. The consumed element appears in the 'Second' branch of
+--   the resulting 'PolyDiff'.
+--
+-- Diagonal edges (free moves corresponding to equal elements) are /not/
+-- recorded as 'DI' steps; they are followed implicitly by 'addsnake' and
+-- produce 'Both' entries in the final output.
 data DI = F | S deriving (Show, Eq)
 
--- | A value is either from the 'First' list, the 'Second' or from 'Both'.
--- 'Both' contains both the left and right values, in case you are using a form
--- of equality that doesn't check all data (for example, if you are using a
--- newtype to only perform equality on side of a tuple).
+-- | A value tagged with which of two input sequences it came from.
+-- The type parameters @a@ and @b@ may differ, which is useful when comparing
+-- sequences of different element types via a custom equality predicate.
+--
+-- Each constructor corresponds to one outcome for a position in the aligned
+-- sequences:
+--
+-- * 'First' — the element exists only in the /first/ input (a deletion).
+-- * 'Second' — the element exists only in the /second/ input (an insertion).
+-- * 'Both' — the element is common to both inputs (part of the LCS).
+--   Both the left and right values are retained so that the original
+--   elements can be recovered even when equality ignores some fields.
 data PolyDiff a b = First a | Second b | Both a b
     deriving (Show, Eq)
 
@@ -53,31 +124,111 @@ instance Bifunctor PolyDiff where
 -- | This is 'PolyDiff' specialized so both sides are the same type.
 type Diff a = PolyDiff a a
 
-data DL = DL {poi :: !Int, poj :: !Int, path::[DI]} deriving (Show, Eq)
+-- | /D-path Location/ — a node on the BFS frontier of the Myers O(ND) diff
+-- algorithm.
+--
+-- Each frontier consists of one 'DL' per /k-diagonal/.  A 'DL' stores the
+-- endpoint coordinates and the edit trace of a \( D \)-path, i.e. a path from the
+-- origin \( (0,0) \) that uses exactly \( D \) non-diagonal edges.
+data DL = DL
+    { poi  :: !Int   -- ^ /Position On I/ — the @x@-coordinate of the endpoint
+                     --   in the edit graph, i.e. the number of elements
+                     --   consumed from the /first/ input sequence so far.
+    , poj  :: !Int   -- ^ /Position On J/ — the @y@-coordinate of the endpoint
+                     --   in the edit graph, i.e. the number of elements
+                     --   consumed from the /second/ input sequence so far.
+    , path :: [DI]   -- ^ The edit trace accumulated so far, stored in
+                     --   /reverse/ order (most recent step first).  Diagonal
+                     --   edges (matches) are not recorded here; only 'F' and
+                     --   'S' steps are stored.
+    } deriving (Show, Eq)
 
+-- | Ordering used by 'dstep' to select the /furthest-reaching/ D-path when
+-- two candidates compete for the same k-diagonal.
+--
+-- As in the Myers algorithm, it is enough to compare by 'poi': the candidate
+-- that has advanced further along the \( x \)-axis is the furthest-reaching
+-- endpoint on that diagonal.
+--
+-- When 'poi' values are equal, the instance prefers the node with the
+-- smaller 'poj' (equivalently, the higher k-diagonal). In practice this
+-- branch is never decisive within 'dstep': competing candidates always
+-- share a k-diagonal, so equal 'poi' implies equal 'poj'.
+--
+-- TODO: This instance is /not/ a lawful 'Ord': it violates reflexivity
+-- (@x '<=' x@ is 'False') because the equal-'poi' branch compares 'poj'
+-- with a strict @'>'@. This is harmless in the current context, since the
+-- only use of this instance is the 'max' call in 'dstep' — which always
+-- returns one of its arguments — and when both candidates occupy the same
+-- position, either choice is equivalent. This instance should either be
+-- made lawful or removed in favour of a local 'max'-like helper.
 instance Ord DL
         where x <= y = if poi x == poi y
                 then  poj x > poj y
                 else poi x <= poi y
 
+-- | Build a /diagonal predicate/ — a closure that tests whether position
+-- @(i, j)@ in the edit graph has a diagonal edge (a /match point/ in Myers'
+-- terminology).
+--
+-- Indices are 0-based (\( i \in [0, lena) \), \( j \in [0, lenb) \) ),
+-- unlike the 1-based convention of the original paper.
+--
+-- The first two 'Int' parameters stand for the lengths of the input lists,
+-- which are captured from the outer scope to compute them only once.
 canDiag :: (a -> b -> Bool) -> [a] -> [b] -> Int -> Int -> Int -> Int -> Bool
 canDiag eq as bs lena lenb = \ i j ->
    if i < lena && j < lenb then (arAs ! i) `eq` (arBs ! j) else False
-    where arAs = listArray (0,lena - 1) as
-          arBs = listArray (0,lenb - 1) bs
+   where
+     -- Lists are converted into arrays to have O(1) lookups.
+     arAs = listArray (0,lena - 1) as
+     arBs = listArray (0,lenb - 1) bs
 
-dstep :: (Int -> Int -> Bool) -> [DL] -> [DL]
+-- | Perform one BFS expansion step, advancing every frontier 'DL' node by one
+-- edit (one non-diagonal edge) and then following any available snake.
+--
+-- For each existing frontier node the step produces two candidate successors:
+--
+-- * An 'F' (delete) move: 'poi' incremented by 1.
+-- * An 'S' (insert) move: 'poj' incremented by 1.
+--
+-- 'addsnake' is applied to each candidate immediately to extend it along any
+-- available sequence of matching elements.
+--
+-- The resulting candidate list interleaves the 'F' and 'S' successors of each
+-- frontier node. The head ('F' successor of the first node) is kept as-is, and
+-- 'pairMaxes' is applied to the tail — pairing each 'S' successor with the 'F'
+-- successor of the next frontier node. When this function is iterated from a
+-- single-node seed (as in 'lcs'), each such pair always lies on the same
+-- diagonal: an 'F' edge advances to the next higher diagonal while an 'S' edge
+-- retreats to the next lower one, so the two members of each pair straddle the
+-- same diagonal from opposite sides.
+dstep
+  :: (Int -> Int -> Bool) -- ^ Diagonal predicate
+  -> [DL]                 -- ^ Frontier of D-paths at edit distance D
+  -> [DL]                 -- ^ Frontier of D-paths at edit distance D+1
 dstep cd dls = hd:pairMaxes rst
   where (hd:rst) = nextDLs dls
+        -- Extend each frontier node by one edit step in both possible directions
+        -- and then follow any available snake from the resulting position.
         nextDLs [] = []
         nextDLs (dl:rest) = dl':dl'':nextDLs rest
           where dl'  = addsnake cd $ dl {poi=poi dl + 1, path=(F : pdl)}
                 dl'' = addsnake cd $ dl {poj=poj dl + 1, path=(S : pdl)}
                 pdl = path dl
+        -- Merge adjacent pairs of candidates to retain only the furthest-reaching.
         pairMaxes [] = []
         pairMaxes [x] = [x]
         pairMaxes (x:y:rest) = max x y:pairMaxes rest
 
+-- | Follow a /snake/ from the current position of a 'DL' node.
+--
+-- A snake is a sequence of diagonal (cost-free) edges in the edit graph,
+-- i.e. a run of equal elements that can be consumed simultaneously
+-- from both input sequences without counting as an edit.  Starting from
+-- @(poi dl, poj dl)@, this function advances both 'poi' and 'poj' as long
+-- as consecutive elements match, leaving 'path' unchanged (diagonal moves
+-- are not recorded as edit steps).
 addsnake :: (Int -> Int -> Bool) -> DL -> DL
 addsnake cd dl
     | cd pi pj = addsnake cd $
@@ -85,6 +236,42 @@ addsnake cd dl
     | otherwise   = dl
     where pi = poi dl; pj = poj dl
 
+-- | Compute the minimum sequence of 'DI' edit steps that transforms @as@ into
+-- @bs@, returned in reverse order. The result is in direct correspondence with
+-- the SES: its subsequence of /match points/ is the Longest Common Subsequence
+-- (LCS).
+--
+-- @lcs eq as bs@ runs the Myers O(ND) BFS algorithm following
+-- a five-step pipeline:
+--
+-- 1. __Seed__: create the initial single-node frontier @[addsnake cd (DL 0 0 [])]@
+--    corresponding to the upper bound of the longest origin-sourced snake.
+-- 2. __Iterate__: apply 'dstep' repeatedly via 'iterate', producing an
+--    infinite list of frontiers (one per edit distance D = 0, 1, 2, …).
+-- 3. __Flatten__: 'concat' all frontiers into a single stream of 'DL' nodes.
+-- 4. __Find__: 'dropWhile' skips nodes until one reaches @(lena, lenb)@ — the
+--    bottom-right corner of the edit graph — which is the terminal node of a
+--    shortest edit script.
+-- 5. __Extract__: 'head' returns that node; its 'path' field carries the edit
+--    trace in reverse order.
+--
+-- This implementation is purely functional: rather than updating a shared
+-- frontier array in place, as in the original paper, it builds a new list of
+-- 'DL' nodes for each value of \( D \) and concatenates them into a single
+-- lazy stream. This is simpler but carries a larger per-node overhead: each
+-- 'DL' holds its own edit trace as a @['DI']@ list that structurally shares
+-- its tail with the parent node's trace (consing one step reuses the
+-- existing spine), rather than the paper's single-integer-per-diagonal
+-- representation. The asymptotic time
+-- and space complexity — \( O(ND) \) and \( O(D^2) \) respectively — is
+-- unchanged. Unlike the paper, which selects the better candidate per
+-- diagonal before extending its snake, 'dstep' extends snakes on /both/
+-- candidates before 'pairMaxes' selects the winner, discarding the other
+-- extension. This does not affect the time bound: on any given diagonal,
+-- all snake intervals — retained and discarded — are non-overlapping across
+-- successive values of \( D \), because each new candidate starts at or
+-- beyond the previous winner's endpoint. The total number of element
+-- comparisons across all snake extensions is therefore \( O(ND) \).
 lcs :: (a -> b -> Bool) -> [a] -> [b] -> [DI]
 lcs eq as bs = path . head . dropWhile (\dl -> poi dl /= lena || poj dl /= lenb) .
             concat . iterate (dstep cd) . (:[]) . addsnake cd $
@@ -120,6 +307,7 @@ getDiffBy eq a b = markup a b . reverse $ lcs eq a b
           markup   xs   (y:ys) (S:ds) = Second y : markup xs ys ds
           markup _ _ _ = []
 
+-- | Like 'getGroupedDiff' but accepts a custom equality predicate.
 getGroupedDiffBy :: (a -> b -> Bool) -> [a] -> [b] -> [PolyDiff [a] [b]]
 getGroupedDiffBy eq a b = go $ getDiffBy eq a b
     where go (First x  : xs) = let (fs, rest) = goFirsts  xs in First  (x:fs)     : go rest

From 2c050024960651eae2fcf5548c1fe82dc94ea31d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20G=C3=B3ngora?= <xavier.gongora@tweag.io>
Date: Wed, 15 Apr 2026 15:35:42 -0600
Subject: [PATCH 2/5] Define lengths in prose

---
 src/Data/Algorithm/Diff.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Data/Algorithm/Diff.hs b/src/Data/Algorithm/Diff.hs
index 0c47ca2..eebe3e9 100644
--- a/src/Data/Algorithm/Diff.hs
+++ b/src/Data/Algorithm/Diff.hs
@@ -18,8 +18,8 @@
 -- Finding the shortest edit script (SES) from a list \( as \) to a list \( bs \)
 -- is modelled as a shortest-path search on an /edit graph/: an
 -- \( (M+1) \times (N+1) \) grid of nodes \( (i, j) \),
--- where \( M = |as| \) and \( N = |bs| \), with \( i \) increasing rightward
--- and \( j \) increasing downward.
+-- where \( M \) and \( N \) are the lengths of \( as \) and \( bs \) respectively,
+-- with \( i \) increasing rightward and \( j \) increasing downward.
 -- Each node represents the state of having consumed \( i \) elements of \( as \)
 -- and \( j \) elements of \( bs \). Three types of move are possible:
 --

From de154587d52d7086b89fa7b78365de3d64183be1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20G=C3=B3ngora?= <xavier.gongora@tweag.io>
Date: Thu, 16 Apr 2026 16:38:34 -0600
Subject: [PATCH 3/5] Use wave front instead of BFS frontier

---
 src/Data/Algorithm/Diff.hs | 49 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/Data/Algorithm/Diff.hs b/src/Data/Algorithm/Diff.hs
index eebe3e9..9cc6fa5 100644
--- a/src/Data/Algorithm/Diff.hs
+++ b/src/Data/Algorithm/Diff.hs
@@ -43,17 +43,17 @@
 -- successor. This distinction is only relevant when reading the implementation
 -- alongside the paper.
 --
--- === K-diagonals and the BFS frontier
+-- === K-diagonals and the wave front
 --
 -- Every node \( (i,j) \) lies on the /k-diagonal/ \( k = i - j \).
 -- After exactly \( D \) non-diagonal moves, every reachable node lies on one of
 -- at most \( D+1 \) k-diagonals \( k \in \{-D,\,-D+2,\,\ldots,\,D-2,\,D\} \).
 -- On each diagonal it suffices to track only the /furthest-reaching/ node
 -- (the one with the largest \( i \)), collapsing the two-dimensional grid to a
--- one-dimensional frontier indexed by \( k \).
+-- one-dimensional /wave front/ indexed by \( k \).
 --
--- The algorithm performs a BFS over \( D = 0, 1, 2, \ldots \), advancing
--- the frontier by one edit at a time until a frontier node reaches the goal
+-- The algorithm performs a bread-first search over \( D = 0, 1, 2, \ldots \),
+-- advancing the wave front by one edit at a time until a node reaches the goal
 -- \( (M, N) \). The edit trace stored in that node is the SES, which
 -- 'getDiffBy' reconstructs into a 'PolyDiff' list. The term /trace/ here
 -- differs from the paper, where it denotes the sequence of k-diagonals visited
@@ -124,10 +124,10 @@ instance Bifunctor PolyDiff where
 -- | This is 'PolyDiff' specialized so both sides are the same type.
 type Diff a = PolyDiff a a
 
--- | /D-path Location/ — a node on the BFS frontier of the Myers O(ND) diff
+-- | /D-path Location/ — a node on the wave front of the Myers O(ND) diff
 -- algorithm.
 --
--- Each frontier consists of one 'DL' per /k-diagonal/.  A 'DL' stores the
+-- Each wave front consists of one 'DL' per /k-diagonal/.  A 'DL' stores the
 -- endpoint coordinates and the edit trace of a \( D \)-path, i.e. a path from the
 -- origin \( (0,0) \) that uses exactly \( D \) non-diagonal edges.
 data DL = DL
@@ -184,10 +184,11 @@ canDiag eq as bs lena lenb = \ i j ->
      arAs = listArray (0,lena - 1) as
      arBs = listArray (0,lenb - 1) bs
 
--- | Perform one BFS expansion step, advancing every frontier 'DL' node by one
--- edit (one non-diagonal edge) and then following any available snake.
+-- | Perform one breath-first search expansion step, advancing every wave front
+-- 'DL' node by one 'DI' edit (one non-diagonal edge) and then following
+-- any available snake.
 --
--- For each existing frontier node the step produces two candidate successors:
+-- For each node the 'dstep' produces two candidate successors by adding:
 --
 -- * An 'F' (delete) move: 'poi' incremented by 1.
 -- * An 'S' (insert) move: 'poj' incremented by 1.
@@ -196,20 +197,20 @@ canDiag eq as bs lena lenb = \ i j ->
 -- available sequence of matching elements.
 --
 -- The resulting candidate list interleaves the 'F' and 'S' successors of each
--- frontier node. The head ('F' successor of the first node) is kept as-is, and
+-- wave front node. The head ('F' successor of the first node) is kept as-is, and
 -- 'pairMaxes' is applied to the tail — pairing each 'S' successor with the 'F'
--- successor of the next frontier node. When this function is iterated from a
+-- successor of the next wave front node. When this function is iterated from a
 -- single-node seed (as in 'lcs'), each such pair always lies on the same
 -- diagonal: an 'F' edge advances to the next higher diagonal while an 'S' edge
 -- retreats to the next lower one, so the two members of each pair straddle the
 -- same diagonal from opposite sides.
 dstep
   :: (Int -> Int -> Bool) -- ^ Diagonal predicate
-  -> [DL]                 -- ^ Frontier of D-paths at edit distance D
-  -> [DL]                 -- ^ Frontier of D-paths at edit distance D+1
+  -> [DL]                 -- ^ Wave front of D-paths at edit distance D
+  -> [DL]                 -- ^ Wave front of D-paths at edit distance D+1
 dstep cd dls = hd:pairMaxes rst
   where (hd:rst) = nextDLs dls
-        -- Extend each frontier node by one edit step in both possible directions
+        -- Extend each node by one edit step in both possible directions
         -- and then follow any available snake from the resulting position.
         nextDLs [] = []
         nextDLs (dl:rest) = dl':dl'':nextDLs rest
@@ -241,14 +242,14 @@ addsnake cd dl
 -- the SES: its subsequence of /match points/ is the Longest Common Subsequence
 -- (LCS).
 --
--- @lcs eq as bs@ runs the Myers O(ND) BFS algorithm following
+-- @lcs eq as bs@ runs the Myers O(ND) diff algorithm following
 -- a five-step pipeline:
 --
--- 1. __Seed__: create the initial single-node frontier @[addsnake cd (DL 0 0 [])]@
---    corresponding to the upper bound of the longest origin-sourced snake.
+-- 1. __Seed__: create an initial 0-path wave front @[addsnake cd (DL 0 0 [])]@
+--    having a single node on the tip of the longest origin-sourced snake.
 -- 2. __Iterate__: apply 'dstep' repeatedly via 'iterate', producing an
---    infinite list of frontiers (one per edit distance D = 0, 1, 2, …).
--- 3. __Flatten__: 'concat' all frontiers into a single stream of 'DL' nodes.
+--    infinite list of wave fronts (one per edit distance D = 0, 1, 2, …).
+-- 3. __Flatten__: 'concat' all wave fronts into a single stream of 'DL' nodes.
 -- 4. __Find__: 'dropWhile' skips nodes until one reaches @(lena, lenb)@ — the
 --    bottom-right corner of the edit graph — which is the terminal node of a
 --    shortest edit script.
@@ -256,11 +257,11 @@ addsnake cd dl
 --    trace in reverse order.
 --
 -- This implementation is purely functional: rather than updating a shared
--- frontier array in place, as in the original paper, it builds a new list of
--- 'DL' nodes for each value of \( D \) and concatenates them into a single
--- lazy stream. This is simpler but carries a larger per-node overhead: each
--- 'DL' holds its own edit trace as a @['DI']@ list that structurally shares
--- its tail with the parent node's trace (consing one step reuses the
+-- diagonal frontier array in place, as in the original paper, it builds a new
+-- list of 'DL' nodes for each value of \( D \) and concatenates them into
+-- a single lazy stream. This is simpler but carries a larger per-node overhead:
+-- each 'DL' holds its own edit trace as a @['DI']@ list that structurally
+-- shares its tail with the parent node's trace (consing one step reuses the
 -- existing spine), rather than the paper's single-integer-per-diagonal
 -- representation. The asymptotic time
 -- and space complexity — \( O(ND) \) and \( O(D^2) \) respectively — is

From 19a9566a4981b58fb547c390957381f29eb7aff1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20G=C3=B3ngora?= <xavier.gongora@tweag.io>
Date: Thu, 16 Apr 2026 18:05:06 -0600
Subject: [PATCH 4/5] Change `lcs` to `ses`

---
 src/Data/Algorithm/Diff.hs | 22 +++++++++-------------
 test/Test.hs               |  5 +----
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/src/Data/Algorithm/Diff.hs b/src/Data/Algorithm/Diff.hs
index 9cc6fa5..9b57d26 100644
--- a/src/Data/Algorithm/Diff.hs
+++ b/src/Data/Algorithm/Diff.hs
@@ -31,9 +31,7 @@
 --   and is only available when \( as[i] = bs[j] \).
 --
 -- The SES corresponds to a path from \( (0,0) \) to \( (M,N) \) that minimises
--- the number of non-diagonal moves. The nodes at which diagonal moves are taken
--- — the /match points/ — form the Longest Common Subsequence (LCS) of the two
--- input lists, as established in the paper.
+-- the number of non-diagonal moves.
 --
 -- Both input lists are 0-indexed, which leads to a slightly different
 -- interpretation of the edit graph than in the original paper. In the paper,
@@ -105,7 +103,7 @@ data DI = F | S deriving (Show, Eq)
 --
 -- * 'First' — the element exists only in the /first/ input (a deletion).
 -- * 'Second' — the element exists only in the /second/ input (an insertion).
--- * 'Both' — the element is common to both inputs (part of the LCS).
+-- * 'Both' — the element is common to both inputs.
 --   Both the left and right values are retained so that the original
 --   elements can be recovered even when equality ignores some fields.
 data PolyDiff a b = First a | Second b | Both a b
@@ -200,7 +198,7 @@ canDiag eq as bs lena lenb = \ i j ->
 -- wave front node. The head ('F' successor of the first node) is kept as-is, and
 -- 'pairMaxes' is applied to the tail — pairing each 'S' successor with the 'F'
 -- successor of the next wave front node. When this function is iterated from a
--- single-node seed (as in 'lcs'), each such pair always lies on the same
+-- single-node seed (as in 'ses'), each such pair always lies on the same
 -- diagonal: an 'F' edge advances to the next higher diagonal while an 'S' edge
 -- retreats to the next lower one, so the two members of each pair straddle the
 -- same diagonal from opposite sides.
@@ -237,12 +235,10 @@ addsnake cd dl
     | otherwise   = dl
     where pi = poi dl; pj = poj dl
 
--- | Compute the minimum sequence of 'DI' edit steps that transforms @as@ into
--- @bs@, returned in reverse order. The result is in direct correspondence with
--- the SES: its subsequence of /match points/ is the Longest Common Subsequence
--- (LCS).
+-- | Compute shortest edit script (SES), as the minimum sequence of 'DI' edit
+-- steps that transforms @as@ into @bs@, returned in reverse order.
 --
--- @lcs eq as bs@ runs the Myers O(ND) diff algorithm following
+-- @ses eq as bs@ runs the Myers O(ND) diff algorithm following
 -- a five-step pipeline:
 --
 -- 1. __Seed__: create an initial 0-path wave front @[addsnake cd (DL 0 0 [])]@
@@ -273,8 +269,8 @@ addsnake cd dl
 -- successive values of \( D \), because each new candidate starts at or
 -- beyond the previous winner's endpoint. The total number of element
 -- comparisons across all snake extensions is therefore \( O(ND) \).
-lcs :: (a -> b -> Bool) -> [a] -> [b] -> [DI]
-lcs eq as bs = path . head . dropWhile (\dl -> poi dl /= lena || poj dl /= lenb) .
+ses :: (a -> b -> Bool) -> [a] -> [b] -> [DI]
+ses eq as bs = path . head . dropWhile (\dl -> poi dl /= lena || poj dl /= lenb) .
             concat . iterate (dstep cd) . (:[]) . addsnake cd $
             DL {poi=0,poj=0,path=[]}
             where cd = canDiag eq as bs lena lenb
@@ -301,7 +297,7 @@ getGroupedDiff = getGroupedDiffBy (==)
 -- | A form of 'getDiff' with no 'Eq' constraint. Instead, an equality predicate
 -- is taken as the first argument.
 getDiffBy :: (a -> b -> Bool) -> [a] -> [b] -> [PolyDiff a b]
-getDiffBy eq a b = markup a b . reverse $ lcs eq a b
+getDiffBy eq a b = markup a b . reverse $ ses eq a b
     where markup (x:xs) (y:ys) ds
             | eq x y = Both x y : markup xs ys ds
           markup (x:xs)   ys   (F:ds) = First x  : markup xs ys ds
diff --git a/test/Test.hs b/test/Test.hs
index af38ab6..aafd26b 100644
--- a/test/Test.hs
+++ b/test/Test.hs
@@ -99,10 +99,7 @@ prop_sub xs ys = isSub xs ys == elem xs (subs ys)
 prop_everySubIsSub xs = all (flip isSub xs) (subs xs)
 
 
--- | Obtains a longest common subsequence of two lists using their
--- diff. Note that there is an @lcs@ function in the
--- 'Data.Algorithm.Diff' module, but it's not exported. It's trivial
--- to reconstruct the LCS though, just by taking the 'B' elements.
+-- | Obtains a longest common subsequence of two lists using their diff.
 diffLCS :: (Eq a) => [a] -> [a] -> [a]
 diffLCS xs ys = recoverLCS $ getDiff xs ys
 

From 97f2314d10d3b349d0cce295bdef438065b131ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20G=C3=B3ngora?= <xavier.gongora@tweag.io>
Date: Fri, 17 Apr 2026 10:21:44 -0600
Subject: [PATCH 5/5] Fix typo

---
 src/Data/Algorithm/Diff.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Data/Algorithm/Diff.hs b/src/Data/Algorithm/Diff.hs
index 9b57d26..42024c9 100644
--- a/src/Data/Algorithm/Diff.hs
+++ b/src/Data/Algorithm/Diff.hs
@@ -50,7 +50,7 @@
 -- (the one with the largest \( i \)), collapsing the two-dimensional grid to a
 -- one-dimensional /wave front/ indexed by \( k \).
 --
--- The algorithm performs a bread-first search over \( D = 0, 1, 2, \ldots \),
+-- The algorithm performs a breadth-first search over \( D = 0, 1, 2, \ldots \),
 -- advancing the wave front by one edit at a time until a node reaches the goal
 -- \( (M, N) \). The edit trace stored in that node is the SES, which
 -- 'getDiffBy' reconstructs into a 'PolyDiff' list. The term /trace/ here
@@ -182,7 +182,7 @@ canDiag eq as bs lena lenb = \ i j ->
      arAs = listArray (0,lena - 1) as
      arBs = listArray (0,lenb - 1) bs
 
--- | Perform one breath-first search expansion step, advancing every wave front
+-- | Perform one breadth-first search expansion step, advancing every wave front
 -- 'DL' node by one 'DI' edit (one non-diagonal edge) and then following
 -- any available snake.
 --