Longest common subsequence: folklore algorithm

Here is one Tcl implementation of the "folklore algorithm" for longest common subsequence.

Hunt and McIlroy have published a much better algorithm (used in the Unix 'diff' command) that is implemented at diff in Tcl. This page is here to hold some of the historical discussion.


namespace eval list {}

namespace eval list::longestCommonSubsequence {
    namespace export compare
}

# Internal procedure that indexes into the 2-dimensional array t,
# which corresponds to the sequence y, looking for the (i,j)th element.

proc list::longestCommonSubsequence::Index { t y i j } {
    set indx [expr { ([llength $y] + 1) * ($i + 1) + ($j + 1) }]
    return [lindex $t $indx]
}


# Internal procedure that implements Levenshtein to derive the longest
# common subsequence of two lists x and y.
proc list::longestCommonSubsequence::ComputeLCS { x y } {
    set t [list]
    for { set i -1 } { $i < [llength $y] } { incr i } {
        lappend t 0
    }
    for { set i 0 } { $i < [llength $x] } { incr i } {
        lappend t 0
        for { set j 0 } { $j < [llength $y] } { incr j } {
            if { [lindex $x $i] eq [lindex $y $j] } {
                set lastT [Index $t $y [expr { $i - 1 }] [expr {$j - 1}]]
                set nextT [expr {$lastT + 1}]
            } else {
                set lastT1 [Index $t $y $i [expr { $j - 1 }]]
                set lastT2 [Index $t $y [expr { $i - 1 }] $j]
                if { $lastT1 > $lastT2 } {
                    set nextT $lastT1
                } else {
                    set nextT $lastT2
                }
            }
            lappend t $nextT
        }
    }
    return $t
}


# Internal procedure that traces through the array built by ComputeLCS
# and finds a longest common subsequence -- specifically, the one that
# is lexicographically first.
proc list::longestCommonSubsequence::TraceLCS { t x y } {
    set trace {}
    set i [expr { [llength $x] - 1 }]
    set j [expr { [llength $y] - 1 }]
    while { $i >= 0 && $j >= 0 } {
        set im1 [expr { $i - 1 }]
        set jm1 [expr { $j - 1 }]
        if { [Index $t $y $i $j] == [Index $t $y $im1 $jm1] + 1
             && [lindex $x $i] eq [lindex $y $j] } {
            lappend trace xy [list $i $j]
            set i $im1
            set j $jm1
        } elseif { [Index $t $y $im1 $j] > [Index $t $y $i $jm1] } {
            lappend trace x $i
            set i $im1
        } else {
            lappend trace y $j
            set j $jm1
        }
    }
    while { $i >= 0 } {
        lappend trace x $i
        incr i -1
    }
    while { $j >= 0 } {
        lappend trace y $j
        incr j -1
    }
    return $trace
}


# list::longestCommonSubsequence::compare --
#
#       Compare two lists for the longest common subsequence
#
# Arguments:
#       x, y - Two lists of strings to compare
#       matched - Callback to execute on matched elements, see below
#       unmatchedX - Callback to execute on unmatched elements from the
#                    first list, see below.
#       unmatchedY - Callback to execute on unmatched elements from the
#                    second list, see below.
#
# Results:
#       None.
#
# Side effects:
#       Whatever the callbacks do.
#
# The 'compare' procedure compares the two lists of strings, x and y.
# It finds a longest common subsequence between the two.  It then walks
# the lists in order and makes the following callbacks:
#
# For an element that is common to both lists, it appends the index in
# the first list, the index in the second list, and the string value of
# the element as three parameters to the 'matched' callback, and executes
# the result.
#
# For an element that is in the first list but not the second, it appends
# the index in the first list and the string value of the element as two
# parameters to the 'unmatchedX' callback and executes the result.
#
# For an element that is in the second list but not the first, it appends
# the index in the second list and the string value of the element as two
# parameters to the 'unmatchedY' callback and executes the result.

proc list::longestCommonSubsequence::compare { x y
                                               matched
                                               unmatchedX unmatchedY } {
    set t [ComputeLCS $x $y]
    set trace [TraceLCS $t $x $y]
    set i [llength $trace]
    while { $i > 0 } {
        set indices [lindex $trace [incr i -1]]
        set type [lindex $trace [incr i -1]]
        switch -exact -- $type {
            xy {
                set c $matched
                eval lappend c $indices
                lappend c [lindex $x [lindex $indices 0]]
                uplevel 1 $c
            }
            x {
                set c $unmatchedX
                lappend c $indices
                lappend c [lindex $x $indices]
                uplevel 1 $c
            }
            y {
                set c $unmatchedY
                lappend c $indices
                lappend c [lindex $y $indices]
                uplevel 1 $c
            }
        }
    }
    return
}

# With this code in hand, we can now write the external parts of a diff command. The various options of diff alter how it displays the comparison, but not its fundamental operation. Here's an external wrapper that gives very simple-minded output.

namespace import list::longestCommonSubsequence::compare

proc umx { index value } {
    variable lastx
    variable xlines
    append xlines "< " $value \n
    set lastx $index
}


proc umy { index value } {
    variable lasty
    variable ylines
    append ylines "> " $value \n
    set lasty $index
}


proc matched { index1 index2 value } {
    variable lastx
    variable lasty
    variable xlines
    variable ylines
    if { [info exists lastx] && [info exists lasty] } {
        puts "[expr { $lastx + 1 }],${index1}c[expr {$lasty + 1 }],${index2}"
        puts -nonewline $xlines
        puts "----"
        puts -nonewline $ylines
    } elseif { [info exists lastx] } {
        puts "[expr { $lastx + 1 }],${index1}d${index2}"
        puts -nonewline $xlines
    } elseif { [info exists lasty] } {
        puts  "${index1}a[expr {$lasty + 1 }],${index2}"
        puts -nonewline $ylines
    }
    catch { unset lastx }
    catch { unset xlines }
    catch { unset lasty }
    catch { unset ylines }
}


# Really, we should read the first file in like this:
#    set f0 [open [lindex $argv 0] r]
#    set x [split [read $f0] \n]
#    close $f0
# But I'll just provide some sample lines:

set x {}
for { set i 0 } { $i < 10 } { incr i } {
    lappend x a r a d e d a b r a x
}

# The second file, too, should be read in like this:
#    set f1 [open [lindex $argv 1] r]
#    set y [split [read $f1] \n]
#    close $f1
# Once again, I'll just do some sample lines.

set y {}
for { set i 0 } { $i < 10 } { incr i } {
    lappend y a b r a c a d a b r a
}

compare $x $y matched umx umy
matched [llength $x] [llength $y] {}