Version 8 of LZ77 Compression

Updated 2004-09-10 12:01:48

RHS 09Spt2004

LZ77 compression is a precursor to LZW compression (An LZW-compressing virtual filesystem), I believe. When working on the Binary image compression challenge, I became interested in writing a pure-tcl implementation of gzip. As gzip uses LZ77 (and Huffman encoding), I decided to start by implementing such an algorithm. Sadly, its spectacularly slow, so its not worthwhile in its present form. I figured it would be worth posting, however, in case anyone wanted to play with it.

 # #############################################
 # FILE: lz77.tcl - the code for the compression
 namespace eval lz77 {
    variable lookback 255

    variable Escape1 "\x01"
    variable Escape2 "\x02"

    variable EscEsc "\x01"

    set K32 [expr {32 * 1024}]
 }

 proc ::lz77::K {a b} { return $a }

 proc ::lz77::encode {data} {
    variable Escape1
    variable Escape2
    variable EscEsc
    variable K32

    set output ""
    set dataLength [string length $data]

    append output [string index $data 0]

    for {set i 1} {$i < $dataLength} {incr i} {
        set foundAt  0
        set foundLen 0

        set found [maxSubstring $i $data length offset]

        if { $length > 0 } {
            if { $offset < 255 & $foundLen < 255 } {
                set offChar [format %c $offset]
                set lenChar [format %c $length]
                append output $Escape1$lenChar$offChar
                incr i $length
                incr i -1
            } else {
                append output $Escape2
                append output [binary format S $length]
                append output [binary format S $offset]
                incr i $length
                incr i -1
            }
        } else {
            set char [string index $data $i]
            if { [string equal $char "$Escape1"] } {
                append output "$Escape1$EscEsc"
            } else {
                append output $char
            }
        }
    }

    return $output
 }

 proc ::lz77::decode {data} {
    variable Escape1
    variable Escape2
    variable EscEsc

    set output ""
    for {set i 0} {$i < [string length $data]} {incr i} {
        set char [string index $data $i]
        if { ($char ne "$Escape1") && ($char ne "$Escape2") } {
            append output $char
        } elseif { [string index $data [expr {$i + 1}]] eq "$EscEsc" } {
            append output $char
            incr i
        } else {
            if { $char eq $Escape1 } {
                scan [string index $data [incr i]] %c length
                scan [string index $data [incr i]] %c offset
                set index [expr {[string length $output] - $offset}]
                for {set j 0} {$j < $length} {incr j} {
                    append output [string index $output $index]
                    incr index
                }
            } else {
                binary scan [string range $data [incr i] [incr i]] S length
                binary scan [string range $data [incr i] [incr i]] S offset
                set index [expr {[string length $output] - $offset}]
                for {set j 0} {$j < $length} {incr j} {
                    append output [string index $output $index]
                    incr index
                }
            }                
        }
    }
    return $output
 }

 proc ::lz77::maxSubstring {index data &matchLength &matchOffset} {
    upvar ${&matchLength} matchLength
    upvar ${&matchOffset} matchOffset

    variable K32

    set matchLength 0
    set matchOffset 0

    if { $index > $K32 } {
        set data [string range [K $data [unset data]] [expr {$index - $K32}] end]
        set index $K32
    }

    set dataLength [string length $data]
    set string [string range $data $index end]

    if { [string length $string] >= $K32 } {
        set string [string range $string 0 [expr {$K32 -2}]]
    }

    # If data is > 32k, get the part we need
    if { $index > $K32 } {
        set start [expr {$index - $K32}]
    } else {
        set start 0
    }

    if { $dataLength > ($index + $K32) } {
        set end [expr {$index + $K32}]
    } else {
        set end end
    }
    set currData [string range $data $start $end]

    while {[string length $string] >= 3} {
        set first 0
         while {
                ([set first [string first $string $currData $first]] >= 0)
            } {
            if { $first < $index } {
                set len [string length $string]
                if { ($index - $first) > 255 && $len < 5 } {
                } else {
                    if { $len > $matchLength } {
                        set matchOffset [expr {$index - $first}]
                        set matchLength $len
                    }
                }
            } else {
                if { $first > $index } {
                    break
                }
            }
            incr first
        }
        if { $matchLength > 0 } {
            return 1
        }

        set string [string range $string 0 end-1]
        if { ([string length $string] + $index) > [string length $currData] } {
            set currData [string range [K $currData [unset currData]] \
                    [expr {$index + [string length $string]}]]
        }
    }

    return 0
 }

And, the tests...

 # #############################################
 # FILE: lz77.test - tests for lz77
 package require tcltest
 namespace import tcltest::*

 source [file join [file dirname [info script]] gz.tcl]

 # ========================================
 test encode-1.1 {
    encode a simple string in using LZ77
 } -body {
    set encoded [::lz77::encode {abcdebcdef}]
    set expect "abcde\x01\x04\x04"
    append expect f
    if { [string length $expect] != [string length $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "ENCODED: $exList -> $enList"

        return "String lengths were not the same: \
                [string length $expect] != [string length $encoded]"
    }
    if { ![string equal $expect $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "EXPECTED: $exList\nENCODED : $enList"
        return "Strings were not equal"
    }
 } -result {}

 test encode-1.2 {
    encode a simple string in using LZ77
 } -body {
    set encoded [::lz77::encode {Blah blah blah blah blah!}]
    set expect "Blah b\x01\x12\x05!"
    if { [string length $expect] != [string length $encoded] } {
        return "String lengths were not the same: \
                [string length $expect] != [string length $encoded]"
    }
    if { ![string equal $expect $encoded] } {
        return "Strings were not equal"
    }
 } -result {}

 test encode-1.3 {
    encode a string with multiple matches
 } -body {
    set string {This is a string with multiple strings within it}
    set expect "This \x01\x03\x03"
    append expect "a string with multiple\x01\x07\x15"
    append expect "s\x01\x05\x16"
    append expect "in it"

    set encoded [::lz77::encode $string]
    if { [string length $expect] != [string length $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "EXPECT: $exList"
        puts "ENCODE: $enList"

        return "String lengths were not the same: \
                [string length $expect] != [string length $encoded]"
    }
    if { ![string equal $expect $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "EXPECT: $exList"
        puts "ENCODE: $enList"

        return "Strings were not equal"
    }
 } -result {}

 test encode-1.4 {
    Encode a long (>255 <255*255) string, to use second escape
 } -setup {
    set original "abcdefghij"
    for {set i 0} {$i < 254} {incr i} {
        append original [expr {$i % 10}]
    }
    append original "abcdefg"

    set expect "abcdefghij0123456789"
    append expect "\x01\xF4\x0a"
    append expect "\x02\x00\x07\x01\x08"
 } -body {
    set encoded [::lz77::encode $original]

    if { ![string equal $expect $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "EXPECT: $exList"
        puts "ENCODE: $enList"

        return "Strings were not equal"
    }
 } -result {}    

 # ========================================
 test decode-1.1 {
    decode a simple string using LZ77
 } -body {
    set decoded [::lz77::decode "Blah b\x01\x12\x05!"]
    set expect {Blah blah blah blah blah!}
    if { [string length $expect] != [string length $decoded] } {
        puts "$expect != $decoded"
        return "String lengths were not the same: \
                [string length $expect] != [string length $decoded]"
    }
    if { ![string equal $expect $decoded] } {
        puts "$expect != $decoded"
        return "Strings were not equal"
    }
 } -result {}

 # ========================================
 test cycle-1.1 {
    cycle a string through encode and decode
 } -body {
    set original "This is a string with multiple strings within it"
    set encoded [::lz77::encode $original]
    set changed [::lz77::decode $encoded]
    if { ![string equal $original $changed] } {
        puts "Not Equal:\nORIGINAL: $original\nENCODED : $changed"
        binary scan $encoded c* enList
        puts $enList
        return "The strings were not equal"
    }
    return
 } -result {}

 test cycle-1.2 {
    cycle a string through encode and decode
 } -body {
    set original "This is a string of text, \
                  whereherehereherehe parts of the string\
                  have text that is in other parts of the string"
    set encoded [::lz77::encode $original]
    set changed [::lz77::decode $encoded]
    if { ![string equal $original $changed] } {
        puts "Not Equal:\nORIGINAL: $original\nENCODED : $changed"
        binary scan $encoded c* enList
        puts $enList
        return "The strings were not equal"
    }
    return
 } -result {}

 test cycle-1.3 {
    Encode a long (>255 <255*255) string, to use second escape\
        and decode it
 } -setup {
    set original "abcdefghij"
    for {set i 0} {$i < 254} {incr i} {
        append original [expr {$i % 10}]
    }
    append original "abcdefg"
 } -body {
    set encoded [::lz77::encode $original]
    set decoded [::lz77::decode $encoded]

    if { ![string equal $original $decoded] } {
        binary scan $original c* orList
        binary scan $decoded c* deList
        puts "ORIGINAL: $orList"
        puts "DECODED : $deList"

        return "Strings were not equal"
    }
 } -result {}

 # ========================================
 # Special cases
 # When there's an escape in the input text
 #   Encode it as \x01\x01, since we can't have a repeat length of 1
 test escape-1.1 {
    An escape in the input data is coded as the escape, followed\
        by \x01
 } -setup {
    unset -nocomplain original expect encoded exList enList
 } -body {
    set original "ab\x01" ; append original cd
    set expect "ab\x01\x01" ; append expect cd
    set encoded [::lz77::encode $original]

    if { ![string equal $expect $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "$exList ->\n$enList"
        return "The strings were not equal"
    }
 } -result {}

 test escape-1.2 {
    An escape in the input data is coded as the escape, followed\
        by \x01
 } -setup {
    unset -nocomplain original expect encoded exList enList
 } -body {
    set original "ab\x01" ; append original cd
    set encoded [::lz77::encode $original]
    set decoded [::lz77::decode $encoded]

    if { ![string equal $original $decoded] } {
        binary scan $original c* orList
        binary scan $decoded c* deList
        puts "$orList ->\n$deList"
        return "The strings were not equal"
    }
 } -result {}

 test escape-1.2 {
    An escaped escape should not interfere with runs surrounding it
 } -setup {
    unset -nocomplain original expect encoded exList enList
 } -body {
    set original "abcdebcde\x01" ; append original cd
    set expect "abcde\x01\x04\x04\x01\x01" ; append expect cd
    set encoded [::lz77::encode $original]

    if { ![string equal $expect $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "$exList ->\n$enList"
        return "The strings were not equal"
    }
 } -result {}

 # If there's multiple matches, get the longest one possible
 #   "These blah is blah blah blah!"
 #                       ^ the match for here        
 #                  ^ should start here
 #          ^ not here
 test longest-1.1 {
    Get the longest match possible
 } -setup {
    unset -nocomplain original expect encoded exList enList
    set original {These blah is blah blah blah!}
    set expect "These blah is\x01\x06\x08"
    append expect "\x01\x09\x05!"
 } -body {
    set encoded [::lz77::encode $original]

    if { ![string equal $expect $encoded] } {
        binary scan $expect c* exList
        binary scan $encoded c* enList
        puts "Expect: $exList ->\nEncode: $enList"
        puts "DECODE: [::lz77::decode $encoded]"
        return "The strings were not equal"
    }
 } -result {}

 # ========================================
 test maxSubstring-1.1 {
    Find the max substring for a string with only one match
 } -setup {
    unset -nocomplain length offset
    set string {abcdefcdefg}
    set index 6
 } -body {
    set flag [::lz77::maxSubstring $index $string length offset]
    list $flag $length $offset
 } -result {1 4 4}

 test maxSubstring-1.2 {
    Return 0 if no match
 } -setup {
    unset -nocomplain length offset
    set string {abcdefghijk}
    set index 6
 } -body {
    ::lz77::maxSubstring $index $string length offset
 } -result {0}

 test maxSubstring-1.3 {
    Find the max substring for a string with multiple matches
 } -setup {
    unset -nocomplain length offset
    set string {aaaabbbbaaaaaaaa}
    set index 9
 } -body {
    set flag [::lz77::maxSubstring $index $string length offset]
    list $flag $length $offset
 } -result {1 7 1}

 test maxSubstring-1.4 {
    For a very long string (>32k), remove everything 32k \
        past the index
 } -setup {
    unset -nocomplain length offset
    set original "abcdef"
    for {set i 0} {$i < 1024} {incr i} {
        append original "0123456789012345678901234567890123456789"
    }
    append original "abcdef"
    set index 17
 } -body {
    set flag [::lz77::maxSubstring $index $original length offset]
    list $flag $length $offset
 } -result {1 32767 10}

 test maxSubstring-1.5 {
    For a very long string (>32k), remove anything more than 32k\
        before the index
 } -setup {
    unset -nocomplain length offset
    set original "abcdef"
    for {set i 0} {$i < 1024} {incr i} {
        append original "0123456789012345678901234567890123456789"
        append original "0123456789012345678901234567890123456789"
        append original "0123456789012345678901234567890123456789"
    }
    append original "abcdef"
    set index 40000
 } -body {
    set flag [::lz77::maxSubstring $index $original length offset]
    list $flag $length $offset
 } -result {1 32767 32760}

 # ========================================
 cleanupTests

PS 10Sep04: This brings us terribly close to pure tcl deflate/gzip. Reading http://www.gzip.org/deflate.html suggests that the Huffman coding is not terribly hard to do.

DKF: I'd estimate that the easiest way to speed this up would be to make [maxSubstring] return a list of its three result values instead of using upvar. Splitting the result list with foreach or lassign is much faster...

MS The usage of [K] can also be sped up noticeably: use the idiom [K $x [set x {}]]] instead of [K $x [unset x]]]. Even faster is the bytecompiled variant [lindex [list $x [set x{}]]] 0 ]


Category Compression