Version 1 of Spell correcter

Updated 2011-07-26 11:03:23 by ferrieux

Prompted by a question from 'AurovilleRadio' on comp.lang.tcl, here is a direct port to Tcl of Norvig's Python spell-correcter at http://norvig.com/spell-correct.html .

Disclaimer1: no effort is made to mimic Python's very expressive (should I say terse ?) style based on its powerful list comprehensions and other functional tools. Only the task at hand is aimed for.

Disclaimer2: yes, it is slow. Feel free to optimize.

Usage: spell-correct SOME-TEXT-FULL-OF-WORDS

Then type misspelled words, one per line (non-letters are collapsed anyway), and see the one-line outputs:

* DICT : word was in dictionary * EDIT1: word was at edit distance 1 from dictionary * EDIT2: word was at edit distance 2 from dictionary * NO-LUCK

#! /bin/sh
#\
exec tclsh $0 "$@"

if {$argc<1} {puts stderr "Usage: [file tail $::argv0] <wordsfile> \[<wordsfile> ...\]";exit 1}

foreach fn $argv {
        puts stderr " (loading $fn)"
        set ff [open $fn r]
        set x [read $ff]
        close $ff
        regsub -all {[^a-z]+} [string tolower $x] \  x
        foreach w $x {
                if {[info exists model($w)]} {
                        incr model($w)
                } else {
                        set model($w) 1
                }
        }
}

set alphabet [split abcdefghijklmnopqrstuvwxyz ""]

proc splits w {
        set out {}
        set n [string length $w]
        for {set i 0} {$i<=$n} {incr i} {
                lappend out [list [string range $w 0 $i-1] [string range $w $i $n]]
        }
        return $out
}

proc deletes w {
        set out {}
        set n [string length $w]
        for {set i 0} {$i<$n} {incr i} {
                lappend out [string replace $w $i $i]
        }
        return $out
}        

proc transposes w {
        set out {}
        set n [expr {[string length $w]-1}]
        for {set i 0} {$i<$n} {incr i} {
                lappend out [string replace $w $i $i+1 [string index $w $i+1][string index $w $i]]
        }
        return $out
}        

proc replaces w {
        set out {}
        set n [string length $w]
        for {set i 0} {$i<$n} {incr i} {
                foreach a $::alphabet {
                        lappend out [string replace $w $i $i $a]
                }
        }
        return $out
}

proc inserts w {
        set out {}
        set n [string length $w]
        for {set i 0} {$i<=$n} {incr i} {
                foreach a $::alphabet {
                        lappend out [string range $w 0 $i-1]$a[string range $w $i $n]
                }
        }
        return $out
}

proc edit1 {w vtab} {
        upvar $vtab tab

        foreach x [deletes $w] {set tab($x) 1}
        foreach x [transposes $w] {set tab($x) 1}
        foreach x [replaces $w] {set tab($x) 1}
        foreach x [inserts $w] {set tab($x) 1}
}


fconfigure stdin -translation binary
fconfigure stdout -translation binary -buffering line

puts stderr " (now processing stdin)"
while {[gets stdin line]>=0} {
        regsub -all {[^a-z]+} [string tolower $line] "" w
        if {[info exists model($w)]} {
                puts "DICT        $w"
                continue
        }
        array unset tab
        edit1 $w tab
        set ok 0
        foreach x [array names tab] {
                if {[info exists model($x)]} {
                        puts "EDIT1        $x"
                        set ok 1
                        break
                }
        }
        if {$ok} continue
        array unset tab2
        foreach x [array names tab] {
                edit1 $x tab2
        }
        set ok 0
        foreach x [array names tab2] {
                if {[info exists model($x)]} {
                        puts "EDIT2        $x"
                        set ok 1
                        break
                }
        }
        if {$ok} continue
        puts "NO-LUCK"
}