This page is based on http://www.paulgraham.com/spam.html but it is still missing some functionality (like the code to build the frequency tables from the message corpora, etc.) Enjoy! ''[DKF]'' Note that Graham's work is based, perhaps indirectly, on such earlier efforts as appear in [http://citeseer.nj.nec.com/397172.html] and [http://www-stat.stanford.edu/~tibs/ElemStatLearn]. ---- switch $tcl_platform(platform) { unix - mac { set ConfigFile ~/.tclSpamFilter/config.tcl } windows { # This seems to be the right spot on Win98... set ConfigFile "c:/windows/application data/Tcl Spam Filter/config.tcl" } } proc extendTable {type string {direction 1}} { global WordRE access upvar #0 ${type}Table t ${type}Count c set i 0 set now [clock seconds] while {[regexp -indices -start $i $WordRE $string match]} { foreach {j i} $match {} set word [string range $string $j $i] if {[catch { if {[incr t($word) $direction] == 0} { unset t($word) } else { set access($word) $now } }]} then { set t($word) $direction set access($word) $now } } incr c $direction } proc generateProbability {word} { global goodTable goodCount badTable badCount set g 0 catch { set g [expr {$goodTable($word) * 2}] } set b 0 catch { set b $badTable($word) } if {$g == 0 && $b == 0} { # Not seen before return 0.2 } if {$g+$b < 5} { # Not frequent enough return 0.0 } set bfreq [min 1.0 [expr {double($b)/$badCount}]] set gfreq [min 1.0 [expr {double($g)/$goodCount}]] return [max 0.01 [min 0.99 [expr {$bfreq / ($gfreq + $bfreq)}]]] } proc combine {probs} { set p1 1.0 set p2 1.0 foreach prob $probs { set p1 [expr {$p1 * $prob}] set p2 [expr {$p2 * (1.0 - $prob)}] } return [expr {$p1 / ($p1 + $p2)}] } proc min {x y} {expr {$x<$y ? $x : $y}} proc max {x y} {expr {$x>$y ? $x : $y}} proc isSpam {message} { global WordRE reasons while {[regexp -indices -start $i $WordRE $message match]} { foreach {j i} $match {} set t([string range $string $j $i]) {} } foreach word [array names t] { set p [generateProbability $word] lappend magic [list [expr {abs($p-0.5)}] $p $word] } foreach l [lrange [lsort -decreasing -real -index 0 $magic] 0 15] { append reasons "[lindex $l 2] (score=[lindex $l 1]) " lappend interesting [lindex $l 1] } set score [combine $interesting] append reasons "=> Overall Score $score" return [expr {$score > 0.9}] } proc saveTables {} { global TableFile goodTable goodCount badTable badCount set f [open $TableFile w] puts $f [list \ [array get goodTable] $goodCount \ [array get badTable] $badCount \ [array set access]] close $f } proc loadTables {} { global TableFile goodTable goodCount badTable badCount access set list {} catch { set f [open $TableFile r] set list [read $f] close $f } array unset goodTable array unset badTable set done 0; # Flag because of catch! catch { if {[llength $list] == 5} { foreach {gt gc bt bc ac} $list {} if { !([llength $gt] & 1) && !([llength $bt] & 1) && !([llength $ac] & 1) && [string is integer -strict $gc] && [string is integer -strict $bc] } then { array set goodTable $gt set goodCount $gc array set badTable $bt set badCount $bc array set access $ac set done 1 } } } if {!$done} { array set goodTable {} set goodCount 0 array set badTable {} set badCount 0 array set access {} } } proc expireTables {} { global goodTable badTable access Expiry expires if {!$Expiry(Enabled)} { return } set expires [expr {[clock seconds]-$Expiry(Interval)}] foreach {word time} [array get access] { if {$time > $expires} { # Not expired yet! continue } set total 0 catch {incr total $goodTable($word)} catch {incr total $badTable($word)} if {$total > $Expiry(InhibitCount)} { # Too common anyway continue } catch {unset goodTable($word)} catch {unset badTable($word)} unset access($word) } } proc log {string infoMsg {optMsg {}}} { global Log if {!($Log(Enabled) || [string length $optMsg])} { return } set s [clock format [clock seconds]] if {[string length $string] && $Log(Subject)} { if {[regexp -line {^Subject:\s+(.*)} $string -> subject]} { append s ": subject=$subject" } else { append s ": no subject" } } else { append s ":" } if {[string length $string] && $Log(Source)} { if {[regexp -line {^(?:Sender|From):\s+(.*)} $string -> source]} { append s ": source=$source" } else { append s ": no source" } } else { append s ":" } append s $infoMsg if {[string length $optMsg]} { append s "\n$optMsg" } set fid [open $Log(File) a] puts $fid $s close $fid } # Basic functionality interfaces proc addSpam {{fid stdin}} { set message [read $fid] extendTable bad $message log $message "added as spam" saveTables } proc addNonspam {{fid stdin}} { set message [read $fid] extendTable good $message log $message "added as non-spam" saveTables } proc removeSpam {{fid stdin}} { set message [read $fid] extendTable bad $message -1 log $message "removed from spam" saveTables } proc removeNonspam {{fid stdin}} { set message [read $fid] extendTable good $message -1 log $message "removed as non-spam" saveTables } # Transfer message from one table to the other proc convertToSpam {{fid stdin}} { set message [read $fid] extendTable good $message -1 extendTable bad $message log $message "converted to spam" saveTables } proc convertToNonspam {{fid stdin}} { set message [read $fid] extendTable bad $message -1 extendTable good $message log $message "converted to non-spam" saveTables } # Filtering interfaces proc filterSpam {{fid stdin}} { global reasons set flag [isSpam [set message [read $fin]]] log $message "${reasons}: [expr {$flag ? {spam} : {non-spam}}]" exit $flag } proc aggressiveFilterSpam {{fid stdin}} { ## This procedure not only reports via the process exit code ## whether the message is spam, but also updates its internal ## database accordingly. Like that, it should be able to maintain ## the database in the face of slowly changing spam with ## absolutely no user intervention (except in the case of wholly ## new classes of spam.) global reasons set message [read $fid] set flag [isSpam $message] extendTable [expr {$flag ? "bad" : "good"}] $message log $message "${reasons}: [expr {$flag ? {spam} : {non-spam}}]: added" saveTables exit $flag } proc initialize {} { global WordRE ConfigFile set WordRE {[-\w'$]+} if {![file exists $ConfigFile]} { set dir [file dirname $ConfigFile] if {![file exists $dir]} { file mkdir $dir } set cfg {### Tcl Spam Filter Configuration File ### Where to load and save the tables of word frequencies set TableFile @@APPDIR@@/tables.db ### Rarely-used word expiry set Expiry(Enabled) 1 # If a word is not entered into the database for a month # and a half (measured in seconds) it should be removed. set Expiry(Interval) 3888000 # However, if the word has come up at least this number of # times, don't bother. set Expiry(InhibitCount) 10 ### Logging set Log(Enabled) 1 set Log(File) @@APPDIR@@/decisions.log # Log message subjects? set Log(Subject) 1 # Log message senders? set Log(Source) 1 } set fid [open $ConfigFile w] puts $fid [string map [list @@APPDIR@@ $dir] $cfg] close $fid } uplevel #0 {source $ConfigFile} } proc main {} { global argv errorInfo initialize loadTables if {[catch {[lindex $argv 0]} msg]} { set ei $errorInfo catch { log {} $msg $ei exit 2 } # Logging system is stuffed! :^/ puts stderr $ei puts stderr $errorInfo exit 3 } } main # This program must not be run multiple times simultaneously; when # installing it as a mail filter assistant, you *must* provide an # adequate level of locking yourself! ---- [[Nice work.]] There's something in PERL which uses Bayes here [http://www.ai.mit.edu/~jrennie/ifile/] - it interfaces to exmh, too. Looks like some *very* interesting code, but a few comments here and there would certainly be helpful to those of us who would like to help out!