Markov Chain for Words

  # Markov Chains in Tcl

  set matrix [dict create]
  set eolchars {. ? ; ! @}
  set breaks {: ,}
  set junk {/}
  set start {}
  set startword 1
  set whatever {well 1 "I think" 1 could 1 may 1 wonder 1}

  set inp [open "Midsummer Night's Dream.txt"]
  set filebuf {}
  proc getword {} {
    global filebuf inp
    if {[llength $filebuf] == 0} {gets $inp filebuf; set filebuf [split $filebuf " "]}
    set result [lindex $filebuf 0]
    set filebuf [lrange $filebuf 1 end]
    return $result
  }

  set w1 [getword]
  set w2 [getword]
  set w3 [getword]
  set w4 [getword]
  set pocket ""
  set actors {}
  while {![eof $inp]} {
  puts -nonewline .
    if {$pocket eq ""} {
      set next [getword]
      if {[set ch [string index $next end]] in $breaks} {
        set pocket $ch\n
        set next [string range $next 0 end-1]
      }
    } else {
      set next $pocket
      set pocket ""
    }
    set next [string trim $next]
  #puts $next
    # skip blank lines or lines with just one word
    if {$next eq ""} continue
    if {($next eq "ACT")||($next eq "SCENE")||([string index $next 0] eq "/")} continue
    if {[string index $next end] in $junk} continue
    if {[string index $next 0] eq "*"} {
      set next [string tolower $next]; set next [string toupper $next 0]
      if {[lsearch $actors $next]==-1} {lappend actors $next}
      continue
    }
    regsub \; $next @ next
    # linklen 4
    if {[catch {set _index [dict get $matrix "$w1 $w2 $w3 $w4"]}]!=0} {set _index [dict create]}
    dict incr _index $; dict incr _index _total; dict set matrix "$w1 $w2 $w3 $w4" ${_index}
    # linklen 3
    if {[catch {set _index [dict get $matrix "$w2 $w3 $w4"]}]!=0} {set _index [dict create]}
    dict incr _index $next; dict incr _index _total; dict set matrix "$w2 $w3 $w4" ${_index}
    # linklen 2
    if {[catch {set _index [dict get $matrix "$w3 $w4"]}]!=0} {set _index [dict create]}
    dict incr _index $next; dict incr _index _total; dict set matrix "$w3 $w4" ${_index}
    # linklen 1
    if {[catch {set _index [dict get $matrix $w4]}]!=0} {set _index [dict create]}
    dict incr _index $next; dict incr _index _total; dict set matrix $w4 ${_index}
    if {$startword} {
      if {[string index $w1 0] in {A B C D E F G H I J K L M N O P Q R S T U V W X Y Z}} {
        if {[string index $w1 end] ni  $eolchars} {
          if {[lsearch $start $w1] == -1} {lappend start $w1}
        }
      }
      set startword 0
    }
    set w1 $w2
    set w2 $w3
    set w3 $w4
    set w4 $next
    if {[string index $next end] in $eolchars} {set startword 1}
  }
  close $inp
  puts ""
  set startlen [llength $start]

  puts "MATRIX IS\n$matrix\n\n\n"
  #puts "Start: $start"
  proc gen {} {
    global start matrix startlen workstr whatever eolchars breaks
    set workstr [lindex $start [expr {int(rand()*$startlen)}]]
    set result $workstr
    while 1 {
      while 1 {
        if {[catch {set choices [dict get $matrix $workstr]}] != 0} {
          if {[llength $workstr]>0} {
            set workstr [lrange $workstr 1 end]
            continue
          }
          set choices $whatever
        }
        break
      }
  #puts "got hit on '$workstr', choices='$choices'"
      # choices is our dict with words and counts
      # _index contains the number of choices total
      set max [dict get $choices _total]
  #puts "max=$max"
      dict unset choices _total
      set choice [expr {int(rand()*$max)}]
  #puts "select $choice"
      dict for {word number} $choices {
  #puts "word=$word number=$number"
        incr choice -$number
  #puts "choice=$choice"
        if {$choice <= 0} {
          lappend result $word
  #puts "result now '$result'"
          if {[string index $word end] in $eolchars} {
            foreach ch $breaks {regsub -all " $ch" $result $ch result}
            regsub -all @ $result \; result
  #puts "returning '$result'";
            return $result
          }
          lappend workstr $word
          if {[llength $workstr]>4} {set workstr [lrange $workstr 1 end]}
        }
      }
    }
  }

  for {set i 0} {$i < 10} {incr i} {puts "$i: [gen]"}
  exit