Here's a little script which turns a set of HTML/XML tags into a series of array changes. By setting a trace on the entire array or on specific elements, one can tunr this into a SAX-like series of events:
# htmlutil.tcl - by Jean-Claude Wippler, September 2001
package provide htmlutil 0.1
# parse HTML text, setting array elements along the way
proc htmlparse {text {aref html} {ignorecase 1}} {
upvar $aref avar
set avar() ""
regsub -all {} $text {} text
append text >
set tags ""
set hist ""
foreach {a b c} [regexp -all -inline {(.*?)<(.*?)>} $text ] {
set avar() $b
set d ""
regexp {^(\w+)\s(.*)} $c - c d
if {$ignorecase} {
set c [string toupper $c]
}
if {[regexp {^/(.*)} $c - e]} {
set t "/"
while {[llength $tags]} {
set t [lindex $tags end]
set avar(/$t) [lindex $hist end]
set tags [lreplace $tags end end]
set hist [lreplace $hist end end]
if {[string equal $t $e]} break
}
# comment out line below to ignore unbalanced closing tags
#if {![string equal $t $e]} { set avar($c) {} }
} else {
set avar($c) $d
lappend tags $c
lappend hist $d
}
}
}
# code below runs when this is launched as the main script
if {[file root [file tail $argv0]] == "htmlutil"} {
proc show {r e op} {
upvar $r a
puts [list set html($e) $a($e)]
}
trace var html w show
set in {adgijkmnp}
puts "Parsing: $in"
puts [htmlparse $in]
}
Output:
Parsing: adgijkmnp
set html() {}
set html() a
set html(B) c
set html() d
set html(E) f
set html() g
set html(E) h
set html() i
set html(/E) h
set html() j
set html(/E) f
set html() k
set html(E) l
set html() m
set html(/E) l
set html(/B) c
set html() n
set html() p