YAXMLP an XML parser

George Peter Staplin Oct 10, 2007 - I created an XML parser that is re-entrant, and is designed to not use regexp or string map. It may be faster, or it might be slower than TAX: A Tiny API for XML. I'll eventually time it and post results here.

Here's revision 2:

 #By George Peter Staplin
 
 set yaxmlp_count 0
 proc yaxmlp {} {
         global yaxmlp_count
         while 1 {
                 incr yaxmlp_count
                 set token yaxmlp$yaxmlp_count
                 if {[info commands $token] eq ""} {
                         break
                 }
         }
 
         proc $token args "[list yaxmlp-instance $token] \$args"
         return $token
 }
 
 proc yaxmlp-instance {token arglist} {
         global $token
         switch -- [lindex $arglist 0] {
                 handler {
                         if {3 != [llength $arglist]} {
                                 return -code error "invalid # args: should be: $token handler tag handler-callback"
                         }
                         set [set token](handler,[lindex $arglist 1]) [lindex $arglist 2] 
                 }
                 parse {
                         yaxmlp-parse $token [lindex $arglist 1]
                 }
         }
 }
 
 proc yaxmlp-dispatch {token tagname props body} {
         global $token 
         set cmd [set [set token](handler,$tagname)]
         set cmd [linsert $cmd end $token $tagname $props $body]
         uplevel #0 $cmd
 }
 
 proc yaxmlp-parse-prop-area {token script ivar endvar} {
         upvar $ivar i
         upvar $endvar end
 
         set GATHERPROP 1
         set GATHERPROPNAME 2
         set GATHERPROPVALUE 3
         set GATHERPROPQUOTE 4
         set state $GATHERPROP
         set props [list]
 
          for {} {$i < [string length $script]} {incr i} {
                 set c [string index $script $i] 
                 #puts "PROPAREA:$c  STATE:$state"
 
                 if {$GATHERPROPVALUE == $state} {
                         if {"\"" eq $c} {
                                 lappend props $propname $propvalue
                                 set state $GATHERPROP
                         } else {
                                 append propvalue $c
                         }
                 } elseif {$GATHERPROPQUOTE == $state} {
                         if {[string is space $c]} continue
                         if {"\"" eq $c} {
                                 set state $GATHERPROPVALUE
                         } 
                 } elseif {$GATHERPROPNAME == $state} {
                         if {[string is space $c]} {
                                 continue
                         } elseif {">" eq $c} {
                                 return $props
                         } elseif {"=" eq $c} {
                                 set state $GATHERPROPQUOTE
                         } else {
                                 append propname $c
                         }
                 } elseif {$GATHERPROP ==$state} {
                         if {[string is space $c]} { 
                                 set state $GATHERPROPNAME
                                 set propname ""
                                 set propvalue ""
                         } elseif {"/" eq $c && ">" eq [string index $script [expr {$i + 1}]]} {
                                 set end 1
                                 return $props        
                         } elseif {">" eq $c} {
                                 return $props
                         }
                 }
         } 
         return -code error "property area without completing > or />"
 }
 
 #Return [list tagname props]
 proc yaxmlp-parse-tag-area {token script ivar} {
         upvar $ivar i
         set GATHERTAG 1
         set state $GATHERTAG
         set tagname ""
         set props ""
         set end 0
 
         for {} {$i < [string length $script]} {incr i} {
                 set c [string index $script $i]
                 #puts C:$c
                 if {$GATHERTAG == $state} {
                         if {">" eq $c} {
                                 return [list $tagname $props $end]
                         } elseif {[string is space $c]} {
                                 set props [yaxmlp-parse-prop-area $token $script i end]
                                 return [list $tagname $props $end]
                         } elseif {"/" eq $c && ">" eq [string index $script [expr {$i + 1}]]} {
                                 set end 1
                                 incr i 2
                                 if {[string length $tagname]} {
                                         return [list $tagname $props $end]
                                 }
                         } else {
                                 append tagname $c
                         }
                 } 
         }
         return -code error "tag without closing: > or />"
 }
 
 proc yaxmlp-future-match {script i string} {
         set subscript [string range $script $i [expr {$i + [string length $string] - 1}]]
         return [expr {$subscript eq $string}]
 }
 
 proc yaxmlp-parse {token script} {
         global $token
         #puts "PARSE:$token"
 
         set GATHERTAG 1
         set GATHERBODY 2
         set state $GATHERTAG
         set tagname ""
         set line 1
         set scriptlen [string length $script]
         for {set i 0} {$i < $scriptlen} {incr i} {
                 set c [string index $script $i]
                 #puts PARSEC:$c
                 if {"\n" eq $c} {
                         incr line
                 }
 
                 if {$GATHERBODY == $state} {
                         if {"<" eq $c} {
                                 if {[yaxmlp-future-match $script [expr {$i + 1}] /$tagname>]} {
                                         yaxmlp-dispatch $token $tagname $props $body
                                         set tagname ""
                                         set props ""
                                         incr i [string length /$tagname]
                                         set state $GATHERTAG
                                 }
                         }
 
                         if {[string is space -strict [string index $body end]] && [string is space $c]} {
                                 continue                                        
                         } else {
                                 append body $c
                         }
                 } elseif {$GATHERTAG == $state} {
                         if {"<" eq $c} {
                                 incr i
                                 lassign [yaxmlp-parse-tag-area $token $script i] tagname props end
                                 if {$end} {
                                         #The tag was something like <foo bar="something"/>
                                         yaxmlp-dispatch $token $tagname $props ""
                                         set tagname ""
                                         set props ""
                                         set state $GATHERTAG
                                 } else {
                                         set body "" 
                                         set state $GATHERBODY
                                 }
                         }
                 }
         }
 }
 
 
 
 #----
 #Test code (from the TAX page)
 set input {
     <meta author="Anne Onymous"/>
     <meta>
         Composed in haste for purposes of demonstration.
     </meta>
     <para indent="3">
       This is an indented paragraph. Only the first line
       is indented, which you can tell if the paragraph goes
       on long enough. <![CDATA[<exampletag "Hi!">]]>
       <![CDATA[\example\path]]>
     </para>
     <para>
       This is an ordinary paragraph. No line is indented. Not
       one. None at all, which you can tell if the paragraph
       goes on long enough.
     </para>
 }
 
 proc meta-handler {token tagname props body} {
         #puts "$tagname $props $body" 
         puts "META:$tagname PROPS:$props BODY:$body ENDBODY"
 }
 
 proc para-handler {token tagname props body} {
         array set par $props
 
         puts PARA
 
         if {[info exists par(indent)]} {
                 foreach line [split [string trim $body] \n] {
                         puts [string repeat " " $par(indent)]$line
                 }
         } else {
                 puts BODY:$body
         }
 }
 
 set h [yaxmlp]
 $h handler meta meta-handler
 $h handler para para-handler
 $h parse $input