An important part of any Scripted Compiler is the ability actually process the system language underlying the scripting language. In the case of Tcl this is the C Language.The first step is always to separate the input stream into tokens, each representing one semantic atom. In compiler speak, lexing.The following script lexes a string containing C source into a list of tokens. It assumes that the sources are free of preprocessor statements like "#include", "#define", etc.Also note that the script is built upon the base package provided in Scripted Lexing. While this means the code shown here is quite tailored to parsing for a compiler the general principle used is broad enough to allow for many variations. Examples:
- Keep the whitespace as tokens. Might be required for a pretty-printer.
- Treat comments as whitespace and remove them. True compiler. Keeping the comments, but not other whitespace as in the script below is more something for a code analyzer looking for additional data (meta-data) in comments. See Source Navigator for a tool in this area.
- Modify the definitions, convert the keywords and punctuation into single byte codes, and refrain from splitting/listifying the result. Sort of a special method for compressing C sources.
[andreask@pliers trans]$ ./driver -noraw -notoken tclIO.c __________________________________________________ tclIO.c: 242918 characters Lexing in 13446065 microseconds = 13.446065 seconds = 55.35227937 usec/char __________________________________________________Not bad for a lexer written in a scripting language IMHO.TODO
- Read up on C syntax. I believe that I currently do not recognize all possible types of numbers.
clex.tcl (The code, finally :)
# -*- tcl -*-
# Lexing C
package require lexbase
package provide clex 2.0
namespace eval clex {
# Define the lexer symbols for the language 'C', as an example.
namespace import ::lexbase::*
DefStart
DefP ( LPAREN ; DefP ) RPAREN ; DefP -> DEREF
DefP < LT ; DefP <= LE ; DefP == EQ
DefP > GT ; DefP >= GE ; DefP != NE
DefP \[ LBRACKET ; DefP \] RBRACKET ; DefP = ASSIGN
DefP \{ LBRACE ; DefP \} RBRACE ; DefP *= MUL_ASSIGN
DefP . DOT ; DefP , COMMA ; DefP /= DIV_ASSIGN
DefP ++ INCR_OP ; DefP -- DECR_OP ; DefP %= REM_ASSIGN
DefP & ADDR_BITAND ; DefP * MULT_STAR ; DefP += PLUS_ASSIGN
DefP + PLUS ; DefP - MINUS ; DefP -= MINUS_ASSIGN
DefP ~ BITNOT ; DefP ! LOGNOT ; DefP <<= LSHIFT_ASSIGN
DefP / DIV ; DefP % REM ; DefP >>= RSHIFT_ASSIGN
DefP << LSHIFT ; DefP >> RSHIFT ; DefP &= BITAND_ASSIGN
DefP ^ BITEOR ; DefP && LOGAND ; DefP ^= BITEOR_ASSIGN
DefP | BITOR ; DefP || LOGOR ; DefP |= BITOR_ASSIGN
DefP ? QUERY ; DefP : COLON ; DefP \; SEMICOLON
DefP ... ELLIPSIS ; DefP ~= BITNOT_ASSIGN
DefK typedef ; DefK extern ; DefK static ; DefK auto ; DefK register
DefK void ; DefK char ; DefK short ; DefK int ; DefK long
DefK float ; DefK double ; DefK signed ; DefK unsigned
DefK goto ; DefK continue ; DefK break ; DefK return
DefK case ; DefK default ; DefK switch
DefK struct ; DefK union ; DefK enum
DefK while ; DefK do ; DefK for
DefK const ; DefK volatile
DefK if ; DefK else
DefK sizeof
DefM COMMENT ::clex::C_comment_begin ::clex::C_comment_end
DefM STRING_LITERAL ::clex::C_string_begin ::clex::C_string_end
DefM STRING_LITERAL ::clex::C_char_begin ::clex::C_char_end
# Floats containing '.'s have to be matched early because the '.'
# is later seen as punctuation.
DefM CONSTANT ::clex::C_floatA_begin ::clex::C_floatA_end
DefM CONSTANT ::clex::C_floatB_begin ::clex::C_floatB_end
DefI IDENT
DefWS "\[\000-\040\]+"
DefRxM "^0x\[0-9a-fA-F\]+" CONSTANT
DefRxM "^\[0-9\]+" CONSTANT
DefEnd
}
proc ::clex::C_comment_begin {string start} {
return [string first "/*" $string $start]
}
proc ::clex::C_comment_end {string start} {
incr start 2 ; # Skip behind /*
set stop [string first "*/" $string $start]
incr stop 1 ; # Skip to /
return $stop
}
proc ::clex::C_string_begin {string start} {
return [string first "\"" $string $start]
}
proc ::clex::C_string_end {string start} {
# The next vari-sized thing is a "-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \" as one character inside of the string. "
set from $start
while 1 {
incr from
set stop [string first "\"" $string $from]
# Note that we do not use [string first] to look for a \",
# but simply check the preceding character. That is less
# expensive than possibly running through the whole string.
incr stop -1
if {[string equal [string index $string $stop] "\\"]} {
incr stop 2
set from $stop
continue
}
incr stop
break
}
return $stop
}
proc ::clex::C_char_begin {string start} {
return [string first "'" $string $start]
}
proc ::clex::C_char_end {string start} {
# The next vari-sized thing is a '-quoted string.
# Finding its end is bit more difficult, because we have
# to accept \' as one character inside of the string. "
set from $start
while 1 {
incr from
set stop [string first "'" $string $from]
# Note that we do not use [string first] to look for a \",
# but simply check the preceding character. That is less
# expensive than possibly running through the whole string.
incr stop -1
if {[string equal [string index $string $stop] "\\"]} {
incr stop 2
set from $stop
continue
}
incr stop
break
}
return $stop
}
proc ::clex::C_floatA_begin {string start} {
upvar stash stash
if {[regexp -indices -start $start {\W([0-9]*\.[0-9]+([eEdD][+-]?[0-9]+)?)\W} $string -> match]} {
#puts a==[string range $string [lindex $match 0] [lindex $match 1]]
set stash(float-a) [lindex $match 1]
return [lindex $match 0]
}
return -1
}
proc ::clex::C_floatA_end {string start} {
upvar stash stash
return $stash(float-a)
}
proc ::clex::C_floatB_begin {string start} {
upvar stash stash
if {[regexp -indices -start $start {\W([0-9]+\.[0-9]*([eEdD][+-]?[0-9]+)?)\W} $string -> match]} {
#puts b==[string range $string [lindex $match 0] [lindex $match 1]]
set stash(float-b) [lindex $match 1]
return [lindex $match 0]
}
return -1
}
proc ::clex::C_floatB_end {string start} {
upvar stash stash
return $stash(float-b)
return -1
}driver
#!/bin/sh
# -*- tcl -*- \
exec tclsh "$0" ${1+"$@"}
set time 1
set token 1
set raw 1
while {1} {
switch -exact -- [lindex $argv 0] {
-notime {set time 0}
-notoken {set token 0}
-noraw {set raw 0}
default {break}
}
set argv [lrange $argv 1 end]
}
source lexbase.tcl
source clex.tcl
# Read file, lex it, time the execution to measure performance
set data [read [set fh [open [set fname [lindex $argv 0]]]]][close $fh]
set len [string length $data]
set usec [lindex [time {set res [lexbase::lex $data]}] 0]
foreach {sym attr} $res break
foreach {aidx aval} $attr break
if {$time} {
# Write performance statistics.
puts __________________________________________________
puts "$fname:"
puts "\t$len characters"
puts "\tLexing in $usec microseconds"
puts "\t = [expr {double($usec)/1000000}] seconds"
puts "\t = [expr {double($usec)/$len}] usec/char"
}
if {$token} {
# Generate tokenized listing of the input, using the lexing results as input.
puts __________________________________________________
set av 0
foreach s $sym {
switch -glob -- $s {
*- {puts "$s <<[lindex $aval [lindex $aidx $av]]>>" ; incr av}
* {puts "$s"}
}
}
}
if {$raw} {
# Dump the raw lexer result.
puts __________________________________________________
puts Symbols___________________________________________
puts $sym
puts ""
puts Attribute-Indices_________________________________
puts $aidx
puts ""
puts Attribute-Data____________________________________
puts \{[join $aval "\} \{"]\}
puts ""
puts __________________________________________________
}
puts __________________________________________________
exit| Category Parsing | Scripted Compiler :: Lexing C :: --> Parsing C |
|---|
