SOIF

Michael Schlenker:

A very simple package i wrote to parse SOIF Objects (RFC 2655, [L1 ]) as used by the harvest search engine [L2 ].

Drop me a note, if you find it useful:


  # Parse SOIF Objects as defined by RFC 2655 
  #
  # (c) 2002 Michael Schlenker <[email protected]>
  #
  #
  #
  # License: Use under the same license as the tcl core.
  #
  #

  # uses uri package from tcllib
  package require Tcl 8.2
  package require uri
  package provide SOIF 0.1

  namespace eval SOIF {
        set version 0.1
        set notalnumregexp {[^a-zA-Z0-9\-_]}
        set identifier     {([a-zA-Z0-9\-_]+)\{([0-9]+)\}(:\t)}
  }

  # -- SOIF::parse 
  #
  # Description:                Parse a SOIF object into a 
  #                             list of values. 
  #
  # Input:                        single SOIF object
  #
  # Output:                        list of the form 
  #                             "TEMPLATE-TYPE URL ATTRIBUTE-VALUELIST"
  #
  #

  proc SOIF::parse {obj} {
        variable notalnumregexp
        variable identifier

        # check for @ symbol
        set k [string first @ $obj]
        if {$k == -1} {
                error "No SOIF Object"
        }

        # check for template type
        set l [string first \{ $obj ]
        if {$l == -1} {
                error "No SOIF Template Type"
        }
        set template_type ""
        set template_type [string trim \
            [string range $obj [expr {$k+1}] [expr {$l-1}]]]

        # validate, that it is alphanumeric
        if {[regexp $notalnumregexp $template_type]} {
                error "Template Type \"$template_type\" not valid \
                       alphanumeric template-type."
        }

        # check for URL
        set m $l
        set url_candidate ""
        while {[string length $url_candidate]==0} {
                # the rfc is unclear how to identify 
                # the url, trying this
                set n [string first "\n" $obj $m]

                set url_candidate [string trim \
                       [string range $obj [expr {$m+1}] $n]]
                set m [expr {$n+1}]
        }
        # handle the special case that no url is given
        if {![string equal $url_candidate "-"]} {

                # check if this is a URL here, 
                # this should throw an error
                # if no valid url is found
                if {[catch {uri::split $url_candidate} clist] == 1} {
                        error "URL \"$url_candidate\" not of \
                               known type."
                }

        }
        set url $url_candidate        
        set attvalue ""
        # header is done, now parse attribute value pairs
        set start $n
        while {[regexp -indices -start $start --\
                 $identifier $obj -> id length delimeter]} {
                set id_text [string range $obj \
                              [lindex $id 0] [lindex $id 1]]
                set length [string range $obj \
                              [lindex $length 0] [lindex $length 1]]
                set offset [expr [lindex $delimeter 1] +1]
                set value [string range \
                             $obj $offset [expr {$offset+$length-1}]]
                lappend attvalue $id_text $value
                set start [expr {$offset+$length}]
        }

        # all identifiers and values have been parsed
        # check for closing \}

        if {![regexp -indices -start $start -- {\}} $obj -> dummy]} {
                error "Missing close brace on obj"
        }

        set result [list $template_type $url $attvalue]
        return $result
  }


  # -- SOIF::readObjectFromFile 
  #
  # Description:                Reads a SOIF Object from File 
  #                       (only one object per file should be used)
  #
  # Input:                Filename
  #
  # Output:                SOIF Object
  #

  proc SOIF::readObjectFromFile { filename } {

        if {![file exists $filename]} {
                error "No file \"${filename}\" exists."
        }

        if {[catch {open $filename r} fid]} {
                error "Opening \"$filename\" failed."
        }
        # set the translation to binary, 
        # as SOIF can contain arbitrary data
        fconfigure $fid -translation binary 

        set obj [read $fid]
        if {[catch {close $fid}]} {
                error "Closing \"$filename\" with \
                       channel ID \"$fid\" failed."
        }
        return $obj
  }

  # -- SOIF::writeObjectToFile
  #
  # Description:        Writes the string rep of a SOIF Object to Disk
  #                        The string rep should be built 
  #                     with SOIF::create.
  #
  # Input:                Filename
  #                        SOIF-Object
  # 
  # Output:                --
  #

  proc SOIF::writeObjectToFile { filename obj } {

        if {![file exists $filename]} {
                error "File \"$filename\" exists, cannot write"
        }

        if {[catch {open $filename w+} fid]} {
                error "File \"$filename\" could not be \
                       opened for writing."
        }

        # set the translation to binary, 
        # as SOIF can contain arbitrary data

        fconfigure $fid -translation binary

        puts -nonewline $fid $obj

        if {[catch {close $fid}]} {
                error "Closing \"$filename\" with channel \
                       ID \"$fid\" failed."
        }
  }


  # -- SOIF::create
  #
  # Description:         Creates a string rep from the parts 
  #                      of a SOIF object.
  #                        
  # Bugs:                Does not check, if the data it gets 
  #                     is well formed. 
  #
  # Input:                Template-Type
  #                        URL
  #                        Identifier-Value List
  #
  # Output:                SOIF String rep
  #

  proc SOIF::create {template-type url attvaluelist} {
        set obj ""
        append obj "@${template-type} \{ "
        append obj $url
        append obj "\r\n"
        foreach {attribute value} $attlist {
                set length [string length $value]
                set identifier "$attribute\{$length\}:\t"
                append obj $identifier $value
                # prettyprinting with extra newlines
                append obj "\r\n"
        }
        append obj "\}\n"

        return $obj
  }