purse NG

This is an improved version of purse, orginally by CMcC. The code below attempts to minimise the amount of file descriptors that are kept open at one time. The original implementation would only close file descriptors on array removals, which can be problematic in virtual machines (I've had problems on Ubuntu in OpenVZ-based machines). Closing of file descriptors uses the following two heuristics:

  1. Journalling files are kept open for few seconds after an initial write to the array, then they get automatically closed. This supposes that your code will "burst" modifications to arrays.
  2. Only a restricted number of journalling files are kept open at all times. When a maximum is reached, the "oldest" ones are closed. This is usefull if you create and purse many arrays at once, for example when restarting from disk state.

The API is exactly the same as the one of purse, except that there are more parameters that can be set for a purse (and good defaults, highlighted at the beginning of the code).

I have been also been looking at tie, which is part of tcllib. But the implementation also keeps the file descriptors opened forever, to what I can understand.

# purse makes tcl arrays purse-istant
package require Tcl 8.5
package provide Purse 0.3
 
namespace eval ::purse {
    namespace eval vars {
        variable nofile 128;        # Default max number of journalling file to keep open
        variable keep 5000;         # Default number of milliseconds to keep journalling files open
        variable debug "";          # File descritor for debug output (empty for none)
        variable dtfmt "%Y%m%d %T"; # Format for date output in log (empty for none)
    }
}

# param - sets a purse's parameters
# dir - directory in which purses are stored (default [pwd])
proc ::purse::param {var val {control purse}} {
    variable $control
    upvar 0 $control cont

    if {![info exists cont]} {
        #::purse::purse $control $control
        set cont() [dict create dir [pwd]]
    }
    
    if {$var == "dir"} {
        file mkdir $val
    }
    dict set cont() $var $val
    Debug "Set parameter $var to $val in $control"
}

# Restart from current state on disk
proc ::purse::restart { {glob ::*} {control purse}} {
    variable $control
    upvar 0 $control cont

    init $control

    set arrays [glob -directory [dict get $cont() dir] \
                    -nocomplain \
                    -tails [string map [dict get $cont() todisk] $glob]]
    foreach array $arrays {
        set array [FromDisk $array]
        # creates namespace ondemand, if necessary
        set nspace [namespace qualifiers $array]
        if { $nspace ne "" } {
            namespace eval $nspace {}
        }
        
        R $control $array $array 
    }
    return $arrays
}

# Initialise purse storage in control variable
proc ::purse::init { {control purse} } {
    global tcl_platform
    variable $control
    upvar 0 $control cont

    if {![info exists cont]} {
        #::purse::purse $control $control
        set cont() [dict create]
    }

    # Make sure we have a "dir" parameter, this will be the directory
    # where we will be persisting arrays.
    if {![dict exists $cont() dir]} {
        param dir [pwd] $control
    }

    # Make sure we know how to map unallowed characters onto
    # characters that can be used for file names.
    if { ![dict exists $cont() todisk] } {
        if { $tcl_platform(platform) eq "windows" } {
            param todisk [list ":" "¨"] $control
        } else {
            param todisk {} $control
        }
    }
    
    # The opposite...
    if { ![dict exists $cont() fromdisk] } {
        if { $tcl_platform(platform) eq "windows" } {
            param fromdisk [list "¨" ":"] $control
        } else {
            param fromdisk {} $control
        }
    }
    
    if {![dict exists $cont() nofile]} {
        param nofile $vars::nofile $control
    }

    if {![dict exists $cont() keep]} {
        param keep $vars::keep $control
    }
}


# purse - purses an array
proc ::purse::purse {array {control purse}} {
    #puts stderr "purse $array $control"
    #set array [namespace which -variable $array]
    upvar $array arr
    variable $control
    upvar 0 $control cont
    
    init $control;   # Initialise storage.
    
    trace add variable arr read  [list ::purse::R $control $array]        ;# one shot load file
    trace add variable arr array [list ::purse::R $control $array]        ;# one shot load file
    trace add variable arr write [list ::purse::W $control $array]
    trace add variable arr unset [list ::purse::U $control $array]
    
    # register a single purse flush at exit
    set pfx ::exit_purse_[string map [list ":" ""] $control]_
    if { [info commands ${pfx}*] eq "" } {
        set newex ${pfx}[expr rand()]
        rename ::exit $newex
        proc ::exit {} [subst {
            #puts stderr "flush $control"
            ::purse::flush * $control
            $newex
        }]
    }
}

# flush arrays matching glob
proc ::purse::flush {{glob *} {control purse}} {
    variable $control
    upvar 0 $control cont
    
    foreach {array d} [array get cont $glob] {
        if { $array != "" } {
            Serialize $array $control
        }
    }
}

# Return full path to where to store an array on disk.
proc ::purse::ToDisk { name {control purse}} {
    variable $control
    upvar 0 $control cont
    init $control
    return  [file join [dict get $cont() dir] \
                 [string map [dict get $cont() todisk] $name]]
}

# Return name of array, given its name on disk.
proc ::purse::FromDisk { name {control purse}} {
    variable $control
    upvar 0 $control cont
    init $control
    return [string map [dict get $cont() fromdisk] $name]
}

# initializes pursed array - one shot
proc ::purse::R {control name array args} {
    #puts stderr "read $control $name $array $args"
    upvar $array arr
    variable $control
    upvar 0 $control cont
    
    trace remove variable arr read  [list ::purse::R $control $name]        ;# one shot load file
    trace remove variable arr array [list ::purse::R $control $name]        ;# one shot load file
    trace remove variable arr write [list ::purse::W $control $name]
    trace remove variable arr unset [list ::purse::U $control $name]
    
    set file [ToDisk $name $control]
    
    if {[file exists $file]} {
        # if the purse exists, load its contents to array
        Debug "Loading content of array $name from $file"
        set fd [open $file r+]
        while {![eof $fd]} {
            set content [gets $fd]
            set len [llength $content]
            if { [expr {$len%2}] } {
                puts "Possibly corrupt data when reading $name"
                set content [lrange $content 0 end-1]
            }
            array set arr $content
        }
        close $fd
        
        Serialize $name $control
    } else {
        # brand new purse - create the file
        Debug "Shadowing content of array $name to $file"
        set fd [open $file w]
        fconfigure $fd -buffering line
        dict set cont($name) fd $fd
        Spacer $name $control
        AutoClose $name $control
    }
    
    # we no longer need a read trace
    trace add variable arr write [list ::purse::W $control $name]
    trace add variable arr unset [list ::purse::U $control $name]
}

# trace unset - writes an element to purse
proc ::purse::W {control name array el op} {
    #puts stderr "write $control $name $array $el $op"
    upvar $array arr
    variable $control
    upvar 0 $control cont
    
    if {![info exists cont($name)]} {
        R $control $name arr
    }
    
    # Append to existing file, reserialize all content if it had been closed
    set fd [dict get $cont($name) fd]
    if { $fd eq "" } {
        Serialize $name $control
    }
    set fd [dict get $cont($name) fd]
    array set junk [list $el $arr($el)]
    puts $fd [array get junk]
    Spacer $name $control
    AutoClose $name $control
}

# trace unset - unsets an element in a pursed array
proc ::purse::U {control name array el op} {
    #puts stderr "unset $control $name $array $el $op"
    upvar $array arr
    variable $control
    upvar 0 $control cont
    
    if {![info exists cont($name)]} {
        r $control $name arr
        if {$el != "" && [info exists arr($el)]} {
            unset arr($el)        ;# we have recreated the element - recurse
        }
        return
    }
    
    if {$el == ""} {
        # removing the entire array - destroy the purse
        set file [ToDisk $name $control]
        CloseCmd $name $control
        unset cont($name)
        if {[file exists $file]} {
            file delete -force -- $file
        }

        # we no longer need traces
        trace remove variable arr write [list ::purse::W $control $name]
        trace remove variable arr unset [list ::purse::U $control $name]

    } else {
        # removing an element - flush the purse
        Serialize $name $control
    }
}

# Conditional debug output, this cost close to zero.
proc ::purse::Debug {msg} {
    if {$vars::debug ne ""} {
        if { $vars::dtfmt eq "" } {
            puts $vars::debug $msg
        } else {
            set dt [clock format [clock seconds] -format $vars::dtfmt]
            puts $vars::debug "\[$dt\] $msg"
        }
    }
}

# Serialize the content of an array to the journalling file, i.e. write a
# complete copy of the array, loosing the history.
proc ::purse::Serialize {array {control purse}} {
    variable $control
    upvar 0 $control cont
    upvar $array arr

    # Close current file descriptor to journalling file
    catch {
        set fd [dict get $cont($array) fd]
        close $fd
    }
    
    # Write complete copy of current content of array to the file
    set file [ToDisk $array $control]
    Debug "Serialize $array to $file"
    set fd [open $file w]
    fconfigure $fd -buffering line
    puts $fd [array get arr]
    ::flush $fd
    dict set cont($array) fd $fd;  # Remember the file descriptor
    
    # Make space for the array by closing older arrays and arrange to close the
    # journal after a while.
    Spacer $array $control
    AutoClose $array $control
}


# Record latest write timestamp to an array, this is used to automatically close
# journalling files of arrays that we haven't written to for a while.
proc ::purse::LatestWrite { name {control purse}} {
    variable $control
    upvar 0 $control cont

    set nofile [dict get $cont() nofile]
    if { $nofile > 0 } {
        dict set cont($name) access [clock clicks]
    }
}

# Make space to write to an array, this will close the journalling files of the
# arrays that we haven't written to for a while, or rather the "oldest" ones in
# that list.
proc ::purse::Spacer { name { control purse} } {
    variable $control
    upvar 0 $control cont

    set nofile [dict get $cont() nofile]
    if { $nofile > 0 } {
        # Make sure we keep the array that we want to make space for as the
        # latest accessed one.
        LatestWrite $name $control

        # Construct a list with the names of the arrays that are under our
        # control, paired to their last access time. 
        set accesses {}
        foreach nm [array names cont] {
            if {$nm ne ""} {
                if {[dict exists $cont($nm) access]} {
                    lappend accesses [list $nm [dict get $cont($nm) access]]
                }
            }
        }
        
        # Sort the list so that latest accessed are first.
        set accesses [lsort -index 1 -integer -decreasing $accesses]
        
        # Remove all the ones that are at the end of the list, which now are the
        # oldest ones.
        set removals [lrange $accesses $nofile end]
        if { [llength $removals] > 0 } {
            Debug "Making space for array $name among the oldest ones"
            foreach nfo $removals {
                foreach {nm access} $nfo break
                CloseCmd $nm $control
            }            
        }
    }
}


# Arrange to automatically close journalling file after a while, if relevant for
# the purse options.
proc ::purse::AutoClose { name {control purse}} {
    variable $control
    upvar 0 $control cont
    
    # Cancel current timer, if any
    if { [dict exists $cont($name) timer] } {
        after cancel [dict get $cont($name) timer]
        dict unset cont($name) timer
    }
    
    # Arrange to close the journalling file to which we write in a little while
    set period [dict get $cont() keep]
    if { $period > 0 } {
        dict set cont($name) \
            timer [after $period [list [namespace current]::CloseCmd $name $control]]
    }
}

# Forcedly close the journalling file for an array at once. This is to ensure
# that we can keep low the number of resources alloted to the program as a
# whole.
proc ::purse::CloseCmd { name {control purse}} {
    variable $control
    upvar 0 $control cont

    # Return at once in case we don't have any information for the array
    # (anymore?)
    if {![info exists cont($name)]} {
        return
    }
    
    # Cancel current timer, if any
    if { [dict exists $cont($name) timer] } {
        after cancel [dict get $cont($name) timer]
        dict unset cont($name) timer
    }
    
    # Close the file to which we are writing
    if { [dict exists $cont($name) fd] } {
        set fd [dict get $cont($name) fd]
        if { $fd ne "" } {
            Debug "Closing journalling output file for array $name"
            catch {close $fd}
        }
    }
    
    # Remember that we are now not writing to any file anymore.
    dict set cont($name) fd ""
}

namespace export -clear purse