[EKB] I have just been introduced to susbstantively weighted analytical techniques (SWAT) and in particular substantively weighted least squares (SWLS) through the book ''What Works: A New Approach to Program and Policy Analysis'' [http://www.amazon.com/gp/product/0813397820/qid=958574327/sr=1-1/002-4268802-3980816?n=283155]. It's also discussed in an online paper "Best Practices Research: A Methodological Guide for the Perplexed" [http://www.maxwell.syr.edu/ctip/Working%20Papers/0102/A%20Guide%20to%20the%20Perplexed-How%20to%20do%20Best%20Practices%20Research%203.0%20Revised1.htm]. It's a very interesting technique for policy analysis, and I created a tcl implementation for it. (There's also an [R language] implementation available from the author at [http://psblade.ucdavis.edu/].) Here's the code: package require Tcl 8.4 package require math::linearalgebra 1.0 package require math::statistics 0.1.1 namespace eval swat { variable epsilon 1.0e-7 variable swls_threshold 0.7 variable swls_beta 10 namespace import \ ::math::linearalgebra::mkMatrix \ ::math::linearalgebra::mkVector \ ::math::linearalgebra::mkIdentity \ ::math::linearalgebra::mkDiagonal \ ::math::linearalgebra::getrow \ ::math::linearalgebra::setrow \ ::math::linearalgebra::getcol \ ::math::linearalgebra::setcol \ ::math::linearalgebra::getelem \ ::math::linearalgebra::setelem \ ::math::linearalgebra::dotproduct \ ::math::linearalgebra::matmul \ ::math::linearalgebra::add \ ::math::linearalgebra::sub \ ::math::linearalgebra::solveGauss # NOTE: The authors of math::linearalgebra forgot to # export ::math::linearalgebra::transpose ######################################## # # t-stats # ######################################## # swat::tstat n # Returns inverse of the single-tailed t distribution # Defaults to alpha = 0.05 # Iterates until result is within epsilon of the target proc tstat {n {alpha 0.05}} { variable epsilon variable tvals if [info exists tvals($n:$alpha)] { return $tvals($n:$alpha) } set deltat [expr {100 * $epsilon}] # For one-tailed distribution, set ptarg [expr {1.000 - $alpha/2.0}] set maxiter 100 # Initial value for t set t 2.0 set niter 0 while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} { set pstar [::math::statistics::cdf-students-t $n $t] set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]] set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]] set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}] incr niter if {$niter == $maxiter} { error "swat::tstat: Did not converge after $niter iterations" return {} } } # Cache the result to shorten the call in future set tvals($n:$alpha) $t return $t } ######################################## # # Weighted Least Squares # ######################################## # swat::wls w [list y x's] w [list y x's] ... # Returns: # R-squared # coefficients of x's in fit # jacknifed residuals # standard errors of the coefficients # 95% confidence bounds for coefficients proc wls {args} { # Fill the matrices of x & y values, and weights # For n points, k coefficients # The number of points is equal to half the arguments (n weights, n points) set n [expr {[llength $args]/2}] set firstloop true # Sum up all y values to take an average set ysum 0 # Add up the weights set wtsum 0 # Count over rows (points) as you go set point 0 foreach {wt pt} $args { # Check inputs if {[string is double $wt] == 0} { error "swat::wls: Weight \"$wt\" is not a number" return {} } ## -- Check dimensions, initialize if $firstloop { # k = num of vals in pt = 1 + number of x's (because of constant) set k [llength $pt] if {$n <= [expr {$k + 1}]} { error "swat::wls: Too few degrees of freedom: $k variables but only $n points" return {} } set X [mkMatrix $n $k] set y [mkVector $n] set I_x [mkIdentity $k] set I_y [mkIdentity $n] set firstloop false } else { # Have to have same number of x's for all points if {$k != [llength $pt]} { error "swat::wls: Point \"$pt\" has wrong number of values (expected $k)" # Clean up return {} } } ## -- Extract values from set of points # Make a list of y values set yval [expr {double([lindex $pt 0])}] setelem y $point $yval set ysum [expr {$ysum + $wt * $yval}] set wtsum [expr {$wtsum + $wt}] # Add x-values to the x-matrix set xrow [lrange $pt 1 end] # Add the constant (value = 1.0) lappend xrow 1.0 setrow X $point $xrow # Create list of weights & square root of weights lappend w [expr {double($wt)}] lappend sqrtw [expr {sqrt(double($wt))}] incr point } set ymean [expr {double($ysum)/$wtsum}] set W [mkDiagonal $w] set sqrtW [mkDiagonal $sqrtw] # Calculate sum os square differences for x's for {set r 0} {$r < $k} {incr r} { set xsqrsum 0.0 set xmeansum 0.0 # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n for {set t 0} {$t < $n} {incr t} { set xval [getelem $X $t $r] set xmeansum [expr {$xmeansum + double($xval)}] set xsqrsum [expr {$xsqrsum + double($xval * $xval)}] } lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}] } ## -- Set up the X'WX matrix set XtW [matmul [::math::linearalgebra::transpose $X] $W] set XtWX [matmul $XtW $X] # Invert set M [solveGauss $XtWX $I_x] set beta [matmul $M [matmul $XtW $y]] ### -- Residuals & jacknife statistics # 1) Generate list of diagonals of the hat matrix set H [matmul $X [matmul $M $XtW]] for {set i 0} {$i < $n} {incr i} { lappend h_ii [getelem $H $i $i] } set R [matmul $sqrtW [matmul [sub $I_y $H] $y]] set yhat [matmul $H $y] # 2) Generate list of residuals, sum of squared residuals, r-squared set sstot 0.0 set ssreg 0.0 # Note: Relying on representation of Vector as a list for y, yhat foreach yval $y wt $w yhatval $yhat { set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}] set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}] } set r2 [expr {double($ssreg)/$sstot}] set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}] set sumsqresid [dotproduct $R $R] # 3) Calculate the overall variance, the variance per point & jacknifed stderr set s2 [expr {double($sumsqresid) / double($n - $k)}] for {set r 0} {$r < $n} {incr r} { set wt [lindex $w $r] set hprime [expr {sqrt($wt) * (1.0 - [lindex $h_ii $r])}] set resid [getelem $R $r] set newjns2i [expr {(($n - $k) * $s2 - double($resid * $resid)/$hprime)/($n - $k - 1)}] lappend jns2i $newjns2i lappend jacknife [expr {double($resid)/sqrt($newjns2i * $hprime)}] } ### -- Confidence intervals for coefficients set tvalue [tstat [expr {$n - $k}]] for {set i 0} {$i < $k} {incr i} { set stderr [expr {sqrt($s2 * [getelem $M $i $i])}] set mid [lindex $beta $i] lappend stderrs $stderr lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]] } return [list $adjr2 $beta $jacknife $stderrs $confinterval] } ######################################## # # Substantively Weighted Least Squares # ######################################## # swat::swls [list y x's] [list y x's] ... # Returns: # - # points with wt 1 # - R-squared # - coefficients of x's for entire sequence of fits # - 95% confidence intervals for coefficients # - weighted average of x values proc swls {args} { variable swls_threshold variable swls_beta set numpoints [llength $args] # Initialize weights to 1.0 for all points for {set i 0} {$i < $numpoints} {incr i} { set wt($i) 1.0 } # Start of big loop over weightings for {set i 0} {$i < $swls_beta} {incr i} { # Prepare the arg to send to wls set wtpts {} set nhigh 0 set sumwts 0 for {set j 0} {$j < $numpoints} {incr j} { set pts [lindex $args $j] if {![info exists numxs]} { set numxs [expr {[llength $pts] - 1}] } lappend wtpts $wt($j) $pts set sumwts [expr {$sumwts + $wt($j)}] for {set n 1} {$n <= $numxs} {incr n} { if {$j > 0} { set sumx($n) [expr {$sumx($n) + $wt($j) * [lindex $pts $n]}] } else { set sumx($n) [lindex $pts $n] } } if {$wt($j) == 1.0} { incr nhigh } } set xave {} for {set n 1} {$n <= $numxs} {incr n} { lappend xave [expr {double($sumx($n))/$sumwts}] } set retval [eval wls $wtpts] # R-squared is first, then coeffs, then jacknifed residuals set r2 [lindex $retval 0] lappend coeffs [list $nhigh $r2 [lindex $retval 1] [lindex $retval 3] [lindex $retval 4] $xave] set jacknife [lindex $retval 2] # Figure out the next set of weights for {set j 0} {$j < $numpoints} {incr j} { if {$i == 0 && [lindex $jacknife $j] < $swls_threshold} { set wt($j) [expr {1.0 - 1.0/double($swls_beta)}] } if {$i > 0 && $wt($j) < 1.0} { set wt($j) [expr {$wt($j) - 1.0/double($swls_beta)}] } } } return $coeffs } } ########################## # # GUI # ########################## set fileinfo(dir) [file dirname $argv0] set fileinfo(name) "" proc fileopen {} { global fileinfo set fname [tk_getOpenFile -initialdir $fileinfo(dir) -initialfile $fileinfo(name) \ -defaultextension ".dat" -filetypes { {{Data files} {.dat}} {{Text Files} {.txt} } {{All Files} *} }] if {$fname == ""} {return} if [catch {open $fname r} fhndl] { tk_messageBox -icon error -message "Error opening file [file tail $fname]: $fhndl" return } set names {} set points {} set firstline true while {[gets $fhndl line] != -1} { # Ignore blank lines & comments if {[string trim $line] == ""} {continue} if {[string index [string trimleft $line] 0] == "#"} {continue} if $firstline { set names [split $line "\t"] set firstline false continue } lappend points [split $line "\t"] } close $fhndl set fileinfo(dir) [file dirname $fname] set fileinfo(name) [file tail $fname] wm title . "$fileinfo(name) - SWLS" filltext $names $points } proc filltext {names points} { .t delete 1.0 end # Go ahead and update, since there can be a delay in loading update set coeffs [eval swat::swls $points] set nstart [lindex [lindex $coeffs 0] 0] set nend [lindex [lindex $coeffs end] 0] .t insert end "Explaining: [lindex $names 0]\n\n" .t insert end "Number of data points: $nstart\n" .t insert end "Number in set of interest: $nend\n\n" set i 1 foreach val $coeffs { set r2 [lindex $val 1] set xcoeffs [lindex $val 2] set stderrs [lindex $val 3] set cis [lindex $val 4] set xaves [lindex $val 5] .t insert end "== Run $i ==\n" incr i set j 1 foreach xave $xaves xc $xcoeffs ci $cis se $stderrs { if {$j == [llength $xcoeffs]} { set lbl "Constant:" set avestring "" } else { set lbl "[lindex $names $j] coeff:" set avestring [format " (ave = %.3f)" $xave] } .t insert end [format "%-20s\t% 12.3f\t\u00B1% 12.3f\t\[% .3f : % .3f\]\t%s\n" $lbl $xc $se [lindex $ci 0] [lindex $ci 1] $avestring] incr j } .t insert end [format "Adj. R-squared: %.3f\n\n" $r2] } } wm title . "SWLS" . configure -menu .mainmenu menu .mainmenu .mainmenu add cascade -label "File" -menu .mainmenu.f menu .mainmenu.f -tearoff false .mainmenu.f add command -label "Open" -command fileopen -underline 0 scrollbar .sby -command {.t yview} -orient vertical scrollbar .sbx -command {.t xview} -orient horizontal text .t -width 70 -height 20 -wrap none -yscrollcommand {.sby set} \ -xscrollcommand {.sbx set} -font "Courier 9" grid .t .sby -sticky nsew grid .sbx -sticky nsew grid columnconfigure . 0 -weight 1 grid rowconfigure . 0 -weight 1 focus -force . ---- [Category Mathematics] - [Category Statistics]