Version 0 of Substantively Weighted Least Squares (SWLS)

Updated 2006-01-19 15:54:49

EKB I have just been introduced to susbstantively weighted analytical techniques (SWAT) and in particular substantively weighted least squares (SWLS) through the book What Works: A New Approach to Program and Policy Analysis [L1 ]. It's also discussed in an online paper "Best Practices Research: A Methodological Guide for the Perplexed" [L2 ]. It's a very interesting technique for policy analysis, and I created a tcl implementation for it. (There's also an R language implementation available from the author at [L3 ].)

Here's the code:

 package require Tcl 8.4
 package require math::linearalgebra 1.0
 package require math::statistics 0.1.1


 namespace eval swat {
    variable epsilon 1.0e-7
    variable swls_threshold 0.7
    variable swls_beta 10

    namespace import \
        ::math::linearalgebra::mkMatrix \
        ::math::linearalgebra::mkVector \
        ::math::linearalgebra::mkIdentity \
        ::math::linearalgebra::mkDiagonal \
        ::math::linearalgebra::getrow \
        ::math::linearalgebra::setrow \
        ::math::linearalgebra::getcol \
        ::math::linearalgebra::setcol \
        ::math::linearalgebra::getelem \
        ::math::linearalgebra::setelem \
        ::math::linearalgebra::dotproduct \
        ::math::linearalgebra::matmul \
        ::math::linearalgebra::add \
        ::math::linearalgebra::sub \
        ::math::linearalgebra::solveGauss

    # NOTE: The authors of math::linearalgebra forgot to
    #   export ::math::linearalgebra::transpose

    ########################################
    #
    # t-stats
    #
    ########################################
    # swat::tstat n
    # Returns inverse of the single-tailed t distribution
    # Defaults to alpha = 0.05
    # Iterates until result is within epsilon of the target
    proc tstat {n {alpha 0.05}} {
        variable epsilon
        variable tvals

        if [info exists tvals($n:$alpha)] {
            return $tvals($n:$alpha)
        }

        set deltat [expr {100 * $epsilon}]
        # For one-tailed distribution, 
        set ptarg [expr {1.000 - $alpha/2.0}]
        set maxiter 100

        # Initial value for t
        set t 2.0

        set niter 0
        while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
            set pstar [::math::statistics::cdf-students-t $n $t]
            set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
            set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]

            set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]

            incr niter
            if {$niter == $maxiter} {
                error "swat::tstat: Did not converge after $niter iterations"
                return {}
            }
        }

        # Cache the result to shorten the call in future
        set tvals($n:$alpha) $t

        return $t
    }

    ########################################
    #
    # Weighted Least Squares
    #
    ########################################
    # swat::wls w [list y x's] w [list y x's] ...
    # Returns:
    #   R-squared
    #   coefficients of x's in fit
    #   jacknifed residuals
    #   standard errors of the coefficients
    #   95% confidence bounds for coefficients
    proc wls {args} {

        # Fill the matrices of x & y values, and weights
        # For n points, k coefficients

        # The number of points is equal to half the arguments (n weights, n points)
        set n [expr {[llength $args]/2}]

        set firstloop true
        # Sum up all y values to take an average
        set ysum 0
        # Add up the weights
        set wtsum 0
        # Count over rows (points) as you go
        set point 0
        foreach {wt pt} $args {

            # Check inputs
            if {[string is double $wt] == 0} {
                error "swat::wls: Weight \"$wt\" is not a number"
                return {}
            }

            ## -- Check dimensions, initialize
            if $firstloop {
                # k = num of vals in pt = 1 + number of x's (because of constant)
                set k [llength $pt]
                if {$n <= [expr {$k + 1}]} {
                    error "swat::wls: Too few degrees of freedom: $k variables but only $n points"
                    return {}
                }
                set X [mkMatrix $n $k]
                set y [mkVector $n]
                set I_x [mkIdentity $k]
                set I_y [mkIdentity $n]

                set firstloop false

            } else {
                # Have to have same number of x's for all points
                if {$k != [llength $pt]} {
                    error "swat::wls: Point \"$pt\" has wrong number of values (expected $k)"
                    # Clean up
                    return {}
                }
            }

            ## -- Extract values from set of points
            # Make a list of y values
            set yval [expr {double([lindex $pt 0])}]
            setelem y $point $yval
            set ysum [expr {$ysum + $wt * $yval}]
            set wtsum [expr {$wtsum + $wt}]
            # Add x-values to the x-matrix
            set xrow [lrange $pt 1 end]
            # Add the constant (value = 1.0)
            lappend xrow 1.0
            setrow X $point $xrow
            # Create list of weights & square root of weights
            lappend w [expr {double($wt)}]
            lappend sqrtw [expr {sqrt(double($wt))}]

            incr point

        }

        set ymean [expr {double($ysum)/$wtsum}]
        set W [mkDiagonal $w]
        set sqrtW [mkDiagonal $sqrtw]

        # Calculate sum os square differences for x's
        for {set r 0} {$r < $k} {incr r} {
            set xsqrsum 0.0
            set xmeansum 0.0
            # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
            for {set t 0} {$t < $n} {incr t} {
                set xval [getelem $X $t $r]
                set xmeansum [expr {$xmeansum + double($xval)}]
                set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
            }
            lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
        }

        ## -- Set up the X'WX matrix
        set XtW [matmul [::math::linearalgebra::transpose $X] $W]
        set XtWX [matmul $XtW $X]

        # Invert
        set M [solveGauss $XtWX $I_x]

        set beta [matmul $M [matmul $XtW $y]]

        ### -- Residuals & jacknife statistics
        # 1) Generate list of diagonals of the hat matrix
        set H [matmul $X [matmul $M $XtW]]
        for {set i 0} {$i < $n} {incr i} {
            lappend h_ii [getelem $H $i $i]
        }

        set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
        set yhat [matmul $H $y]

        # 2) Generate list of residuals, sum of squared residuals, r-squared
        set sstot 0.0
        set ssreg 0.0
        # Note: Relying on representation of Vector as a list for y, yhat
        foreach yval $y wt $w yhatval $yhat {
            set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
            set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
        }
        set r2 [expr {double($ssreg)/$sstot}]
        set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
        set sumsqresid [dotproduct $R $R]

        # 3) Calculate the overall variance, the variance per point & jacknifed stderr
        set s2 [expr {double($sumsqresid) / double($n - $k)}]
        for {set r 0} {$r < $n} {incr r} {
            set wt [lindex $w $r]
            set hprime [expr {sqrt($wt) * (1.0 - [lindex $h_ii $r])}]
            set resid [getelem $R $r]
            set newjns2i [expr {(($n - $k) * $s2 - double($resid * $resid)/$hprime)/($n - $k - 1)}]
            lappend jns2i $newjns2i
            lappend jacknife [expr {double($resid)/sqrt($newjns2i * $hprime)}]
        }

        ### -- Confidence intervals for coefficients
        set tvalue [tstat [expr {$n - $k}]]
        for {set i 0} {$i < $k} {incr i} {
            set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
            set mid [lindex $beta $i]
            lappend stderrs $stderr
            lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
        }

        return [list $adjr2 $beta $jacknife $stderrs $confinterval]
    }

    ########################################
    #
    # Substantively Weighted Least Squares
    #
    ########################################
    # swat::swls [list y x's] [list y x's] ...
    # Returns:
    #  - # points with wt 1
    #  - R-squared
    #  - coefficients of x's for entire sequence of fits
    #  - 95% confidence intervals for coefficients
    #  - weighted average of x values
    proc swls {args} {
        variable swls_threshold
        variable swls_beta

        set numpoints [llength $args]

        # Initialize weights to 1.0 for all points
        for {set i 0} {$i < $numpoints} {incr i} {
            set wt($i) 1.0
        }

        # Start of big loop over weightings
        for {set i 0} {$i < $swls_beta} {incr i} {
            # Prepare the arg to send to wls
            set wtpts {}
            set nhigh 0
            set sumwts 0
            for {set j 0} {$j < $numpoints} {incr j} {
                set pts [lindex $args $j]
                if {![info exists numxs]} {
                    set numxs [expr {[llength $pts] - 1}]
                }
                lappend wtpts $wt($j) $pts
                set sumwts [expr {$sumwts + $wt($j)}]
                for {set n 1} {$n <= $numxs} {incr n} {
                    if {$j > 0} {
                        set sumx($n) [expr {$sumx($n) + $wt($j) * [lindex $pts $n]}]
                    } else {
                        set sumx($n) [lindex $pts $n]
                    }
                }
                if {$wt($j) == 1.0} {
                    incr nhigh
                }
            }
            set xave {}
            for {set n 1} {$n <= $numxs} {incr n} {
                lappend xave [expr {double($sumx($n))/$sumwts}]
            }
            set retval [eval wls $wtpts]
            # R-squared is first, then coeffs, then jacknifed residuals
            set r2 [lindex $retval 0]
            lappend coeffs [list $nhigh $r2 [lindex $retval 1] [lindex $retval 3] [lindex $retval 4] $xave]
            set jacknife [lindex $retval 2]
            # Figure out the next set of weights
            for {set j 0} {$j < $numpoints} {incr j} {
                if {$i == 0 && [lindex $jacknife $j] < $swls_threshold} {
                    set wt($j) [expr {1.0 - 1.0/double($swls_beta)}]
                }
                if {$i > 0 && $wt($j) < 1.0} {
                    set wt($j) [expr {$wt($j) - 1.0/double($swls_beta)}]
                }
            }
        }
        return $coeffs
    }
 }

 ##########################
 #
 # GUI
 #
 ##########################

 set fileinfo(dir) [file dirname $argv0]
 set fileinfo(name) ""

 proc fileopen {} {
    global fileinfo

    set fname [tk_getOpenFile -initialdir $fileinfo(dir) -initialfile $fileinfo(name) \
            -defaultextension ".dat" -filetypes { {{Data files} {.dat}} {{Text Files} {.txt} } {{All Files} *} }]

    if {$fname == ""} {return}

    if [catch {open $fname r} fhndl] {
        tk_messageBox -icon error -message "Error opening file [file tail $fname]: $fhndl"
        return
    }

    set names {}
    set points {}

    set firstline true
    while {[gets $fhndl line] != -1} {
        # Ignore blank lines & comments
        if {[string trim $line] == ""} {continue}
        if {[string index [string trimleft $line] 0] == "#"} {continue}

        if $firstline {
            set names [split $line "\t"]
            set firstline false
            continue
        }

        lappend points [split $line "\t"]
    } 

    close $fhndl

    set fileinfo(dir) [file dirname $fname]
    set fileinfo(name) [file tail $fname]

    wm title . "$fileinfo(name) - SWLS"

    filltext $names $points

 }

 proc filltext {names points} {
    .t delete 1.0 end
    # Go ahead and update, since there can be a delay in loading
    update

    set coeffs [eval swat::swls $points]

    set nstart [lindex [lindex $coeffs 0] 0]
    set nend [lindex [lindex $coeffs end] 0]

    .t insert end "Explaining: [lindex $names 0]\n\n"
    .t insert end "Number of data points: $nstart\n"
    .t insert end "Number in set of interest: $nend\n\n"

    set i 1
    foreach val $coeffs {
        set r2 [lindex $val 1]
        set xcoeffs [lindex $val 2]
        set stderrs [lindex $val 3]
        set cis [lindex $val 4]
        set xaves [lindex $val 5]
        .t insert end "== Run $i ==\n"
        incr i
        set j 1
        foreach xave $xaves xc $xcoeffs ci $cis se $stderrs {
            if {$j == [llength $xcoeffs]} {
                set lbl "Constant:"
                set avestring ""
            } else {
                set lbl "[lindex $names $j] coeff:"
                set avestring [format " (ave = %.3f)" $xave]
            }
            .t insert end [format "%-20s\t% 12.3f\t\u00B1% 12.3f\t\[% .3f : % .3f\]\t%s\n" $lbl $xc $se [lindex $ci 0] [lindex $ci 1] $avestring]
            incr j
        }
        .t insert end [format "Adj. R-squared: %.3f\n\n" $r2]
    }
 }

 wm title . "SWLS"

 . configure -menu .mainmenu
 menu .mainmenu
 .mainmenu add cascade -label "File" -menu .mainmenu.f
 menu .mainmenu.f -tearoff false
 .mainmenu.f add command -label "Open" -command fileopen -underline 0

 scrollbar .sby -command {.t yview} -orient vertical
 scrollbar .sbx -command {.t xview} -orient horizontal
 text .t -width 70 -height 20 -wrap none -yscrollcommand {.sby set} \
    -xscrollcommand {.sbx set} -font "Courier 9"
 grid .t .sby -sticky nsew
 grid .sbx -sticky nsew
 grid columnconfigure . 0 -weight 1
 grid rowconfigure . 0 -weight 1

 focus -force .

Category Mathematics - Category Statistics