Version 5 of BlastFile

Updated 2005-02-21 12:47:14 by DDG

snittype for parsing of Blast [L1 ] output files

 package require snit 0.97
 package require oomk 
 package require md5pure

 snit::type BlastFile {
    # public options
    # BLAST resultfile 
    option -filename ""
    # metakit indexfile optional argument
    # defaults to $options(-filename).mk
    option -indexfile ""
    # number of hits stored inside the indexfile
    option -hits 10
    # private variables 
    variable db
    # the views inside the database
    variable pvPositions
    variable pvTop1
    variable pvTopX
    constructor {args} {
        $self configurelist $args
        if {$options(-filename) eq ""} {
            error "`-filename blastfile' argument is required"
        }
        if {$options(-indexfile) eq ""} {
            set options(-indexfile) $options(-filename).mk
        }
        set db [mkstorage %AUTO% $options(-indexfile)]
        if {[file size $options(-indexfile)] < 2 } {
            $self CreateIndex
        } else {
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
        }
    } 
    destructor {
        $db close
    }


    # public methods (are lowercase)
    method getQueryIds {} {
        # returns a list with query ids 
        # each item is a list with the 
        # * query id 
        # * a flag indicating if this query id has a hit 

        #[$db view top1] as pvTop1
        set res [list]
        $pvTop1 loop c {
            lappend res [array get c]
        }
        return $res
    }
    method getMatchIds {id args} {
        set arg(-md5) false
        array set arg $args
        set res [list]
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} { 
                set id [$pSel get 0 id]
            } else { 
                return [list]
            }
        }
        [$pvTopX select -exact id $id] as pSel
        $pSel loop c {
            lappend res [array get c]
        }
        return $res
    }
    method hasHit {id args} {
        # return true or false 
        # if we have BlastHits for a certain queryid
        set arg(-md5) false
        array set arg $args
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
            if {[$pSel size] > 0} { 
                set id [$pSel get 0 id]
            } else { 
                return false
            }
        }
        [$pvTop1 select -exact id $id] as pSel
        if {[$pSel size] > 0} { 
            set res [$pSel get 0 match]
        } else {
            set res 0
        }
        if {$res eq "0"} {
            return false
        } else {
            return true
        }
    }
    method getBlast {id args} {
        set arg(-md5) false
        array set arg $args
        set blast ""
        if {$arg(-md5)} {
            [$pvPositions select -exact md5 $id] as pSel
        }  else {
            [$pvPositions select -exact id $id] as pSel
        }
        if {[$pSel size] > 0} { 
            set pos [$pSel get 0 pos]
        } else { 
            return "" 
        }
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else { 
            seek $infh $pos
            set flag false
            while {[gets $infh line] >= 0} {
                if {[regexp {^T?BLAST[XNP]} $line] && $flag} {
                    break
                } elseif {[regexp {^T?BLAST[XNP]} $line]} {
                    set flag true
                }
                append blast "$line\n"
            }
            close $infh
        }
        return $blast
    }
    # private methods (are uppercase)
    method CreateIndex {} {
        if [catch {open $options(-filename) r} infh] {
            error "Cannot open $options(-filename): $infh"
        } else {
            $db layout positions {id md5 pos}
            $db layout top1 {id match score:I evalue}
            $db layout topX {id match score:I evalue}
            [$db view positions] as pvPositions
            [$db view top1] as pvTop1
            [$db view topX] as pvTopX
            set hit -1
            set prog [Progress %AUTO% -file $options(-filename)]
            while {[gets $infh line] >= 0} {
                if {[regexp {^T?BLAST[XNP]} $line]} {
                    set pos [tell $infh]
                    $prog progress $pos
                    incr pos [expr [string length $line] * -1 - 1]
                } elseif {[regexp {^Query= ([^\s]+)} $line -> id]} {
                    set md5 [md5pure::md5 $id]
                    $pvPositions append id $id md5 $md5 pos $pos
                } elseif {[regexp {No hits found} $line]} {
                    $pvTop1 append id $id match 0 score 0 evalue 1
                    set hit -1
                } elseif {[regexp {^Sequences producing significant alignments} $line]} { 
                    set hit 0 
                } elseif {[regexp {^>([^\s]+)(.+)} $line]} { 
                    set hit -1 ;
                } elseif {$hit >= 0 && [regexp {^([^\s]+).+\s([0-9]+)\s+([-e0-9\.]+)} $line -> matchID score evalue]} { 
                    incr hit 
                    regsub {^e} $evalue {1e} evalue
                    if {$hit == 1} {
                        $pvTop1 append id $id match $matchID score $score evalue $evalue
                    }
                    if {$hit <= $options(-hits)} {
                        $pvTopX append id $id match $matchID score $score evalue $evalue
                    }
                }


            }
            close $infh
            $db commit
         }
    }
 }

 # test
 proc test {} {
    source [file join [file dirname [info script]] Progress.tcl]
    BlastFile bfp -filename E:/links/project/goblet/data/goblet-results/20050209-143332--nrprot-test_blastp_nrprot
    #puts [bfp getBlast 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]
    foreach x {0 1 2 3 4} {
        puts "ci010013000$x [bfp hasHit ci010013000$x]"
    }
    puts "27ccfed00cacd48a458a2057eec4e3c8 [bfp hasHit 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]"
    puts [bfp getQueryIds]          
    puts [bfp getMatchIds ci0100130000]
 }

schlenk Those BLAST files are nice, but how about using the XML and/or ASN.1 output of the blastall program. Makes parsing some things easier, if you produce those instead of the plaintext version. (sadly the NCBI DTD's are all totally out of date and incorrect).


DDG Never used XML and ASN.1. XML seems to me to verbose for large files. But ASN.1 should be an urgent requirement. So an option -format asn.1|text might be required. May be we need then methods like getBlastASN1', getBlastText' where `getBlast' just forwards in dependence of the option -format.