snittype for parsing of Blast [L1 ] output files
package require snit 0.97 package require oomk package require md5pure snit::type BlastFile { # public options # BLAST resultfile option -filename "" # metakit indexfile optional argument # defaults to $options(-filename).mk option -indexfile "" # number of hits stored inside the indexfile option -hits 10 # private variables variable db # the views inside the database variable pvPositions variable pvTop1 variable pvTopX constructor {args} { $self configurelist $args if {$options(-filename) eq ""} { error "`-filename blastfile' argument is required" } if {$options(-indexfile) eq ""} { set options(-indexfile) $options(-filename).mk } set db [mkstorage %AUTO% $options(-indexfile)] if {[file size $options(-indexfile)] < 2 } { $self CreateIndex } else { [$db view positions] as pvPositions [$db view top1] as pvTop1 [$db view topX] as pvTopX } } destructor { $db close } # public methods (are lowercase) method getQueryIds {} { # returns a list with query ids # each item is a list with the # * query id # * a flag indicating if this query id has a hit #[$db view top1] as pvTop1 set res [list] $pvTop1 loop c { lappend res [array get c] } return $res } method getMatchIds {id args} { set arg(-md5) false array set arg $args set res [list] if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel if {[$pSel size] > 0} { set id [$pSel get 0 id] } else { return [list] } } [$pvTopX select -exact id $id] as pSel $pSel loop c { lappend res [array get c] } return $res } method hasHit {id args} { # return true or false # if we have BlastHits for a certain queryid set arg(-md5) false array set arg $args if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel if {[$pSel size] > 0} { set id [$pSel get 0 id] } else { return false } } [$pvTop1 select -exact id $id] as pSel if {[$pSel size] > 0} { set res [$pSel get 0 match] } else { set res 0 } if {$res eq "0"} { return false } else { return true } } method getBlast {id args} { set arg(-md5) false array set arg $args set blast "" if {$arg(-md5)} { [$pvPositions select -exact md5 $id] as pSel } else { [$pvPositions select -exact id $id] as pSel } if {[$pSel size] > 0} { set pos [$pSel get 0 pos] } else { return "" } if [catch {open $options(-filename) r} infh] { error "Cannot open $options(-filename): $infh" } else { seek $infh $pos set flag false while {[gets $infh line] >= 0} { if {[regexp {^T?BLAST[XNP]} $line] && $flag} { break } elseif {[regexp {^T?BLAST[XNP]} $line]} { set flag true } append blast "$line\n" } close $infh } return $blast } # private methods (are uppercase) method CreateIndex {} { if [catch {open $options(-filename) r} infh] { error "Cannot open $options(-filename): $infh" } else { $db layout positions {id md5 pos} $db layout top1 {id match score:I evalue} $db layout topX {id match score:I evalue} [$db view positions] as pvPositions [$db view top1] as pvTop1 [$db view topX] as pvTopX set hit -1 set prog [Progress %AUTO% -file $options(-filename)] while {[gets $infh line] >= 0} { if {[regexp {^T?BLAST[XNP]} $line]} { set pos [tell $infh] $prog progress $pos incr pos [expr [string length $line] * -1 - 1] } elseif {[regexp {^Query= ([^\s]+)} $line -> id]} { set md5 [md5pure::md5 $id] $pvPositions append id $id md5 $md5 pos $pos } elseif {[regexp {No hits found} $line]} { $pvTop1 append id $id match 0 score 0 evalue 1 set hit -1 } elseif {[regexp {^Sequences producing significant alignments} $line]} { set hit 0 } elseif {[regexp {^>([^\s]+)(.+)} $line]} { set hit -1 ; } elseif {$hit >= 0 && [regexp {^([^\s]+).+\s([0-9]+)\s+([-e0-9\.]+)} $line -> matchID score evalue]} { incr hit regsub {^e} $evalue {1e} evalue if {$hit == 1} { $pvTop1 append id $id match $matchID score $score evalue $evalue } if {$hit <= $options(-hits)} { $pvTopX append id $id match $matchID score $score evalue $evalue } } } close $infh $db commit } } } # test proc test {} { source [file join [file dirname [info script]] Progress.tcl] BlastFile bfp -filename E:/links/project/goblet/data/goblet-results/20050209-143332--nrprot-test_blastp_nrprot #puts [bfp getBlast 27ccfed00cacd48a458a2057eec4e3c8 -md5 true] foreach x {0 1 2 3 4} { puts "ci010013000$x [bfp hasHit ci010013000$x]" } puts "27ccfed00cacd48a458a2057eec4e3c8 [bfp hasHit 27ccfed00cacd48a458a2057eec4e3c8 -md5 true]" puts [bfp getQueryIds] puts [bfp getMatchIds ci0100130000] }
schlenk Those BLAST files are nice, but how about using the XML and/or ASN.1 output of the blastall program. Makes parsing some things easier, if you produce those instead of the plaintext version. (sadly the NCBI DTD's are all totally out of date and incorrect).