sqlite3 full text search example

JBR - 20101022

fts.tcl - index a directory tree of random docs.

I have been using swish-e and swish++ for a while but they are too slow and don't have very nice incremental index features. Here is a full text search system written in tcl using sqlite3's fts3 virtual tables. I'm a rank novice at sql and implemented the searchrank function as simply as possible so any suggestions are welcome. This entire program was written in 1 day, inspiration to wiki in 6 hours, enjoy.

The search system in configured with a configuration file similar to swish++. There are several high level commands, here are some example command lines:

 ./fts index myindex.conf
 ./fts search myindex.conf "search terms"

A little debugging output has been included and is selectable with a final arg to "fts index". The command gives a short summary if run with no args.

 john@tdc : ./fts
 fts: unknown subcommand ""


                fts index    <index.db>   <conf> <verb>                - index a set of directories indicated in <conf>

                    verbosity is a comma separated list of the message types insert, update, unindexed
                    , unchanged,exclude or a unique prefix of them.

                fts search   <index.db>  <query>                - seach the index for query
                fts excludes <index.db> <conf>                        - display the exclude patterns from <conf>
                fts filters  <index.db> <conf>                        - display the filter  patterns from <conf>
                fts docs                                        - display a table of documents in the index.

                fts.conf is the default config file.

fts.conf:

 set tmp /var/tmp

 stopwords stopwords

 filter *.bz2   bunzip2    -c %f > @%F
 filter *.gz    gunzip     -c %f > @%F
 filter *.Z     uncompress -c %f > @%F

 filter *.doc   catdoc    %f
 filter *.xls   xls2csv   %f
 filter *.ppt   catppt    %f
 filter *.prn   pdftotext %f -
 filter *.pdf   pdftotext %f -
 filter *.PS    ps2ascii  %f
 filter *.ps    ps2ascii  %f
 filter *.eps   ps2ascii  %f
 filter *.txt   cat       %f

 filter *.htm   lynx -dump %f
 filter *.html  lynx -dump %f

 filter */page/*        cat       %f

 exclude                *.dxf

 exclude                *.exe
 exclude                *.gif *.jpg *.png .jpeg
 exclude                *.mpeg *.mov .avi

 exclude                *.mf1 *.mf2
 exclude                *.sym
 exclude                *.zip

 exclude                *~

 exclude                *.files/

 index-path /data/cgijohn/paige/file
 index-path /data/cgijohn/paige/page
 index-path /data/wdocs/john/idoc-db
 index-path /data/wdocs/dfabricant/www-docs

fts.tcl:


#!/home/john/bin/tclkit8.6
#
 load /home/john/lib/libtclsqlite3.so

 set  verb QUIET
 proc verb { type message } { if { [regexp $::verb $type] } { puts "[format %10.10s $type] : $message" } }
 
 proc database { database } { set ::database $database }

 set excludes {}
 proc exclude  { args } {
    foreach pattern $args {
        lappend ::excludes $pattern
        if { [regexp {^\*\.[a-z0-9]+$} $pattern] } {
            lappend ::excludes [string toupper $pattern]
        }
    }
 }
 proc exclude? { file } {
    foreach pattern $::excludes {
        if { [string match $pattern $file] } { return 1 }
    }

    return 0
 }

 proc filter  { pattern args } {
    lappend ::filters $pattern $args
    if { [regexp {^\*\.[a-z0-9]+$} $pattern] } {
        lappend ::filters [string toupper $pattern] $args
    }
 }
 proc filter? { file } {
    foreach { pattern action } $::filters {
        if { [string match $pattern $file] } {
            if { [string first @%F $action] == -1 } {
                return [list [list [string map [list %f $file] $action]]]
            } else {
                set f $file
                set F [string map [list [string range $pattern 1 end] {}] $::tmp/[file tail $file]]
                set actions [list [string map [list %f $f @%F $F] $action] $F]

                if { [set action [filter? $F]] eq {} } {
                    return {}
                } else { 
                    return [list $actions {*}$action]
                }
            }
        }
    }

    return {}
 }
 proc filter! { actions } {
    set tmpfiles {}

    foreach action [lrange $actions 0 end-1] {
        foreach { action tmpfile } $action {}
        try { exec -ignorestderr {*}$action
        } on error message {
            puts $message
        }

        lappend tmpfiles $tmpfile
    }

    set indx {}
    set body {}
    try {
        set body [exec -ignorestderr {*}[lindex [lindex $actions end] 0]] 
    } on error message {
        puts $message
    } finally {
        if { $tmpfiles ne {} } { exec rm -f $tmpfiles }
    }

    return $body
 }

 proc stopwords { file } {
    foreach word [split [read [set fp [open $file]]]] {
        set ::stops($word) 1
    }
    close $fp
 }

 proc index-file { file } {
    if { ![llength [set actions [filter? $file]]] } {
        verb unindexed $file
        return
    }

    try {
        set xtime [file mtime $file]
    } on error message {
        puts $message
        return
    } 

    db eval { select mtime from documents where file = $file } {
        if { $mtime >= $xtime } {
            verb unchanged $file
            return
        }
    }

    set body [filter! $actions]
    set body [regsub -all {\m[-+]?([0-9]+\.?[0-9]*|\.[0-9]+)([eE][-+]?[0-9]+)?\M} $body { }]
    set body [regsub -all {[,<>?\\~!@#$%^&*()_+]} $body { }]

    foreach word [split $body] {
        if { [regexp {[[:lower:]]} $word] } {
            if { [string length $word] <= 3 } { continue }
            set word [string tolower $word]
        }
        if { [info exists ::stops($word)] } { continue }

        append indx $word " "
    }

    if { [db eval { select rowid from documents where file = @file }] eq {} } {
        verb insert $file
        db eval {
            begin  transaction  ;
            insert into documents  ( file, mtime ) values ( $file, $xtime ) ;
            insert into searchtext ( docid, body ) values ( last_insert_rowid(), $indx ) ;
            commit transaction
        }
    } else { 
        verb update $file
        db eval {
            begin  transaction  ;
            update documents set mtime = $xtime where file = $file ;
            update searchtext set body = @body
                    where docid = ( select rowid from documents where file = $file ) ;
            commit transaction
        } 
    }
 }

 proc index-path  { path } { lappend ::paths $path }
 proc index-path! { path } {
    foreach dir [glob -type d -directory $path -nocomplain *] {
        if { [exclude? $dir/] } { continue }

        index-path! $dir
    }
    foreach file [glob -type f -directory $path -nocomplain *] {
        if { [exclude? $file] } { continue }

        index-file  $file
    }
 }

 set command  [lindex $argv 0]
 set config   [lindex $argv 1]
 set query    [lrange $argv 2 end]

 if { $config eq {} } {
    puts "fts: no config file"
    exit 1
 } else {
    try { source $::config } on error message { puts "fts: $message"; exit 1 }
 }

 proc searchrank { matchinfo } {
    binary scan $matchinfo iiiii nphrase ncol 1 2 3
    return $1
 }

 switch $command {
  docs   -
  index  -
  search {
    sqlite3 db $database
    db enable_load_extension 1
    db function searchrank searchrank

    catch { db eval { create virtual table searchtext using fts3(tokenize=porter, body text); } }
    catch { db eval { create table documents ( file, mtime ) } }
    catch { db eval { create index on documents ( file ) } }
  }
  default {
    puts "fts: unknown subcommand \"$command\""
    puts {

                fts index    <conf> <verb>                - index a set of directories indicated in <conf>

                    verbosity is a comma separated list of the message types insert, update, unindexed
                    , unchanged,exclude or a unique prefix of them.

                fts search   <conf> <query>                - seach the index for query
                fts excludes <conf>                        - display the exclude patterns from <conf>
                fts filters  <conf>                        - display the filter  patterns from <conf>
                fts docs                                - display a table of documents in the index.

                fts.conf is the default config file.
        }
    exit 1
  } 
 }

 switch $command {
  excludes { foreach exclude $::excludes { puts $exclude } }
  filters  { foreach { pattern action } $::filters { puts "[format %8.8s $pattern] : $action" } }

  index  {
            if { [llength $argv] == 3 } { set ::verb ^(([string map { , .*)|( } [lindex $argv 2]].*))$ }

         set ::itime [file mtime $database]

            foreach path $::paths { index-path! $path }
  }
  search { 
    db eval { select docid, searchrank(matchinfo(searchtext)) as rank from searchtext
              where body match @query
               order by searchrank(matchinfo(searchtext)) desc; } {
        db eval { select rowid, file from documents where rowid = $docid ; } {
            puts "$rank        $file"
        }
    }
  }
  docs {
    puts "rowid        mtime        file"
    puts "-----        -----        ----"
    db eval { select rowid, file, mtime from documents } {
        puts "$rowid        $mtime        $file"
    }
  }
 }