# websearch.tcl -- # # Procedures to help automate searching of search engines. # # # Michael A. Cleverly, 1 Jan 2002 # # Distributed under the same terms as the Tcl core. package require Tcl 8.2 package require http 2.3 package provide websearch 0.1 namespace eval ::websearch:: { variable search# 0 variable subcommands variable entities array set subcommands { all_results {0 1 t "?keys?"} cleanup {0 0 f ""} configure {1 2 f "key ?value?"} more_p {0 0 t ""} next_result {0 1 t "?keys?"} curr_result {0 1 t "?keys?"} query {1 1 f "query"} reset {0 0 f ""} variables {0 0 f ""} } array set defaults { limit 100 timeout 120 useragent "Tcl websearch" queried? no } # These escapes borrowed from tcllib's htmlparse package array set escapes { lt < gt > amp & quot \" copy \xa9 reg \xae ob \x7b cb \x7d nbsp \xa0 } array set escapes { iexcl \xa1 cent \xa2 pound \xa3 curren \xa4 yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9 ordf \xaa laquo \xab not \xac shy \xad reg \xae hibar \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3 acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8 sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2 Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7 Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1 Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6 times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0 aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5 aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4 otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9 uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe yuml \xff } # Convert into a form that we can use with [string map] foreach key [array names escapes] { set entities(&$key\;) $escapes($key) } } proc ::websearch::new {{search_engine google}} { variable search# if [catch { package require websearch::[string tolower $search_engine] }] { error "Do not know how to search \"$search_engine\"" } else { set command ::websearch::[incr search#] interp alias {} $command \ {} ::websearch::Search $search_engine \ [set search#] variable defaults variable [set search#] array set [set search#] [list search_engine $search_engine] array set [set search#] [array get defaults] array set [set search#] [array get ${search_engine}::defaults] return $command } } proc ::websearch::defaults {args} { variable defaults switch [llength $args] { 0 { return [array get defaults] } 1 { return $defaults([lindex $args 0]) } 2 { return [set $defaults([lindex $args 0]) [lindex $args 1]] } } error "Invalid syntax. Should be:\ websearch::defaults ?key? ?value?" } proc ::websearch::Search {search_engine search# command args} { variable subcommands if {![info exists subcommands($command)]} { error "Unknown option \"$command\". Should be one of:\ [join [lsort [array names subcommands]] ", "]." } foreach {min max queried_p parameters} $subcommands($command) break set explanation "::websearch::[set search#] $command $parameters" set argc [llength $args] if {$argc < $min} { error "Insufficient parameters. Should be:\ [string trim $explanation]" } if {$argc > $max} { error "Too many parameters. Should be:\ [string trim $explanation]" } if {[Boolean $queried_p]} { ::websearch::AssertQueryPerformed [set search#] } set cmd [list ::websearch::[string totitle $command] [set search#]] foreach arg $args { lappend cmd $arg } return [eval $cmd] } proc ::websearch::AssertQueryPerformed {search#} { variable [set search#] if {![info exists [set search#]]} { return -code error "Search not found!" } if {![Boolean [set [set search#](queried?)]]} { return -code error "No query has been performed yet!" } } proc ::websearch::Boolean {bool} { if {![string is boolean -strict $bool] && ![string is digit -strict $bool]} { return -code error "Could not interpret \"$bool\" as\ a boolean value." } switch -glob [string trimleft [string tolower $bool] 0] { "" - 0 - n* - of* - f* { return 0 } default { return 1 } } } proc ::websearch::Cleanup {search#} { variable [set search#] Reset [set search#] interp alias {} ::websearch::[set search#] {} unset [set search#] } proc ::websearch::Configure {search# key args} { variable [set search#] upvar 0 [set search#] token # Query a value if {[llength $args] == 0} { if {[info exists token($key)]} { return $token($key) } else { return } } # Set a value if {![string is word -strict $key]} { error "$key can be queried, but not set!" } set $token($key) [lindex $args 0] } proc ::websearch::More_p {search#} { variable [set search#] upvar 0 [set search#] token if {$token(@pos) < ($token(#results) - 1)} { return 1 } else { return 0 } } proc ::websearch::All_results {search# args} { variable [set search#] upvar 0 [set search#] token # save our current position set saved_pos $token(@pos) set token(@pos) -1 set data {} while {[More_p [set search#]]} { if {[llength $args] == 0} { lappend data [Next_result [set search#]] } else { lappend data [Next_result [set search#] [lindex $args 0]] } } # restore our previous position set token(@pos) $saved_pos return $data } proc ::websearch::Curr_result {search# args} { variable [set search#] upvar 0 [set search#] token # If we haven't gotten our first result, # then there is no "current" result if {$token(@pos) == -1} { return } # we'll cheat by decrementing our position and then # calling Next_result which will increment it back to where # we currently are at incr token(@pos) -1 if {[llength $args] == 0} { return [Next_result [set search#]] } else { return [Next_result [set search#] [lindex $args 0]] } } proc ::websearch::Next_result {search# args} { variable [set search#] upvar 0 [set search#] token upvar 0 ::websearch::$token(search_engine)::results results upvar 0 ::websearch::$token(search_engine)::variables variables # have we run out of results? if {$token(@pos) >= ($token(#results) - 1)} { return } incr token(@pos) set result [lindex $results([set search#]) $token(@pos)] switch [llength $args] { 0 { } 1 { set keys [lindex $args 0] } default { error "Should be impossible to get here." } } if {[info exists keys]} { foreach key $keys { set positions($key) [lsearch -exact $variables $key] } if {[llength $keys] == 1} { set data [lindex $result $positions($keys)] } else { foreach key $keys { lappend data [lindex $result $positions($key)] } } } else { foreach key $variables \ val [lindex $results([set search#]) $token(@pos)] { set array($key) $val } set data [array get array] } return $data } proc ::websearch::Query {search# query} { variable [set search#] upvar 0 [set search#] token Reset [set search#] return [::websearch::$token(search_engine)::Query \ [set search#] [UrlEncode $query]] } proc ::websearch::Reset {search#} { variable [set search#] upvar 0 [set search#] token if {[Boolean $token(queried?)]} { unset \ ::websearch::$token(search_engine)::results([set search#]) unset token(query) unset token(@pos) unset token(#results) set token(queried?) no } } proc ::websearch::Variables {search#} { variable [set search#] upvar 0 [set search#] token return [join [set ::websearch::$token(search_engine)::variables]] } proc ::websearch::GetUrl {search# url {headers ""}} { variable [set search#] upvar 0 [set search#] token array set config_old [http::config] array set config_new {-accept text/html,text/plain} foreach key [list proxyhost proxyport useragent] { if {[info exists token($key)]} { set config_new(-$key) $token($key) } } foreach key [array names config_old] { if {[info exists config_new($key)] && ![string equal $config_new($key) $config_old($key)]} { http::config $key $config_new($key) } } set timeout [expr {$token(timeout) * 1000}] set conn [http::geturl $url -headers $headers \ -timeout $timeout] set html [http::data $conn] http::cleanup $conn foreach key [array names config_new] { http::config $key $config_old($key) } return $html } proc ::websearch::PostUrl {search# url formvars {headers ""}} { variable [set search#] upvar 0 [set search#] token array set config_old [http::config] array set config_new {-accept text/html,text/plain} foreach key [list proxyhost proxyport useragent] { if {[info exists token($key)]} { set config_new(-$key) $token($key) } } foreach key [array names config_old] { array set config_new {-accept text/html,text/plain} foreach key [list proxyhost proxyport useragent] { if {[info exists token($key)]} { set config_new(-$key) $token($key) } } foreach key [array names config_old] { if {[info exists config_new($key)] && ![string equal $config_new($key) $config_old($key)]} { http::config $key $config_new($key) } } set timeout [expr {$token(timeout) * 1000}] set conn [http::geturl $url -headers $headers \ -query $formvars \ -timeout $timeout] set html [http::data $conn] http::cleanup $conn foreach key [array names config_new] { http::config $key $config_old($key) } return $html } proc ::websearch::UrlEncode {string} { set encoded "" foreach char [split $string ""] { if {[regexp {[A-Za-z0-9]} $char]} { append encoded $char } else { scan $char %c ascii append encoded %[format %x $ascii] } } return $encoded } proc ::websearch::StripHtml {html} { variable entities regsub -all -- {<[^>]+>} $html "" text set text [string map [array get entities] $text] regsub -all -- {&[^ ]+;} $text "" text regsub -all -- {\s+} $text " " text return $text }