Version 1 of Tcl chatroom snaphost history (2)

Updated 2001-09-26 12:11:14

MS

Here is a pure Tcl script that builds up the history of the chatroom. See also Larry Virden's ksh chatroom snaphost history. Note that Larry's script requires (ksh + lynx + sed + diff + egrep).

Notes:

  • the script will build daily snapshots of the chatroom activity
  • this may still be buggy ...
  • these scripts produces invalid html - it is missing the end-tags </BODY></HTML>. Most browsers are tolerant of this (I hope, at least netscape is ...)

A first version, which sometimes stutters and is slower, is at Tcl chatroom snaphost history; I left it there so that I can reconstruct my mistakes ... This version has the following improvements:

  • it does not need an auxiliary file
  • it does not stutter
  • it uses string ops instead of list ops, moving some processing from the script to [string]
  • it is better factored (findOverlap as new proc)
  • it has a default time setting (180 secs)
  • does not put time stamps - they proved to be a nuisance; there is now an independent robot that timestamps the chat every half hour
  • uses better regexps that reduce the cleanup time by a factor 10

 #! /usr/local/bin/tclsh
 # Author: Miguel Sofer < mailto:[email protected] >
 # Date:    Sept. 23, 2001
 # Version: 0.5 (remove redirects)
 # Adapted from Larry W. Virden's program (ksh+lynx+sed+diff+egrep)
 # Purpose: to scrape tcl'ers chat log pages and accumulate them
 # NOTE: This script is going to record private messages and memos to your
 #   id, so you should be careful to review the information before making it
 #   public.

 #############################################################
 # USAGE
 #############################################################
 #
 # First define the variables URL and chatPath in this section.
 #
 # Then call this program giving a time (in seconds) between checks,
 # it will run forever and update the history file periodically.
 #

 #
 #       Set URL to your chat information
 #
 set URL {http://mini.net/cgi-bin/chat.cgi?action=chat&name=miguel&password=I_WONT_TELL_YOU&updatefrequency=45&color=800000&new_msg_on_top=0&ls=&pause=#end}
 # To get this URL, use your web browser to visit the
 # chat room, and then check the URL information for the dialog frame.
 # WARNING: The URL contains your chat password - so this file needs to be
 # protected appropriately.

 # 
 # Set the directory for history files histFile_date.htm
 # 
 set chatPath /SCRAPE

 #############################################################

 set len [llength $argv]
 if {$len != 1} {
     if {$len} {
         error "USAGE: $argv0 seconds_between_snapshots"
     } else {
         set argv 180
     }
 }
 set time [expr {1000 * [lindex $argv 0]}]

 package require http

 proc repeatRecord {time} {
     recordData
     after $time repeatRecord $time
 }


 #
 # Build a regexp to clean up; this is very dependent on the
 # particular formatting of the chat program
 # The cleaned-up data has to start with a "real data" line
 #

 # match headers, up to <BODY ...> tag
 set re1 {^.*<BO[^>]*>}

 # match entered/left lines
 set re2 {<B>[^ :]* h[^R]*R>\n}

 # match trailers, after <A ...> tag
 set re3 {<A NAME.*}

 # remove URL redirection (hopefully this text does not appear in messages)
 set re4 {chat2.cgi\?action=gotourl&url=}

 set re "${re1}|${re2}|${re3}|${re4}"

 #
 # getData gets and cleans up the data from the chat.
 #
 proc getChat {} {
     set token {}
     if {[catch {set token [::http::geturl $::URL -timeout 30000]}]\
             || ([::http::status $token] != "ok")\
             || ([::http::ncode $token] != "200")} {
         ::http::cleanup $token
         return {}
     }
     regsub -all $::re [::http::data $token] {} data
     ::http::cleanup $token
     set data
 }


 #
 # the workhorse: record the data
 #
 variable oldData {} 

 proc recordData {} {
     variable oldData

     set data [getChat]
     set dataLen [string length $data]
     if {$dataLen} {
         set new $data
     } else {
         set new "<B>******* [clock format [clock seconds]]: ... no connection</B>\n<BR>\n"
     }

     set date [clock format [clock seconds] -format %y%m%d]
     set histFile [file join $::chatPath histfile_$date.htm]

     #
     # If there is no history file, start one with the current data
     #
     if {![file isfile $histFile]} {
         set hist [open $histFile w]
         puts $hist {<HTML><HEAD><STYLE TYPE="text/css"> </STYLE></HEAD><BODY BGColor=#ffffff>}
         puts $hist "<B>******* [clock format [clock seconds]]: START RECORDING</B>\n<BR>\n<BR>\n"
         puts -nonewline $hist $new
         close $hist
         set oldData $data
         return
     } 

     #
     # If old data is not in memory, get it from the history file;
     # only get as many bytes as you need
     #
     if {![string length $oldData]} {
         set hist [open $histFile]
         set oldData [read $hist]
        set oldData [string range $oldData end-[string length $data] end]
         close $hist
     }

     #
     # Record the new messages
     #
     set hist [open $histFile a]
     if {$dataLen} {
         set pos [findOverlap $oldData $data]
         if {$pos} {
             puts -nonewline $hist [string range $data $pos end]
         } else {
             puts $hist "<B>....... possibly missing data</B>\n<BR>\n"
             puts -nonewline $hist $data
         }
         set oldData $data
     } else {
         puts $hist $new
     }
     close $hist
 }


 #
 # This proc finds the new messages; specialized from
 #        http://mini.net/tcl/2184.html
 #
 proc findOverlap {str1 str2} {
     set first2 [string range $str2 0 [string first "\n" $str2]]
     set firstLen [string length $first2]

     while {1} {
         set index [string first $first2 $str1]
          if {$index == -1} {
             return 0
         }
         set str1 [string range $str1 $index end]
         set len [string length $str1]
         if {[string equal -length $len $str1 $str2]} {
             return $len
         }
         set str1 [string range $str1 $firstLen end]
     }
 }        

 repeatRecord $time
 vwait forever