A little Slashdot comment parser

Ro started this page on July 8th, 2003

This parses Slashdot [L1 ] comments. It reads in the source of a comment page that is of type nested.

  #!/usr/bin/tclsh

  proc @ij {c_name i_j} {foreach {i j} $i_j break ; upvar $c_name c ; return [string range $c $i $j]}
  proc =@ {c arr_name} {
    upvar $arr_name v ; set n 0
    while {[regexp -indices -start $n {<TR><TD BGCOLOR="#CCCCCC">\s+?<FONT SIZE="3" COLOR="#000000">\s+<A NAME="(\d+)"><B>(.+?)</B></A> \(Score\:(.+?)\)\s+</FONT>\s+<BR>(.+?)\s+</TD></TR>\s+<TR><TD>\s+(.+?)\s+</TD></TR>\s+<TR><TD>\s+<FONT SIZE="2">.+?\[(.+?)\]} $c MATCH i_cid i_title i_rating i_header i_body i_trailer]} {
      #puts -nonewline . ; flush stdout
      foreach var [info vars] {if {[regexp {i_(.+)} $var -> rvar]} {set $rvar [@ij c [set $var]]}}
      lappend v(comments) $cid ; set v($cid,title) $title ; set v($cid,body) $body
      if {![regexp {(.+)\, (.+)} $rating -> v($cid,mod_num) v($cid,mod_type)]} {set v($cid,mod_num) $rating ; set v($cid,mod_type) no}
      if {![regexp {(\d+)">Parent</A>} $trailer -> v($cid,parent)]} {set v($cid,parent) no}
      if {![regexp {by <A HREF="(.+?)">(.+?) \((\d+)\)</A> } [lindex [split $header \n] 0] -> v($cid,u_slash_home) v($cid,u_name) v($cid,u_id)]} \
        {foreach el {u_slash_home u_name u_id} {set v($cid,$el) no}}
      regexp {on (.+?) \(} [lindex [split $header \n] 1] -> v($cid,time)
      set n [expr {[lindex $MATCH 1] + 1}]
      foreach el [lsort [array names v $cid,*]] {puts [format {%20s  %s} $el $v($el)]} ; puts --------------------- ; flush stdout
    }
  }

  set f [open nested_comments.html] ; set c [read $f] ; close $f ; unset f
  =@ $c v
  puts done!

Output looks like this, it's all in an array, so you can do anything you want with the comments...

  ...
          6372291,body  Actually: Mass Destruction + Stupidity = Globalization... or something<nobr> <wbr></nobr>;o)
       6372291,mod_num  1
      6372291,mod_type  no
        6372291,parent  6372196
          6372291,time  Saturday July 05, @10:58AM
         6372291,title  Re:Encryption...
          6372291,u_id  681945
        6372291,u_name  darth_silliarse
  6372291,u_slash_home  //slashdot.org/~darth_silliarse
  ---------------------
          6373621,body  You call that a troll?<BR><BR>You moderators need to get out of the house a bit too...watch that sunlight, it'll do your head in if you're not expecting it.<BR><BR>
       6373621,mod_num  2
      6373621,mod_type  no
        6373621,parent  6372196
          6373621,time  Saturday July 05, @05:02PM
         6373621,title  Re:Encryption...
          6373621,u_id  601553
        6373621,u_name  ShieldW0lf
  6373621,u_slash_home  //slashdot.org/~ShieldW0lf
  ....

Enjoy!


PT 8-Jul-2003: This wins my vote for the Obfuscated Tcl contest. '=@' eeeewwww! :)