[WJR] - There was a discussion on the [TclHttpd] mailing list about processing [RSS]. Here's a variant I came up with using [tDOM] and [The Tcler's Wiki] RSS feed (it should work with any RSS feed):
[Doc_Dynamic]
[
package require http
package require tdom
html::set token [http::geturl http://wiki.tcl.tk/rss.xml]
html::set rss_xml [http::data $token]
html::set doc [dom parse $rss_xml]
html::set root [$doc documentElement]
html::set stories [$root selectNodes /rss/channel/item]
]
[html::head {RSS Example}]
[html::bodyTag]
[html::h1 {RSS Example}]
[html::foreach story $stories {
-
[[$story selectNodes title/text()] nodeValue]
([[$story selectNodes pubDate/text()] nodeValue])
- [[$story selectNodes description/text()] nodeValue]
}]
[html::end]
[tDOM] and [TclHttpd] makes this pretty simple!
----
[DG] -- That little starter script got me going. I was messing with [TclXML] at first, but [tDOM] is a whole lot easier to use. See it in action [http://www.pobox.com/~davygrvy/news.tml]
# tmlrss.tcl --
#
# Process RSS (0.91, 0.92, 0.93, 1.0, 2.0) newsfeeds into
# 4.01 HTML. For use in .tml templates with tclhttpd.
#
# http://www.xml.com/pub/a/2002/12/18/dive-into-xml.html
# http://blogs.law.harvard.edu/tech/rss
# http://www.tdom.org/
#
# Instructions:
# 1) place this file in your tml library directory so
# the server sources it automatically at startup.
# 2) Call TmlRss_GenHTML with the url of the RSS feed
# from your .tml file (set to dynamic generation).
# This function returns the formatted HTML of the feed.
#
# By David Gravereaux
#
package provide tmlrss 0.2
package require httpd
package require http
package require html
package require uri
if {[catch {package require tdom} err]} {
eval [subst {proc TmlRss_GenHTML {uri {attr {}}} {
return ""}
}]
return -code error $err
}
namespace eval tmlrss {
variable rss_xml_cache
array set rss_xml_cache {}
}
# TmlRss_GenHTML --
#
# Generates 4.01 HTML given a URI to an RSS feed
#
# uri - uri of rss feed
# attr - additional attributes for the table (optional)
#
proc TmlRss_GenHTML {uri {attr {}}} {
append html "\n"
if {[catch {
set doc [tmlrss::GetDOM $uri]
append html [tmlrss::genTitleBlock $doc]
append html [tmlrss::genContent $doc]
} err]} {
append html "$uri | [tmlrss::encTxt $err] |
"
}
append html "
\n"
catch {$doc delete}
return $html
}
proc tmlrss::GetDOM {uri} {
# Is it cached locally?
if {[isCachedXMLExpired $uri]} {
set doc [fetchXML $uri]
} else {
set doc [dom parse -baseurl [uriBase $uri] $rss_xml_cache($uri)]
}
return $doc
}
# returns the DOM object of the RSS feed.
proc tmlrss::fetchXML {uri} {
variable rss_xml_cache
set token [http::geturl $uri]
if {[http::status $token] != "ok" || [http::ncode $token] != 200} {
set err [http::code $token]
http::cleanup $token
return -code error $err
}
set xml [http::data $token]
upvar #0 $token state
array set meta $state(meta)
http::cleanup $token
set doc [dom parse -baseurl [uriBase $uri] $xml]
# Append a download time to the DOM as a comment.
set comment [$doc createComment \
"downloaded: [clock format [clock seconds] -format {%a, %e %b %Y %T GMT} -gmt 1]"]
[$doc documentElement] appendChild $comment
# Append when the server considers it expired to the DOM.
catch {
set comment [$doc createComment "expires: $meta(Expires)"]
[$doc documentElement] appendChild $comment
}
# Save it in the cache.
set rss_xml_cache($uri) [$doc asXML -indent 4]
return $doc
}
proc tmlrss::isCachedXMLExpired {uri} {
variable rss_xml_cache
#TODO: make this work
if {[info exist rss_xml_cache($uri)]} {
set xml $rss_xml_cache($uri)
set doc [dom parse -baseurl [uriBase $uri] $xml]
# TODO: Is it past the TTL (if supported)?
# just return yes, for now.
if {1} {
$doc delete
unset rss_xml_cache($uri)
return true
}
return false
} else {
return true
}
}
proc tmlrss::uriBase {uri} {
array set info [uri::split $uri]
set info(path) [file dirname $info(path)]
return [eval uri::join [array get info]]
}
proc tmlrss::getRSSVersion {doc} {
set root [$doc documentElement]
switch [$root nodeName] {
rss {
if {[$root hasAttribute version]} {
return [$root getAttribute version]
}
# Best guess as most stuff is optional...
return 0.92
}
rdf:RDF {
return 1.0
}
default {
return 0
}
}
}
proc tmlrss::genTitleBlock {doc} {
set root [$doc documentElement]
append html "\n\n"
switch [getRSSVersion $doc] {
0.91 - 0.92 - 0.93 - 2.0 {
set titleXpath {/rss/channel/title/text()}
set linkXpath {/rss/channel/link/text()}
set imgNodeXpath {/rss/channel/image/title}
set imgTitleXpath {/rss/channel/image/title/text()}
set imgLinkXpath {/rss/channel/image/url/text()}
set imgWidthXpath {/rss/channel/image/width/text()}
set imgHeightXpath {/rss/channel/image/height/text()}
}
1.0 {
set titleXpath {/rdf:RDF/*[local-name()='channel']/*[local-name()='title']/text()}
set linkXpath {/rdf:RDF/*[local-name()='channel']/*[local-name()='link']/text()}
set imgNodeXpath {/rdf:RDF/*[local-name()='image']}
set imgTitleXpath {/rdf:RDF/*[local-name()='image']/*[local-name()='title']/text()}
set imgLinkXpath {/rdf:RDF/*[local-name()='image']/*[local-name()='url']/text()}
set imgWidthXpath {/rdf:RDF/*[local-name()='image']/*[local-name()='width']/text()}
set imgHeightXpath {/rdf:RDF/*[local-name()='image']/*[local-name()='height']/text()}
}
}
append html "[nodeTxt $root $titleXpath] | \n"
if {[$root selectNode $imgNodeXpath] != ""} {
append html " | \n"
}
append html "
\n\n"
return $html
}
proc tmlrss::genContent {doc} {
set root [$doc documentElement]
append html "\n"
switch [getRSSVersion $doc] {
0.91 - 0.92 - 0.93 - 2.0 {
set storiesXpath {/rss/channel/item}
set titleXpath {title/text()}
set linkXpath {link/text()}
set pubDateXpath {pubDate/text()}
set descXpath {description/text()}
}
1.0 {
set storiesXpath {/rdf:RDF/*[local-name()='item']}
set titleXpath {*[local-name()='title']/text()}
set linkXpath {*[local-name()='link']/text()}
set pubDateXpath {*[local-name()='pubDate']/text()}
set descXpath {*[local-name()='description']/text()}
}
}
set stories [$root selectNodes $storiesXpath]
append html [html::foreach story $stories {
[nodeTxt $story $descXpath]
|
}]
append html "\n"
return $html
}
proc tmlrss::encUri {uri} {
return [string map { & %26 } $uri]
}
proc tmlrss::encTxt {txt} {
return [string map { & & < < > > } $txt]
}
proc tmlrss::nodeUri {node xpath} {
if {[$node selectNode $xpath] != ""} {
# Only if there is a lonely &, quote it back to an entity.
return [encUri [[$node selectNode $xpath] nodeValue]]
} else {
return ""
}
}
# TODO: there's a big problem here... Sometimes feeds include entities (ie. )
# that are outside the range of the claimed charset. Sometimes feeds include
# URIs, but they aren't properly encoded..
#
# I tried ::htmlparse::mapEscapes, but for lonely ampersands that aren't encoded
# to entities ('&' not '&'), ::htmlparse::mapEscapes mangles them and the four
# chars that follow.
#
# TODO: possible solution would be to pre encode lonely ampersands, then
# ::htmlparse::mapEscapes ??? But how to fix the bad cp1252 entities when the
# charset is being claimed as iso8859-1 in the XML header?
# http://www.cs.tut.fi/~jkorpela/www/windows-chars.html#list
#
proc tmlrss::nodeTxt {node xpath} {
if {[$node selectNode $xpath] != ""} {
return [[$node selectNode $xpath] nodeValue]
} else {
return ""
}
}
proc tmlrss::ShutDown {} {
}
Httpd_RegisterShutdown tmlrss::ShutDown
A .tml file to call it would look something like this:
[Doc_Dynamic]
SNews, you lose!
[html::foreach rss_source {
http://www.npr.org/rss/rss.php?topicId=2
http://rss.news.yahoo.com/rss/highestrated
http://wiki.tcl.tk/rss.xml
http://sourceforge.net/export/rss2_projsummary.php?group_id=10894
http://dwlt.net/tapestry/dilbert.rdf
} {[TmlRss_GenHTML $rss_source "border=1"]}]
[WJR] - Nice, works great on my system! You should consider making this one of TclHttpd's sample apps (a number of apps come with the distribution in the sampleapp subdir).
[DG] ''Thanks..'' It's almost done. I had some problems with RSS v1.0, but just got them fixed. XML namespaces are no fun.
[Carsten Zerbst] I have problems with encoding with some malformed files, e.g. by the main german tv magazin http://www.tagesschau.de/newsticker.rdf.
I got rid off this changing the code a bit
# write to an temporary file
set fd [ open tmp.xml w]
puts $fd [http::data $token]
close $fd
upvar #0 $token state
array set meta $state(meta)
http::cleanup $token
set doc [ dom parse -baseurl [uriBase $uri] -channel [tDOM::xmlOpenFile tmp.xml] ]
file delete tmp.xml
It works, but has the disadvantage of needing a temporary file. But it was too late to modify tDOM:xmlOpenFile to work on strings.
[DG] -- See [XML/tDOM encoding issues with the http package]
----
[XO] - As pointed out by [Keith Vetter] in [Following Redirects], "One feature lacking
in the http package is the ability to automatically handle redirects". A '''geturl_followRedirects''' routine was used to solve the issue.
As of now (2006/03/04), the first (NPR) & the last (Dilbert) RSS feed in the above sample template have been redirected. Replacing '''''http::geturl $uri''''' in line#4 of tmlrss::fetchXML routine, by '''''geturl_followRedirects $uri''''' will fix the problem.
BTW, when I ran the script, I got an error on the second (Yahoo News) & the 4th (SourceForge) RSS feed, the error says something like ...
''error "syntax error" at line 1 character 0 "1 <--Error-- a7f