HTML to DOM via http/htmlparse/struct::tree packages

This is some code to convert HTML data to a tDOM DOM.

Why

male: I needed to parse a web site pages, but the HTML provided by this web site is so malformed, that tDOM has no chance to parse these pages.

So I tried successfully to use htmlparse, which uses struct::tree to store the parsed data in. So I use the struct::tree routines to convert its data into a tDOM DOM tree.

The source below is tested in the context of the my needs to parse pages from a specific web site, but the used packages should allow to use this source a few more websites.

So I published the source here on the wiki.

Functionality

Using this code it is possible to:

  • specify an URL to download the HTML data from (following even redirections)
  • specify a path to a HTML file
  • specify already loaded/built/... HTML data

... to convert this HTML data into a tDOM DOM.

Syntax

Download of the HTML data via URL:

 html2dom converturl url

Where "url" points to a web resource of one the MIME types "text/html", "text/xhtml", "application/xhtml+xml".

The command returns the tDOM DOM object.

Loading the HTML data from a file:

 html2dom convertfile fileName

The command returns the tDOM DOM object.

Converting directly given HTML data:

 html2dom convertdata htmlData

The command returns the tDOM DOM object.

Dependencies

  • tcl 8.6 (because of the usage of try/error/finally - tcl 8.5, if try/error/finally would be eleminated)
  • http package (from within tcl)
  • tcllib pagckages
  • tDOM - the XML DOM package

The source

 package require http;
 package require uri;
 package require tdom;
 package require htmlparse;
 package require struct::tree;
 
 namespace eval html2dom {
     proc Attributes {data} {
         regsub -all {\s*=\s*} $data {=} data;

         set attributes  [dict create];
         set valueRanges [list];

         foreach {=> nameRange extValueRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)=(\"([^\"]*)\")} $data] {
             # emacs: "
             lappend valueRanges $extValueRange;

             set name    [string range $data {*}$nameRange];
             set value   [string range $data {*}$valueRange];

             dict set attributes $name [expr {$value eq "" ? $name : $value}];
         }

         foreach {=> nameRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)(?:=?(\S*))?} $data] {
             lassign $valueRange curFirst curLast;

             set forget 0;

             foreach range $valueRanges {
                 lassign $range first last;

                 if {$first <= $curFirst && $curLast <= $last} {
                     set forget 1;
                     break;
                 }
             }

             if {$forget} {
                 continue;
             }

             lappend valuesRange $valueRange;

             set name    [string range $data {*}$nameRange];
             set value   [string range $data {*}$valueRange];

             dict set attributes $name [expr {$value eq "" ? $name : $value}];
         }

         return $attributes;
     }

     proc Walk {tree parentNode node dom parentDomNode} {
         set type    [$tree get $node type];
         set hasData 0;

         if {[$tree keyexists $node data]} {
             set hasData 1;
         }

         if {![regexp {[a-zA-Z]\w*} $type]} {
             if {$hasData} {
                 set data [$tree get $node data];
             }

             $tree set $node data [concat $type $data];

             set type PCDATA;
         }

         if {$parentNode eq $node} {
             set domNode $parentDomNode;
         } else {
             if {$type eq "PCDATA"} {
                 set domNode [$dom createTextNode $type];

                 $domNode nodeValue [$tree get $node data];
                 $parentDomNode appendChild $domNode;

                 return;
             }

             set domNode [$dom createElement $type];

             $parentDomNode appendChild $domNode;
         }

         if {$hasData} {
             set data [$tree get $node data];

             if {$data ne ""} {
                 set attributes [Attributes $data];

                 if {$attributes ne ""} {
                     $domNode setAttribute {*}$attributes;
                 }
             }
         }

         foreach childNode [$tree children $node] {
             [lindex [info level 0] 0] $tree $node $childNode $dom $domNode;
         }

         return;
     }

     proc converturl {url} {
         http::config -accept "text/html, text/xhtml, application/xhtml+xml";
 
         set ncode 0;
 
         while {$ncode < 200 || $ncode >= 300} {
             # test if the url is valid
             #
             if {[catch {set token [http::geturl $url -validate 1];} reason options] == 1} {
                 http::config -accept "*/*";
 
                 return -options $options $reason;
             }
 
             # error and redirection handling
             #
             set ncode   [http::ncode $token];
             set meta    [http::meta $token];
 
             # error handling
             #
             if {[http::status $token] eq "error" || $ncode >= 400} {
                 if {[http::status $token] eq "error"} {
                     set reason "[http::error $token] ([http::code $token])";
                 } else {
                     set reason "[http::code $token]";
                 }
 
                 set errorCode [list $ncode [http::code $token] $meta];
 
                 http::cleanup $token;
                 http::config -accept "*/*";
 
                 return -code error -errorcode $errorCode $reason;
             }
 
             # redirection handling
             #
             if {$ncode >= 300} {
                 set url [uri::resolve $url [dict get [http::meta $token] Location]];
 
                 http::cleanup $token;
 
                 continue;
             }
 
             http::cleanup $token;
 
             # fetching the HTML data from the url
             #
             if {[catch {set token [http::geturl $url];} reason options] == 1} {
                 http::config -accept "*/*";
 
                 return -options $options $reason;
             }
 
             set htmlData [http::data $token];
 
             http::cleanup $token;
         }
 
         http::config -accept "*/*";
 
         # converting the HTML data to the tDOM DOM
         #
         if {[catch {set dom [convertdata $htmlData];} reason options] == 1} {
             return -options $options $reason;
         }
 
         set titleNode [[$dom documentElement] selectNodes {/html/head/title}];
 
         if {$titleNode ne "" && [regexp {[[:<:]]404[[:>:]]} [$titleNode text]]} {
             $dom delete;
 
             return -code error -errorcode [list 404 "HTTP/1.1 404 Not Found" $meta] "HTTP/1.1 404 Not Found"
         }
 
         return $dom;
     }
 
     proc convertfile {htmlFileName} {
         if {[catch {set dom [convertdata [tDOM::xmlReadFile $htmlFileName]];} reason options] == 1} {
             return -options $options $reason;
         }
 
         return $dom;
     }
 
     proc convertdata {htmlData} {
         try {
             # create a tree to be filled by htmlparse
             #
             set tree [struct::tree];
 
             # parse the HTML data and remove "unwanted" data
             #
             htmlparse::2tree $htmlData $tree;
             htmlparse::removeVisualFluff $tree;
             htmlparse::removeFormDefs $tree;
 
             # convert the HTML data tree into a tDOM DOM tree
             #
             set dom         [dom createDocument html];
             set domDoc      [$dom documentElement];
             set treeRoot    [$tree rootname];
 
             Walk $tree $treeRoot $treeRoot $dom $domDoc;
         } on error {reason options} {
             if {[info exists tree]} {
                 $tree destroy;
             }
 
             if {[info exists dom]} {
                 $dom delete;
             }
 
             return -options $options $reason;
         } finally {
             $tree destroy;
         }
 
         return $dom
     }
 
     namespace export -clear {[a-z]*};
     namespace ensemble create;
 }

The usage (an example)

 set url         http://www.nirsoft.net/utils/index.html;
 set dom         [html2dom converturl $url];
 set domDoc      [$dom documentElement];
 set baseNode    [$domDoc selectNodes {/html/head/base}];

 if {$baseNode eq ""} {
     set uri     [uri::split $url];
     set baseUrl [uri::join {*}[dict set uri path [file dirname [dict get $uri path]]]]/;
 } else {
     set baseUrl [$baseNode @href]/;
 }
 
 set utils [dict create];
 
 foreach sectionNode [$domDoc selectNodes {//table[@class="title2"]}] {
     set abbreviation    [[$sectionNode selectNodes {preceding-sibling::a}] @name];
     set title           [[$sectionNode selectNodes {.//text()[1]}] nodeValue];
     set descriptionNode [$sectionNode selectNodes {../span[@class="special3"]/text()}];
 
     if {$descriptionNode ne ""} {
         set description [$descriptionNode nodeValue];
     } else {
         set description "";
     }
 
     set filesTableNode      [$sectionNode selectNodes {../table[@class="filestable"]}];
     set section             [dict create abbr $abbreviation title $title desc $description utils [dict create]];
 
     puts section=$abbreviation;
 
     dict update section utils sectionUtils {
         foreach node [$filesTableNode selectNodes {.//node()[@class="filetitle"]/..}] {
             set longDescription [$node text];
             set anchorNode      [$node selectNodes {a[1]}];
 
             regexp {((?:\w+\s*)*\w+)(?:\s+v(\d+\.\d+))?(?: . (.+))?$} [$anchorNode text] => name version shortDescription;
 
             set relativeUrl [$anchorNode @href]
             set relativeUri [uri::split $relativeUrl];
             set absoluteUrl [uri::resolve $baseUrl $relativeUrl];
             set abbr        [file rootname [file tail [dict get $relativeUri path]]]
 
             dict set sectionUtils $abbr [dict create \
                 abbr        $abbr \
                 name        $name \
                 version     $version \
                 shortdesc   $shortDescription \
                 longdesc    $longDescription \
                 relurl      $relativeUrl \
                 absurl      $absoluteUrl \
             ];
 
             puts \tutil=$abbr
         }
     }
 
     dict set utils $abbreviation $section;
 }