Version 10 of char2ent

Updated 2004-11-18 13:19:02 by lwv

Posted by LES on November 18, 2004

 #!/usr/local/bin/tclsh

 # char2ent.tcl - Opens an HTML or XML file with special characters 
 #                         (diacritics) written in plain text, replaces these special 
 #                         characters with appropriate HTML or XML entities and writes the 
 #                        output to a new file. 
 #
 # Author: Luciano Espirito Santo 
 #
 # History 
 # 
 #         Version 1.0        2004-11-18        Luciano Espirito Santo
 #                 First version. Alpha stage.
 #
 #                 KNOWN ISSUES: 
 #                - No user-proof measures, no error or exception handling, no nothing! 
 #                   No guarantees! Use it at your own risk!
 #                - Tested on Windows 98 only. Permission issues are likely to come up 
 #                  in other operating systems. Permission issues are harmless. The 
 #                  program will just not be able to read the input file and/or write to 
 #                  the output file.
 #
 #                TODO: 
 #                - Make the ability to do the INVERSE operation (that would include the 
 #                  ability to tell non-escaped entities from escaped entities and NOT 
 #                  replace the escaped entities.
 #                - Make it handle STDIN.
 #
 #                LICENSE: BSD
 #
 # How to use it:
 # 
 # char2ent.tcl  --help

 # ----------------------------------------------------------------
 # Do not change anything below this point unless you know what you're doing.


 # Print help text and exit if '--help' is the only argument 
 if          { [ llength $argc ] == 1  &&  [ lindex $argv 0 ] == "--help" }          {
         puts  ""
         puts  "char2ent, by Luciano Espirito Santo - 2004"
         puts  ""
         puts  {Usage: char2ent -[option]  "input file"  "output file"}
         puts  ""
         puts  "Possible options:"
         puts  "-h: convert special characters to HTML entities"
         puts  "-x: convert special characters to XML entities"
         puts  ""
         puts  {"input file" MUST exist}
         puts  {"output file" is created automatically if it does not exist}
         puts  {"input file" and "output file" MUST NOT be the same file}
         puts  ""
         puts  {Example: char2ent -x  "sample.xml"  "converted.xml"}
         puts  ""
         exit
 }

 # Complain and exit if option is neither '-h' nor '-x' 
 if          { [ lindex $argv 0 ] != "-h"  &&  [ lindex $argv 0 ] != "-x" }          {
         puts  "Error! Try 'char2ent --help' to see how to use this program.\n"
         exit
 }

 # Complain and exit if not exactly 3 arguments (option, input, output) found 
 if          { $argc  !=  3 }          {
         puts  "Error! Try 'char2ent --help' to see how to use this program.\n"
         exit
 }

 # Complain and exit if input file does not exist 
 if          {! [ file exists [ lindex $argv 1 ] ] }          {
         puts  "Error! File \"[ lindex $argv 1 ]\" not found!\n"
         exit
 }

 # Complain and exit if input file is not readable 
 if          {! [ file readable [ lindex $argv 1 ] ] }          {
         puts  "Error! Permission denied to read [ lindex $argv 1 ]!\n"
         exit
 }

 # Complain if input file and output file are the same 
 if          { [ lindex $argv 1 ]  ==  [ lindex $argv 2 ] }          {
         puts  "Error! \"input file\" and \"output file\" must not be the same.\n"
         exit
 }

 # Try to open input file for reading. 
 # Complain and exit in case of errors. 
 if          { [ catch { set myIF [ open [ lindex $argv 1 ] r ] } myIFError ] }  {
         puts  "Error! $myIFError\n"
         exit
 }

 # Try to open output file for writing. 
 # Complain, close input file and exit in case of errors. 
 if          { [ catch { set myOF [ open [ lindex $argv 2 ] w ] } myOFError ] }  {
         close  $myIF
         puts  "Error! $myOFError\n"
         exit
 }


 # ===============================================
 # Two files open. No errors  this far. Let's replace. 

 set  myChars  {
         ª        º        À        Á        ��        ��        Ä        Å        Æ        Ç        È        É        Ê        Ë        Ì        Í        Î        Ï        Ð        
         Ñ        Ò        Ó        Ô        Õ        Ö        Ø        Ù        Ú        Û        Ü        Ý        Þ        ß        à        á        â        ã        ä        
         å        æ        ç        è        é        ê        ë        ì        í        î        ï        ð        ñ        ò        ó        ô        õ        ö        ø        
         ù        ú        û        ü        ý        þ        ÿ        OE        oe        Y
 }

 set  myHTML  {
         ª        º        À        Á        Â        Ã        Ä        
         Å        &Aelig;        Ç        È        É        Ê        Ë        
         Ì        Í        Î        Ï        Ð        Ñ        Ò        
         Ó        Ô        Õ        Ö        Ø        Ù        Ú        
         Û        Ü        Ý        Þ        ß        à        á        
         â        ã        ä        å        æ        ç        è        
         é        ê        ë        ì        í        î        ï        ð        
         ñ        ò        ó        ô        õ        ö        ø        
         ù        ú        û        ü        ý        þ        ÿ        
         &Oelig;        œ        Ÿ
 }

 set  myXML  {
         ª        º        À        Á        Â        Ã        Ä        Å        Æ        
         Ç        È        É        Ê        Ë        Ì        Í        Î        Ï        
         Ð        Ñ        Ò        Ó        Ô        Õ        Ö        Ø        Ù        
         Ú        Û        Ü        Ý        Þ        ß        à        á        â        
         ã        ä        å        æ        ç        è        é        ê        ë        
         ì        í        î        ï        ð        ñ        ò        ó        ô        
         õ        ö        ø        ù        ú        û        ü        ý        þ        
         ÿ        Œ        œ        Ÿ
 }


 set  myText  [ read $myIF ]

 for          { set i 0 }   { $i  < [ llength  $myChars ] }   { incr i }          {

         switch  --  [ lindex $argv 0 ]          {
                 "-h"        { set  myReplace  [ lindex  $myHTML  $i ] }
                 "-x"        { set  myReplace  [ lindex  $myXML  $i ] }
         }

         set  myText [ string map "[ lindex $myChars $i ] $myReplace" $myText ]
 }


 puts -nonewline  $myOF  $myText
 close  $myIF
 close  $myOF

 exit

LES: Unfortunately, I can't make wikit display the numeric codes in the "set myXML" line... Jean-Claude?


RS Note that read includes the trailing newline; to write such a file-string out it is best to use puts -nonewline so you don't get an extra one. Also, string map is happy with long maps, so you can avoid the for loop by just coding

 set XMLmap {& &amp; < &lt; > &gt; ...}
 set HTMLmap {Ä &Auml; ...}
 ...
 set myMap $XMLmap
 ...
 set myText [string map $myMap $mytext]

Category Application Category Dev. Tools Category XML