Version 16 of char2ent

Updated 2005-01-08 11:56:10

Posted by LES on November 18, 2004

 #!/usr/local/bin/tclsh

 # char2ent.tcl - Opens an HTML or XML file with special characters 
 #                         (diacritics) written in plain text, replaces these special 
 #                         characters with appropriate HTML or XML entities and writes 
 #                        the output to a new file. 
 #
 # Author: Luciano Espirito Santo 
 #
 # History 
 # 
 #         Version 1.0        2004-11-18        Luciano Espirito Santo
 #                 First version. Alpha stage.
 #
 #                 KNOWN ISSUES: 
 #                - No user-proof measures, no error or exception handling, no nothing! 
 #                   No guarantees! Use it at your own risk!
 #                - Tested on Windows 98 only. Permission issues are likely to come up 
 #                  in other operating systems. Permission issues are harmless. The 
 #                  program will just not be able to read the input file and/or write 
 #                  to the output file.
 #
 #                TODO: 
 #                - Make the ability to do the INVERSE operation (that would include 
 #                  the ability to tell non-escaped entities from escaped entities and 
 #                  NOT replace the escaped entities.
 #                - Make it handle STDIN.
 #
 #                LICENSE: BSD
 #
 # How to use it:
 # 
 # char2ent.tcl  --help

 # ----------------------------------------------------------------
 # Do not change anything below this point unless you know what you're doing.


 # Print help text and exit if '--help' is the only argument 
 if          { [ llength $argc ] == 1  &&  [ lindex $argv 0 ] == "--help" }          {
         puts  ""
         puts  "char2ent, by Luciano Espirito Santo - 2004"
         puts  ""
         puts  {Usage: char2ent -[option]  "input file"  "output file"}
         puts  ""
         puts  "Possible options:"
         puts  "-h: convert special characters to HTML entities"
         puts  "-x: convert special characters to XML entities"
         puts  ""
         puts  {"input file" MUST exist}
         puts  {"output file" is created automatically if it does not exist}
         puts  {"input file" and "output file" MUST NOT be the same file}
         puts  ""
         puts  {Example: char2ent -x  "sample.xml"  "converted.xml"}
         puts  ""
         exit
 }

 # Complain and exit if option is neither '-h' nor '-x' 
 if          { [ lindex $argv 0 ] != "-h"  &&  [ lindex $argv 0 ] != "-x" }          {
         puts  "Error! Try 'char2ent --help' to see how to use this program.\n"
         exit
 }

 # Complain and exit if not exactly 3 arguments (option, input, output) found 
 if          { $argc  !=  3 }          {
         puts  "Error! Try 'char2ent --help' to see how to use this program.\n"
         exit
 }

 # Complain and exit if input file does not exist 
 if          {! [ file exists [ lindex $argv 1 ] ] }          {
         puts  "Error! File \"[ lindex $argv 1 ]\" not found!\n"
         exit
 }

 # Complain and exit if input file is not readable 
 if          {! [ file readable [ lindex $argv 1 ] ] }          {
         puts  "Error! Permission denied to read [ lindex $argv 1 ]!\n"
         exit
 }

 # Complain if input file and output file are the same 
 if          { [ lindex $argv 1 ]  ==  [ lindex $argv 2 ] }          {
         puts  "Error! \"input file\" and \"output file\" must not be the same.\n"
         exit
 }

 # Try to open input file for reading. 
 # Complain and exit in case of errors. 
 if          { [ catch { set myIF [ open [ lindex $argv 1 ] r ] } myIFError ] }  {
         puts  "Error! $myIFError\n"
         exit
 }

 # Try to open output file for writing. 
 # Complain, close input file and exit in case of errors. 
 if          { [ catch { set myOF [ open [ lindex $argv 2 ] w ] } myOFError ] }  {
         close  $myIF
         puts  "Error! $myOFError\n"
         exit
 }


 # ===============================================
 # Two files open. No errors  this far. Let's replace. 

 set  myChars  {
         ª        º        À        Á        ��        ��        Ä        Å        Æ        Ç        
         È        É        Ê        Ë        Ì        Í        Î        Ï        Ð        Ñ        
         Ò        Ó        Ô        Õ        Ö        Ø        Ù        Ú        Û        Ü        
         Ý        Þ        ß        à        á        â        ã        ä        å        æ        
        ç        è        é        ê        ë        ì        í        î        ï        ð        
         ñ        ò        ó        ô        õ        ö        ø        ù        ú        û        
         ü        ý        þ        ÿ        OE        oe        Y
 }

 set  myHTML  {
         ª        º        À        Á        Â        Ã
        Ä        Å        &Aelig;                Ç        È        É
         Ê        Ë        Ì        Í        Î        Ï
         Ð        Ñ        Ò        Ó        Ô        
         Õ        Ö        Ø        Ù        Ú
         Û        Ü        Ý        Þ        ß        à
         á        â        ã        ä        å        æ
         ç        è        é        ê        ë        ì
         í        î        ï        ð        ñ        ò
        ó        ô        õ        ö        ø
        ù        ú        û        ü        ý
        þ        ÿ        &Oelig;        œ        Ÿ
 }

 set  myXML  {
         ª        º        À        Á        Â        Ã        Ä        Å        Æ        
         Ç        È        É        Ê        Ë        Ì        Í        Î        Ï        
         Ð        Ñ        Ò        Ó        Ô        Õ        Ö        Ø        Ù        
         Ú        Û        Ü        Ý        Þ        ß        à        á        â        
         ã        ä        å        æ        ç        è        é        ê        ë        
         ì        í        î        ï        ð        ñ        ò        ó        ô        
         õ        ö        ø        ù        ú        û        ü        ý        þ        
         ÿ        Œ        œ        Ÿ
 }


 set  myText  [ read $myIF ]

 for          { set i 0 }   { $i  < [ llength  $myChars ] }   { incr i }          {

         switch  --  [ lindex $argv 0 ]          {
                 "-h"        { set  myReplace  [ lindex  $myHTML  $i ] }
                 "-x"        { set  myReplace  [ lindex  $myXML  $i ] }
         }

         set  myText [ string map "[ lindex $myChars $i ] $myReplace" $myText ]
 }


 puts -nonewline  $myOF  $myText
 close  $myIF
 close  $myOF

 exit

LES: Unfortunately, I can't make wikit display the numeric codes in the "set myXML" line... Jean-Claude?


RS Note that read includes the trailing newline; to write such a file-string out it is best to use puts -nonewline so you don't get an extra one. Also, string map is happy with long maps, so you can avoid the for loop by just coding

 set XMLmap {& &amp; < &lt; > &gt; ...}
 set HTMLmap {Ä &Auml; ...}
 ...
 set myMap $XMLmap
 ...
 set myText [string map $myMap $mytext]

Category Application Category Dev. Tools Category XML Hello everyone This is a Translation service website which can supply machine translation! [L1 ] [L2 ] [L3 ] [L4 ] [L5 ]