Chinlish

Richard Suchenwirth 2000-04-10 - Chinlish ("Chinese from English (alphabet)") is a converter that translates a number of Chinese words written in Pinyin transcription (exception: use y for u-umlaut) to the corresponding Unicodes. In contrast to the other members of The Lish family, this is (and can only be) a partial solution, as some 4000 .. 6000 Chinese characters can even in context not perfectly be mapped to 400 Pinyin syllables. So if you use Chinlish, two things might happen:

  • The word you wanted was not in the dictionary. The pinyin string will come back unchanged. Remedy: if you need it more than once, add it to the dictionary (and put it to the Wiki, for the rest of us ;-) For single occurrences, use the \u notation that always works right.
  • The word you wanted was not the one retrieved, e.g. you wanted a different shi than the most frequent copula. Again: edit the dictionary, if you feel the need, or write \u....

Simplified (PRC) and traditional character forms (Hong Kong, Taiwan) are supported. Default, and recommended for dictionary entries, is Simplified. If you call the main proc chinlish (or the short name cn) with the -fan switch, traditional codes are substituted where appropriate, i.e. as defined in the i18n_jian2fan array. If you add words with jian/fan characters to the dictionary, update the jian2fan map also. Maybe some database can be found that completely covers this mapping, but I was offline over the weekend ;-) - RS 2007-09-04: many years later, a more comprehensive jian/fan converter is finally at fan2jian and jian2fan. }

 array set i18n_pin2u {
        ba \u628a
        Beijing \u5317\u4eac
        bu \u4e0d
        canguan \u53c2\u89c2
        chengxu \u7a0b\u5e8f
        da \u5927
        daxue \u5927\u5b66
        de \u7684
        erqie \u800c\u4e14
        feichang \u975e\u5e38
        ge \u4e2a
        gongzuo \u5de5\u4f5c
        hao \u597d
        he  \u548c
        hen \u5f88
        huanying \u6b22\u8fce
        huida \u56de\u7b54
        jintian \u4eca\u5929
        jisuanji \u8ba1\u7b97\u673a
        kexue \u79d1\u5b66
        lao \u8001
        laoshi \u8001\u5e08
        le \u4e86
        Nanjing \u5357\u4eac
        neng \u80fd
        nenggou \u80fd\u591f
        ni \u4f60
        nimen \u4f60\u4eec
        pengyou \u670b\u53cb
        relie \u70ed\u70c8
        ren \u4eba
        Shanghai \u4e0a\u6d77
        shi \u662f
        suoyi \u6240\u4ee5
        ta \u4ed6
        tamen \u4ed6\u4eec
        Tianjin \u5929\u6d25
        wenti \u95ee\u9898
        wo \u6211
        women \u6211\u4eec
        Xianggang \u9999\u6e2f
        xiao \u5c0f
        xiaoxue \u5c0f\u5b66
        xuesheng \u5b66\u751f
        yanjiu \u7814\u7a76
        yi \u4e00
        yinwei \u56e0\u4e3a
        you \u6709
        yuanlai \u5143\u6765
        zai \u5728
        zhe \u8fd9
        zheyang \u8fd9\u6837
        zhongguo \u4e2d\u56fd
        zhongwen \u4e2d\u6587
        zhongxue \u4e2d\u5b66
        zhuanhuan \u8f6c\u6362
 } ;#--------------------- above: the dictionary - extend as required
 proc chinlish {args} {
        if {$args==""} {set args "huanying, zhe shi zhongwen zhuanhuan chengxu"}
        set res ""
        set fan 0
        if [regsub -- "-fan" $args "" args] {incr fan}
        regsub {[.]$} $args " \u3002" args
        regsub -all {([.,:;!?]+)} $args { \1} text
        foreach i $text {
                if [info exists ::i18n_pin2u($i)] {
                        lappend res $::i18n_pin2u($i)
                } else {
                        lappend res $i
                }
        }
        set res [join $res ""]
        if $fan {set res [jian2fan $res]}
        set res
 }
 proc cn args {eval chinlish $args}
 proc cn:dic s {
        set res [list]
        foreach i [lsort [array names ::i18n_pin2u $s]] {
                lappend res $i $::i18n_pin2u($i)
        }
        set res
 }
 set i18n_jian2fan {
        \u4e2a \u500b
        \u4e3a \u7232
        \u4eec \u5011
        \u53c2 \u53c3
        \u56fd \u570b
        \u5b66 \u5b78
        \u5e08 \u5e2b
        \u673a \u6a5f
        \u6765 \u4f86
        \u6b22 \u6b61
        \u70ed \u71b1
        \u89c2 \u89c0
        \u8ba1 \u8a08
        \u8f6c \u8f49
        \u8fd9 \u9019
        \u95ee \u554f
        \u9898 \u984c
 } 

#--------- simplified(jian) - traditional(fan) mapping (incomplete) - see fan2jian and jian2fan for better data

 proc jian2fan s {
        foreach {jian fan} $::i18n_jian2fan {regsub -all $jian $s $fan s}
        set s
 }
 proc fan2jian s {
        foreach {jian fan} $::i18n_jian2fan {regsub -all $fan $s $jian s}
        set s
 }