Updated 2016-11-18 15:43:35 by bll

Richard Suchenwirth 2005-10-04 - Collation is "the logical ordering of character or wide-character strings according to defined precedence rules. These rules identify a collation sequence between the collating elements, and such additional rules that can be used to order strings consisting of multiple collating elements."

bll 2016-1-9

For example, in the German locale (de_DE), this is properly sorted: A Ä B Z. But for Swiss German (de_CH) this should be: A B Z Ä.

The following code can be used on Linux/FreeBSD/Mac OS X to create a collate command which can then be used as an argument to lsort's -command option (lsort -command collate $mylist). Example compilation:
Linux
${CC:-cc} -O -shared -o collate.so -I/usr/include/tcl8.6 -DUSE_TCL_STUBS collate.c -ltclstub8.6
Mac OS X (MacPorts Tcl/Tk)
${CC:-cc} -O -shared -o collate.dylib -I/opt/local/include -DUSE_TCL_STUBS collate.c -L/opt/local/lib -ltclstub8.6
FreeBSD
${CC:-cc} -O -shared -o collate.so -I/usr/local/include/tcl8.6 -DUSE_TCL_STUBS collate.c -L/usr/local/lib -ltclstub86
Windows
gcc -m64 -shared -static-libgcc -o collate64.dll -I$HOME/local-64/include -DUSE_TCL_STUBS collatembs.c -L$HOME/local-64/lib -ltclstub86 # with tcl compiled and installed to $HOME/local-64

( bll 2016-11-18 I can make available the collate and setlocale Tcl interfaces for anyone who wants them. E-mail me. )

Window's strcoll function cannot handle multibyte utf-8 characters. So instead wcscoll or _mbscoll should be used. wcscoll can be accessed using the Ffidl package. Windows does not use the standard locale names.

Of course Ffidl may be used on Linux/FreeBSD/Mac OS X also instead of the collate code below.

If you want to really go crazy, the complete set of collation rules and data can be found here: http://cldr.unicode.org/. I started playing with this a little (I got some of the parsing done, not much further), but it is quite complicated.

Windows using Ffidl (wcscoll)
# windows example
package require Ffidl

# libname will need to be changed to match the correct .dll for the compiler in use.
set libname MSVCRT.dll ; # matches ActiveState Tcl/Tk (microsoft VC runtime)
# windows setlocale: LC_ALL = 0, LC_COLLATE = 1
# posix setlocale: LC_ALL = 6, LC_COLLATE = 3
ffidl::callout ::setlocale {int pointer-utf8} pointer-utf8 [ffidl::symbol $libname setlocale]
ffidl::callout ::collate {pointer-utf16 pointer-utf16} int [ffidl::symbol $libname wcscoll]

set rv [::setlocale 1 deu]  ; # set the windows collating locale to de_DE
puts $rv ; # windows locale name (long version)
set newlist [lsort -command ::collate $mylist]

Windows (_mbscoll)
#include <tcl.h>
#include <string.h>
#include <locale.h>
#include <mbstring.h>
#include <windows.h>

int collateObjCmd (
  ClientData cd,
  Tcl_Interp* interp,
  int objc,
  Tcl_Obj * const objv[]
  )
{
  Tcl_Encoding utf = (Tcl_Encoding) cd;
  Tcl_Encoding sys = Tcl_GetEncoding (interp,NULL);
  const char* s1;
  const char* s2;
  const char* ts1;
  const char* ts2;
  int lt1, lt2;

  if (objc != 3) {
    Tcl_WrongNumArgs (interp, 1, objv, "string1 string2");
    return TCL_ERROR;
  }
  s1 = Tcl_GetStringFromObj (objv[1], &lt1);
  s2 = Tcl_GetStringFromObj (objv[2], &lt2);
  if (utf == sys) {
    Tcl_SetObjResult (interp, Tcl_NewIntObj (_mbscoll (s1,s2)));
  } else { /* for other locales */
    Tcl_DString s1d, s2d;
    Tcl_DStringInit (&s1d);
    Tcl_DStringInit (&s2d);
    Tcl_UtfToExternalDString (sys, s1, lt1, &s1d);
    Tcl_UtfToExternalDString (sys, s2, lt2, &s2d);
    ts1 = Tcl_DStringValue(&s1d);
    ts2 = Tcl_DStringValue(&s2d);
    Tcl_SetObjResult (interp, Tcl_NewIntObj (_mbscoll (ts1, ts2)));
    Tcl_DStringFree (&s1d);
    Tcl_DStringFree (&s2d);
  }
  return TCL_OK;
}

int Collate_Init(Tcl_Interp *interp)
{
  Tcl_Encoding utf;
#ifdef USE_TCL_STUBS
  if (!Tcl_InitStubs(interp,"8.3",0)) {
    return TCL_ERROR;
  }
#else
  if (!Tcl_PkgRequire(interp,"Tcl","8.3",0)) {
    return TCL_ERROR;
  }
#endif
  utf = Tcl_GetEncoding(interp,"utf-8");
  if (!utf) {
    return TCL_ERROR;
  }
  Tcl_CreateObjCommand(interp,"collate", collateObjCmd, (ClientData)utf, NULL);
  Tcl_PkgProvide(interp,"collate","0.1");
  setlocale(LC_COLLATE, "");

  return TCL_OK;
}

Unix and Mac OS X
/*
 * collate.c
 *
 * This code can be used to create a collate command for tcl.  It
 * is a simple interface to strcoll().  Works for anything non-Windows.
 *
 * http://www.siftsoft.com/inprogress/tclcoll.c
 * https://groups.google.com/d/msg/comp.lang.tcl/2JeJgvwwlFo/N-4ne_EU2CAJ
 */

#include <tcl.h>
#include <string.h>
#include <locale.h>

int collateObjCmd (
  ClientData cd,
  Tcl_Interp* interp,
  int objc,
  Tcl_Obj * const objv[]
  )
{
  Tcl_Encoding utf = (Tcl_Encoding)cd;
  Tcl_Encoding sys = Tcl_GetEncoding(interp,NULL);
  const char* s1;
  const char* s2;
  int lt1, lt2;

  if (objc != 3) {
    Tcl_WrongNumArgs(interp, 1, objv, "string1 string2");
    return TCL_ERROR;
  }
  s1 = Tcl_GetStringFromObj(objv[1], &lt1);
  s2 = Tcl_GetStringFromObj(objv[2], &lt2);
  if (utf == sys) {
    Tcl_SetObjResult(interp, Tcl_NewIntObj(strcoll(s1,s2)));
  } else { /* for other locales */
    Tcl_DString s1d, s2d;
    Tcl_DStringInit(&s1d);
    Tcl_DStringInit(&s2d);
    Tcl_UtfToExternalDString(sys, s1, lt1, &s1d);
    Tcl_UtfToExternalDString(sys, s2, lt2, &s2d);
    Tcl_SetObjResult(interp, Tcl_NewIntObj(strcoll(Tcl_DStringValue(&s1d),Tcl_DStringValue(&s2d))));
    Tcl_DStringFree(&s1d);
    Tcl_DStringFree(&s2d);
  }
  return TCL_OK;
}

int Collate_Init(Tcl_Interp *interp)
{
  Tcl_Encoding utf;
#ifdef USE_TCL_STUBS
  if (!Tcl_InitStubs(interp,"8.3",0)) {
    return TCL_ERROR;
  }
#else
  if (!Tcl_PkgRequire(interp,"Tcl","8.3",0)) {
    return TCL_ERROR;
  }
#endif
  utf = Tcl_GetEncoding(interp,"utf-8");
  if (!utf) {
    return TCL_ERROR;
  }
  Tcl_CreateObjCommand(interp,"collate", collateObjCmd, (ClientData)utf, NULL);
  Tcl_PkgProvide(interp,"collate","0.1");
  setlocale(LC_COLLATE, "");

  return TCL_OK;
}

Discussion of a quick and simple way to do collated sorts

Richard Suchenwirth 2005-10-04 - Collation is "the logical ordering of character or wide-character strings according to defined precedence rules. These rules identify a collation sequence between the collating elements, and such additional rules that can be used to order strings consisting of multiple collating elements."

Tcl's lsort sorts according to numerical Unicode values, which may not be correct in some locales. For instance, in Portuguese, accented letters should sort as if they weren't, but in Unicode sequence come after "z".

The following oversimplified code takes a map in which collation differences can be listed as {from to from to...}, sorts the mapped items, and retrieves only the original elements:
 proc collatesort {list map} {
    set l2 {}
    foreach e $list {
       lappend l2 [list $e [string map $map $e]]
    }
    set res {}
    foreach e [lsort -index 1 $l2] {lappend res [lindex $e 0]}
    set res
 }

Testing, Portuguese:
 % collatesort {ab ãc ãd ae} {ã a}
 ab ãc ãd ae

Spanish (ll sorts after lz):
 % collatesort {llano luxación leche} {ll lzz}
 leche luxación llano

German (umlauts sorted as if "ä" was "ae"):
 % lsort {Bar Bär Bor}
 Bar Bor Bär
 % collatesort {Bar Bär Bor} {ä ae}
 Bär Bar Bor

jima:To be precise, in what is normally known outside Spain as Spanish language (don't want to mess things up with Catalán or any other tongue spoken there) there is no letter accentuated with the ` character. Our tilde (that is the term we use for the graphical notation of an accent) is ´. Therefore, it should be luxación.

If my precision is somewhat anoyying to anyone please, just delete it from this page. - RS: No, every correction is welcome. Fixed above - thanks!

RS: Even English data may have collation problems - if they contain ff, fi, fl, ffi, ffl ligatures. Then it might help to do
 set sorted [collatesort $input {\uFB00 ff \uFB01 fi \uFB02 fl \uFB03 ffi \uFB04 ffl}]

See also custom sorting