Unicode 4.0 special casing. (#114681)

author Noah Levitt <nlevitt@columbia.edu>

Wed, 10 Sep 2003 16:55:36 +0000 (16:55 +0000)

committer Noah Levitt <nlevitt@src.gnome.org>

Wed, 10 Sep 2003 16:55:36 +0000 (16:55 +0000)
author Noah Levitt <nlevitt@columbia.edu>
Wed, 10 Sep 2003 16:55:36 +0000 (16:55 +0000)
committer Noah Levitt <nlevitt@src.gnome.org>
Wed, 10 Sep 2003 16:55:36 +0000 (16:55 +0000)
diff --git a/ChangeLog b/ChangeLog

index 0dd6877deceacb6a45ddf017ecf281727dd0cf92..05b27882ebbbe0c177d4a6fc3f1cd8251c6d49ca 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2003-09-10  Noah Levitt  <nlevitt@columbia.edu>
+
+       * glib/gunicodeprivate.h:
+       * glib/gunicollate.c:
+       * glib/gunidecomp.c:
+       * glib/guniprop.c:
+       * tests/casemap.txt:
+       * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
+
+       * glib/gunicodeprivate.h: Use a private header instead of extern
+       function declarations (_g_utf8_normalize_wc,
+       _g_unichar_combining_class).
+
  Mon Sep  8 00:31:10 2003  Stefan Westerfeld  <stefan@space.twc.de>
  
          * glib/gbsearcharray.h: inserted casts for C++.
diff --git a/ChangeLog.pre-2-10 b/ChangeLog.pre-2-10

index 0dd6877deceacb6a45ddf017ecf281727dd0cf92..05b27882ebbbe0c177d4a6fc3f1cd8251c6d49ca 100644 (file)
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,3 +1,16 @@
+2003-09-10  Noah Levitt  <nlevitt@columbia.edu>
+
+       * glib/gunicodeprivate.h:
+       * glib/gunicollate.c:
+       * glib/gunidecomp.c:
+       * glib/guniprop.c:
+       * tests/casemap.txt:
+       * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
+
+       * glib/gunicodeprivate.h: Use a private header instead of extern
+       function declarations (_g_utf8_normalize_wc,
+       _g_unichar_combining_class).
+
  Mon Sep  8 00:31:10 2003  Stefan Westerfeld  <stefan@space.twc.de>
  
          * glib/gbsearcharray.h: inserted casts for C++.
diff --git a/ChangeLog.pre-2-12 b/ChangeLog.pre-2-12

index 0dd6877deceacb6a45ddf017ecf281727dd0cf92..05b27882ebbbe0c177d4a6fc3f1cd8251c6d49ca 100644 (file)
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,3 +1,16 @@
+2003-09-10  Noah Levitt  <nlevitt@columbia.edu>
+
+       * glib/gunicodeprivate.h:
+       * glib/gunicollate.c:
+       * glib/gunidecomp.c:
+       * glib/guniprop.c:
+       * tests/casemap.txt:
+       * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
+
+       * glib/gunicodeprivate.h: Use a private header instead of extern
+       function declarations (_g_utf8_normalize_wc,
+       _g_unichar_combining_class).
+
  Mon Sep  8 00:31:10 2003  Stefan Westerfeld  <stefan@space.twc.de>
  
          * glib/gbsearcharray.h: inserted casts for C++.
diff --git a/ChangeLog.pre-2-4 b/ChangeLog.pre-2-4

index 0dd6877deceacb6a45ddf017ecf281727dd0cf92..05b27882ebbbe0c177d4a6fc3f1cd8251c6d49ca 100644 (file)
--- a/ChangeLog.pre-2-4
+++ b/ChangeLog.pre-2-4
@@ -1,3 +1,16 @@
+2003-09-10  Noah Levitt  <nlevitt@columbia.edu>
+
+       * glib/gunicodeprivate.h:
+       * glib/gunicollate.c:
+       * glib/gunidecomp.c:
+       * glib/guniprop.c:
+       * tests/casemap.txt:
+       * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
+
+       * glib/gunicodeprivate.h: Use a private header instead of extern
+       function declarations (_g_utf8_normalize_wc,
+       _g_unichar_combining_class).
+
  Mon Sep  8 00:31:10 2003  Stefan Westerfeld  <stefan@space.twc.de>
  
          * glib/gbsearcharray.h: inserted casts for C++.
diff --git a/ChangeLog.pre-2-6 b/ChangeLog.pre-2-6

index 0dd6877deceacb6a45ddf017ecf281727dd0cf92..05b27882ebbbe0c177d4a6fc3f1cd8251c6d49ca 100644 (file)
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,3 +1,16 @@
+2003-09-10  Noah Levitt  <nlevitt@columbia.edu>
+
+       * glib/gunicodeprivate.h:
+       * glib/gunicollate.c:
+       * glib/gunidecomp.c:
+       * glib/guniprop.c:
+       * tests/casemap.txt:
+       * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
+
+       * glib/gunicodeprivate.h: Use a private header instead of extern
+       function declarations (_g_utf8_normalize_wc,
+       _g_unichar_combining_class).
+
  Mon Sep  8 00:31:10 2003  Stefan Westerfeld  <stefan@space.twc.de>
  
          * glib/gbsearcharray.h: inserted casts for C++.
diff --git a/ChangeLog.pre-2-8 b/ChangeLog.pre-2-8

index 0dd6877deceacb6a45ddf017ecf281727dd0cf92..05b27882ebbbe0c177d4a6fc3f1cd8251c6d49ca 100644 (file)
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,3 +1,16 @@
+2003-09-10  Noah Levitt  <nlevitt@columbia.edu>
+
+       * glib/gunicodeprivate.h:
+       * glib/gunicollate.c:
+       * glib/gunidecomp.c:
+       * glib/guniprop.c:
+       * tests/casemap.txt:
+       * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
+
+       * glib/gunicodeprivate.h: Use a private header instead of extern
+       function declarations (_g_utf8_normalize_wc,
+       _g_unichar_combining_class).
+
  Mon Sep  8 00:31:10 2003  Stefan Westerfeld  <stefan@space.twc.de>
  
          * glib/gbsearcharray.h: inserted casts for C++.
diff --git a/glib/gunicodeprivate.h b/glib/gunicodeprivate.h

new file mode 100644 (file)

index 0000000..84ebad3
--- /dev/null
+++ b/glib/gunicodeprivate.h
@@ -0,0 +1,35 @@
+/* gunicodeprivate.h
+ *
+ * Copyright (C) 2003 Noah Levitt
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.         See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef __G_UNICODE_PRIVATE_H__
+#define __G_UNICODE_PRIVATE_H__
+
+#include "glib.h"
+
+G_BEGIN_DECLS
+
+gunichar *_g_utf8_normalize_wc       (const gchar    *str,
+                                      gssize          max_len,
+                                      GNormalizeMode  mode);
+gint      _g_unichar_combining_class (gunichar uc);
+
+G_END_DECLS
+
+#endif /* __G_UNICODE_PRIVATE_H__ */
diff --git a/glib/gunicollate.c b/glib/gunicollate.c

index 9dcc6d2057a9af332044c319c23dd1b8d7bbc033..63dc73045aad82c29e46acc06ae6b9c38f6daedf 100644 (file)
--- a/glib/gunicollate.c
+++ b/glib/gunicollate.c
@@ -27,10 +27,7 @@
  #endif
  
  #include "glib.h"
-
-extern gunichar *_g_utf8_normalize_wc (const gchar    *str,
-                                      gssize          max_len,
-                                      GNormalizeMode  mode);
+#include "gunicodeprivate.h"
  
  /**
   * g_utf8_collate:
diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c

index 8419564fc309325807889f6d6e894bb67010298c..a5373f9bc9c503f58149aa149f9bf2a7f0c86915 100644 (file)
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -26,6 +26,7 @@
  #include "glib.h"
  #include "gunidecomp.h"
  #include "gunicomp.h"
+#include "gunicodeprivate.h"
  
  
  #define CC_PART1(Page, Char) \
@@ -45,6 +46,12 @@
        ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
        : 0))
  
+gint
+_g_unichar_combining_class (gunichar uc)
+{
+  return COMBINING_CLASS (uc);
+}
+
  /**
   * g_unicode_canonical_ordering:
   * @string: a UCS-4 encoded string.
diff --git a/glib/guniprop.c b/glib/guniprop.c

index 9cef6e7cdbc706a3a21cbbfea05a7bdb334d3134..df0067633f4516e1c29030ed22eb8a957c71f2f9 100644 (file)
--- a/glib/guniprop.c
+++ b/glib/guniprop.c
@@ -27,6 +27,7 @@
  
  #include "glib.h"
  #include "gunichartables.h"
+#include "gunicodeprivate.h"
  
  #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
                            ? attr_table_part1[Page] \
@@ -737,6 +738,28 @@ g_utf8_strup (const gchar *str,
    return result;
  }
  
+/* traverses the string checking for characters with combining class == 230
+ * until a base character is found */
+static gboolean
+has_more_above (gchar *str)
+{
+  gchar *p = str;
+  gint combining_class;
+
+  while (*p)
+    {
+      combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
+      if (combining_class == 230)
+        return TRUE;
+      else if (combining_class == 0)
+        break;
+
+      p = g_utf8_next_char (p);
+    }
+
+  return FALSE;
+}
+
  static gsize
  real_tolower (const gchar *str,
               gssize       max_len,
@@ -758,9 +781,46 @@ real_tolower (const gchar *str,
  
        if (locale_type == LOCALE_TURKIC && c == 'I')
         {
-         /* I => LATIN SMALL LETTER DOTLESS I */
-         len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); 
-       }
+          if (g_utf8_get_char (p) == 0x0307)
+            {
+              /* I + COMBINING DOT ABOVE => i (U+0069) */
+              len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); 
+              p = g_utf8_next_char (p);
+            }
+          else
+            {
+              /* I => LATIN SMALL LETTER DOTLESS I */
+              len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); 
+            }
+        }
+      /* Introduce an explicit dot above when lowercasing capital I's and J's
+       * whenever there are more accents above. [SpecialCasing.txt] */
+      else if (locale_type == LOCALE_LITHUANIAN && 
+               (c == 0x00cc || c == 0x00cd || c == 0x0128))
+        {
+          len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); 
+          len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); 
+
+          switch (c)
+            {
+            case 0x00cc: 
+              len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); 
+              break;
+            case 0x00cd: 
+              len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); 
+              break;
+            case 0x0128: 
+              len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); 
+              break;
+            }
+        }
+      else if (locale_type == LOCALE_LITHUANIAN && 
+               (c == 'I' || c == 'J' || c == 0x012e) && 
+               has_more_above (p))
+        {
+          len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); 
+          len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); 
+        }
        else if (c == 0x03A3)    /* GREEK CAPITAL LETTER SIGMA */
         {
           if ((max_len < 0 || p < str + max_len) && *p)
diff --git a/tests/casemap.txt b/tests/casemap.txt

index 7bc2f40d0e2b5470ed8a6a98b7a15d899aab6dcf..696998221cb906e554fcc2ace75e7dbc26e2b627 100644 (file)
--- a/tests/casemap.txt
+++ b/tests/casemap.txt
@@ -5,6 +5,10 @@
  #
  tr_TR  i       i       İ      İ      # i => LATIN CAPITAL LETTER I WITH DOT ABOVE
  tr_TR  I       ı      I       I       # I => LATIN SMALL LETTER DOTLESS I
+tr_TR  İ     i       İ     İ     # I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8    i       i       İ      İ      # i => LATIN CAPITAL LETTER I WITH DOT ABOVE
+tr_TR.UTF-8    I       ı      I       I       # I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8    İ     i       İ     İ     # I => LATIN SMALL LETTER DOTLESS I
  # Test reordering of YPOGEGRAMMENI across other accents
         ᾁ  ᾁ  ᾉ  ἉΙ  
         ᾁ  ᾁ  ᾉ  ἉΙ  
@@ -16,6 +20,26 @@ tr_TR        I       ı      I       I       # I => LATIN SMALL LETTER DOTLESS I
  # about the titlecase part here
  lt_LT  iė     iė     Ie      IE      
  lt_LT  iė    iė    Ie      IE      
+lt_LT  Ì      i̇̀   Ì      Ì       # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT  Í      i̇́   Í      Í       # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT  Ĩ      i̇̃   Ĩ      Ĩ       # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT  Í     i̇́   Í     Í      # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT  Ì     i̇̀   Ì     Ì      # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT  Ĩ     i̇̃   Ĩ     Ĩ      # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT  Į́   į̇́ Į́   Į́    # LATIN CAPITAL LETTER I (with ogonek and acute accent)
+lt_LT  J́     j̇́   J́     J́      # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT  Į́    į̇́  Į́    Į́     # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
+lt_LT.UTF-8    iė     iė     Ie      IE      
+lt_LT.UTF-8    iė    iė    Ie      IE      
+lt_LT.UTF-8    Ì      i̇̀   Ì      Ì       # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT.UTF-8    Í      i̇́   Í      Í       # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT.UTF-8    Ĩ      i̇̃   Ĩ      Ĩ       # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT.UTF-8    Í     i̇́   Í     Í      # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT.UTF-8    Ì     i̇̀   Ì     Ì      # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT.UTF-8    Ĩ     i̇̃   Ĩ     Ĩ      # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT.UTF-8    Į́   į̇́ Į́   Į́    # LATIN CAPITAL LETTER I (with ogonek and acute accent)
+lt_LT.UTF-8    J́     j̇́   J́     J́      # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT.UTF-8    Į́    į̇́  Į́    Į́     # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
  # Special case not at initial position
         aﬄ    aﬄ    Affl    AFFL    # FB04
  #
diff --git a/tests/gen-casemap-txt.pl b/tests/gen-casemap-txt.pl

index ae06ded41749c5cc5f90836390ef5ffda32b9de1..0b9fc1d6b49d47ff85316697c6978d8c45119253 100755 (executable)
--- a/tests/gen-casemap-txt.pl
+++ b/tests/gen-casemap-txt.pl
@@ -148,6 +148,10 @@ print <<EOT;
  #
  tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
  tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
+tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
+tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
  # Test reordering of YPOGEGRAMMENI across other accents
  \t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
  \t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
@@ -159,6 +163,26 @@ tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
  # about the titlecase part here
  lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
  lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
+lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
+lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
+lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
+lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
+lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
+lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
+lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
+lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
+lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
+lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
+lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
+lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
+lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
  # Special case not at initial position
  \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
  #
author	Noah Levitt <nlevitt@columbia.edu>
	Wed, 10 Sep 2003 16:55:36 +0000 (16:55 +0000)
committer	Noah Levitt <nlevitt@src.gnome.org>
	Wed, 10 Sep 2003 16:55:36 +0000 (16:55 +0000)
ChangeLog		patch \| blob \| history
ChangeLog.pre-2-10		patch \| blob \| history
ChangeLog.pre-2-12		patch \| blob \| history
ChangeLog.pre-2-4		patch \| blob \| history
ChangeLog.pre-2-6		patch \| blob \| history
ChangeLog.pre-2-8		patch \| blob \| history
glib/gunicodeprivate.h	[new file with mode: 0644]	patch \| blob
glib/gunicollate.c		patch \| blob \| history
glib/gunidecomp.c		patch \| blob \| history
glib/guniprop.c		patch \| blob \| history
tests/casemap.txt		patch \| blob \| history
tests/gen-casemap-txt.pl		patch \| blob \| history