From: Noah Levitt Date: Wed, 10 Sep 2003 16:55:36 +0000 (+0000) Subject: Unicode 4.0 special casing. (#114681) X-Git-Url: http://git.openbox.org/?a=commitdiff_plain;h=8d91ba8c585f269229c768415b04fdd222e6fa72;p=dana%2Fcg-glib.git Unicode 4.0 special casing. (#114681) 2003-09-10 Noah Levitt * glib/gunicodeprivate.h: * glib/gunicollate.c: * glib/gunidecomp.c: * glib/guniprop.c: * tests/casemap.txt: * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) * glib/gunicodeprivate.h: Use a private header instead of extern function declarations (_g_utf8_normalize_wc, _g_unichar_combining_class). --- diff --git a/ChangeLog b/ChangeLog index 0dd6877d..05b27882 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2003-09-10 Noah Levitt + + * glib/gunicodeprivate.h: + * glib/gunicollate.c: + * glib/gunidecomp.c: + * glib/guniprop.c: + * tests/casemap.txt: + * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) + + * glib/gunicodeprivate.h: Use a private header instead of extern + function declarations (_g_utf8_normalize_wc, + _g_unichar_combining_class). + Mon Sep 8 00:31:10 2003 Stefan Westerfeld * glib/gbsearcharray.h: inserted casts for C++. diff --git a/ChangeLog.pre-2-10 b/ChangeLog.pre-2-10 index 0dd6877d..05b27882 100644 --- a/ChangeLog.pre-2-10 +++ b/ChangeLog.pre-2-10 @@ -1,3 +1,16 @@ +2003-09-10 Noah Levitt + + * glib/gunicodeprivate.h: + * glib/gunicollate.c: + * glib/gunidecomp.c: + * glib/guniprop.c: + * tests/casemap.txt: + * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) + + * glib/gunicodeprivate.h: Use a private header instead of extern + function declarations (_g_utf8_normalize_wc, + _g_unichar_combining_class). + Mon Sep 8 00:31:10 2003 Stefan Westerfeld * glib/gbsearcharray.h: inserted casts for C++. diff --git a/ChangeLog.pre-2-12 b/ChangeLog.pre-2-12 index 0dd6877d..05b27882 100644 --- a/ChangeLog.pre-2-12 +++ b/ChangeLog.pre-2-12 @@ -1,3 +1,16 @@ +2003-09-10 Noah Levitt + + * glib/gunicodeprivate.h: + * glib/gunicollate.c: + * glib/gunidecomp.c: + * glib/guniprop.c: + * tests/casemap.txt: + * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) + + * glib/gunicodeprivate.h: Use a private header instead of extern + function declarations (_g_utf8_normalize_wc, + _g_unichar_combining_class). + Mon Sep 8 00:31:10 2003 Stefan Westerfeld * glib/gbsearcharray.h: inserted casts for C++. diff --git a/ChangeLog.pre-2-4 b/ChangeLog.pre-2-4 index 0dd6877d..05b27882 100644 --- a/ChangeLog.pre-2-4 +++ b/ChangeLog.pre-2-4 @@ -1,3 +1,16 @@ +2003-09-10 Noah Levitt + + * glib/gunicodeprivate.h: + * glib/gunicollate.c: + * glib/gunidecomp.c: + * glib/guniprop.c: + * tests/casemap.txt: + * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) + + * glib/gunicodeprivate.h: Use a private header instead of extern + function declarations (_g_utf8_normalize_wc, + _g_unichar_combining_class). + Mon Sep 8 00:31:10 2003 Stefan Westerfeld * glib/gbsearcharray.h: inserted casts for C++. diff --git a/ChangeLog.pre-2-6 b/ChangeLog.pre-2-6 index 0dd6877d..05b27882 100644 --- a/ChangeLog.pre-2-6 +++ b/ChangeLog.pre-2-6 @@ -1,3 +1,16 @@ +2003-09-10 Noah Levitt + + * glib/gunicodeprivate.h: + * glib/gunicollate.c: + * glib/gunidecomp.c: + * glib/guniprop.c: + * tests/casemap.txt: + * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) + + * glib/gunicodeprivate.h: Use a private header instead of extern + function declarations (_g_utf8_normalize_wc, + _g_unichar_combining_class). + Mon Sep 8 00:31:10 2003 Stefan Westerfeld * glib/gbsearcharray.h: inserted casts for C++. diff --git a/ChangeLog.pre-2-8 b/ChangeLog.pre-2-8 index 0dd6877d..05b27882 100644 --- a/ChangeLog.pre-2-8 +++ b/ChangeLog.pre-2-8 @@ -1,3 +1,16 @@ +2003-09-10 Noah Levitt + + * glib/gunicodeprivate.h: + * glib/gunicollate.c: + * glib/gunidecomp.c: + * glib/guniprop.c: + * tests/casemap.txt: + * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) + + * glib/gunicodeprivate.h: Use a private header instead of extern + function declarations (_g_utf8_normalize_wc, + _g_unichar_combining_class). + Mon Sep 8 00:31:10 2003 Stefan Westerfeld * glib/gbsearcharray.h: inserted casts for C++. diff --git a/glib/gunicodeprivate.h b/glib/gunicodeprivate.h new file mode 100644 index 00000000..84ebad30 --- /dev/null +++ b/glib/gunicodeprivate.h @@ -0,0 +1,35 @@ +/* gunicodeprivate.h + * + * Copyright (C) 2003 Noah Levitt + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef __G_UNICODE_PRIVATE_H__ +#define __G_UNICODE_PRIVATE_H__ + +#include "glib.h" + +G_BEGIN_DECLS + +gunichar *_g_utf8_normalize_wc (const gchar *str, + gssize max_len, + GNormalizeMode mode); +gint _g_unichar_combining_class (gunichar uc); + +G_END_DECLS + +#endif /* __G_UNICODE_PRIVATE_H__ */ diff --git a/glib/gunicollate.c b/glib/gunicollate.c index 9dcc6d20..63dc7304 100644 --- a/glib/gunicollate.c +++ b/glib/gunicollate.c @@ -27,10 +27,7 @@ #endif #include "glib.h" - -extern gunichar *_g_utf8_normalize_wc (const gchar *str, - gssize max_len, - GNormalizeMode mode); +#include "gunicodeprivate.h" /** * g_utf8_collate: diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c index 8419564f..a5373f9b 100644 --- a/glib/gunidecomp.c +++ b/glib/gunidecomp.c @@ -26,6 +26,7 @@ #include "glib.h" #include "gunidecomp.h" #include "gunicomp.h" +#include "gunicodeprivate.h" #define CC_PART1(Page, Char) \ @@ -45,6 +46,12 @@ ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ : 0)) +gint +_g_unichar_combining_class (gunichar uc) +{ + return COMBINING_CLASS (uc); +} + /** * g_unicode_canonical_ordering: * @string: a UCS-4 encoded string. diff --git a/glib/guniprop.c b/glib/guniprop.c index 9cef6e7c..df006763 100644 --- a/glib/guniprop.c +++ b/glib/guniprop.c @@ -27,6 +27,7 @@ #include "glib.h" #include "gunichartables.h" +#include "gunicodeprivate.h" #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ ? attr_table_part1[Page] \ @@ -737,6 +738,28 @@ g_utf8_strup (const gchar *str, return result; } +/* traverses the string checking for characters with combining class == 230 + * until a base character is found */ +static gboolean +has_more_above (gchar *str) +{ + gchar *p = str; + gint combining_class; + + while (*p) + { + combining_class = _g_unichar_combining_class (g_utf8_get_char (p)); + if (combining_class == 230) + return TRUE; + else if (combining_class == 0) + break; + + p = g_utf8_next_char (p); + } + + return FALSE; +} + static gsize real_tolower (const gchar *str, gssize max_len, @@ -758,9 +781,46 @@ real_tolower (const gchar *str, if (locale_type == LOCALE_TURKIC && c == 'I') { - /* I => LATIN SMALL LETTER DOTLESS I */ - len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); - } + if (g_utf8_get_char (p) == 0x0307) + { + /* I + COMBINING DOT ABOVE => i (U+0069) */ + len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); + p = g_utf8_next_char (p); + } + else + { + /* I => LATIN SMALL LETTER DOTLESS I */ + len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); + } + } + /* Introduce an explicit dot above when lowercasing capital I's and J's + * whenever there are more accents above. [SpecialCasing.txt] */ + else if (locale_type == LOCALE_LITHUANIAN && + (c == 0x00cc || c == 0x00cd || c == 0x0128)) + { + len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); + len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); + + switch (c) + { + case 0x00cc: + len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); + break; + case 0x00cd: + len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); + break; + case 0x0128: + len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); + break; + } + } + else if (locale_type == LOCALE_LITHUANIAN && + (c == 'I' || c == 'J' || c == 0x012e) && + has_more_above (p)) + { + len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); + len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); + } else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ { if ((max_len < 0 || p < str + max_len) && *p) diff --git a/tests/casemap.txt b/tests/casemap.txt index 7bc2f40d..69699822 100644 --- a/tests/casemap.txt +++ b/tests/casemap.txt @@ -5,6 +5,10 @@ # tr_TR i i İ İ # i => LATIN CAPITAL LETTER I WITH DOT ABOVE tr_TR I ı I I # I => LATIN SMALL LETTER DOTLESS I +tr_TR İ i İ İ # I => LATIN SMALL LETTER DOTLESS I +tr_TR.UTF-8 i i İ İ # i => LATIN CAPITAL LETTER I WITH DOT ABOVE +tr_TR.UTF-8 I ı I I # I => LATIN SMALL LETTER DOTLESS I +tr_TR.UTF-8 İ i İ İ # I => LATIN SMALL LETTER DOTLESS I # Test reordering of YPOGEGRAMMENI across other accents ᾁ ᾁ ᾉ ἉΙ ᾁ ᾁ ᾉ ἉΙ @@ -16,6 +20,26 @@ tr_TR I ı I I # I => LATIN SMALL LETTER DOTLESS I # about the titlecase part here lt_LT iė iė Ie IE lt_LT iė iė Ie IE +lt_LT Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I WITH GRAVE +lt_LT Í i̇́ Í Í # LATIN CAPITAL LETTER I WITH ACUTE +lt_LT Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I WITH TILDE +lt_LT Í i̇́ Í Í # LATIN CAPITAL LETTER I (with acute accent) +lt_LT Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I (with grave accent) +lt_LT Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I (with tilde above) +lt_LT Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I (with ogonek and acute accent) +lt_LT J́ j̇́ J́ J́ # LATIN CAPITAL LETTER J (with acute accent) +lt_LT Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) +lt_LT.UTF-8 iė iė Ie IE +lt_LT.UTF-8 iė iė Ie IE +lt_LT.UTF-8 Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I WITH GRAVE +lt_LT.UTF-8 Í i̇́ Í Í # LATIN CAPITAL LETTER I WITH ACUTE +lt_LT.UTF-8 Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I WITH TILDE +lt_LT.UTF-8 Í i̇́ Í Í # LATIN CAPITAL LETTER I (with acute accent) +lt_LT.UTF-8 Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I (with grave accent) +lt_LT.UTF-8 Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I (with tilde above) +lt_LT.UTF-8 Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I (with ogonek and acute accent) +lt_LT.UTF-8 J́ j̇́ J́ J́ # LATIN CAPITAL LETTER J (with acute accent) +lt_LT.UTF-8 Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) # Special case not at initial position affl affl Affl AFFL # FB04 # diff --git a/tests/gen-casemap-txt.pl b/tests/gen-casemap-txt.pl index ae06ded4..0b9fc1d6 100755 --- a/tests/gen-casemap-txt.pl +++ b/tests/gen-casemap-txt.pl @@ -148,6 +148,10 @@ print < LATIN CAPITAL LETTER I WITH DOT ABOVE tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I +tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I +tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE +tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I +tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I # Test reordering of YPOGEGRAMMENI across other accents \t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t \t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t @@ -159,6 +163,26 @@ tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I # about the titlecase part here lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t +lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE +lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE +lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE +lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent) +lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent) +lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above) +lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) +lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent) +lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) +lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t +lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t +lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE +lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE +lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE +lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent) +lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent) +lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above) +lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) +lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent) +lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) # Special case not at initial position \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04 #