From 6deac027c98f5d99e1805f9ddc21ff2dbebe0fb7 Mon Sep 17 00:00:00 2001 From: Remi Collet Date: Wed, 20 Mar 2013 10:29:29 +0100 Subject: compat-icu36: new package (for EL-5) --- Makefile | 4 + canonicalize.patch | 11 + compat-icu36.spec | 189 +++++++ icu-3.4-multiarchdevel.patch | 70 +++ icu-config | 387 +++++++++++++++ icu.icu5365.dependantvowels.patch | 11 + icu.icu5418.malayam.patch | 39 ++ icu.icu5431.malayam.patch | 107 ++++ icu.icu5433.oriya.patch | 31 ++ icu.icu5465.telegu.patch | 29 ++ icu.icu5483.backport.patch | 874 +++++++++++++++++++++++++++++++++ icu.icu5488.assamese.patch | 11 + icu.icu5500.devicetablecrash.patch | 11 + icu.icu5501.sinhala.biggerexpand.patch | 11 + icu.icu5506.multiplevowels.patch | 61 +++ icu.icu5557.safety.patch | 14 + icu.icu5594.gujarati.patch | 14 + icu.icu5691.backport.patch | 730 +++++++++++++++++++++++++++ icu.icu5797.backport.patch | 749 ++++++++++++++++++++++++++++ icu.icu6001.backport.patch | 741 ++++++++++++++++++++++++++++ icu.icu6002.backport.patch | 397 +++++++++++++++ icu.icu6175.emptysegments.patch | 535 ++++++++++++++++++++ icu.icuXXXX.malayalam.bysyllable.patch | 250 ++++++++++ icu.icuXXXX.rollbackabi.patch | 131 +++++ icu.icuXXXX.virama.prevnext.patch | 98 ++++ icu.rh429023.regexp.patch | 307 ++++++++++++ 26 files changed, 5812 insertions(+) create mode 100644 Makefile create mode 100644 canonicalize.patch create mode 100644 compat-icu36.spec create mode 100644 icu-3.4-multiarchdevel.patch create mode 100755 icu-config create mode 100644 icu.icu5365.dependantvowels.patch create mode 100644 icu.icu5418.malayam.patch create mode 100644 icu.icu5431.malayam.patch create mode 100644 icu.icu5433.oriya.patch create mode 100644 icu.icu5465.telegu.patch create mode 100644 icu.icu5483.backport.patch create mode 100644 icu.icu5488.assamese.patch create mode 100644 icu.icu5500.devicetablecrash.patch create mode 100644 icu.icu5501.sinhala.biggerexpand.patch create mode 100644 icu.icu5506.multiplevowels.patch create mode 100644 icu.icu5557.safety.patch create mode 100644 icu.icu5594.gujarati.patch create mode 100644 icu.icu5691.backport.patch create mode 100644 icu.icu5797.backport.patch create mode 100644 icu.icu6001.backport.patch create mode 100644 icu.icu6002.backport.patch create mode 100644 icu.icu6175.emptysegments.patch create mode 100644 icu.icuXXXX.malayalam.bysyllable.patch create mode 100644 icu.icuXXXX.rollbackabi.patch create mode 100644 icu.icuXXXX.virama.prevnext.patch create mode 100644 icu.rh429023.regexp.patch diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1e65467 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +SRCDIR := $(shell pwd) +NAME := $(shell basename $(SRCDIR)) +include ../common/Makefile + diff --git a/canonicalize.patch b/canonicalize.patch new file mode 100644 index 0000000..3ff9c33 --- /dev/null +++ b/canonicalize.patch @@ -0,0 +1,11 @@ +--- source/common/uloc.c 2011-12-12 04:50:00.601092000 -0500 ++++ source/common/uloc.c 2011-12-12 04:56:18.503570000 -0500 +@@ -1712,7 +1712,7 @@ + /* Check for EURO variants. */ + sawEuro = _deleteVariant(variant, variantSize, "EURO", 4); + len -= sawEuro; +- if (sawEuro > 0 && name[len-1] == '_') { /* delete trailing '_' */ ++ if (sawEuro > 0 && len > 0 && name[len-1] == '_') { /* delete trailing '_' */ + --len; + } + diff --git a/compat-icu36.spec b/compat-icu36.spec new file mode 100644 index 0000000..d8a820e --- /dev/null +++ b/compat-icu36.spec @@ -0,0 +1,189 @@ +Name: compat-icu36 +Version: 3.6 +Release: 5.16.1 +Summary: International Components for Unicode + +Group: System Environment/Libraries +License: X License +URL: http://www.ibm.com/software/globalization/icu/ +Source0: ftp://ftp.software.ibm.com/software/globalization/icu/icu4c-3_6-src.tgz +BuildRoot: %{_tmppath}/%{name}-%{version}-root + +BuildRequires: doxygen, autoconf +Patch1: icu-3.4-multiarchdevel.patch +Patch2: icu-config +Patch3: icu.icu5365.dependantvowels.patch +Patch4: icu.icu5418.malayam.patch +Patch5: icu.icu5431.malayam.patch +Patch6: icu.icu5433.oriya.patch +Patch7: icu.icuXXXX.virama.prevnext.patch +Patch8: icu.icu5465.telegu.patch +Patch9: icu.icu5488.assamese.patch +Patch10: icu.icu5500.devicetablecrash.patch +Patch11: icu.icu5501.sinhala.biggerexpand.patch +Patch12: icu.icu5557.safety.patch +Patch13: icu.icu5594.gujarati.patch +Patch14: icu.icu5506.multiplevowels.patch +Patch15: icu.icuXXXX.malayalam.bysyllable.patch +Patch16: icu.rh429023.regexp.patch +Patch17: icu.icu5483.backport.patch +Patch18: icu.icu5797.backport.patch +Patch19: icu.icu6001.backport.patch +Patch20: icu.icu6002.backport.patch +Patch21: icu.icu6175.emptysegments.patch +Patch22: icu.icu5691.backport.patch +Patch23: icu.icuXXXX.rollbackabi.patch +Patch24: canonicalize.patch +Conflicts: icu + +%description +The International Components for Unicode (ICU) libraries provide +robust and full-featured Unicode services on a wide variety of +platforms. ICU supports the most current version of the Unicode +standard, and they provide support for supplementary Unicode +characters (needed for GB 18030 repertoire support). +As computing environments become more heterogeneous, software +portability becomes more important. ICU lets you produce the same +results across all the various platforms you support, without +sacrificing performance. It offers great flexibility to extend and +customize the supplied services. + + +%package -n compat-libicu36 +Summary: International Components for Unicode - libraries +Group: System Environment/Libraries + +%description -n compat-libicu36 +%{summary}. + +This package provides the ICU libraries for package built +against version %{version}. + +%package -n compat-libicu36-devel +Summary: Development files for International Components for Unicode +Group: Development/Libraries +Requires: compat-libicu36 = %{version}-%{release} +Requires: pkgconfig +Conflicts: libicu-devel + +%description -n compat-libicu36-devel +%{summary}. + +%package -n compat-libicu36-doc +Summary: Documentation for International Components for Unicode +Group: Documentation + +%description -n compat-libicu36-doc +%{summary}. + + +%prep +%setup -q -n icu +%patch1 -p1 -b .multiarchdevel +%patch3 -p1 -b .dependantvowels +%patch4 -p1 -b .icu5418.malayam.patch +%patch5 -p1 -b .icu5431.malayam.patch +%patch6 -p1 -b .icu5433.oriya.patch +%patch7 -p1 -b .icuXXXX.virama.prevnext.patch +%patch8 -p1 -b .icu5465.telegu.patch +%patch9 -p1 -b .icu5488.assamese.patch +%patch10 -p1 -b .icu5500.devicetablecrash.patch +%patch11 -p1 -b .icu5501.sinhala.biggerexpand.patch +%patch12 -p1 -b .icu5557.safety.patch +%patch13 -p1 -b .icu5594.gujarati.patch +%patch14 -p1 -b .icu5506.multiplevowels.patch +%patch15 -p1 -b .icuXXXX.malayalam.bysyllable.patch +%patch16 -p1 -b .rh429023.regexp.patch +%patch17 -p1 -b .icu5483.backport.patch +%patch18 -p1 -b .icu5797.backport.patch +%patch19 -p1 -b .icu6001.backport.patch +%patch20 -p1 -b .icu6002.backport.patch +%patch21 -p1 -b .icu6175.emptysegments.patch +%patch22 -p1 -b .icu5691.backport.patch +%patch23 -p1 -b .icuXXXX.rollbackabi.patch +%patch24 -p0 -b .canonicalize.patch + +%build +cd source +export CFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing" +export CXXFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing" +autoconf +%configure --with-data-packaging=library --disable-samples +#rhbz#654590 +sed -i -- "s/-nodefaultlibs -nostdlib//" config/mh-linux +make # %{?_smp_mflags} # -j(X>1) may "break" man pages as of 3.2, b.f.u #2357 +make doc + +%install +rm -rf $RPM_BUILD_ROOT source/__docs +make -C source install DESTDIR=$RPM_BUILD_ROOT +make -C source install-doc docdir=__docs +chmod +x $RPM_BUILD_ROOT%{_libdir}/*.so.* +cp %{PATCH2} $RPM_BUILD_ROOT%{_bindir}/icu-config +chmod a+x $RPM_BUILD_ROOT%{_bindir}/icu-config +sed -i s/\\\$\(THREADSCXXFLAGS\)// $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc +sed -i s/\\\$\(THREADSCPPFLAGS\)/-D_REENTRANT/ $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc + +%check +make -C source check + + +%clean +rm -rf $RPM_BUILD_ROOT + + +%post -n compat-libicu36 -p /sbin/ldconfig + +%postun -n compat-libicu36 -p /sbin/ldconfig + + +%files +%defattr(-,root,root,-) +%doc license.html readme.html +%{_bindir}/derb +%{_bindir}/genbrk +%{_bindir}/gencnval +%{_bindir}/genctd +%{_bindir}/genrb +%{_bindir}/makeconv +%{_bindir}/pkgdata +%{_bindir}/uconv +%{_sbindir}/* +%{_mandir}/man1/derb.1* +%{_mandir}/man1/gencnval.1* +%{_mandir}/man1/genrb.1* +%{_mandir}/man1/genbrk.1* +%{_mandir}/man1/genctd.1* +%{_mandir}/man1/makeconv.1* +%{_mandir}/man1/pkgdata.1* +%{_mandir}/man1/uconv.1* +%{_mandir}/man8/*.8* + +%files -n compat-libicu36 +%defattr(-,root,root,-) +%{_libdir}/*.so.* + +%files -n compat-libicu36-devel +%defattr(-,root,root,-) +%{_bindir}/icu-config +%{_mandir}/man1/icu-config.1* +%{_includedir}/layout +%{_includedir}/unicode +%{_libdir}/*.so +%{_libdir}/icu +%{_libdir}/pkgconfig/icu.pc +%dir %{_datadir}/icu +%dir %{_datadir}/icu/3.6 +%{_datadir}/icu/3.6/mkinstalldirs +%{_datadir}/icu/3.6/config +%doc %{_datadir}/icu/3.6/license.html + +%files -n compat-libicu36-doc +%defattr(-,root,root,-) +%doc source/__docs/icu/html/* + + +%changelog +* Wed Mar 20 2013 Remi Collet - 3.6-5.16.1 +- new package from RHEL-5 spec of icu. + diff --git a/icu-3.4-multiarchdevel.patch b/icu-3.4-multiarchdevel.patch new file mode 100644 index 0000000..a7839aa --- /dev/null +++ b/icu-3.4-multiarchdevel.patch @@ -0,0 +1,70 @@ +--- icu/source/configure.in.orig 2006-05-02 12:10:31.000000000 +0100 ++++ icu/source/configure.in 2006-05-02 15:06:07.000000000 +0100 +@@ -1011,6 +1011,7 @@ + Makefile \ + data/icupkg.inc \ + config/Makefile.inc \ ++ config/icu.pc \ + data/Makefile \ + stubdata/Makefile \ + common/Makefile \ +--- /dev/null 2006-04-29 13:38:37.035974750 +0100 ++++ icu/source/config/icu.pc.in 2006-05-02 15:03:14.000000000 +0100 +@@ -0,0 +1,46 @@ ++prefix = @prefix@ ++bindir = @bindir@ ++exec_prefix = @exec_prefix@ ++libdir = @libdir@ ++includedir = @includedir@ ++datadir = @datadir@ ++sbindir = @sbindir@ ++mandir = @mandir@ ++sysconfdir = @sysconfdir@ ++CFLAGS = @CFLAGS@ ++CXXFLAGS = @CXXFLAGS@ ++DEFS = @DEFS@ ++UNICODE_VERSION=@UNICODE_VERSION@ ++ICUPREFIX=icu ++ICULIBSUFFIX=@ICULIBSUFFIX@ ++LIBICU=lib${ICUPREFIX} ++LIBCPPFLAGS=-D_REENTRANT ++CPPFLAGS=@CPPFLAGS@ ${LIBCPPFLAGS} -I${prefix}/include ++SHAREDLIBCPPFLAGS=-DPIC ++SHAREDLIBCXXFLAGS=-fPIC ++SHAREDLIBCFLAGS=-fPIC ++pkglibdir=${libdir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@ ++pkgdatadir=${datadir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@ ++ICUDATA_NAME = icudt@LIB_VERSION_MAJOR@@ICUDATA_CHAR@ ++ICUPKGDATA_DIR=@libdir@ ++ICUDATA_DIR=${pkgdatadir} ++SO=so ++ICULIBS_COMMON_LIB_NAME=${LIBICU}uc${ICULIBSUFFIX}.${SO} ++SHLIB_cc=cxx ${DEFS} ${CPPFLAGS} ${CXXFLAGS} @LDFLAGS@ -shared ++SHLIB_c=cc ${DEFS} ${CPPFLAGS} ${CFLAGS} @LDFLAGS@ -shared ++ICULIBS_LAYOUT = -l${ICUPREFIX}le${ICULIBSUFFIX} -l${ICUPREFIX}lx${ICULIBSUFFIX} ++ICULIBS_TOOLUTIL = -l${ICUPREFIX}tu${ICULIBSUFFIX} ++ICULIBS_OBSOLETE = -l${ICUPREFIX}obsolete${ICULIBSUFFIX} ++ICULIBS_ICUIO = -l${ICUPREFIX}io${ICULIBSUFFIX} ++ICULIBS_I18N = -l${ICUPREFIX}i18n${ICULIBSUFFIX} ++ICULIBS_COMMON = -l${ICUPREFIX}uc${ICULIBSUFFIX} ++ICULIBS_DATA = -l${ICUPREFIX}data${ICULIBSUFFIX} ++ICULIBS_LIBSONLY = ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA} ++ICULIBS_SYSTEMLIBS = @LIBS@ ++ICULIBS_BASE = @LIBS@ -L${libdir} ++ICULIBS = ${ICULIBS_BASE} ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA} ++ ++Name: @PACKAGE@ ++Description: International Components for Unicode ++Version: @VERSION@ ++Libs: @LDFLAGS@ ${ICULIBS} @LIBS@ +--- icu/source/Makefile.in.orig 2006-05-02 12:10:31.000000000 +0100 ++++ icu/source/Makefile.in 2006-05-02 15:18:15.000000000 +0100 +@@ -125,6 +125,8 @@ + @$(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL_DATA) @platform_make_fragment@ $(DESTDIR)$(pkgdatadir)/config/@platform_make_fragment_name@ + $(INSTALL_SCRIPT) $(top_srcdir)/mkinstalldirs $(DESTDIR)$(pkgdatadir)/mkinstalldirs ++ @$(MKINSTALLDIRS) $(DESTDIR)$(libdir)/pkgconfig ++ $(INSTALL_DATA) $(top_srcdir)/config/icu.pc $(DESTDIR)$(libdir)/pkgconfig/icu.pc + $(INSTALL_DATA) $(top_srcdir)/../license.html $(DESTDIR)$(pkgdatadir)/license.html + $(INSTALL_SCRIPT) $(top_builddir)/config/icu-config $(DESTDIR)$(bindir)/icu-config + $(INSTALL_DATA) $(top_builddir)/config/Makefile.inc $(DESTDIR)$(pkglibdir)/Makefile.inc diff --git a/icu-config b/icu-config new file mode 100755 index 0000000..08f9ce8 --- /dev/null +++ b/icu-config @@ -0,0 +1,387 @@ +#!/bin/sh +## -*-sh-*- +#set -x +# BEGIN of icu-config-top +#****************************************************************************** +# Copyright (C) 1999-2004, International Business Machines +# Corporation and others. All Rights Reserved. +#****************************************************************************** +# This script is designed to aid configuration of ICU. +# rpath links a library search path right into the binaries. + + +### END of icu-config-top + +## Zero out prefix. +exec_prefix=`pkg-config --variable=exec_prefix icu` +execprefix=$exec_prefix +prefix=`pkg-config --variable=prefix icu` + + +loaddefs() +{ +LDLIBRARYPATH_ENVVAR="LD_LIBRARY_PATH" +bindir=`pkg-config --variable=bindir icu` +sbindir=`pkg-config --variable=sbindir icu` +libdir=`pkg-config --variable=libdir icu` +sysconfdir=`pkg-config --variable=sysconfdir icu` +mandir=`pkg-config --variable=mandir icu` +datadir=`pkg-config --variable=datadir icu` +pkglibdir=`pkg-config --variable=pkglibdir icu` +ICULIBS_COMMON_LIB_NAME=`pkg-config --variable=ICULIBS_COMMON_LIB_NAME icu` +UNICODE_VERSION=`pkg-config --variable=UNICODE_VERSION icu` +VERSION=`pkg-config --modversion icu` +SO=`pkg-config --variable=SO icu` + +## -*-sh-*- +## BEGIN of icu-config-bottom. +## Copyright (c) 2002-2004, International Business Machines Corporation and +## others. All Rights Reserved. + +ICUUC_FILE=${libdir}/${ICULIBS_COMMON_LIB_NAME} + +# echo ENABLE RPATH $ENABLE_RPATH and RPATHLDFLAGS=${RPATH_LDFLAGS} +if [ "x$PKGDATA_MODE" = "x" ]; then + PKGDATA_MODE=dll +fi + +} + +## The actual code of icu-config goes here. + +ME=`basename $0` + +allflags() +{ + echo " --bindir Print binary directory path (bin)" + echo " --cc Print C compiler used [CC]" + echo " --cflags Print C compiler flags [CFLAGS]" + echo " --cflags-dynamic Print additional C flags for" + echo " building shared libraries." + echo " --cppflags Print C Preprocessor flags [CPPFLAGS]" + echo " --cppflags-dynamic Print additional C Preprocessor flags for" + echo " building shared libraries." + echo " --cppflags-searchpath Print only -I include directives (-Iinclude)" + echo " --cxx Print C++ compiler used [CXX]" + echo " --cxxflags Print C++ compiler flags [CXXFLAGS]" + echo " --cxxflags-dynamic Print additional C++ flags for" + echo " building shared libraries." + echo " --detect-prefix Attempt to detect prefix based on PATH" + echo " --exec-prefix Print prefix for executables (/bin)" + echo " --exists Return with 0 status if ICU exists else fail" + echo " --help, -?, --usage Print this message" + echo " --icudata Print shortname of ICU data file (icudt21l)" + echo " --icudata-install-dir Print path to install data to - use as --install option to pkgdata(1)" + echo " --icudata-mode Print default ICU pkgdata mode (dll) - use as --mode option to pkgdata(1)." + echo " --icudatadir Print path to packaged archive data. Can set as [ICU_DATA]" + echo " --invoke Print commands to invoke an ICU program" + echo " --invoke= Print commands to invoke an ICU program named (ex: genrb)" + echo " --ldflags Print -L search path and -l libraries to link with ICU [LDFLAGS]. This is for the data, uc (common), and i18n libraries only. " + echo " --ldflags-layout Print ICU layout engine link directive. Use in addition to --ldflags" + echo " --ldflags-libsonly Same as --ldflags, but only the -l directives" + echo " --ldflags-searchpath Print only -L (search path) directive" + echo " --ldflags-system Print only system libs ICU links with (-lpthread, -lm)" + echo " --ldflags-icuio Print ICU icuio link directive. Use in addition to --ldflags " + echo " --ldflags-obsolete Print ICU obsolete link directive. Use in addition to --ldflags. (requires icuapps/obsolete to be built and installed.) " + echo " --mandir Print manpage (man) path" + echo " --prefix Print PREFIX to icu install (/usr/local)" + echo " --prefix=XXX Set prefix to XXX for remainder of command" + echo " --sbindir Print system binary path (sbin) " + echo " --shared-datadir Print shared data (share) path. This is NOT the ICU data dir." + echo " --shlib-c Print the command to compile and build C shared libraries with ICU" + echo " --shlib-cc Print the command to compile and build C++ shared libraries with ICU" + echo " --sysconfdir Print system config (etc) path" + echo " --unicode-version Print version of Unicode data used in ICU ($UNICODE_VERSION)" + echo " --version Print ICU version ($VERSION)" + echo " --incfile Print path to Makefile.inc (for -O option of pkgdata)" +} + +## Print the normal usage message +shortusage() +{ + echo "usage: ${ME} " `allflags | cut -c-25 | sed -e 's%.*%[ & ]%'` +} + + +usage() +{ + echo "${ME}: icu-config: ICU configuration helper script" + echo + echo "The most commonly used options will be --cflags, --cxxflags, --cppflags, and --ldflags." + echo 'Example (in make): CPFLAGS=$(shell icu-config --cppflags)' + echo ' LDFLAGS=$(shell icu-config --ldflags)' + echo " (etc).." + echo + echo "Usage:" + allflags + + echo + echo " [Brackets] show MAKE variable equivalents, (parenthesis) show example output" + echo + echo "Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved." +} + +## Check the sanity of current variables +sanity() +{ + if [ ! -f ${ICUUC_FILE} ]; + then + echo "### $ME: Can't find ${ICUUC_FILE} - ICU prefix is wrong." 1>&2 + echo "### Try the --prefix= or --exec-prefix= options " 1>&2 + echo "### or --detect-prefix" + echo "### $ME: Exitting." 1>&2 + exit 2 + fi +} + +## Main starts here. + +if [ $# -lt 1 ]; then + shortusage + exit 1 +fi + + +# Load our variables from autoconf +# ALWAYS load twice because of dependencies +loaddefs +loaddefs +sanity + +while [ $# -gt 0 ]; +do + arg="$1" + var=`echo $arg | sed -e 's/^[^=]*=//'` +# echo "### processing $arg" 1>&2 + case "$arg" in + + # undocumented. + --debug) + set -x + ;; + + --so) + echo $SO + ;; + + --bindir) + echo $bindir + ;; + + --libdir) + echo $libdir + ;; + + --exists) + sanity + ;; + + --sbindir) + echo $sbindir + ;; + + --invoke=*) + QUOT="'" + CMD="${var}" + + # If it's not a locally executable command (1st choice) then + # search for it in the ICU directories. + if [ ! -x ${CMD} ]; then + if [ -x ${bindir}/${var} ]; then + CMD="${bindir}/${var}" + fi + if [ -x ${sbindir}/${var} ]; then + CMD="${sbindir}/${var}" + fi + fi + + echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} ${CMD} + ;; + + --invoke) + QUOT="'" + echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} + ;; + + --cflags) + pkg-config --variable=CFLAGS icu + ;; + + --cc) + echo cc + ;; + + --cxx) + echo c++ + ;; + + --cxxflags) + pkg-config --variable=CXXFLAGS icu + ;; + + --cppflags) + # Don't echo the -I. - it's unneeded. + CPPFLAGS=`pkg-config --variable=CPPFLAGS icu` + echo $CPPFLAGS | sed -e 's/-I. //' + ;; + + --cppflags-searchpath) + echo -I${prefix}/include + ;; + + --cppflags-dynamic) + pkg-config --variable=SHAREDLIBCPPFLAGS icu + ;; + + --cxxflags-dynamic) + pkg-config --variable=SHAREDLIBCXXFLAGS icu + ;; + + --cflags-dynamic) + pkg-config --variable=SHAREDLIBCFLAGS icu + ;; + + --ldflags-system) + pkg-config --variable=ICULIBS_SYSTEMLIBS icu + ;; + + --ldflags) + pkg-config --libs icu +# $RPATH_LDFLAGS + ;; + + --ldflags-libsonly) + pkg-config --variable=ICULIBS_LIBSONLY icu + ;; + + --ldflags-icuio) + pkg-config --variable=ICULIBS_ICUIO icu + ;; + + --ldflags-obsolete) + pkg-config --variable=ICULIBS_OBSOLETE icu + ;; + + --ldflags-toolutil) + pkg-config --variable=ICULIBS_TOOLUTIL icu + ;; + + --ldflags-layout) + pkg-config --variable=ICULIBS_LAYOUT icu + ;; + + --ldflags-searchpath) + echo -L${libdir} + ;; + + --detect-prefix) + HERE=`echo $0 | sed -e "s/$ME//g"` + if [ -f $HERE/../lib/${ICULIBS_COMMON_LIB_NAME} ]; then + prefix=$HERE/.. + echo "## Using --prefix=${prefix}" 1>&2 + fi + loaddefs + loaddefs + sanity + ;; + + --exec-prefix) + echo $exec_prefix + ;; + + --prefix) + echo $prefix + ;; + + --prefix=*) + prefix=$var + loaddefs + loaddefs + sanity + ;; + + --sysconfdir) + echo $sysconfdir + ;; + + --mandir) + echo $mandir + ;; + + --shared-datadir) + echo $datadir + ;; + + --incfile) + echo $pkglibdir/Makefile.inc + ;; + + --icudata) + pkg-config --variable=ICUDATA_NAME icu + ;; + + --icudata-mode) + echo $PKGDATA_MODE + ;; + + --icudata-install-dir) + pkg-config --variable=ICUPKGDATA_DIR icu + ;; + + --icudatadir) + pkg-config --variable=ICUDATA_DIR icu + ;; + + --shlib-c) + pkg-config --variable=SHLIB_c icu + ;; + + --shlib-cc) + pkg-config --variable=SHLIB_cc icu + ;; + + --version) + echo $VERSION + ;; + + --unicode-version) + echo $UNICODE_VERSION + ;; + + --help) + usage + exit 0 + ;; + + --usage) + usage + exit 0 + ;; + +# --enable-rpath=*) +# ENABLE_RPATH=$var +# loaddefs +# ;; + + -?) + usage + exit 0 + ;; + + *) + echo ${ME}: ERROR Unknown Option $arg 1>&2 + echo 1>&2 + shortusage 1>&2 + echo "### $ME: Exitting." 1>&2 + exit 1; + ;; + esac + shift +done + +# Check once before we quit (will check last used prefix) +sanity +## END of icu-config-bottom + +exit 0 + diff --git a/icu.icu5365.dependantvowels.patch b/icu.icu5365.dependantvowels.patch new file mode 100644 index 0000000..5708018 --- /dev/null +++ b/icu.icu5365.dependantvowels.patch @@ -0,0 +1,11 @@ +--- icu/source/layout/IndicReordering.cpp.orig 2006-09-05 17:01:15.000000000 +0100 ++++ icu/source/layout/IndicReordering.cpp 2006-09-05 17:01:19.000000000 +0100 +@@ -377,7 +377,7 @@ + {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, 9, 5, 5, 4, 12}, // 2 - consonant with nukta + {-1, 6, 1, -1, -1, -1, -1, -1, 2, 5, 9, 5, 5, 4, 12}, // 3 - consonant + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 7}, // 4 - consonant virama +- {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 5 - dependent vowels ++ {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, // 5 - dependent vowels + {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - vowel mark + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, -1}, // 7 - consonant virama ZWJ, consonant ZWJ virama + {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1}, // 8 - independent vowels that can take a virama diff --git a/icu.icu5418.malayam.patch b/icu.icu5418.malayam.patch new file mode 100644 index 0000000..03fbe63 --- /dev/null +++ b/icu.icu5418.malayam.patch @@ -0,0 +1,39 @@ +--- icu/source/layout/IndicClassTables.cpp.orig 2006-08-23 01:12:40.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-09-25 09:06:38.000000000 +0100 +@@ -173,6 +173,19 @@ + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0CE0 - 0CEF + }; + ++#if 1 ++//use the pango char class table here ++static const IndicClassTable::CharClass mlymCharClasses[] = ++{ ++ _xx, _xx, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0D00 - 0D0F */ ++ _iv, _xx, _iv, _iv, _iv, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, /* 0D10 - 0D1F */ ++ _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _pb, /* 0D20 - 0D2F */ ++ _pb, _cn, _ct, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _dr, _dr, /* 0D30 - 0D3F */ ++ _dr, _dr, _dr, _dr, _xx, _xx, _dl, _dl, _dl, _xx, _s1, _s2, _s3, _vr, _xx, _xx, /* 0D40 - 0D4F */ ++ _xx, _xx, _xx, _xx, _xx, _xx, _xx, _dr, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0D50 - 0D5F */ ++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 0D60 - 0D6F */ ++}; ++#else + // FIXME: this is correct for old-style Malayalam (MAL) but not for reformed Malayalam (MLR) + // FIXME: should there be a REPH for old-style Malayalam? + static const IndicClassTable::CharClass mlymCharClasses[] = +@@ -185,6 +198,7 @@ + _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0D50 - 0D5F + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0D60 - 0D6F + }; ++#endif + + static const IndicClassTable::CharClass sinhCharClasses[] = + { +@@ -232,7 +246,7 @@ + #define TAML_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH) + #define TELU_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) + #define KNDA_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) +-#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH) ++#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT) + #define SINH_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT) + + // diff --git a/icu.icu5431.malayam.patch b/icu.icu5431.malayam.patch new file mode 100644 index 0000000..48a549d --- /dev/null +++ b/icu.icu5431.malayam.patch @@ -0,0 +1,107 @@ +--- icu.orig/source/layout/IndicReordering.cpp 2006-12-21 09:24:42.000000000 +0000 ++++ icu/source/layout/IndicReordering.cpp 2006-12-21 09:16:15.000000000 +0000 +@@ -50,6 +50,14 @@ + #define distFeatureMask 0x00010000UL + #define initFeatureMask 0x00008000UL + ++// TODO: Find better names for these! ++#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask) ++#define tagArray3 (pstfFeatureMask | tagArray4) ++#define tagArray2 (halfFeatureMask | tagArray3) ++#define tagArray1 (blwfFeatureMask | tagArray2) ++#define tagArray0 (rphfFeatureMask | tagArray1) ++ ++ + class IndicReorderingOutput : public UMemory { + private: + le_int32 fOutIndex; +@@ -154,6 +162,27 @@ + fSMabove = fSMbelow = 0; + } + ++ void swapChars(int a, int b) ++ { ++ LEErrorCode success = LE_NO_ERROR; ++ LEUnicode temp_char; ++ le_uint32 temp_index; ++ FeatureMask temp_tag; ++ ++ temp_char = fOutChars[fOutIndex + b]; ++ temp_index = fGlyphStorage.getCharIndex(fOutIndex + b, success); ++ temp_tag = fGlyphStorage.getAuxData(fOutIndex + b, success); ++ ++ fOutChars[fOutIndex + b] = fOutChars[fOutIndex + a]; ++ le_uint32 toswap = fGlyphStorage.getCharIndex(fOutIndex + a, success); ++ fGlyphStorage.setCharIndex(fOutIndex + b, toswap, success); ++ fGlyphStorage.setAuxData(fOutIndex + b, tagArray3, success); ++ ++ fOutChars[fOutIndex + a] = temp_char; ++ fGlyphStorage.setCharIndex(fOutIndex + a, temp_index, success); ++ fGlyphStorage.setAuxData(fOutIndex + a, temp_tag, success); ++ } ++ + void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask charFeatures) + { + LEErrorCode success = LE_NO_ERROR; +@@ -335,13 +364,6 @@ + C_DOTTED_CIRCLE = 0x25CC + }; + +-// TODO: Find better names for these! +-#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask) +-#define tagArray3 (pstfFeatureMask | tagArray4) +-#define tagArray2 (halfFeatureMask | tagArray3) +-#define tagArray1 (blwfFeatureMask | tagArray2) +-#define tagArray0 (rphfFeatureMask | tagArray1) +- + static const FeatureMap featureMap[] = + { + {loclFeatureTag, loclFeatureMask}, +@@ -629,6 +651,21 @@ + output.writeChar(chars[i], i, tagArray4); + } + ++ /* for the special conjuction of Cons+0x0d4d+0x0d31 or Cons+0x0d4d+0x0d30 of Malayalam */ ++ if ((baseConsonant - 2 >= 0) && ++ (chars[baseConsonant - 1] == 0x0d4d) && ++ ((chars[baseConsonant] == 0x0d31) || ++ (chars[baseConsonant] == 0x0d30)) && ++ ((chars[baseConsonant - 2] >= 0x0d15) && ++ (chars[baseConsonant - 2] <= 0x0d39))) { ++ if (baseConsonant < 3 || chars[baseConsonant - 3] != 0x0d4d) { ++ output.swapChars(-1, -3); ++ ++ if (mpreFixups) ++ mpreFixups->reduce(); ++ } ++ } ++ + if ((classTable->scriptFlags & SF_MATRAS_AFTER_BASE) != 0) { + output.writeMbelow(); + output.writeSMbelow(); // FIXME: there are no SMs in these scripts... +--- icu.orig/source/layout/MPreFixups.h 2006-11-10 09:42:47.000000000 +0000 ++++ icu/source/layout/MPreFixups.h 2006-12-21 09:13:47.000000000 +0000 +@@ -31,6 +31,8 @@ + + void apply(LEGlyphStorage &glyphStorage); + ++ void reduce(); ++ + private: + FixupData *fFixupData; + le_int32 fFixupCount; +--- icu.orig/source/layout/MPreFixups.cpp 2006-11-10 09:42:47.000000000 +0000 ++++ icu/source/layout/MPreFixups.cpp 2006-12-21 09:16:33.000000000 +0000 +@@ -40,6 +40,12 @@ + } + } + ++void MPreFixups::reduce() ++{ ++ if (fFixupCount > 0) ++ fFixupCount--; ++} ++ + void MPreFixups::apply(LEGlyphStorage &glyphStorage) + { + for (le_int32 fixup = 0; fixup < fFixupCount; fixup += 1) { diff --git a/icu.icu5433.oriya.patch b/icu.icu5433.oriya.patch new file mode 100644 index 0000000..f35f5a2 --- /dev/null +++ b/icu.icu5433.oriya.patch @@ -0,0 +1,31 @@ +diff -ru icu.orig/source/layout/IndicClassTables.cpp icu/source/layout/IndicClassTables.cpp +--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-03 14:27:47.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-10-03 14:30:07.000000000 +0100 +@@ -120,6 +120,19 @@ + _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF + }; + ++#if 1 ++static const IndicClassTable::CharClass oryaCharClasses[] = ++{ ++ _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, /* 0B00 - 0B0F */ ++ _iv, _xx, _xx, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _ct, _bb, /* 0B10 - 0B1F */ ++ _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _pb, /* 0B20 - 0B2F */ ++ _rb, _xx, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _nu, _xx, _dr, _da, /* 0B30 - 0B3F */ ++ _dr, _db, _db, _db, _xx, _xx, _xx, _dl, _s1, _xx, _xx, _s2, _s3, _vr, _xx, _xx, /* 0B40 - 0B4F */ ++ _xx, _xx, _xx, _xx, _xx, _xx, _da, _dr, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _pb, /* 0B50 - 0B5F */ ++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0B60 - 0B6F */ ++ _xx, _bb /* 0B70 - 0B71 */ ++}; ++#else + static const IndicClassTable::CharClass oryaCharClasses[] = + { + _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, // 0B00 - 0B0F +@@ -131,6 +144,7 @@ + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0B60 - 0B6F + _xx, _ct // 0B70 - 0B71 + }; ++#endif + + static const IndicClassTable::CharClass tamlCharClasses[] = + { diff --git a/icu.icu5465.telegu.patch b/icu.icu5465.telegu.patch new file mode 100644 index 0000000..7e80103 --- /dev/null +++ b/icu.icu5465.telegu.patch @@ -0,0 +1,29 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2007-02-05 14:44:17.000000000 +0000 ++++ icu/source/layout/IndicClassTables.cpp 2007-02-05 14:47:49.000000000 +0000 +@@ -145,6 +145,7 @@ + }; + + // FIXME: Should some of the bb's be pb's? (KA, NA, MA, YA, VA, etc. (approx 13)) ++#if 0 + static const IndicClassTable::CharClass teluCharClasses[] = + { + _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, // 0C00 - 0C0F +@@ -155,6 +156,18 @@ + _xx, _xx, _xx, _xx, _xx, _da, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0C50 - 0C5F + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0C60 - 0C6F + }; ++#else ++static const IndicClassTable::CharClass teluCharClasses[] = ++{ ++ _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0C00 - 0C0F */ ++ _iv, _xx, _iv, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C10 - 0C1F */ ++ _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C20 - 0C2F */ ++ _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _xx, _xx, _da, _da, /* 0C30 - 0C3F */ ++ _da, _dr, _dr, _dr, _dr, _xx, _da, _da, _s1, _xx, _da, _da, _da, _vr, _xx, _xx, /* 0C40 - 0C4F */ ++ _xx, _xx, _xx, _xx, _xx, _da, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0C50 - 0C5F */ ++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 0C60 - 0C6F */ ++}; ++#endif + + // U+CC3 and U+CC4 are _lm here not _dr since the Kannada rendering + // rules want them below and to the right of the entire cluster diff --git a/icu.icu5483.backport.patch b/icu.icu5483.backport.patch new file mode 100644 index 0000000..039dee2 --- /dev/null +++ b/icu.icu5483.backport.patch @@ -0,0 +1,874 @@ +diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.orig/source/common/ucnv2022.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 12:30:29.000000000 +0100 +@@ -84,6 +84,26 @@ + #define V_TAB 0x0B + #define SPACE 0x20 + ++enum { ++ HWKANA_START=0xff61, ++ HWKANA_END=0xff9f ++}; ++ ++/* ++ * 94-character sets with native byte values A1..FE are encoded in ISO 2022 ++ * as bytes 21..7E. (Subtract 0x80.) ++ * 96-character sets with native byte values A0..FF are encoded in ISO 2022 ++ * as bytes 20..7F. (Subtract 0x80.) ++ * Do not encode C1 control codes with native bytes 80..9F ++ * as bytes 00..1F (C0 control codes). ++ */ ++enum { ++ GR94_START=0xa1, ++ GR94_END=0xfe, ++ GR96_START=0xa0, ++ GR96_END=0xff ++}; ++ + /* + * ISO 2022 control codes must not be converted from Unicode + * because they would mess up the byte stream. +@@ -981,22 +1001,27 @@ + + + /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSFromUChar32() function should be reflected in +- * this macro ++ * any future change in _MBCSFromUChar32() function should be reflected here. ++ * @return number of bytes in *value; negative number if fallback; 0 if no mapping + */ +-static U_INLINE void ++static U_INLINE int32_t + MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, + UChar32 c, + uint32_t* value, + UBool useFallback, +- int32_t *length, + int outputType) + { + const int32_t *cx; + const uint16_t *table; + uint32_t stage2Entry; + uint32_t myValue; ++ int32_t length; + const uint8_t *p; ++ /* ++ * TODO(markus): Use and require new, faster MBCS conversion table structures. ++ * Use internal version of ucnv_open() that verifies that the new structures are available, ++ * else U_INTERNAL_PROGRAM_ERROR. ++ */ + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { + table=sharedData->mbcs.fromUnicodeTable; +@@ -1005,51 +1030,60 @@ + if(outputType==MBCS_OUTPUT_2){ + myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); + if(myValue<=0xff) { +- *length=1; ++ length=1; + } else { +- *length=2; ++ length=2; + } + } else /* outputType==MBCS_OUTPUT_3 */ { + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); + myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; + if(myValue<=0xff) { +- *length=1; ++ length=1; + } else if(myValue<=0xffff) { +- *length=2; ++ length=2; + } else { +- *length=3; ++ length=3; + } + } ++ /* ++ * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. ++ * Pass in parameter for type of output bytes, for validation and shifting: ++ * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? ++ * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) ++ * - A1-FE: Subtract 80 after range check. ++ * - SJIS: Shift DBCS result to 21-7E x 21-7E. ++ */ + /* is this code point assigned, or do we use fallbacks? */ +- if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || +- (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) +- ) { ++ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { ++ /* assigned */ ++ *value=myValue; ++ return length; ++ } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { + /* + * We allow a 0 byte output if the "assigned" bit is set for this entry. + * There is no way with this data structure for fallback output + * to be a zero byte. + */ +- /* assigned */ + *value=myValue; +- return; ++ return -length; + } + } + + cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { +- *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback); +- return; ++ return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); + } + + /* unassigned */ +- *length=0; ++ return 0; + } + + /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSSingleFromUChar32() function should be reflected in +- * this macro ++ * any future change in _MBCSSingleFromUChar32() function should be reflected here. ++ * @param retval pointer to output byte ++ * @return 1 roundtrip byte 0 no mapping -1 fallback byte + */ +-static U_INLINE void ++static U_INLINE int32_t + MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, + UChar32 c, + uint32_t* retval, +@@ -1059,20 +1093,21 @@ + int32_t value; + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { +- *retval=(uint16_t)-1; +- return; ++ return 0; + } + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ + table=sharedData->mbcs.fromUnicodeTable; + /* get the byte for the output */ + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); + /* is this code point assigned, or do we use fallbacks? */ +- if(useFallback ? value>=0x800 : value>=0xc00) { +- value &=0xff; ++ *retval=(uint32_t)(value&0xff); ++ if(value>=0xf00) { ++ return 1; /* roundtrip */ ++ } else if(useFallback ? value>=0x800 : value>=0xc00) { ++ return -1; /* fallback taken */ + } else { +- value= -1; ++ return 0; /* no mapping */ + } +- *retval=(uint16_t) value; + } + + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -1316,6 +1351,7 @@ + + static void + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { ++ UConverter *cnv = args->converter; + UConverterDataISO2022 *converterData; + ISO2022State *pFromU2022State; + uint8_t *target = (uint8_t *) args->target; +@@ -1335,14 +1371,13 @@ + int8_t cs, g; + + /* set up the state */ +- converterData = (UConverterDataISO2022*)args->converter->extraInfo; ++ converterData = (UConverterDataISO2022*)cnv->extraInfo; + pFromU2022State = &converterData->fromU2022State; +- useFallback = args->converter->useFallback; + + choiceCount = 0; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ +- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + +@@ -1361,26 +1396,26 @@ + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +- args->converter->fromUChar32=0x00; ++ cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -1389,7 +1424,7 @@ + if(IS_2022_CONTROL(sourceChar)) { + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -1407,9 +1442,10 @@ + + /* JIS7/8: try single-byte half-width Katakana before JISX208 */ + if(converterData->version == 3 || converterData->version == 4) { +- choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT; +- csm &= ~CSM(cs); ++ choices[choiceCount++] = (int8_t)HWKANA_7BIT; + } ++ /* Do not try single-byte half-width Katakana for other versions. */ ++ csm &= ~CSM(HWKANA_7BIT); + + /* try the current G0 charset */ + choices[choiceCount++] = cs = pFromU2022State->cs[0]; +@@ -1432,86 +1468,134 @@ + } + + cs = g = 0; ++ /* ++ * len==0: no mapping found yet ++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++ * len>0: found a roundtrip result, done ++ */ + len = 0; ++ /* ++ * We will turn off useFallback after finding a fallback, ++ * but we still get fallbacks from PUA code points as usual. ++ * Therefore, we will also need to check that we don't overwrite ++ * an early fallback with a later one. ++ */ ++ useFallback = cnv->useFallback; + +- for(i = 0; i < choiceCount && len == 0; ++i) { +- cs = choices[i]; +- switch(cs) { ++ for(i = 0; i < choiceCount && len <= 0; ++i) { ++ uint32_t value; ++ int32_t len2; ++ int8_t cs0 = choices[i]; ++ switch(cs0) { + case ASCII: + if(sourceChar <= 0x7f) { + targetValue = (uint32_t)sourceChar; + len = 1; ++ cs = cs0; ++ g = 0; + } + break; + case ISO8859_1: +- if(0x80 <= sourceChar && sourceChar <= 0xff) { ++ if(GR96_START <= sourceChar && sourceChar <= GR96_END) { + targetValue = (uint32_t)sourceChar - 0x80; + len = 1; ++ cs = cs0; + g = 2; + } + break; + case HWKANA_7BIT: +- if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) { +- targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21)); +- len = 1; +- ++ if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ +- pFromU2022State->cs[1] = cs; /* do not output an escape sequence */ ++ /* Shift U+FF61..U+FF9F to bytes 21..5F. */ ++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); ++ len = 1; ++ pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ + g = 1; + } else if(converterData->version==4) { + /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ +- int8_t cs0; +- +- targetValue += 0x80; ++ /* Shift U+FF61..U+FF9F to bytes A1..DF. */ ++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); ++ len = 1; + +- cs0 = pFromU2022State->cs[0]; +- if(IS_JP_DBCS(cs0)) { ++ cs = pFromU2022State->cs[0]; ++ if(IS_JP_DBCS(cs)) { + /* switch from a DBCS charset to JISX201 */ + cs = (int8_t)JISX201; +- } else { +- /* stay in the current G0 charset */ +- cs = cs0; + } ++ /* else stay in the current G0 charset */ ++ g = 0; + } ++ /* else do not use HWKANA_7BIT with other versions */ + } + break; + case JISX201: + /* G0 SBCS */ +- MBCS_SINGLE_FROM_UCHAR32( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback); +- if(targetValue <= 0x7f) { +- len = 1; ++ len2 = MBCS_SINGLE_FROM_UCHAR32( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback); ++ if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; + } + break; + case ISO8859_7: + /* G0 SBCS forced to 7-bit output */ +- MBCS_SINGLE_FROM_UCHAR32( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback); +- if(0x80 <= targetValue && targetValue <= 0xff) { +- targetValue -= 0x80; +- len = 1; ++ len2 = MBCS_SINGLE_FROM_UCHAR32( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback); ++ if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { ++ targetValue = value - 0x80; ++ len = len2; ++ cs = cs0; + g = 2; ++ useFallback = FALSE; + } + break; + default: + /* G0 DBCS */ +- MBCS_FROM_UCHAR32_ISO2022( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback, &len, MBCS_OUTPUT_2); +- if(len != 2) { +- len = 0; ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ if(cs0 == KSC5601) { ++ /* ++ * Check for valid bytes for the encoding scheme. ++ * This is necessary because the sub-converter (windows-949) ++ * has a broader encoding scheme than is valid for 2022. ++ * ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ */ ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ value -= 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ break; /* not valid for ISO 2022 */ ++ } ++ } ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; + } + break; + } + } + +- if(len > 0) { ++ if(len != 0) { ++ if(len < 0) { ++ len = -len; /* fallback */ ++ } + outLen = 0; /* count output bytes */ + + /* write SI if necessary (only for JIS7) */ +@@ -1560,7 +1644,7 @@ + * then this is an error + */ + *err = U_INVALID_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -1586,7 +1670,7 @@ + } + } else { + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, outLen, + &target, (const char *)targetLimit, + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -1615,7 +1699,7 @@ + */ + if( U_SUCCESS(*err) && + (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && +- args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++ args->flush && source>=sourceLimit && cnv->fromUChar32==0 + ) { + int32_t sourceIndex; + +@@ -1654,7 +1738,7 @@ + } + + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, outLen, + &target, (const char *)targetLimit, + &offsets, sourceIndex, +@@ -1777,7 +1861,7 @@ + !IS_JP_DBCS(cs) + ) { + /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ +- targetUniChar = mySourceChar + (0xff61 - 0xa1); ++ targetUniChar = mySourceChar + (HWKANA_START - 0xa1); + + /* return from a single-shift state to the previous one */ + if(pToU2022State->g >= 2) { +@@ -1818,7 +1902,7 @@ + case HWKANA_7BIT: + if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { + /* 7-bit halfwidth Katakana */ +- targetUniChar = mySourceChar + (0xff61 - 0x21); ++ targetUniChar = mySourceChar + (HWKANA_START - 0x21); + } + break; + default: +@@ -1965,9 +2049,10 @@ + break; + } + +- /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData, +- sourceChar,&targetByteUnit,args->converter->useFallback);*/ +- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2); ++ length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); ++ if(length < 0) { ++ length = -length; /* fallback */ ++ } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ + if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ +@@ -2449,7 +2534,7 @@ + + static void + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ +- ++ UConverter *cnv = args->converter; + UConverterDataISO2022 *converterData; + ISO2022State *pFromU2022State; + uint8_t *target = (uint8_t *) args->target; +@@ -2466,14 +2551,13 @@ + UBool useFallback; + + /* set up the state */ +- converterData = (UConverterDataISO2022*)args->converter->extraInfo; ++ converterData = (UConverterDataISO2022*)cnv->extraInfo; + pFromU2022State = &converterData->fromU2022State; +- useFallback = args->converter->useFallback; + + choiceCount = 0; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ +- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + +@@ -2492,26 +2576,26 @@ + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +- args->converter->fromUChar32=0x00; ++ cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -2522,7 +2606,7 @@ + if(IS_2022_CONTROL(sourceChar)) { + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -2545,7 +2629,6 @@ + } + else{ + /* convert U+0080..U+10ffff */ +- UConverterSharedData *cnv; + int32_t i; + int8_t cs, g; + +@@ -2593,17 +2676,41 @@ + } + + cs = g = 0; ++ /* ++ * len==0: no mapping found yet ++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++ * len>0: found a roundtrip result, done ++ */ + len = 0; ++ /* ++ * We will turn off useFallback after finding a fallback, ++ * but we still get fallbacks from PUA code points as usual. ++ * Therefore, we will also need to check that we don't overwrite ++ * an early fallback with a later one. ++ */ ++ useFallback = cnv->useFallback; + +- for(i = 0; i < choiceCount && len == 0; ++i) { +- cs = choices[i]; +- if(cs > 0) { +- if(cs > CNS_11643_0) { +- cnv = converterData->myConverterArray[CNS_11643]; +- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3); +- if(len==3) { +- cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80); +- len = 2; ++ for(i = 0; i < choiceCount && len <= 0; ++i) { ++ int8_t cs0 = choices[i]; ++ if(cs0 > 0) { ++ uint32_t value; ++ int32_t len2; ++ if(cs0 > CNS_11643_0) { ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[CNS_11643], ++ sourceChar, ++ &value, ++ useFallback, ++ MBCS_OUTPUT_3); ++ if(len2 == 3 || (len2 == -3 && len == 0)) { ++ targetValue = value; ++ cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); ++ if(len2 >= 0) { ++ len = 2; ++ } else { ++ len = -2; ++ useFallback = FALSE; ++ } + if(cs == CNS_11643_1) { + g = 1; + } else if(cs == CNS_11643_2) { +@@ -2617,15 +2724,25 @@ + } + } else { + /* GB2312_1 or ISO-IR-165 */ +- cnv = converterData->myConverterArray[cs]; +- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2); +- g = 1; /* used if len == 2 */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[cs0], ++ sourceChar, ++ &value, ++ useFallback, ++ MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 1; ++ useFallback = FALSE; ++ } + } + } + } + +- if(len > 0) { +- len = 0; /* count output bytes; it must have been len == 2 */ ++ if(len != 0) { ++ len = 0; /* count output bytes; it must have been abs(len) == 2 */ + + /* write the designation sequence if necessary */ + if(cs != pFromU2022State->cs[g]) { +@@ -2670,7 +2787,7 @@ + * then this is an error + */ + *err = U_INVALID_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -2691,7 +2808,7 @@ + } + } else { + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, len, + &target, (const char *)targetLimit, + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -2720,7 +2837,7 @@ + */ + if( U_SUCCESS(*err) && + pFromU2022State->g!=0 && +- args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++ args->flush && source>=sourceLimit && cnv->fromUChar32==0 + ) { + int32_t sourceIndex; + +@@ -2748,7 +2865,7 @@ + } + + fromUWriteUInt8( +- args->converter, ++ cnv, + SHIFT_IN_STR, 1, + &target, (const char *)targetLimit, + &offsets, sourceIndex, +@@ -3146,7 +3263,7 @@ + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { + /* include half-width Katakana for JP */ +- sa->addRange(sa->set, 0xff61, 0xff9f); ++ sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } + break; + case 'c': +diff -ru icu.orig/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.orig/source/common/ucnv_ext.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 12:14:20.000000000 +0100 +@@ -551,6 +551,12 @@ + return 0; + } + ++ /* ++ * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: ++ * Do not interpret values with reserved bits used, for forward compatibility, ++ * and do not even remember intermediate results with reserved bits used. ++ */ ++ + if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { + /* partial match, enter the loop below */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); +@@ -575,7 +581,8 @@ + value=*fromUSectionValues++; + if( value!=0 && + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP)) ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* remember longest match so far */ + matchValue=value; +@@ -613,8 +620,9 @@ + /* partial match, continue */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); + } else { +- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP) ++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* full match, stop with result */ + matchValue=value; +@@ -632,8 +640,9 @@ + return 0; + } + } else /* result from firstCP trie lookup */ { +- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP) ++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* full match, stop with result */ + matchValue=value; +@@ -644,20 +653,18 @@ + } + } + +- if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { +- /* do not interpret values with reserved bits used, for forward compatibility */ +- return 0; +- } +- + /* return result */ + if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { + return 1; /* assert matchLength==2 */ + } + +- *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); ++ *pMatchValue=matchValue; + return matchLength; + } + ++/* ++ * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits ++ */ + static U_INLINE void + ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, + uint32_t value, +@@ -792,6 +799,10 @@ + } + } + ++/* ++ * Used by ISO 2022 implementation. ++ * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping ++ */ + U_CFUNC int32_t + ucnv_extSimpleMatchFromU(const int32_t *cx, + UChar32 cp, uint32_t *pValue, +@@ -809,13 +820,15 @@ + if(match>=2) { + /* write result for simple, single-character conversion */ + int32_t length; +- ++ int isRoundtrip; ++ ++ isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); + length=UCNV_EXT_FROM_U_GET_LENGTH(value); + value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); + + if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { + *pValue=value; +- return length; ++ return isRoundtrip ? length : -length; + #if 0 /* not currently used */ + } else if(length==4) { + /* de-serialize a 4-byte result */ +@@ -825,7 +838,7 @@ + ((uint32_t)result[1]<<16)| + ((uint32_t)result[2]<<8)| + result[3]; +- return 4; ++ return isRoundtrip ? 4 : -4; + #endif + } + } +diff -ru icu.orig/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.orig/source/common/ucnv_ext.h 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.h 2009-06-02 12:14:20.000000000 +0100 +@@ -452,7 +452,7 @@ + #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0) + #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) + +-/* use after masking off the roundtrip flag */ ++/* get length; masks away all other bits */ + #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES) + + /* get bytes or bytes index */ +diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.orig/source/common/ucnvmbcs.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:14:20.000000000 +0100 +@@ -3785,7 +3785,8 @@ + + cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { +- return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++ length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++ return length>=0 ? length : -length; /* return abs(length); */ + } + + /* unassigned */ +diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.orig/source/test/testdata/conversion.txt 2009-06-02 11:48:26.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:14:20.000000000 +0100 +@@ -495,6 +495,46 @@ + } + { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } + { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++ // Verify that mappings that would result in byte values outside 20..7F (for SBCS) ++ // or 21..7E (for DBCS) are not used. ++ // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): ++ // \x9F |0 (also in ISO 8859-1) ++ // \xB7 |1 ++ // windows-949-2000 (KSC_5601, $(C=1b242843): ++ // \xA0\xA1 |0 ++ // \xC0\x41 |0 ++ // \xC8\xFE |0 ++ { ++ "JIS8", // =ISO_2022,locale=ja,version=4 ++ "\u009f\u0387\uc829\ud4fe\ud79d", ++ :bin{ 1a1b2e461b4e371a1a1b242843487e1b2842 }, ++ :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 }, ++ :int{1}, :int{1}, "", "?", "" ++ } ++ // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping ++ // Verify that a roundtrip mapping is used even when a fallback mapping is ++ // available in the current state. ++ // U+FF61 is handled in code ++ // jisx-208.ucm ($B=1b2442): ++ // \x21\x34 |0 ++ // \x21\x51 |0 and ++ // ibm-897_P100-1995.ucm (JIS X 0201, (J=1b284a): ++ // \x7D |1 ++ // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): ++ // \xF6 |1 ++ // \xAF |0 ++ // \x7D |1 (not legal for ISO 2022) ++ // windows-949-2000 (KSC_5601, $(C=1b242843): ++ // \xB0\xA1 |0 ++ // \xA3\xFD |0 ++ // \xA1\xAD |0 (in extension table) ++ { ++ "JIS8", // =ISO_2022,locale=ja,version=4 ++ "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d", // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208. ++ :bin{ 61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 }, ++ :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 }, ++ :int{1}, :int{1}, "", "?", "" ++ } + + // e4b8 is a partial sequence + { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } } diff --git a/icu.icu5488.assamese.patch b/icu.icu5488.assamese.patch new file mode 100644 index 0000000..8b5d773 --- /dev/null +++ b/icu.icu5488.assamese.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-11-01 09:26:58.000000000 +0000 +@@ -94,7 +94,7 @@ + _dr, _db, _db, _db, _db, _xx, _xx, _l1, _dl, _xx, _xx, _s1, _s2, _vr, _xx, _xx, // 09C0 - 09CF + _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _cn, // 09D0 - 09DF + _iv, _iv, _dv, _dv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 09E0 - 09EF +- _ct, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 09F0 - 09FA ++ _rv, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 09F0 - 09FA */ + }; + + static const IndicClassTable::CharClass punjCharClasses[] = diff --git a/icu.icu5500.devicetablecrash.patch b/icu.icu5500.devicetablecrash.patch new file mode 100644 index 0000000..16ea5b7 --- /dev/null +++ b/icu.icu5500.devicetablecrash.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/DeviceTables.cpp 2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/DeviceTables.cpp 2006-11-08 09:08:09.000000000 +0000 +@@ -22,7 +22,7 @@ + le_uint16 format = SWAPW(deltaFormat) - 1; + le_int16 result = 0; + +- if (ppem >= start && ppem <= SWAPW(endSize)) { ++ if (ppem >= start && ppem <= SWAPW(endSize) && format < sizeof(fieldBits)/sizeof(fieldBits[0])) { + le_uint16 sizeIndex = ppem - start; + le_uint16 bits = fieldBits[format]; + le_uint16 count = 16 / bits; diff --git a/icu.icu5501.sinhala.biggerexpand.patch b/icu.icu5501.sinhala.biggerexpand.patch new file mode 100644 index 0000000..6013780 --- /dev/null +++ b/icu.icu5501.sinhala.biggerexpand.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-11-08 11:20:55.000000000 +0000 +@@ -284,7 +284,7 @@ + + static const IndicClassTable mlymClassTable = {0x0D00, 0x0D6F, 3, MLYM_SCRIPT_FLAGS, mlymCharClasses, mlymSplitTable}; + +-static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 3, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable}; ++static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 4, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable}; + + // + // IndicClassTable addresses diff --git a/icu.icu5506.multiplevowels.patch b/icu.icu5506.multiplevowels.patch new file mode 100644 index 0000000..a58ec64 --- /dev/null +++ b/icu.icu5506.multiplevowels.patch @@ -0,0 +1,61 @@ +diff -ur icu.orig/source/layout/IndicReordering.cpp icu/source/layout/IndicReordering.cpp +--- icu.orig/source/layout/IndicReordering.cpp 2006-11-10 09:42:44.000000000 +0000 ++++ icu/source/layout/IndicReordering.cpp 2006-11-10 09:47:05.000000000 +0000 +@@ -395,7 +395,7 @@ + {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, 9, 5, 5, 4, 12}, // 2 - consonant with nukta + {-1, 6, 1, -1, -1, -1, -1, -1, 2, 5, 9, 5, 5, 4, 12}, // 3 - consonant + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 7}, // 4 - consonant virama +- {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, // 5 - dependent vowels ++ {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 5 - dependent vowels + {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - vowel mark + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, -1}, // 7 - consonant virama ZWJ, consonant ZWJ virama + {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1}, // 8 - independent vowels that can take a virama +@@ -423,6 +423,48 @@ + + state = stateTable[state][charClass & CF_CLASS_MASK]; + ++ /*for the components of split matra*/ ++ if ((charCount >= cursor + 3) && ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF && chars[cursor + 2] == 0x0DCA)) { /*for 3 split matra of Sinhala*/ ++ return cursor + 3; ++ } ++ else if ((charCount >= cursor + 3) && ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2 && chars[cursor + 2] == 0x0CD5)) { /*for 3 split matra of Kannada*/ ++ return cursor + 3; ++ } ++ /*for 2 split matra*/ ++ else if (charCount >= cursor + 2) { ++ /*for Bengali*/ ++ if ((chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09BE) || ++ (chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09D7) || ++ /*for Oriya*/ ++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B3E) || ++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B56) || ++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B57) || ++ /*for Tamil*/ ++ (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BBE) || ++ (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BD7) || ++ (chars[cursor] == 0x0BC7 && chars[cursor + 1] == 0x0BBE) || ++ /*for Malayalam*/ ++ (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D3E) || ++ (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D57) || ++ (chars[cursor] == 0x0D47 && chars[cursor + 1] == 0x0D3E) || ++ /*for Sinhala*/ ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCA) || ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF) || ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DDF) || ++ (chars[cursor] == 0x0DDC && chars[cursor + 1] == 0x0DCA) || ++ /*for Telugu*/ ++ (chars[cursor] == 0x0C46 && chars[cursor + 1] == 0x0C56) || ++ /*for Kannada*/ ++ (chars[cursor] == 0x0CBF && chars[cursor + 1] == 0x0CD5) || ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD5) || ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD6) || ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2) || ++ (chars[cursor] == 0x0CCA && chars[cursor + 1] == 0x0CD5)) ++ return cursor + 2; ++ } ++ + if (state < 0) { + break; + } diff --git a/icu.icu5557.safety.patch b/icu.icu5557.safety.patch new file mode 100644 index 0000000..682caa1 --- /dev/null +++ b/icu.icu5557.safety.patch @@ -0,0 +1,14 @@ +--- icu.orig/source/layout/CoverageTables.cpp 2007-01-09 12:57:41.000000000 +0000 ++++ icu/source/layout/CoverageTables.cpp 2007-01-09 12:59:09.000000000 +0000 +@@ -44,6 +44,11 @@ + le_uint16 count = SWAPW(glyphCount); + le_uint8 bit = OpenTypeUtilities::highBit(count); + le_uint16 power = 1 << bit; ++ ++ if (count == 0) { ++ return -1; ++ } ++ + le_uint16 extra = count - power; + le_uint16 probe = power; + le_uint16 index = 0; diff --git a/icu.icu5594.gujarati.patch b/icu.icu5594.gujarati.patch new file mode 100644 index 0000000..b21418d --- /dev/null +++ b/icu.icu5594.gujarati.patch @@ -0,0 +1,14 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2007-02-09 14:26:04.000000000 +0000 ++++ icu/source/layout/IndicClassTables.cpp 2007-02-13 15:41:52.000000000 +0000 +@@ -117,7 +117,11 @@ + _rv, _xx, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _xx, _xx, _nu, _xx, _dr, _dl, // 0AB0 - 0ABF + _dr, _db, _db, _db, _db, _da, _xx, _da, _da, _dr, _xx, _dr, _dr, _vr, _xx, _xx, // 0AC0 - 0ACF + _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0AD0 - 0ADF ++#if 1 ++ _iv, _xx, _db, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF ++#else + _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF ++#endif + }; + + #if 1 diff --git a/icu.icu5691.backport.patch b/icu.icu5691.backport.patch new file mode 100644 index 0000000..906ecd3 --- /dev/null +++ b/icu.icu5691.backport.patch @@ -0,0 +1,730 @@ +diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6175/source/common/ucnv2022.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 16:03:15.000000000 +0100 +@@ -754,6 +754,7 @@ + UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); + uint32_t key = myData2022->key; + int32_t offset = 0; ++ int8_t initialToULength = _this->toULength; + char c; + + value = VALID_NON_TERMINAL_2022; +@@ -806,7 +807,6 @@ + return; + } else if (value == INVALID_2022 ) { + *err = U_ILLEGAL_ESCAPE_SEQUENCE; +- return; + } else /* value == VALID_TERMINAL_2022 */ { + switch(var){ + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -938,6 +938,35 @@ + } + if(U_SUCCESS(*err)) { + _this->toULength = 0; ++ } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { ++ if(_this->toULength>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte (ESC) in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * In escape sequences, all following bytes are "printable", that is, ++ * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), ++ * they are valid single/lead bytes. ++ * For simplicity, we always only report the initial ESC byte as the ++ * illegal sequence and back out all other bytes we looked at. ++ */ ++ /* Back out some bytes. */ ++ int8_t backOutDistance=_this->toULength-1; ++ int8_t bytesFromThisBuffer=_this->toULength-initialToULength; ++ if(backOutDistance<=bytesFromThisBuffer) { ++ /* same as initialToULength<=1 */ ++ *source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* same as -(initialToULength-1) */ ++ /* preToULength is negative! */ ++ uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); ++ *source-=bytesFromThisBuffer; ++ } ++ _this->toULength=1; ++ } + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { + _this->toUCallbackReason = UCNV_UNASSIGNED; + } +@@ -1973,6 +2002,7 @@ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -2102,17 +2132,44 @@ + default: + /* G0 DBCS */ + if(mySource < mySourceLimit) { +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- if(cs == JISX208) { +- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); +- } else { +- tempBuf[0] = (char)mySourceChar; +- tempBuf[1] = trailByte; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ uint32_t tmpSourceChar = (mySourceChar << 8) | trailByte; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); ++ mySourceChar = tmpSourceChar; ++ } else { ++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ ++ mySourceChar = tmpSourceChar; ++ if (cs == KSC5601) { ++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ ++ } ++ tempBuf[0] = (char)(tmpSourceChar >> 8); ++ tempBuf[1] = (char)(tmpSourceChar); ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +@@ -2254,7 +2311,12 @@ + } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ ++ if( length > 2 || length==0 || ++ (length == 1 && targetByteUnit > 0x7f) || ++ (length == 2 && ++ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || ++ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) ++ ) { + targetByteUnit=missingCharMarker; + } + if (targetByteUnit != missingCharMarker){ +@@ -2583,17 +2645,34 @@ + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ + if(myData->toU2022State.g == 1) { + if(mySource < mySourceLimit) { +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempBuf[0] = (char)(mySourceChar + 0x80); +- tempBuf[1] = (char)(trailByte + 0x80); +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- if((mySourceChar & 0x8080) == 0) { ++ targetUniChar = missingCharMarker; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ tempBuf[0] = (char)(mySourceChar + 0x80); ++ tempBuf[1] = (char)(trailByte + 0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); +- } else { +- /* illegal bytes > 0x7f */ +- targetUniChar = missingCharMarker; ++ mySourceChar = (mySourceChar << 8) | trailByte; ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -2601,8 +2680,10 @@ + break; + } + } +- else{ ++ else if(mySourceChar <= 0x7f) { + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); ++ } else { ++ targetUniChar = 0xffff; + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -3099,6 +3180,7 @@ + /* continue with a partial double-byte character */ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -3178,29 +3260,50 @@ + UConverterSharedData *cnv; + StateEnum tempState; + int32_t tempBufLen; +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; +- if(tempState > CNS_11643_0) { +- cnv = myData->myConverterArray[CNS_11643]; +- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); +- tempBuf[1] = (char) (mySourceChar); +- tempBuf[2] = trailByte; +- tempBufLen = 3; +- +- }else{ +- cnv = myData->myConverterArray[tempState]; +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte; +- tempBufLen = 2; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ if(tempState >= CNS_11643_0) { ++ cnv = myData->myConverterArray[CNS_11643]; ++ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); ++ tempBuf[1] = (char) (mySourceChar); ++ tempBuf[2] = (char) trailByte; ++ tempBufLen = 3; ++ ++ }else{ ++ cnv = myData->myConverterArray[tempState]; ++ tempBuf[0] = (char) (mySourceChar); ++ tempBuf[1] = (char) trailByte; ++ tempBufLen = 2; ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); ++ mySourceChar = (mySourceChar << 8) | trailByte; ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + if(pToU2022State->g>=2) { + /* return from a single-shift state to the previous one */ + pToU2022State->g=pToU2022State->prevG; + } +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6175/source/common/ucnvhz.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:57:18.000000000 +0100 +@@ -196,10 +196,30 @@ + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ +- mySourceChar = 0x7e00 | mySourceChar; +- targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ + myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ +- break; ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUBytes[0] = UCNV_TILDE; ++ if( myData->isStateDBCS ? ++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) : ++ mySourceChar <= 0x7f ++ ) { ++ /* The current byte could be the start of a character: Back it out. */ ++ args->converter->toULength = 1; ++ --mySource; ++ } else { ++ /* Include the current byte in the illegal sequence. */ ++ args->converter->toUBytes[1] = mySourceChar; ++ args->converter->toULength = 2; ++ } ++ args->target = myTarget; ++ args->source = mySource; ++ return; + } + } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ +@@ -215,19 +235,36 @@ + } + else{ + /* trail byte */ ++ int leadIsOk, trailIsOk; + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; +- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && +- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) +- ) { ++ targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In HZ DBCS, if the second byte is in the 21..7e range, ++ * we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); ++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { + tempBuf[0] = (char) (leadByte+0x80) ; + tempBuf[1] = (char) (mySourceChar+0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, + tempBuf, 2, args->converter->useFallback); ++ mySourceChar= (leadByte << 8) | mySourceChar; ++ } else if (trailIsOk) { ++ /* report a single illegal byte and continue with the following DBCS starter byte */ ++ --mySource; ++ mySourceChar = (int32_t)leadByte; + } else { +- targetUniChar = 0xffff; ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + } +- /* add another bit so that the code below writes 2 bytes in case of error */ +- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; + } + } +diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6175/source/common/ucnvmbcs.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:56:07.000000000 +0100 +@@ -1697,6 +1697,65 @@ + pArgs->offsets=offsets; + } + ++static UBool ++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { ++ const int32_t *row=stateTable[state]; ++ int32_t b, entry; ++ /* First test for final entries in this state for some commonly valid byte values. */ ++ entry=row[0xa1]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ entry=row[0x41]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ /* Then test for final entries in this state. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ } ++ /* Then recurse for transition entries. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( MBCS_ENTRY_IS_TRANSITION(entry) && ++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) ++ ) { ++ return TRUE; ++ } ++ } ++ return FALSE; ++} ++ ++/* ++ * Is byte b a single/lead byte in this state? ++ * Recurse for transition states, because here we don't want to say that ++ * b is a lead byte if all byte sequences that start with b are illegal. ++ */ ++static UBool ++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { ++ const int32_t *row=stateTable[state]; ++ int32_t entry=row[b]; ++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ ++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); ++ } else { ++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); ++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { ++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */ ++ } else { ++ return action!=MBCS_STATE_ILLEGAL; ++ } ++ } ++} ++ + U_CFUNC void + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { +@@ -2052,6 +2111,34 @@ + sourceIndex=nextSourceIndex; + } else if(U_FAILURE(*pErrorCode)) { + /* callback(illegal) */ ++ if(byteIndex>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++ int8_t i; ++ for(i=1; ++ isource); ++ byteIndex=i; /* length of reported illegal byte sequence */ ++ if(backOutDistance<=bytesFromThisBuffer) { ++ source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* preToULength is negative! */ ++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); ++ source=(const uint8_t *)pArgs->source; ++ } ++ } ++ } + break; + } else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ +@@ -2062,7 +2149,7 @@ + &offsets, sourceIndex, + pArgs->flush, + pErrorCode); +- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); ++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); + + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ +@@ -2353,15 +2440,37 @@ + + if(c<0) { + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes; + cnv->toULength=(int8_t)(source-lastSource); + do { + *bytes++=*lastSource++; + } while(lastSourcesharedData->mbcs.dbcsOnlyState!=0); ++ uint8_t *bytes=cnv->toUBytes; ++ *bytes++=*lastSource++; /* first byte */ ++ if(lastSource==source) { ++ cnv->toULength=1; ++ } else /* lastSourcetoULength=i; ++ source=lastSource; ++ } + } else { + /* no output because of empty input or only state changes */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; +diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c +--- icu.6175/source/test/cintltst/nccbtst.c 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nccbtst.c 2009-06-02 15:47:38.000000000 +0100 +@@ -2497,13 +2497,13 @@ + + + static const uint8_t text943[] = { +- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; +- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; +- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; ++ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 }; ++ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 }; + static const UChar toUnicode943stop[]= { 0x304b}; + +- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7}; +- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7}; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 }; ++ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 }; + static const int32_t fromIBM943Offsstop[] = { 0}; + + gInBufferSize = inputsize; +@@ -2537,9 +2537,9 @@ + { + static const uint8_t sampleText[] = { + 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82, +- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33}; +- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033}; +- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8}; ++ 0xff, 0x32, 0x33}; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 }; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 }; + /*checking illegal value for ibm-943 with substitute*/ + gInBufferSize = inputsize; + gOutBufferSize = outputsize; +diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6175/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:38.000000000 +0100 +@@ -2606,7 +2606,7 @@ + TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source"); + /*Test for the condition where there is an invalid character*/ + { +- static const uint8_t source2[]={0xa1, 0x01}; ++ static const uint8_t source2[]={0xa1, 0x80}; + TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character"); + } + /*Test for the condition where we have a truncated char*/ +@@ -3899,11 +3899,11 @@ + TestISO_2022_KR() { + /* test input */ + static const uint16_t in[]={ +- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D +- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04 ++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D ++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04 + ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029 + ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB +- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2 ++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2 + ,0x53E3,0x53E4,0x000A,0x000D}; + const UChar* uSource; + const UChar* uSourceLimit; +diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6175/source/test/testdata/conversion.txt 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:57:41.000000000 +0100 +@@ -48,12 +48,144 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // Test ticket 5691: consistent illegal sequences ++ // The following test cases are for illegal character byte sequences. ++ // ++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket ++ // comments because our Shift-JIS table is Windows-compatible and ++ // therefore has no illegal single bytes. Same for GBK. ++ // Instead, we use the stricter GB 18030 also for 2-byte examples. ++ // The byte sequences are generally slightly different from the ticket ++ // comment, simply using assigned characters rather than just ++ // theoretically valid sequences. ++ { ++ "gb18030", ++ :bin{ 618140813c81ff7a }, ++ "a\u4e02\\x81<\\x81\\xFFz", ++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "EUC-JP", ++ :bin{ 618fb0a98fb03c8f3cb0a97a }, ++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", ++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "gb18030", ++ :bin{ 618130fc318130fc8181303c3e813cfc817a }, ++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", ++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "UTF-8", ++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, ++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z", ++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-JP", ++ :bin{ 1b24424141af4142affe41431b2842 }, ++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e", ++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ibm-25546", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-KR", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 411b242941420e4141af4142affe41430f5a }, ++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "HZ", ++ :bin{ 417e7b4141af4142affe41437e7d5a }, ++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: consistent illegal sequences ++ // The following test cases are for illegal escape/designator/shift sequences. ++ // ++ // ISO-2022-JP and -CN with illegal escape sequences. ++ { ++ "ISO-2022-JP", ++ :bin{ 611b24201b244241411b283f1b28427a }, ++ "a\\x1B$ \u758f\\x1B\u2538z", ++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b2429201b2429410e41410f7a }, ++ "a\\x1B$) \u4eaez", ++ :intvector{ 0,1,1,1,1,2,3,4,10,13 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences. ++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space. ++ { ++ "ISO-2022-JP-2", ++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, ++ "N\\x1BNNN\xceN\\x1B N", ++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN-EXT", ++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, ++ "N\\x1BNNN\u8f0eN\\x1B N", ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN-EXT", ++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, ++ "O\\x1BOOO\u492bO\\x1B O", ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: HZ with illegal tilde sequences. ++ { ++ "HZ", ++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, ++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z", ++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS ++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS ++ 25 }, // SBCS ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: Example from Peter Edberg. ++ { ++ "ISO-2022-JP", ++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, ++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", ++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, ++ :int{1}, :int{0}, "", "?", :bin{""} ++ } + // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e + { + "HZ", +- :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, +- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", +- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", ++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and +@@ -61,8 +193,8 @@ + { + "ISO-2022-JP", + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, +- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", +- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() +@@ -341,7 +473,7 @@ + { + "ISO-2022-CN-EXT", + :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, +- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e } ++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b } + } + // G3 designator: recognized, but not supported for -CN (only for -CN-EXT) + { diff --git a/icu.icu5797.backport.patch b/icu.icu5797.backport.patch new file mode 100644 index 0000000..39e3f77 --- /dev/null +++ b/icu.icu5797.backport.patch @@ -0,0 +1,749 @@ +diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5483/source/common/ucnv2022.c 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 13:18:23.000000000 +0100 +@@ -473,8 +473,7 @@ + if(jpCharsetMasks[version]&CSM(ISO8859_7)) { + myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); + } +- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); +- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); ++ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); + if(jpCharsetMasks[version]&CSM(JISX212)) { + myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); + } +@@ -1045,14 +1044,6 @@ + length=3; + } + } +- /* +- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. +- * Pass in parameter for type of output bytes, for validation and shifting: +- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? +- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) +- * - A1-FE: Subtract 80 after range check. +- * - SJIS: Shift DBCS result to 21-7E x 21-7E. +- */ + /* is this code point assigned, or do we use fallbacks? */ + if((stage2Entry&(1<<(16+(c&0xf))))!=0) { + /* assigned */ +@@ -1110,6 +1101,23 @@ + } + } + ++/* ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ * Return 0 if out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromGR94DBCS(uint32_t value) { ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ return value - 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ return 0; /* not valid for ISO 2022 */ ++ } ++} ++ + #ifdef U_ENABLE_GENERIC_ISO_2022 + + /********************************************************************************** +@@ -1238,7 +1246,7 @@ + } + else{ + cnv->toUBytes[0] =(char) sourceChar; +- cnv->toULength = 2; ++ cnv->toULength = 1; + } + + if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ +@@ -1332,6 +1340,181 @@ + 3 /* length of (I HWKANA_7BIT */ + }; + ++/* Map 00..7F to Unicode according to JIS X 0201. */ ++static U_INLINE uint32_t ++jisx201ToU(uint32_t value) { ++ if(value < 0x5c) { ++ return value; ++ } else if(value == 0x5c) { ++ return 0xa5; ++ } else if(value == 0x7e) { ++ return 0x203e; ++ } else /* value <= 0x7f */ { ++ return value; ++ } ++} ++ ++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ ++static U_INLINE uint32_t ++jisx201FromU(uint32_t value) { ++ if(value<=0x7f) { ++ if(value!=0x5c && value!=0x7e) { ++ return value; ++ } ++ } else if(value==0xa5) { ++ return 0x5c; ++ } else if(value==0x203e) { ++ return 0x7e; ++ } ++ return 0xfffe; ++} ++ ++/* ++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding ++ * to JIS X 0208, and convert it to a pair of 21..7E bytes. ++ * Return 0 if the byte pair is out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromSJIS(uint32_t value) { ++ uint8_t trail; ++ ++ if(value > 0xEFFC) { ++ return 0; /* beyond JIS X 0208 */ ++ } ++ ++ trail = (uint8_t)value; ++ ++ value &= 0xff00; /* lead byte */ ++ if(value <= 0x9f00) { ++ value -= 0x7000; ++ } else /* 0xe000 <= value <= 0xef00 */ { ++ value -= 0xb000; ++ } ++ value <<= 1; ++ ++ if(trail <= 0x9e) { ++ value -= 0x100; ++ if(trail <= 0x7e) { ++ value |= trail - 0x1f; ++ } else { ++ value |= trail - 0x20; ++ } ++ } else /* trail <= 0xfc */ { ++ value |= trail - 0x7e; ++ } ++ return value; ++} ++ ++/* ++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. ++ * If either byte is outside 21..7E make sure that the result is not valid ++ * for Shift-JIS so that the converter catches it. ++ * Some invalid byte values already turn into equally invalid Shift-JIS ++ * byte values and need not be tested explicitly. ++ */ ++static U_INLINE void ++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { ++ if(c1&1) { ++ ++c1; ++ if(c2 <= 0x5f) { ++ c2 += 0x1f; ++ } else if(c2 <= 0x7e) { ++ c2 += 0x20; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } else { ++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { ++ c2 += 0x7e; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } ++ c1 >>= 1; ++ if(c1 <= 0x2f) { ++ c1 += 0x70; ++ } else if(c1 <= 0x3f) { ++ c1 += 0xb0; ++ } else { ++ c1 = 0; /* invalid */ ++ } ++ bytes[0] = (char)c1; ++ bytes[1] = (char)c2; ++} ++ ++/* ++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) ++ * Katakana. ++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks ++ * because Shift-JIS roundtrips half-width Katakana to single bytes. ++ * These were the only fallbacks in ICU's jisx-208.ucm file. ++ */ ++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { ++ 0x2123, /* U+FF61 */ ++ 0x2156, ++ 0x2157, ++ 0x2122, ++ 0x2126, ++ 0x2572, ++ 0x2521, ++ 0x2523, ++ 0x2525, ++ 0x2527, ++ 0x2529, ++ 0x2563, ++ 0x2565, ++ 0x2567, ++ 0x2543, ++ 0x213C, /* U+FF70 */ ++ 0x2522, ++ 0x2524, ++ 0x2526, ++ 0x2528, ++ 0x252A, ++ 0x252B, ++ 0x252D, ++ 0x252F, ++ 0x2531, ++ 0x2533, ++ 0x2535, ++ 0x2537, ++ 0x2539, ++ 0x253B, ++ 0x253D, ++ 0x253F, /* U+FF80 */ ++ 0x2541, ++ 0x2544, ++ 0x2546, ++ 0x2548, ++ 0x254A, ++ 0x254B, ++ 0x254C, ++ 0x254D, ++ 0x254E, ++ 0x254F, ++ 0x2552, ++ 0x2555, ++ 0x2558, ++ 0x255B, ++ 0x255E, ++ 0x255F, /* U+FF90 */ ++ 0x2560, ++ 0x2561, ++ 0x2562, ++ 0x2564, ++ 0x2566, ++ 0x2568, ++ 0x2569, ++ 0x256A, ++ 0x256B, ++ 0x256C, ++ 0x256D, ++ 0x256F, ++ 0x2573, ++ 0x212B, ++ 0x212C /* U+FF9F */ ++}; ++ + /* + * The iteration over various code pages works this way: + * i) Get the currentState from myConverterData->currentState +@@ -1504,7 +1687,7 @@ + } + break; + case HWKANA_7BIT: +- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { ++ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ + /* Shift U+FF61..U+FF9F to bytes 21..5F. */ +@@ -1531,13 +1714,34 @@ + break; + case JISX201: + /* G0 SBCS */ +- len2 = MBCS_SINGLE_FROM_UCHAR32( ++ value = jisx201FromU(sourceChar); ++ if(value <= 0x7f) { ++ targetValue = value; ++ len = 1; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ break; ++ case JISX208: ++ /* G0 DBCS from Shift-JIS table */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[cs0], + sourceChar, &value, +- useFallback); +- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { +- targetValue = value; +- len = len2; ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ value = _2022FromSJIS(value); ++ if(value != 0) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ } else if(len == 0 && useFallback && ++ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { ++ targetValue = hwkana_fb[sourceChar - HWKANA_START]; ++ len = -2; + cs = cs0; + g = 0; + useFallback = FALSE; +@@ -1569,17 +1773,10 @@ + * Check for valid bytes for the encoding scheme. + * This is necessary because the sub-converter (windows-949) + * has a broader encoding scheme than is valid for 2022. +- * +- * Check that the result is a 2-byte value with each byte in the range A1..FE +- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte +- * to move it to the ISO 2022 range 21..7E. + */ +- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && +- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) +- ) { +- value -= 0x8080; /* shift down to 21..7e byte range */ +- } else { +- break; /* not valid for ISO 2022 */ ++ value = _2022FromGR94DBCS(value); ++ if(value == 0) { ++ break; + } + } + targetValue = value; +@@ -1755,7 +1952,7 @@ + static void + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, + UErrorCode* err){ +- char tempBuf[3]; ++ char tempBuf[2]; + const char *mySource = (char *) args->source; + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; +@@ -1893,10 +2090,7 @@ + break; + case JISX201: + if(mySourceChar <= 0x7f) { +- targetUniChar = +- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( +- myData->myConverterArray[cs], +- mySourceChar); ++ targetUniChar = jisx201ToU(mySourceChar); + } + break; + case HWKANA_7BIT: +@@ -1910,8 +2104,13 @@ + if(mySource < mySourceLimit) { + char trailByte; + getTrailByte: +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte = *mySource++; ++ trailByte = *mySource++; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); ++ } else { ++ tempBuf[0] = (char)mySourceChar; ++ tempBuf[1] = trailByte; ++ } + mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { +@@ -3254,6 +3453,9 @@ + /* open a set and initialize it with code points that are algorithmically round-tripped */ + switch(cnvData->locale[0]){ + case 'j': ++ /* include JIS X 0201 which is hardcoded */ ++ sa->add(sa->set, 0xa5); ++ sa->add(sa->set, 0x203e); + if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { + /* include Latin-1 for some variants of JP */ + sa->addRange(sa->set, 0, 0xff); +@@ -3262,6 +3464,11 @@ + sa->addRange(sa->set, 0, 0x7f); + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ /* ++ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, ++ * we need to include half-width Katakana for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them. ++ */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } +@@ -3281,15 +3488,7 @@ + break; + } + +- /* +- * Version-specific for CN: +- * CN version 0 does not map CNS planes 3..7 although +- * they are all available in the CNS conversion table; +- * CN version 1 does map them all. +- * The two versions create different Unicode sets. +- */ +- for (i=0; imyConverterArray[i]!=NULL) { ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { +@@ -3299,9 +3498,33 @@ + sa, UCNV_ROUNDTRIP_SET, + 0, 0x81, 0x82, + pErrorCode); ++ } ++#endif ++ ++ for (i=0; imyConverterArray[i]!=NULL) { ++ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && ++ cnvData->version==0 && i==CNS_11643 ++ ) { ++ /* ++ * Version-specific for CN: ++ * CN version 0 does not map CNS planes 3..7 although ++ * they are all available in the CNS conversion table; ++ * CN version 1 (-EXT) does map them all. ++ * The two versions create different Unicode sets. ++ */ ++ filter=UCNV_SET_FILTER_2022_CN; ++ } else if(cnvData->locale[0]=='j' && i==JISX208) { ++ /* ++ * Only add code points that map to Shift-JIS codes ++ * corresponding to JIS X 0208. ++ */ ++ filter=UCNV_SET_FILTER_SJIS; + } else { +- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); ++ filter=UCNV_SET_FILTER_NONE; + } ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); + } + } + +diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5483/source/common/ucnvmbcs.c 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:48:08.000000000 +0100 +@@ -340,6 +340,8 @@ + + /* Miscellaneous ------------------------------------------------------------ */ + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void + _getUnicodeSetForBytes(const UConverterSharedData *sharedData, +@@ -432,11 +434,14 @@ + pErrorCode); + } + ++#endif ++ + U_CFUNC void +-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode) { + const UConverterMBCSTable *mbcsTable; + const uint16_t *table; + +@@ -490,50 +495,26 @@ + c+=1024; /* empty stage 2 block */ + } + } +- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { +- /* ignore single-byte results */ ++ } else { + const uint32_t *stage2; +- const uint16_t *stage3, *results; +- +- results=(const uint16_t *)mbcsTable->fromUnicodeBytes; +- +- for(st1=0; st1(maxStage1>>1)) { +- stage2=(const uint32_t *)table+st2; +- for(st2=0; st2<64; ++st2) { +- if((st3=stage2[st2])!=0) { +- /* read the stage 3 block */ +- stage3=results+16*(uint32_t)(uint16_t)st3; ++ const uint8_t *stage3, *bytes; ++ uint32_t st3Multiplier; ++ uint32_t value; + +- /* get the roundtrip flags for the stage 3 block */ +- st3>>=16; ++ bytes=mbcsTable->fromUnicodeBytes; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. +- * See ucnv_MBCSFromUnicodeWithOffsets() for details. +- * +- * Ignore single-byte results (<0x100). +- */ +- do { +- if((st3&1)!=0 && *stage3>=0x100) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- ++stage3; +- } while((++c&0xf)!=0); +- } else { +- c+=16; /* empty stage 3 block */ +- } +- } +- } else { +- c+=1024; /* empty stage 2 block */ +- } ++ switch(mbcsTable->outputType) { ++ case MBCS_OUTPUT_3: ++ case MBCS_OUTPUT_4_EUC: ++ st3Multiplier=3; ++ break; ++ case MBCS_OUTPUT_4: ++ st3Multiplier=4; ++ break; ++ default: ++ st3Multiplier=2; ++ break; + } +- } else { +- const uint32_t *stage2; + + for(st1=0; st1>=16; + +@@ -550,12 +534,49 @@ + * non-roundtrip stage 3 results for whether they are 0. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ +- do { +- if(st3&1) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- } while((++c&0xf)!=0); ++ switch(filter) { ++ case UCNV_SET_FILTER_NONE: ++ do { ++ if(st3&1) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_DBCS_ONLY: ++ /* Ignore single-byte results (<0x100). */ ++ do { ++ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_2022_CN: ++ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ ++ do { ++ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=3; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ ++ do { ++ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ default: ++ *pErrorCode=U_INTERNAL_PROGRAM_ERROR; ++ return; ++ } + } else { + c+=16; /* empty stage 3 block */ + } +@@ -569,6 +590,19 @@ + ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); + } + ++U_CFUNC void ++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode) { ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ sharedData, sa, which, ++ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? ++ UCNV_SET_FILTER_DBCS_ONLY : ++ UCNV_SET_FILTER_NONE, ++ pErrorCode); ++} ++ + static void + ucnv_MBCSGetUnicodeSet(const UConverter *cnv, + const USetAdder *sa, +diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5483/source/common/ucnvmbcs.h 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 12:48:08.000000000 +0100 +@@ -363,6 +363,7 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -377,6 +378,7 @@ + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); ++#endif + + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. +@@ -388,9 +390,30 @@ + */ + U_CFUNC void + ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode); ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode); ++ ++typedef enum UConverterSetFilter { ++ UCNV_SET_FILTER_NONE, ++ UCNV_SET_FILTER_DBCS_ONLY, ++ UCNV_SET_FILTER_2022_CN, ++ UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_COUNT ++} UConverterSetFilter; ++ ++/* ++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but ++ * the set can be filtered by encoding scheme. ++ * Used by stateful converters which share regular conversion tables ++ * but only use a subset of their mappings. ++ */ ++U_CFUNC void ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode); + + #endif + +diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.5483/source/test/cintltst/nucnvtst.c 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 12:58:02.000000000 +0100 +@@ -3202,7 +3202,7 @@ + 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x3014, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +@@ -3730,7 +3730,7 @@ + 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A, +diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c +--- icu.5483/source/test/cintltst/udatatst.c 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/udatatst.c 2009-06-02 13:09:15.000000000 +0100 +@@ -1260,6 +1260,11 @@ + {"gb18030", "cnv", ucnv_swap}, + /* MBCS conversion table file with extension */ + {"*test4x", "cnv", ucnv_swap}, ++ /* ++ * MBCS conversion table file without extension, ++ * to test swapping and preflighting of UTF-8-friendly mbcsIndex[]. ++ */ ++ {"jisx-212", "cnv", ucnv_swap}, + #endif + + #if !UCONFIG_NO_CONVERSION +diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5483/source/test/testdata/conversion.txt 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:49:51.000000000 +0100 +@@ -48,6 +48,15 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() + { + "ISO-8859-3", +@@ -495,6 +504,15 @@ + } + { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } + { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", ++ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, ++ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, ++ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e ++ } + // Verify that mappings that would result in byte values outside 20..7F (for SBCS) + // or 21..7E (for DBCS) are not used. + // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): +@@ -1273,13 +1291,13 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", ++ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\x0e\x0f\x1b\uffe7-\U0010ffff]", + :int{0} + } diff --git a/icu.icu6001.backport.patch b/icu.icu6001.backport.patch new file mode 100644 index 0000000..11b2ee3 --- /dev/null +++ b/icu.icu6001.backport.patch @@ -0,0 +1,741 @@ +diff -ru icu.5797/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5797/source/common/ucnv2022.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 15:05:10.000000000 +0100 +@@ -3399,11 +3399,19 @@ + /* include ASCII for JP */ + sa->addRange(sa->set, 0, 0x7f); + } +- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { + /* +- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, +- * we need to include half-width Katakana for all JP variants because +- * JIS X 0208 has hardcoded fallbacks for them. ++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 ++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) ++ * use half-width Katakana. ++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) ++ * half-width Katakana via the ESC ( I sequence. ++ * However, we only emit (fromUnicode) half-width Katakana according to the ++ * definition of each variant. ++ * ++ * When including fallbacks, ++ * we need to include half-width Katakana Unicode code points for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). + */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); +@@ -3457,6 +3465,12 @@ + * corresponding to JIS X 0208. + */ + filter=UCNV_SET_FILTER_SJIS; ++ } else if(i==KSC5601) { ++ /* ++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) ++ * are broader than GR94. ++ */ ++ filter=UCNV_SET_FILTER_GR94DBCS; + } else { + filter=UCNV_SET_FILTER_NONE; + } +@@ -3472,6 +3486,9 @@ + sa->remove(sa->set, 0x0e); + sa->remove(sa->set, 0x0f); + sa->remove(sa->set, 0x1b); ++ ++ /* ISO 2022 converters do not convert C1 controls either */ ++ sa->removeRange(sa->set, 0x80, 0x9f); + } + + static const UConverterImpl _ISO2022Impl={ +diff -ru icu.5797/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.5797/source/common/ucnv_ext.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 15:12:21.000000000 +0100 +@@ -946,7 +946,7 @@ + ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, + const int32_t *cx, + const USetAdder *sa, +- UConverterUnicodeSet which, ++ UBool useFallback, + int32_t minLength, + UChar32 c, + UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, +@@ -966,7 +966,7 @@ + value=*fromUSectionValues++; + + if( value!=0 && +- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && ++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { + if(c>=0) { +@@ -987,12 +987,14 @@ + /* no mapping, do nothing */ + } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { + ucnv_extGetUnicodeSetString( +- sharedData, cx, sa, which, minLength, ++ sharedData, cx, sa, useFallback, minLength, + U_SENTINEL, s, length+1, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++ } else if((useFallback ? ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { + sa->addString(sa->set, s, length+1); +@@ -1004,6 +1006,7 @@ + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, ++ UConverterSetFilter filter, + UErrorCode *pErrorCode) { + const int32_t *cx; + const uint16_t *stage12, *stage3, *ps2, *ps3; +@@ -1011,6 +1014,7 @@ + + uint32_t value; + int32_t st1, stage1Length, st2, st3, minLength; ++ UBool useFallback; + + UChar s[UCNV_EXT_MAX_UCHARS]; + UChar32 c; +@@ -1027,12 +1031,20 @@ + + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; + ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + +- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) { ++ if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++ filter==UCNV_SET_FILTER_DBCS_ONLY || ++ filter==UCNV_SET_FILTER_SJIS || ++ filter==UCNV_SET_FILTER_GR94DBCS ++ ) { + /* DBCS-only, ignore single-byte results */ + minLength=2; ++ } else if(filter==UCNV_SET_FILTER_2022_CN) { ++ minLength=3; + } else { + minLength=1; + } +@@ -1064,14 +1076,41 @@ + length=0; + U16_APPEND_UNSAFE(s, length, c); + ucnv_extGetUnicodeSetString( +- sharedData, cx, sa, which, minLength, ++ sharedData, cx, sa, useFallback, minLength, + c, s, length, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++ } else if((useFallback ? ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { ++ switch(filter) { ++ case UCNV_SET_FILTER_2022_CN: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_GR94DBCS: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1))) { ++ continue; ++ } ++ break; ++ default: ++ /* ++ * UCNV_SET_FILTER_NONE, ++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength ++ */ ++ break; ++ } + sa->add(sa->set, c); + } + } while((++c&0xf)!=0); +diff -ru icu.5797/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.5797/source/common/ucnv_ext.h 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_ext.h 2009-06-02 15:05:10.000000000 +0100 +@@ -382,10 +382,20 @@ + UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode); + ++/* ++ * Add code points and strings to the set according to the extension mappings. ++ * Limitation on the UConverterSetFilter: ++ * The filters currently assume that they are used with 1:1 mappings. ++ * They only apply to single input code points, and then they pass through ++ * only mappings with single-charset-code results. ++ * For example, the Shift-JIS filter only works for 2-byte results and tests ++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS. ++ */ + U_CFUNC void + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, ++ UConverterSetFilter filter, + UErrorCode *pErrorCode); + + /* toUnicode helpers -------------------------------------------------------- */ +diff -ru icu.5797/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.5797/source/common/ucnvhz.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:05:10.000000000 +0100 +@@ -528,6 +528,7 @@ + sa->add(sa->set, 0x7e); + + /* add all of the code points that the sub-converter handles */ ++ /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ + ((UConverterDataHZ*)cnv->extraInfo)-> + gbConverter->sharedData->impl-> + getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +diff -ru icu.5797/source/common/ucnv_lmb.c icu/source/common/ucnv_lmb.c +--- icu.5797/source/common/ucnv_lmb.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_lmb.c 2009-06-02 15:09:13.000000000 +0100 +@@ -536,7 +536,7 @@ + NULL,\ + NULL,\ + _LMBCSSafeClone,\ +- _LMBCSGetUnicodeSet\ ++ ucnv_getCompleteUnicodeSet\ + };\ + static const UConverterStaticData _LMBCSStaticData##n={\ + sizeof(UConverterStaticData),\ +@@ -662,15 +662,14 @@ + return &newLMBCS->cnv; + } + +-static void +-_LMBCSGetUnicodeSet(const UConverter *cnv, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { +- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */ +- sa->addRange(sa->set, 0, 0xf5ff); +- sa->addRange(sa->set, 0xf700, 0x10ffff); +-} ++/* ++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117) ++ * which added all code points except for U+F6xx ++ * because those cannot be represented in the Unicode group. ++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx ++ * which means that LMBCS can convert all Unicode code points after all. ++ * We now simply use ucnv_getCompleteUnicodeSet(). ++ */ + + /* + Here's the basic helper function that we use when converting from +diff -ru icu.5797/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5797/source/common/ucnvmbcs.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:12:40.000000000 +0100 +@@ -463,9 +463,23 @@ + + if(mbcsTable->outputType==MBCS_OUTPUT_1) { + const uint16_t *stage2, *stage3, *results; ++ uint16_t minValue; + + results=(const uint16_t *)mbcsTable->fromUnicodeBytes; + ++ /* ++ * Set a threshold variable for selecting which mappings to use. ++ * See ucnv_MBCSSingleFromBMPWithOffsets() and ++ * MBCS_SINGLE_RESULT_FROM_U() for details. ++ */ ++ if(which==UCNV_ROUNDTRIP_SET) { ++ /* use only roundtrips */ ++ minValue=0xf00; ++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { ++ /* use all roundtrip and fallback results */ ++ minValue=0x800; ++ } ++ + for(st1=0; st1maxStage1) { +@@ -475,15 +489,8 @@ + /* read the stage 3 block */ + stage3=results+st3; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to use +- * a threshold variable with a value of 0x800. +- * See ucnv_MBCSSingleFromBMPWithOffsets() and +- * MBCS_SINGLE_RESULT_FROM_U() for details. +- */ + do { +- if(*stage3++>=0xf00) { ++ if(*stage3++>=minValue) { + sa->add(sa->set, c); + } + } while((++c&0xf)!=0); +@@ -500,9 +507,12 @@ + const uint8_t *stage3, *bytes; + uint32_t st3Multiplier; + uint32_t value; ++ UBool useFallback; + + bytes=mbcsTable->fromUnicodeBytes; + ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: +@@ -529,9 +539,8 @@ + st3>>=16; + + /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. ++ * Add code points for which the roundtrip flag is set, ++ * or which map to non-zero bytes if we use fallbacks. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ + switch(filter) { +@@ -539,6 +548,23 @@ + do { + if(st3&1) { + sa->add(sa->set, c); ++ stage3+=st3Multiplier; ++ } else if(useFallback) { ++ uint8_t b=0; ++ switch(st3Multiplier) { ++ case 4: ++ b|=*stage3++; ++ case 3: ++ b|=*stage3++; ++ case 2: ++ b|=stage3[0]|stage3[1]; ++ stage3+=2; ++ default: ++ break; ++ } ++ if(b!=0) { ++ sa->add(sa->set, c); ++ } + } + st3>>=1; + } while((++c&0xf)!=0); +@@ -546,7 +572,7 @@ + case UCNV_SET_FILTER_DBCS_ONLY: + /* Ignore single-byte results (<0x100). */ + do { +- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -556,7 +582,7 @@ + case UCNV_SET_FILTER_2022_CN: + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ + do { +- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -566,7 +592,20 @@ + case UCNV_SET_FILTER_SJIS: + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ + do { +- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_GR94DBCS: ++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ ++ do { ++ if( ((st3&1)!=0 || useFallback) && ++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1) ++ ) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -587,7 +626,7 @@ + } + } + +- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); ++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); + } + + U_CFUNC void +diff -ru icu.5797/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5797/source/common/ucnvmbcs.h 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 15:05:10.000000000 +0100 +@@ -399,6 +399,7 @@ + UCNV_SET_FILTER_DBCS_ONLY, + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_GR94DBCS, + UCNV_SET_FILTER_COUNT + } UConverterSetFilter; + +diff -ru icu.5797/source/common/ucnv_set.c icu/source/common/ucnv_set.c +--- icu.5797/source/common/ucnv_set.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_set.c 2009-06-02 15:05:10.000000000 +0100 +@@ -1,7 +1,7 @@ + /* + ******************************************************************************* + * +-* Copyright (C) 2003-2005, International Business Machines ++* Copyright (C) 2003-2007, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* +@@ -52,7 +52,8 @@ + uset_add, + uset_addRange, + uset_addString, +- uset_remove ++ uset_remove, ++ uset_removeRange + }; + sa.set=setFillIn; + +diff -ru icu.5797/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h +--- icu.5797/source/common/unicode/ucnv.h 2009-06-02 14:45:30.000000000 +0100 ++++ icu/source/common/unicode/ucnv.h 2009-06-02 15:05:10.000000000 +0100 +@@ -870,6 +870,8 @@ + typedef enum UConverterUnicodeSet { + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ + UCNV_ROUNDTRIP_SET, ++ /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ ++ UCNV_ROUNDTRIP_AND_FALLBACK_SET, + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ + UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -878,11 +880,16 @@ + /** + * Returns the set of Unicode code points that can be converted by an ICU converter. + * +- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): ++ * Returns one of several kinds of set: ++ * ++ * 1. UCNV_ROUNDTRIP_SET ++ * + * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter. ++ * (converted without any data loss) with the converter (ucnv_fromUnicode()). + * This set will not include code points that have fallback mappings + * or are only the result of reverse fallback mappings. ++ * This set will also not include PUA code points with fallbacks, although ++ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). + * See UTR #22 "Character Mapping Markup Language" + * at http://www.unicode.org/reports/tr22/ + * +@@ -893,6 +900,12 @@ + * by comparing its roundtrip set with the set of ExemplarCharacters from + * ICU's locale data or other sources + * ++ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET ++ * ++ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) ++ * when fallbacks are turned on (see ucnv_setFallback()). ++ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). ++ * + * In the future, there may be more UConverterUnicodeSet choices to select + * sets with different properties. + * +diff -ru icu.5797/source/common/uset_imp.h icu/source/common/uset_imp.h +--- icu.5797/source/common/uset_imp.h 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/uset_imp.h 2009-06-02 15:05:10.000000000 +0100 +@@ -36,6 +36,9 @@ + typedef void U_CALLCONV + USetRemove(USet *set, UChar32 c); + ++typedef void U_CALLCONV ++USetRemoveRange(USet *set, UChar32 start, UChar32 end); ++ + /** + * Interface for adding items to a USet, to keep low-level code from + * statically depending on the USet implementation. +@@ -47,6 +50,7 @@ + USetAddRange *addRange; + USetAddString *addString; + USetRemove *remove; ++ USetRemoveRange *removeRange; + }; + typedef struct USetAdder USetAdder; + +diff -ru icu.5797/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.5797/source/test/intltest/convtest.cpp 2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp 2009-06-02 15:09:31.000000000 +0100 +@@ -59,6 +59,7 @@ + case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; + case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; + case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; ++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; + default: name=""; break; //needed to end loop + } + } +@@ -454,6 +455,183 @@ + } + } + ++U_CDECL_BEGIN ++static void U_CALLCONV ++getUnicodeSetCallback(const void *context, ++ UConverterFromUnicodeArgs *fromUArgs, ++ const UChar* codeUnits, ++ int32_t length, ++ UChar32 codePoint, ++ UConverterCallbackReason reason, ++ UErrorCode *pErrorCode) { ++ if(reason<=UCNV_IRREGULAR) { ++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point ++ *pErrorCode=U_ZERO_ERROR; // skip ++ } // else ignore the reset, close and clone calls. ++} ++U_CDECL_END ++ ++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted. ++void ++ConversionTest::TestGetUnicodeSet2() { ++ // Build a string with all code points. ++ UChar32 cpLimit; ++ int32_t s0Length; ++ if(quick) { ++ cpLimit=s0Length=0x10000; // BMP only ++ } else { ++ cpLimit=0x110000; ++ s0Length=0x10000+0x200000; // BMP + surrogate pairs ++ } ++ UChar *s0=new UChar[s0Length]; ++ if(s0==NULL) { ++ return; ++ } ++ UChar *s=s0; ++ UChar32 c; ++ UChar c2; ++ // low BMP ++ for(c=0; c<=0xd7ff; ++c) { ++ *s++=(UChar)c; ++ } ++ // trail surrogates ++ for(c=0xdc00; c<=0xdfff; ++c) { ++ *s++=(UChar)c; ++ } ++ // lead surrogates ++ // (after trails so that there is not even one surrogate pair in between) ++ for(c=0xd800; c<=0xdbff; ++c) { ++ *s++=(UChar)c; ++ } ++ // high BMP ++ for(c=0xe000; c<=0xffff; ++c) { ++ *s++=(UChar)c; ++ } ++ // supplementary code points = surrogate pairs ++ if(cpLimit==0x110000) { ++ for(c=0xd800; c<=0xdbff; ++c) { ++ for(c2=0xdc00; c2<=0xdfff; ++c2) { ++ *s++=(UChar)c; ++ *s++=c2; ++ } ++ } ++ } ++ ++ static const char *const cnvNames[]={ ++ "UTF-8", ++ "UTF-7", ++ "UTF-16", ++ "US-ASCII", ++ "ISO-8859-1", ++ "windows-1252", ++ "Shift-JIS", ++ "ibm-1390", // EBCDIC_STATEFUL table ++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table ++ // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++ "ISO-2022-JP", ++ "JIS7", ++ "ISO-2022-CN", ++ "ISO-2022-CN-EXT", ++ "LMBCS" ++ }; ++ char buffer[1024]; ++ int32_t i; ++ for(i=0; i100) { ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++ } ++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d", ++ cnvNames[i], which); ++ errln(out); ++ } ++ ++ // are there items that must not be in the set but are? ++ (diffSet=set).removeAll(expected); ++ if(!diffSet.isEmpty()) { ++ diffSet.toPattern(out, TRUE); ++ if(out.length()>100) { ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++ } ++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d", ++ cnvNames[i], which); ++ errln(out); ++ } ++ } ++ } ++ } ++ ++ delete [] s0; ++} ++ + // open testdata or ICU data converter ------------------------------------- *** + + UConverter * +diff -ru icu.5797/source/test/intltest/convtest.h icu/source/test/intltest/convtest.h +--- icu.5797/source/test/intltest/convtest.h 2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/intltest/convtest.h 2009-06-02 15:05:10.000000000 +0100 +@@ -64,6 +64,7 @@ + void TestToUnicode(); + void TestFromUnicode(); + void TestGetUnicodeSet(); ++ void TestGetUnicodeSet2(); + + private: + UBool +diff -ru icu.5797/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5797/source/test/testdata/conversion.txt 2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:25:04.000000000 +0100 +@@ -1198,16 +1198,29 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", +- "[\x0e\x0f\x1b\uffe7-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]", ++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]", + :int{0} + } ++ { ++ "JIS7", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]", ++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]", ++ :int{0} ++ } ++ // with fallbacks ++ { ++ "ISO-2022-JP", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]", ++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]", ++ :int{1} ++ } + + // versions of ISO-2022-CN + { +@@ -1223,6 +1236,14 @@ + :int{0} + } + ++ // LMBCS ++ { ++ "LMBCS", ++ "[\x00-\U0010ffff]", ++ "[]", ++ :int{0} ++ } ++ + // DBCS-only + { + "ibm-971", diff --git a/icu.icu6002.backport.patch b/icu.icu6002.backport.patch new file mode 100644 index 0000000..51f0d75 --- /dev/null +++ b/icu.icu6002.backport.patch @@ -0,0 +1,397 @@ +diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.6001/source/common/ucnv_ext.c 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 15:29:18.000000000 +0100 +@@ -1036,15 +1036,13 @@ + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + +- if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || +- filter==UCNV_SET_FILTER_DBCS_ONLY || +- filter==UCNV_SET_FILTER_SJIS || +- filter==UCNV_SET_FILTER_GR94DBCS ++ if(filter==UCNV_SET_FILTER_2022_CN) { ++ minLength=3; ++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++ filter!=UCNV_SET_FILTER_NONE + ) { + /* DBCS-only, ignore single-byte results */ + minLength=2; +- } else if(filter==UCNV_SET_FILTER_2022_CN) { +- minLength=3; + } else { + minLength=1; + } +@@ -1104,6 +1102,13 @@ + continue; + } + break; ++ case UCNV_SET_FILTER_HZ: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { ++ continue; ++ } ++ break; + default: + /* + * UCNV_SET_FILTER_NONE, +diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6001/source/common/ucnvhz.c 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:29:15.000000000 +0100 +@@ -72,7 +72,7 @@ + cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ)); + if(cnv->extraInfo != NULL){ + uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ)); +- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode); ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode); + } + else { + *errorCode = U_MEMORY_ALLOCATION_ERROR; +@@ -141,7 +141,7 @@ + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; + UChar32 targetUniChar = 0x0000; +- UChar mySourceChar = 0x0000; ++ int32_t mySourceChar = 0x0000; + UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); + tempBuf[0]=0; + tempBuf[1]=0; +@@ -156,90 +156,71 @@ + + mySourceChar= (unsigned char) *mySource++; + +- switch(mySourceChar){ ++ if(args->converter->mode == UCNV_TILDE) { ++ /* second byte after ~ */ ++ args->converter->mode=0; ++ switch(mySourceChar) { + case 0x0A: +- if(args->converter->mode ==UCNV_TILDE){ +- args->converter->mode=0; +- +- } +- *(myTarget++)=(UChar)mySourceChar; ++ /* no output for ~\n (line-continuation marker) */ + continue; +- + case UCNV_TILDE: +- if(args->converter->mode ==UCNV_TILDE){ +- *(myTarget++)=(UChar)mySourceChar; +- args->converter->mode=0; +- continue; +- ++ if(args->offsets) { ++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); + } +- else if(args->converter->toUnicodeStatus !=0){ +- args->converter->mode=0; +- break; +- } +- else{ +- args->converter->mode = UCNV_TILDE; +- continue; +- } +- +- ++ *(myTarget++)=(UChar)mySourceChar; ++ continue; + case UCNV_OPEN_BRACE: +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- myData->isStateDBCS = TRUE; +- continue; +- } +- else{ +- break; +- } +- +- ++ myData->isStateDBCS = TRUE; ++ continue; + case UCNV_CLOSE_BRACE: +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- myData->isStateDBCS = FALSE; +- continue; +- } +- else{ +- break; +- } +- ++ myData->isStateDBCS = FALSE; ++ continue; + default: + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); +- goto SAVE_STATE; +- } +- ++ mySourceChar = 0x7e00 | mySourceChar; ++ targetUniChar = 0xffff; + break; +- +- } +- +- if(myData->isStateDBCS){ ++ } ++ } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ +- args->converter->toUnicodeStatus = (UChar) mySourceChar; ++ /* lead byte */ ++ if(mySourceChar == UCNV_TILDE) { ++ args->converter->mode = UCNV_TILDE; ++ } else { ++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */ ++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++ } + continue; + } + else{ +- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ; +- tempBuf[1] = (char) (mySourceChar+0x80); +- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); ++ /* trail byte */ ++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; ++ if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && ++ (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) ++ ) { ++ tempBuf[0] = (char) (leadByte+0x80) ; ++ tempBuf[1] = (char) (mySourceChar+0x80); ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, ++ tempBuf, 2, args->converter->useFallback); ++ } else { ++ targetUniChar = 0xffff; ++ } ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- tempBuf, 2, args->converter->useFallback); + } + } + else{ +- if(args->converter->fromUnicodeStatus == 0x00){ +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- mySource - 1, 1, args->converter->useFallback); +- } +- else{ +- goto SAVE_STATE; ++ if(mySourceChar == UCNV_TILDE) { ++ args->converter->mode = UCNV_TILDE; ++ continue; ++ } else if(mySourceChar <= 0x7f) { ++ targetUniChar = (UChar)mySourceChar; /* ASCII */ ++ } else { ++ targetUniChar = 0xffff; + } +- + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -248,26 +229,17 @@ + + *(myTarget++)=(UChar)targetUniChar; + } +- else if(targetUniChar>=0xfffe){ +-SAVE_STATE: ++ else /* targetUniChar>=0xfffe */ { + if(targetUniChar == 0xfffe){ + *err = U_INVALID_CHAR_FOUND; + } + else{ + *err = U_ILLEGAL_CHAR_FOUND; + } +- if(myData->isStateDBCS){ +- /* this should never occur since isStateDBCS is set to true +- * only after tempBuf[0] and tempBuf[1] +- * are set to the input .. just to please BEAM +- */ +- if(tempBuf[0]==0 || tempBuf[1]==0){ +- *err = U_INTERNAL_PROGRAM_ERROR; +- }else{ +- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80); +- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80); +- args->converter->toULength=2; +- } ++ if(mySourceChar > 0xff){ ++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); ++ args->converter->toUBytes[1] = (uint8_t)mySourceChar; ++ args->converter->toULength=2; + } + else{ + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -328,16 +300,21 @@ + escSeq = TILDE_ESCAPE; + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); + continue; +- } +- else{ ++ } else if(mySourceChar <= 0x7f) { ++ length = 1; ++ targetUniChar = mySourceChar; ++ } else { + length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, + mySourceChar,&targetUniChar,args->converter->useFallback); +- +- } +- /* only DBCS or SBCS characters are expected*/ +- /* DB haracters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){ +- targetUniChar= missingCharMarker; ++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */ ++ if( length == 2 && ++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && ++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ targetUniChar -= 0x8080; ++ } else { ++ targetUniChar = missingCharMarker; ++ } + } + if (targetUniChar != missingCharMarker){ + myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF); +@@ -360,22 +337,22 @@ + + if(isTargetUCharDBCS){ + if( myTargetIndex > 8) -0x80); ++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + if(myTargetIndex < targetLength){ +- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80); ++ myTarget[myTargetIndex++] =(char) targetUniChar; + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + }else{ +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + }else{ +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80); +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + +@@ -524,15 +501,14 @@ + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { +- /* the tilde '~' is hardcoded in the converter */ +- sa->add(sa->set, 0x7e); ++ /* HZ converts all of ASCII */ ++ sa->addRange(sa->set, 0, 0x7f); + + /* add all of the code points that the sub-converter handles */ +- /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ +- ((UConverterDataHZ*)cnv->extraInfo)-> +- gbConverter->sharedData->impl-> +- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +- sa, which, pErrorCode); ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, ++ sa, which, UCNV_SET_FILTER_HZ, ++ pErrorCode); + } + + static const UConverterImpl _HZImpl={ +diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6001/source/common/ucnvmbcs.c 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:35:01.000000000 +0100 +@@ -612,6 +612,19 @@ + stage3+=2; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; ++ case UCNV_SET_FILTER_HZ: ++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ ++ do { ++ if( ((st3&1)!=0 || useFallback) && ++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1) ++ ) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; + default: + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; + return; +diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.6001/source/common/ucnvmbcs.h 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 15:29:15.000000000 +0100 +@@ -400,6 +400,7 @@ + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, + UCNV_SET_FILTER_GR94DBCS, ++ UCNV_SET_FILTER_HZ, + UCNV_SET_FILTER_COUNT + } UConverterSetFilter; + +diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c +--- icu.6001/source/test/cintltst/ncnvtst.c 2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/cintltst/ncnvtst.c 2009-06-02 15:29:15.000000000 +0100 +@@ -1928,7 +1928,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }, + { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff }, +- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff }, ++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */ + { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff } + #else + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff } +diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.6001/source/test/intltest/convtest.cpp 2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp 2009-06-02 15:29:15.000000000 +0100 +@@ -527,7 +527,7 @@ + "Shift-JIS", + "ibm-1390", // EBCDIC_STATEFUL table + "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table +- // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++ "HZ", + "ISO-2022-JP", + "JIS7", + "ISO-2022-CN", +diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6001/source/test/testdata/conversion.txt 2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:29:15.000000000 +0100 +@@ -48,6 +48,14 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e ++ { ++ "HZ", ++ :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", ++ :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { +@@ -1244,6 +1252,14 @@ + :int{0} + } + ++ // HZ ++ { ++ "HZ", ++ "[\u0410-\u044f\u4e00\u4e01\u4e03]", ++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]", ++ :int{0} ++ } ++ + // DBCS-only + { + "ibm-971", diff --git a/icu.icu6175.emptysegments.patch b/icu.icu6175.emptysegments.patch new file mode 100644 index 0000000..bb40bd5 --- /dev/null +++ b/icu.icu6175.emptysegments.patch @@ -0,0 +1,535 @@ +diff -ru icu.6002/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6002/source/common/ucnv2022.c 2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 15:40:20.000000000 +0100 +@@ -201,6 +201,7 @@ + #ifdef U_ENABLE_GENERIC_ISO_2022 + UBool isFirstBuffer; + #endif ++ UBool isEmptySegment; + char name[30]; + char locale[3]; + }UConverterDataISO2022; +@@ -609,6 +610,7 @@ + if(choice<=UCNV_RESET_TO_UNICODE) { + uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); + myConverterData->key = 0; ++ myConverterData->isEmptySegment = FALSE; + } + if(choice!=UCNV_RESET_TO_UNICODE) { + uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); +@@ -814,6 +816,7 @@ + if(chosenConverterName == NULL) { + /* SS2 or SS3 */ + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; ++ _this->toUCallbackReason = UCNV_UNASSIGNED; + return; + } + +@@ -935,6 +938,8 @@ + } + if(U_SUCCESS(*err)) { + _this->toULength = 0; ++ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { ++ _this->toUCallbackReason = UCNV_UNASSIGNED; + } + } + +@@ -1986,6 +1991,7 @@ + continue; + } else { + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */ + break; + } + +@@ -1997,21 +2003,39 @@ + continue; + } else { + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */ + break; + } + + case ESC_2022: + mySource--; + escape: +- changeState_2022(args->converter,&(mySource), +- mySourceLimit, ISO_2022_JP,err); ++ { ++ const char * mySourceBefore = mySource; ++ int8_t toULengthBefore = args->converter->toULength; ++ ++ changeState_2022(args->converter,&(mySource), ++ mySourceLimit, ISO_2022_JP,err); ++ ++ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ ++ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); ++ } ++ } + + /* invalid or illegal escape sequence */ + if(U_FAILURE(*err)){ + args->target = myTarget; + args->source = mySource; ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ + return; + } ++ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ ++ if(myData->key==0) { ++ myData->isEmptySegment = TRUE; ++ } + continue; + + /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ +@@ -2028,6 +2052,7 @@ + /* falls through */ + default: + /* convert one or two bytes */ ++ myData->isEmptySegment = FALSE; + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; + if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && + !IS_JP_DBCS(cs) +@@ -2524,15 +2549,27 @@ + + if(mySourceChar==UCNV_SI){ + myData->toU2022State.g = 0; ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = mySourceChar; ++ args->converter->toULength = 1; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } + /*consume the source */ + continue; + }else if(mySourceChar==UCNV_SO){ + myData->toU2022State.g = 1; ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ + /*consume the source */ + continue; + }else if(mySourceChar==ESC_2022){ + mySource--; + escape: ++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ + changeState_2022(args->converter,&(mySource), + mySourceLimit, ISO_2022_KR, err); + if(U_FAILURE(*err)){ +@@ -2543,6 +2580,7 @@ + continue; + } + ++ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ + if(myData->toU2022State.g == 1) { + if(mySource < mySourceLimit) { + char trailByte; +@@ -3075,27 +3113,52 @@ + switch(mySourceChar){ + case UCNV_SI: + pToU2022State->g=0; ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = mySourceChar; ++ args->converter->toULength = 1; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } + continue; + + case UCNV_SO: + if(pToU2022State->cs[1] != 0) { + pToU2022State->g=1; ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ + continue; + } else { + /* illegal to have SO before a matching designator */ ++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ + break; + } + + case ESC_2022: + mySource--; + escape: +- changeState_2022(args->converter,&(mySource), +- mySourceLimit, ISO_2022_CN,err); ++ { ++ const char * mySourceBefore = mySource; ++ int8_t toULengthBefore = args->converter->toULength; ++ ++ changeState_2022(args->converter,&(mySource), ++ mySourceLimit, ISO_2022_CN,err); ++ ++ /* After SO there must be at least one character before a designator (designator error handled separately) */ ++ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); ++ } ++ } + + /* invalid or illegal escape sequence */ + if(U_FAILURE(*err)){ + args->target = myTarget; + args->source = mySource; ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ + return; + } + continue; +@@ -3109,6 +3172,7 @@ + /* falls through */ + default: + /* convert one or two bytes */ ++ myData->isEmptySegment = FALSE; + if(pToU2022State->g != 0) { + if(mySource < mySourceLimit) { + UConverterSharedData *cnv; +diff -ru icu.6002/source/common/ucnv_bld.c icu/source/common/ucnv_bld.c +--- icu.6002/source/common/ucnv_bld.c 2009-06-02 15:38:05.000000000 +0100 ++++ icu/source/common/ucnv_bld.c 2009-06-02 15:38:31.000000000 +0100 +@@ -914,6 +914,7 @@ + myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen; + myUConverter->subChars = (uint8_t *)myUConverter->subUChars; + uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen); ++ myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */ + + if(mySharedConverterData->impl->open != NULL) { + mySharedConverterData->impl->open(myUConverter, realName, locale, options, err); +diff -ru icu.6002/source/common/ucnv_bld.h icu/source/common/ucnv_bld.h +--- icu.6002/source/common/ucnv_bld.h 2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnv_bld.h 2009-06-02 15:38:31.000000000 +0100 +@@ -226,6 +226,9 @@ + char preToU[UCNV_EXT_MAX_BYTES]; + int8_t preFromULength, preToULength; /* negative: replay */ + int8_t preToUFirstLength; /* length of first character */ ++ ++ /* new fields for ICU 4.0 */ ++ UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */ + }; + + U_CDECL_END /* end of UConverter */ +diff -ru icu.6002/source/common/ucnv.c icu/source/common/ucnv.c +--- icu.6002/source/common/ucnv.c 2009-06-02 15:38:05.000000000 +0100 ++++ icu/source/common/ucnv.c 2009-06-02 15:38:31.000000000 +0100 +@@ -1473,11 +1473,14 @@ + cnv->toULength=0; + + /* call the callback function */ ++ if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) { ++ cnv->toUCallbackReason = UCNV_UNASSIGNED; ++ } + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, + cnv->invalidCharBuffer, errorInputLength, +- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ? +- UCNV_UNASSIGNED : UCNV_ILLEGAL, ++ cnv->toUCallbackReason, + err); ++ cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */ + + /* + * loop back to the offset handling +diff -ru icu.6002/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6002/source/common/ucnvhz.c 2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:38:31.000000000 +0100 +@@ -59,6 +59,7 @@ + UBool isEscapeAppended; + UBool isStateDBCS; + UBool isTargetUCharDBCS; ++ UBool isEmptySegment; + }UConverterDataHZ; + + +@@ -98,6 +99,7 @@ + cnv->mode=0; + if(cnv->extraInfo != NULL){ + ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; ++ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; + } + } + if(choice!=UCNV_RESET_TO_UNICODE) { +@@ -130,6 +132,10 @@ + * from-GB code '~}' ($7E7D) is outside the defined GB range.) + * + * Source: RFC 1842 ++* ++* Note that the formal syntax in RFC 1842 is invalid. I assume that the ++* intended definition of single-byte-segment is as follows (pedberg): ++* single-byte-segment = single-byte-seq 1*single-byte-char + */ + + +@@ -168,12 +174,23 @@ + args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); + } + *(myTarget++)=(UChar)mySourceChar; ++ myData->isEmptySegment = FALSE; + continue; + case UCNV_OPEN_BRACE: +- myData->isStateDBCS = TRUE; +- continue; + case UCNV_CLOSE_BRACE: +- myData->isStateDBCS = FALSE; ++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = UCNV_TILDE; ++ args->converter->toUBytes[1] = mySourceChar; ++ args->converter->toULength = 2; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } ++ myData->isEmptySegment = TRUE; + continue; + default: + /* if the first byte is equal to TILDE and the trail byte +@@ -181,6 +198,7 @@ + */ + mySourceChar = 0x7e00 | mySourceChar; + targetUniChar = 0xffff; ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ + break; + } + } else if(myData->isStateDBCS) { +@@ -191,6 +209,7 @@ + } else { + /* add another bit to distinguish a 0 byte from not having seen a lead byte */ + args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ + } + continue; + } +@@ -218,8 +237,10 @@ + continue; + } else if(mySourceChar <= 0x7f) { + targetUniChar = (UChar)mySourceChar; /* ASCII */ ++ myData->isEmptySegment = FALSE; /* the segment has something valid */ + } else { + targetUniChar = 0xffff; ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ + } + } + if(targetUniChar < 0xfffe){ +diff -ru icu.6002/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6002/source/test/cintltst/nucnvtst.c 2009-06-02 15:37:53.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:40:52.000000000 +0100 +@@ -81,6 +81,7 @@ + static void TestJitterbug2411(void); + #endif + ++static void TestJitterbug6175(void); + static void TestRoundTrippingAllUTF(void); + static void TestConv(const uint16_t in[], + int len, +@@ -294,6 +295,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION + addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346"); + addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411"); ++ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175"); + #endif + + } +@@ -4454,6 +4456,70 @@ + free(offsets); + } + ++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */ ++typedef struct { ++ const char * converterName; ++ const char * inputText; ++ int inputTextLength; ++} EmptySegmentTest; ++ ++/* Callback for TestJitterbug6175, should only get called for empty segment errors */ ++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits, ++ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) { ++ if (reason > UCNV_IRREGULAR) { ++ return; ++ } ++ if (reason != UCNV_IRREGULAR) { ++ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n"); ++ } ++ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */ ++ *err = U_ZERO_ERROR; ++ ucnv_cbToUWriteSub(toArgs,0,err); ++} ++ ++enum { kEmptySegmentToUCharsMax = 64 }; ++static void TestJitterbug6175(void) { ++ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A }; ++ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A }; ++ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A }; ++ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A }; ++ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 }; ++ static const EmptySegmentTest emptySegmentTests[] = { ++ /* converterName inputText inputTextLength */ ++ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) }, ++ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) }, ++ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) }, ++ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) }, ++ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) }, ++ /* terminator: */ ++ { NULL, NULL, 0, } ++ }; ++ const EmptySegmentTest * testPtr; ++ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) { ++ UErrorCode err = U_ZERO_ERROR; ++ UConverter * cnv = ucnv_open(testPtr->converterName, &err); ++ if (U_FAILURE(err)) { ++ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err)); ++ return; ++ } ++ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err); ++ if (U_FAILURE(err)) { ++ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err)); ++ ucnv_close(cnv); ++ return; ++ } ++ { ++ UChar toUChars[kEmptySegmentToUCharsMax]; ++ UChar * toUCharsPtr = toUChars; ++ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax; ++ const char * inCharsPtr = testPtr->inputText; ++ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength; ++ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err); ++ } ++ ucnv_close(cnv); ++ } ++} ++ + static void + TestEBCDIC_STATEFUL() { + /* test input */ +diff -ru icu.6002/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6002/source/test/testdata/conversion.txt 2009-06-02 15:37:54.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:40:52.000000000 +0100 +@@ -199,6 +199,21 @@ + :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, + :int{1}, :int{1}, "", "&", :bin{""} + } ++ // empty segment (using substitution and stop) ++ { ++ "ISO-2022-KR", ++ :bin{ 1b242943610e0f620d0a }, ++ "a\uFFFDb\u000D\u000A", ++ :intvector{ 4, 6, 7, 8, 9 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-KR", ++ :bin{ 1b242943610e0f620d0a }, ++ "a", ++ :intvector{ 4 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"} ++ } + + // ISO-2022-JP + +@@ -249,6 +264,21 @@ + :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 }, + :int{1}, :int{1}, "", ".", :bin{""} + } ++ // empty segment (using substitution and stop) ++ { ++ "ISO-2022-JP", ++ :bin{ 61621b24421b284263640d0a }, ++ "ab\uFFFDcd\u000D\u000A", ++ :intvector{ 0, 1, 5, 8, 9, 10, 11 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-JP", ++ :bin{ 61621b24421b284263640d0a }, ++ "ab", ++ :intvector{ 0, 1 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"} ++ } + + // ISO-2022-CN + +@@ -319,6 +349,36 @@ + :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, + :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } + } ++ // empty segment 1 (using substitution and stop) ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, ++ "ab\uFFFD\u994Cc\u000D\u000A", ++ :intvector{ 0, 5, 7, 14, 16, 17, 18 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, ++ "ab", ++ :intvector{ 0, 5 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"} ++ } ++ // empty segment 2 (using substitution and stop) ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e1b24294768640f630d0a }, ++ "ab\uFFFD\u5F70c\u000D\u000A", ++ :intvector{ 0, 5, 7, 11, 14, 15, 16 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e1b24294768640f630d0a }, ++ "ab", ++ :intvector{ 0, 5 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"} ++ } + + // ISO-2022 SBCS + // [U_ENABLE_GENERIC_ISO_2022] +@@ -333,6 +393,39 @@ + // :int{1}, :int{1}, "", ".", :bin{""} + //} + ++ // HZ-GB-2312 ++ ++ // empty segment 1 (using substitution and stop) ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b7e7d6364 }, ++ "ab\uFFFDcd", ++ :intvector{ 0, 1, 4, 6, 7 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b7e7d63640d0a }, ++ "ab", ++ :intvector{ 0, 1 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"} ++ } ++ // empty segment 2 & legal redundant switches (using substitution and stop) ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, ++ "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD", ++ :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, ++ "ab\u4E0D\u7A7A", ++ :intvector{ 0, 1, 4, 6 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"} ++ } ++ + // DBCS-only extensions + { + "ibm-970", diff --git a/icu.icuXXXX.malayalam.bysyllable.patch b/icu.icuXXXX.malayalam.bysyllable.patch new file mode 100644 index 0000000..d0cd1b1 --- /dev/null +++ b/icu.icuXXXX.malayalam.bysyllable.patch @@ -0,0 +1,250 @@ +diff -ruN icu.orig/source/layout/IndicReordering.h icu/source/layout/IndicReordering.h +--- icu.orig/source/layout/IndicReordering.h 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/IndicReordering.h 2007-04-27 10:39:22.000000000 +0100 +@@ -142,6 +142,7 @@ + // do not instantiate + IndicReordering(); + ++public: + static le_int32 findSyllable(const IndicClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount); + + }; +diff -ruN icu.orig/source/layout/LayoutEngine.cpp icu/source/layout/LayoutEngine.cpp +--- icu.orig/source/layout/LayoutEngine.cpp 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/LayoutEngine.cpp 2007-04-27 10:39:22.000000000 +0100 +@@ -14,6 +14,7 @@ + #include "CanonShaping.h" + #include "HanLayoutEngine.h" + #include "HangulLayoutEngine.h" ++#include "MalayalamLayoutEngine.h" + #include "IndicLayoutEngine.h" + #include "KhmerLayoutEngine.h" + #include "ThaiLayoutEngine.h" +@@ -451,11 +452,13 @@ + + if (gsubTable != NULL && gsubTable->coversScript(scriptTag = OpenTypeLayoutEngine::getScriptTag(scriptCode))) { + switch (scriptCode) { ++ case mlymScriptCode: ++ result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable); ++ break; + case bengScriptCode: + case devaScriptCode: + case gujrScriptCode: + case kndaScriptCode: +- case mlymScriptCode: + case oryaScriptCode: + case guruScriptCode: + case tamlScriptCode: +@@ -512,11 +515,13 @@ + result = new GXLayoutEngine(fontInstance, scriptCode, languageCode, morphTable); + } else { + switch (scriptCode) { ++ case mlymScriptCode: ++ result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags); ++ break; + case bengScriptCode: + case devaScriptCode: + case gujrScriptCode: + case kndaScriptCode: +- case mlymScriptCode: + case oryaScriptCode: + case guruScriptCode: + case tamlScriptCode: +diff -ruN icu.orig/source/layout/LEGlyphStorage.h icu/source/layout/LEGlyphStorage.h +--- icu.orig/source/layout/LEGlyphStorage.h 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/LEGlyphStorage.h 2007-04-27 10:43:54.000000000 +0100 +@@ -413,6 +413,8 @@ + */ + void adoptGlyphArray(LEGlyphStorage &from); + ++ void appendGlyphStorage(LEGlyphStorage &from); ++ + /** + * Delete the char indices array and replace it with the one + * in from. Set the char indices array pointer +diff -ruN icu.orig/source/layout/Makefile.in icu/source/layout/Makefile.in +--- icu.orig/source/layout/Makefile.in 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/Makefile.in 2007-04-27 10:39:22.000000000 +0100 +@@ -66,6 +66,7 @@ + ArabicLayoutEngine.o \ + GXLayoutEngine.o \ + HanLayoutEngine.o \ ++MalayalamLayoutEngine.o \ + IndicLayoutEngine.o \ + LayoutEngine.o \ + ContextualGlyphSubstProc.o \ +diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.cpp icu/source/layout/MalayalamLayoutEngine.cpp +--- icu.orig/source/layout/MalayalamLayoutEngine.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ icu/source/layout/MalayalamLayoutEngine.cpp 2007-04-27 10:44:26.000000000 +0100 +@@ -0,0 +1,126 @@ ++ ++/* ++ * ++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved ++ * ++ */ ++ ++#include "LETypes.h" ++#include "LayoutEngine.h" ++#include "OpenTypeLayoutEngine.h" ++#include "MalayalamLayoutEngine.h" ++#include "ScriptAndLanguageTags.h" ++ ++#include "GlyphSubstitutionTables.h" ++#include "GlyphDefinitionTables.h" ++#include "GlyphPositioningTables.h" ++ ++#include "GDEFMarkFilter.h" ++#include "LEGlyphStorage.h" ++ ++#include "IndicReordering.h" ++ ++#include ++ ++U_NAMESPACE_BEGIN ++ ++UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MalayalamOpenTypeLayoutEngine) ++ ++void LEGlyphStorage::appendGlyphStorage(LEGlyphStorage &from) ++{ ++ if (fInsertionList) applyInsertions(); ++ if (from.fInsertionList) from.applyInsertions(); ++ if ((!fInsertionList) && (from.fInsertionList)) ++ { ++ fInsertionList = from.fInsertionList; ++ from.fInsertionList = NULL; ++ } ++ ++ if (!from.fGlyphCount) ++ return; ++ ++ le_int32 newGlyphCount = fGlyphCount + from.fGlyphCount; ++ ++ fGlyphs = (LEGlyphID*)LE_GROW_ARRAY(fGlyphs, newGlyphCount); ++ LE_ARRAY_COPY(fGlyphs+fGlyphCount, from.fGlyphs, from.fGlyphCount); ++ ++ le_int32 nLargestIndex = 0; ++ if (fGlyphCount) ++ { ++ for (le_int32 i = 0; i < fGlyphCount; ++i) ++ { ++ if (fCharIndices[i] > nLargestIndex) ++ nLargestIndex = fCharIndices[i]; ++ } ++ nLargestIndex+=1; ++ } ++ fCharIndices = (le_int32 *)LE_GROW_ARRAY(fCharIndices, newGlyphCount); ++ for (le_int32 i = 0; i < from.fGlyphCount; ++i) ++ fCharIndices[fGlyphCount+i] = from.fCharIndices[i] + nLargestIndex; ++ ++ fAuxData = (le_uint32 *)LE_GROW_ARRAY(fAuxData, newGlyphCount); ++ LE_ARRAY_COPY(fAuxData+fGlyphCount, from.fAuxData, from.fGlyphCount); ++ ++ fGlyphCount = newGlyphCount; ++} ++ ++le_int32 MalayalamOpenTypeLayoutEngine::glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success) ++{ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ glyphStorage.appendGlyphStorage(tempGlyphStorage); ++ ++ return glyphStorage.getGlyphCount(); ++} ++ ++ ++le_int32 MalayalamOpenTypeLayoutEngine::computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success) ++{ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ if (chars == NULL || offset < 0 || count < 0 || max < 0 || offset >= max || offset + count > max) { ++ success = LE_ILLEGAL_ARGUMENT_ERROR; ++ return 0; ++ } ++ ++ le_int32 outGlyphCount=0; ++ ++ const IndicClassTable *classTable = IndicClassTable::getScriptClassTable(fScriptCode); ++ le_int32 prev = 0; ++ while (prev < count) ++ { ++ le_int32 outCharCount=0, fakeGlyphCount=0; ++ LEUnicode *outChars = NULL; ++ LEGlyphStorage fakeGlyphStorage; ++ ++ le_int32 syllable = IndicReordering::findSyllable(classTable, chars+offset, prev, count); ++ outCharCount = characterProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, outChars, fakeGlyphStorage, success); ++ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ if (outChars != NULL) { ++ fakeGlyphCount = glyphProcessing(outChars, 0, outCharCount, outCharCount, rightToLeft, fakeGlyphStorage, success); ++ LE_DELETE_ARRAY(outChars); // FIXME: a subclass may have allocated this, in which case this delete might not work... ++ } else { ++ fakeGlyphCount = glyphProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, fakeGlyphStorage, success); ++ } ++ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ outGlyphCount = glyphPostProcessing(fakeGlyphStorage, glyphStorage, success); ++ ++ prev = syllable; ++ } ++ ++ return outGlyphCount; ++} ++ ++U_NAMESPACE_END +diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.h icu/source/layout/MalayalamLayoutEngine.h +--- icu.orig/source/layout/MalayalamLayoutEngine.h 1970-01-01 01:00:00.000000000 +0100 ++++ icu/source/layout/MalayalamLayoutEngine.h 2007-04-27 10:39:52.000000000 +0100 +@@ -0,0 +1,41 @@ ++ ++/* ++ * ++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved ++ * ++ */ ++ ++#ifndef __MALAYALAMLAYOUTENGINE_H ++#define __MALAYALAMLAYOUTENGINE_H ++ ++#include "IndicLayoutEngine.h" ++ ++U_NAMESPACE_BEGIN ++ ++class MalayalamOpenTypeLayoutEngine : public IndicOpenTypeLayoutEngine ++{ ++public: ++ MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode, ++ le_int32 typoFlags, const GlyphSubstitutionTableHeader *gsubTable) : ++ IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable) ++ ++ {} ++ ++ MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode, ++ le_int32 typoFlags) : ++ IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags) ++ ++ {} ++ ++ virtual UClassID getDynamicClassID() const; ++ static UClassID getStaticClassID(); ++ ++protected: ++ virtual le_int32 glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success); ++ ++ virtual le_int32 computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success); ++}; ++ ++U_NAMESPACE_END ++#endif ++ diff --git a/icu.icuXXXX.rollbackabi.patch b/icu.icuXXXX.rollbackabi.patch new file mode 100644 index 0000000..038d4b6 --- /dev/null +++ b/icu.icuXXXX.rollbackabi.patch @@ -0,0 +1,131 @@ +diff -ru icu.5691/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5691/source/common/ucnv2022.c 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 16:21:56.000000000 +0100 +@@ -3566,7 +3566,7 @@ + /* include ASCII for JP */ + sa->addRange(sa->set, 0, 0x7f); + } +- if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++ if(cnvData->version==3 || cnvData->version==4) { + /* + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) +diff -ru icu.5691/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.5691/source/common/ucnv_ext.c 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 16:23:12.000000000 +0100 +@@ -1031,7 +1031,7 @@ + + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; + +- useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ useFallback=(UBool)(FALSE); + + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ +diff -ru icu.5691/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5691/source/common/ucnvmbcs.c 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 16:23:50.000000000 +0100 +@@ -340,7 +340,7 @@ + + /* Miscellaneous ------------------------------------------------------------ */ + +-#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void +@@ -434,8 +434,6 @@ + pErrorCode); + } + +-#endif +- + U_CFUNC void + ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, +@@ -511,7 +509,7 @@ + + bytes=mbcsTable->fromUnicodeBytes; + +- useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ useFallback=(UBool)(FALSE); + + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: +diff -ru icu.5691/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5691/source/common/ucnvmbcs.h 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 16:23:50.000000000 +0100 +@@ -363,7 +363,8 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + +-#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -378,7 +379,6 @@ + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); +-#endif + + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. +diff -ru icu.5691/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h +--- icu.5691/source/common/unicode/ucnv.h 2009-06-02 16:07:32.000000000 +0100 ++++ icu/source/common/unicode/ucnv.h 2009-06-02 16:20:18.000000000 +0100 +@@ -870,8 +870,6 @@ + typedef enum UConverterUnicodeSet { + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ + UCNV_ROUNDTRIP_SET, +- /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ +- UCNV_ROUNDTRIP_AND_FALLBACK_SET, + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ + UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -880,16 +878,11 @@ + /** + * Returns the set of Unicode code points that can be converted by an ICU converter. + * +- * Returns one of several kinds of set: +- * +- * 1. UCNV_ROUNDTRIP_SET +- * ++ * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): + * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter (ucnv_fromUnicode()). ++ * (converted without any data loss) with the converter. + * This set will not include code points that have fallback mappings + * or are only the result of reverse fallback mappings. +- * This set will also not include PUA code points with fallbacks, although +- * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). + * See UTR #22 "Character Mapping Markup Language" + * at http://www.unicode.org/reports/tr22/ + * +@@ -900,12 +893,6 @@ + * by comparing its roundtrip set with the set of ExemplarCharacters from + * ICU's locale data or other sources + * +- * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET +- * +- * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) +- * when fallbacks are turned on (see ucnv_setFallback()). +- * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). +- * + * In the future, there may be more UConverterUnicodeSet choices to select + * sets with different properties. + * +diff -ru icu.5691/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.5691/source/test/intltest/convtest.cpp 2009-06-02 16:07:21.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp 2009-06-02 16:24:08.000000000 +0100 +@@ -552,7 +552,7 @@ + } + UConverterUnicodeSet which; + for(which=UCNV_ROUNDTRIP_SET; which= 0x0901 && (wc) <= 0x17FF) ++#define VIRAMA(wc) ((wc) == 0x094D || \ ++ (wc) == 0x09CD || \ ++ (wc) == 0x0A4D || \ ++ (wc) == 0x0ACD || \ ++ (wc) == 0x0B4D || \ ++ (wc) == 0x0BCD || \ ++ (wc) == 0x0C4D || \ ++ (wc) == 0x0CCD || \ ++ (wc) == 0x0D4D || \ ++ (wc) == 0x0DCA || \ ++ (wc) == 0x0E3A || \ ++ (wc) == 0x0F84 || \ ++ (wc) == 0x1039 || \ ++ (wc) == 0x17D2 || \ ++ (wc) == 0x200D) + + //----------------------------------------------------------------------------------- + // +@@ -896,6 +911,7 @@ + RBBIRunMode mode; + + RBBIStateTableRow *row; ++ UChar32 prevchar; + UChar32 c; + int32_t lookaheadStatus = 0; + int32_t lookaheadTagIdx = 0; +@@ -919,6 +935,7 @@ + // if we're already at the end of the text, return DONE. + initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); + result = initialPosition; ++ prevchar = 0; + c = UTEXT_NEXT32(fText); + if (fData == NULL || c==U_SENTINEL) { + return BreakIterator::DONE; +@@ -1001,6 +1018,11 @@ + + // State Transition - move machine to its next state + // ++ if (VIRAMA_SCRIPT(c) && VIRAMA(prevchar)) ++ { ++ state = START_STATE; ++ row = (RBBIStateTableRow *) (tableData + tableRowLen * state); ++ } + state = row->fNextState[category]; + row = (RBBIStateTableRow *) + // (statetable->fTableData + (statetable->fRowLen * state)); +@@ -1059,6 +1081,7 @@ + // the input position. The next iteration will be processing the + // first real input character. + if (mode == RBBI_RUN) { ++ prevchar = c; + c = UTEXT_NEXT32(fText); + } else { + if (mode == RBBI_START) { +@@ -1107,6 +1130,7 @@ + int16_t category = 0; + RBBIRunMode mode; + RBBIStateTableRow *row; ++ UChar32 prevchar; + UChar32 c; + int32_t lookaheadStatus = 0; + int32_t result = 0; +@@ -1135,6 +1159,7 @@ + // Set up the starting char. + initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); + result = initialPosition; ++ prevchar = 0; + c = UTEXT_PREVIOUS32(fText); + + // Set the initial state for the state machine +@@ -1218,6 +1243,11 @@ + + // State Transition - move machine to its next state + // ++ if (VIRAMA_SCRIPT(prevchar) && VIRAMA(c)) ++ { ++ state = START_STATE; ++ row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); ++ } + state = row->fNextState[category]; + row = (RBBIStateTableRow *) + (statetable->fTableData + (statetable->fRowLen * state)); +@@ -1269,6 +1299,7 @@ + // the input position. The next iteration will be processing the + // first real input character. + if (mode == RBBI_RUN) { ++ prevchar = c; + c = UTEXT_PREVIOUS32(fText); + } else { + if (mode == RBBI_START) { diff --git a/icu.rh429023.regexp.patch b/icu.rh429023.regexp.patch new file mode 100644 index 0000000..ef8eded --- /dev/null +++ b/icu.rh429023.regexp.patch @@ -0,0 +1,307 @@ +diff -ru icu.orig/source/common/uvectr32.cpp icu/source/common/uvectr32.cpp +--- icu.orig/source/common/uvectr32.cpp 2003-08-27 02:01:30.000000000 +0100 ++++ icu/source/common/uvectr32.cpp 2008-01-22 08:37:06.000000000 +0000 +@@ -1,6 +1,6 @@ + /* + ****************************************************************************** +-* Copyright (C) 1999-2003, International Business Machines Corporation and * ++* Copyright (C) 1999-2008, International Business Machines Corporation and * + * others. All Rights Reserved. * + ****************************************************************************** + * Date Name Description +@@ -26,6 +26,7 @@ + UVector32::UVector32(UErrorCode &status) : + count(0), + capacity(0), ++ maxCapacity(0), + elements(NULL) + { + _init(DEFUALT_CAPACITY, status); +@@ -34,6 +35,7 @@ + UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) : + count(0), + capacity(0), ++ maxCapacity(0), + elements(0) + { + _init(initialCapacity, status); +@@ -46,6 +48,9 @@ + if (initialCapacity < 1) { + initialCapacity = DEFUALT_CAPACITY; + } ++ if (maxCapacity>0 && maxCapacity= minimumCapacity) { + return TRUE; +- } else { +- int32_t newCap = capacity * 2; +- if (newCap < minimumCapacity) { +- newCap = minimumCapacity; +- } +- int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); +- if (newElems == 0) { +- status = U_MEMORY_ALLOCATION_ERROR; +- return FALSE; +- } +- uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); +- uprv_free(elements); +- elements = newElems; +- capacity = newCap; +- return TRUE; ++ } ++ if (maxCapacity>0 && minimumCapacity>maxCapacity) { ++ status = U_BUFFER_OVERFLOW_ERROR; ++ return FALSE; ++ } ++ int32_t newCap = capacity * 2; ++ if (newCap < minimumCapacity) { ++ newCap = minimumCapacity; ++ } ++ if (maxCapacity > 0 && newCap > maxCapacity) { ++ newCap = maxCapacity; ++ } ++ int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); ++ if (newElems == 0) { ++ status = U_MEMORY_ALLOCATION_ERROR; ++ return FALSE; ++ } ++ uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); ++ uprv_free(elements); ++ elements = newElems; ++ capacity = newCap; ++ return TRUE; ++} ++ ++void UVector32::setMaxCapacity(int32_t limit) { ++ U_ASSERT(limit >= 0); ++ maxCapacity = limit; ++ if (maxCapacity < 0) { ++ maxCapacity = 0; + } + } + +diff -ru icu.orig/source/common/uvectr32.h icu/source/common/uvectr32.h +--- icu.orig/source/common/uvectr32.h 2006-01-18 03:52:04.000000000 +0000 ++++ icu/source/common/uvectr32.h 2008-01-22 08:37:07.000000000 +0000 +@@ -1,6 +1,6 @@ + /* + ********************************************************************** +-* Copyright (C) 1999-2006, International Business Machines ++* Copyright (C) 1999-2008, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ +@@ -61,6 +61,8 @@ + int32_t count; + + int32_t capacity; ++ ++ int32_t maxCapacity; // Limit beyond which capacity is not permitted to grow. + + int32_t* elements; + +@@ -162,6 +164,14 @@ + int32_t *getBuffer() const; + + /** ++ * Set the maximum allowed buffer capacity for this vector/stack. ++ * Default with no limit set is unlimited, go until malloc() fails. ++ * A Limit of zero means unlimited capacity. ++ * Units are vector elements (32 bits each), not bytes. ++ */ ++ void setMaxCapacity(int32_t limit); ++ ++ /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); +@@ -221,7 +231,9 @@ + } + + inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) { +- ensureCapacity(count+size, status); ++ if (ensureCapacity(count+size, status) == FALSE) { ++ return NULL; ++ } + int32_t *rp = elements+count; + count += size; + return rp; +diff -ru icu.orig/source/i18n/regexcmp.cpp icu/source/i18n/regexcmp.cpp +--- icu.orig/source/i18n/regexcmp.cpp 2006-02-02 04:37:14.000000000 +0000 ++++ icu/source/i18n/regexcmp.cpp 2008-01-22 08:37:06.000000000 +0000 +@@ -1187,14 +1187,17 @@ + // Because capture groups can be forward-referenced by back-references, + // we fill the operand with the capture group number. At the end + // of compilation, it will be changed to the variable's location. +- U_ASSERT(groupNum > 0); +- int32_t op; +- if (fModeFlags & UREGEX_CASE_INSENSITIVE) { +- op = URX_BUILD(URX_BACKREF_I, groupNum); ++ if (groupNum < 1) { ++ error(U_REGEX_INVALID_BACK_REF); + } else { +- op = URX_BUILD(URX_BACKREF, groupNum); ++ int32_t op; ++ if (fModeFlags & UREGEX_CASE_INSENSITIVE) { ++ op = URX_BUILD(URX_BACKREF_I, groupNum); ++ } else { ++ op = URX_BUILD(URX_BACKREF, groupNum); ++ } ++ fRXPat->fCompiledPat->addElement(op, *fStatus); + } +- fRXPat->fCompiledPat->addElement(op, *fStatus); + } + break; + +diff -ru icu.orig/source/i18n/rematch.cpp icu/source/i18n/rematch.cpp +--- icu.orig/source/i18n/rematch.cpp 2005-08-25 19:02:20.000000000 +0100 ++++ icu/source/i18n/rematch.cpp 2008-01-22 08:37:44.000000000 +0000 +@@ -30,6 +30,15 @@ + + U_NAMESPACE_BEGIN + ++// Limit the size of the back track stack, to avoid system failures caused ++// by heap exhaustion. Units are in 32 bit words, not bytes. ++// This value puts ICU's limits higher than most other regexp implementations, ++// which use recursion rather than the heap, and take more storage per ++// backtrack point. ++// This constant is _temporary_. Proper API to control the value will added. ++// ++static const int32_t BACKTRACK_STACK_CAPACITY = 8000000; ++ + //----------------------------------------------------------------------------- + // + // Constructor and Destructor +@@ -53,6 +62,8 @@ + } + if (fStack == NULL || fData == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; ++ } else { ++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); + } + + reset(*RegexStaticSets::gStaticSets->fEmptyString); +@@ -78,6 +89,8 @@ + } + if (fStack == NULL || fData == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; ++ } else { ++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); + } + reset(input); + } +@@ -102,6 +115,8 @@ + } + if (fStack == NULL || fData == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; ++ } else { ++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); + } + reset(*RegexStaticSets::gStaticSets->fEmptyString); + } +@@ -1015,6 +1030,14 @@ + inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) { + // push storage for a new frame. + int32_t *newFP = fStack->reserveBlock(frameSize, status); ++ if (newFP == NULL) { ++ // Heap allocation error on attempted stack expansion. ++ // We need to return a writable stack frame, so just return the ++ // previous frame. The match operation will stop quickly ++ // becuase of the error status, after which the frame will never ++ // be looked at again. ++ return fp; ++ } + fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. + + // New stack frame = copy of old top frame. +@@ -1030,8 +1053,8 @@ + fp->fPatIdx = savePatIdx; + return (REStackFrame *)newFP; + } +- +- ++ ++ + //-------------------------------------------------------------------------------- + // + // MatchAt This is the actual matching engine. +@@ -2262,6 +2285,7 @@ + } + + if (U_FAILURE(status)) { ++ isMatch = FALSE; + break; + } + } +diff -ru icu.orig/source/test/intltest/regextst.cpp icu/source/test/intltest/regextst.cpp +--- icu.orig/source/test/intltest/regextst.cpp 2005-07-05 19:39:00.000000000 +0100 ++++ icu/source/test/intltest/regextst.cpp 2008-01-22 08:38:21.000000000 +0000 +@@ -66,6 +66,10 @@ + case 6: name = "PerlTests"; + if (exec) PerlTests(); + break; ++ case 7: name = "Bug 6149"; ++ if (exec) Bug6149(); ++ break; ++ + + + default: name = ""; +@@ -1637,6 +1641,13 @@ + // UnicodeSet containing a string + REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING); + ++ ++ // Invalid Back Reference \0 ++ // For ICU 3.8 and earlier ++ // For ICU versions newer than 3.8, \0 introduces an octal escape. ++ // ++ REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_INVALID_BACK_REF); ++ + } + + +@@ -2119,6 +2130,26 @@ + } + + ++//-------------------------------------------------------------- ++// ++// Bug6149 Verify limits to heap expansion for backtrack stack. ++// Use this pattern, ++// "(a?){1,}" ++// The zero-length match will repeat forever. ++// (That this goes into a loop is another bug) ++// ++//--------------------------------------------------------------- ++void RegexTest::Bug6149() { ++ UnicodeString pattern("(a?){1,}"); ++ UnicodeString s("xyz"); ++ uint32_t flags = 0; ++ UErrorCode status = U_ZERO_ERROR; ++ ++ RegexMatcher matcher(pattern, s, flags, status); ++ UBool result = false; ++ REGEX_ASSERT_FAIL(result=matcher.matches(status), U_BUFFER_OVERFLOW_ERROR); ++ REGEX_ASSERT(result == FALSE); ++ } + + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + +diff -ru icu.orig/source/test/intltest/regextst.h icu/source/test/intltest/regextst.h +--- icu.orig/source/test/intltest/regextst.h 2003-12-03 06:58:28.000000000 +0000 ++++ icu/source/test/intltest/regextst.h 2008-01-22 08:37:06.000000000 +0000 +@@ -30,6 +30,7 @@ + virtual void Extended(); + virtual void Errors(); + virtual void PerlTests(); ++ virtual void Bug6149(); + + // The following functions are internal to the regexp tests. + virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line); -- cgit