diff options
Diffstat (limited to 'icu.icu5483.backport.patch')
-rw-r--r-- | icu.icu5483.backport.patch | 874 |
1 files changed, 874 insertions, 0 deletions
diff --git a/icu.icu5483.backport.patch b/icu.icu5483.backport.patch new file mode 100644 index 0000000..039dee2 --- /dev/null +++ b/icu.icu5483.backport.patch @@ -0,0 +1,874 @@ +diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.orig/source/common/ucnv2022.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 12:30:29.000000000 +0100 +@@ -84,6 +84,26 @@ + #define V_TAB 0x0B + #define SPACE 0x20 + ++enum { ++ HWKANA_START=0xff61, ++ HWKANA_END=0xff9f ++}; ++ ++/* ++ * 94-character sets with native byte values A1..FE are encoded in ISO 2022 ++ * as bytes 21..7E. (Subtract 0x80.) ++ * 96-character sets with native byte values A0..FF are encoded in ISO 2022 ++ * as bytes 20..7F. (Subtract 0x80.) ++ * Do not encode C1 control codes with native bytes 80..9F ++ * as bytes 00..1F (C0 control codes). ++ */ ++enum { ++ GR94_START=0xa1, ++ GR94_END=0xfe, ++ GR96_START=0xa0, ++ GR96_END=0xff ++}; ++ + /* + * ISO 2022 control codes must not be converted from Unicode + * because they would mess up the byte stream. +@@ -981,22 +1001,27 @@ + + + /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSFromUChar32() function should be reflected in +- * this macro ++ * any future change in _MBCSFromUChar32() function should be reflected here. ++ * @return number of bytes in *value; negative number if fallback; 0 if no mapping + */ +-static U_INLINE void ++static U_INLINE int32_t + MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, + UChar32 c, + uint32_t* value, + UBool useFallback, +- int32_t *length, + int outputType) + { + const int32_t *cx; + const uint16_t *table; + uint32_t stage2Entry; + uint32_t myValue; ++ int32_t length; + const uint8_t *p; ++ /* ++ * TODO(markus): Use and require new, faster MBCS conversion table structures. ++ * Use internal version of ucnv_open() that verifies that the new structures are available, ++ * else U_INTERNAL_PROGRAM_ERROR. ++ */ + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { + table=sharedData->mbcs.fromUnicodeTable; +@@ -1005,51 +1030,60 @@ + if(outputType==MBCS_OUTPUT_2){ + myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); + if(myValue<=0xff) { +- *length=1; ++ length=1; + } else { +- *length=2; ++ length=2; + } + } else /* outputType==MBCS_OUTPUT_3 */ { + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); + myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; + if(myValue<=0xff) { +- *length=1; ++ length=1; + } else if(myValue<=0xffff) { +- *length=2; ++ length=2; + } else { +- *length=3; ++ length=3; + } + } ++ /* ++ * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. ++ * Pass in parameter for type of output bytes, for validation and shifting: ++ * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? ++ * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) ++ * - A1-FE: Subtract 80 after range check. ++ * - SJIS: Shift DBCS result to 21-7E x 21-7E. ++ */ + /* is this code point assigned, or do we use fallbacks? */ +- if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || +- (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) +- ) { ++ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { ++ /* assigned */ ++ *value=myValue; ++ return length; ++ } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { + /* + * We allow a 0 byte output if the "assigned" bit is set for this entry. + * There is no way with this data structure for fallback output + * to be a zero byte. + */ +- /* assigned */ + *value=myValue; +- return; ++ return -length; + } + } + + cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { +- *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback); +- return; ++ return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); + } + + /* unassigned */ +- *length=0; ++ return 0; + } + + /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSSingleFromUChar32() function should be reflected in +- * this macro ++ * any future change in _MBCSSingleFromUChar32() function should be reflected here. ++ * @param retval pointer to output byte ++ * @return 1 roundtrip byte 0 no mapping -1 fallback byte + */ +-static U_INLINE void ++static U_INLINE int32_t + MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, + UChar32 c, + uint32_t* retval, +@@ -1059,20 +1093,21 @@ + int32_t value; + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { +- *retval=(uint16_t)-1; +- return; ++ return 0; + } + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ + table=sharedData->mbcs.fromUnicodeTable; + /* get the byte for the output */ + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); + /* is this code point assigned, or do we use fallbacks? */ +- if(useFallback ? value>=0x800 : value>=0xc00) { +- value &=0xff; ++ *retval=(uint32_t)(value&0xff); ++ if(value>=0xf00) { ++ return 1; /* roundtrip */ ++ } else if(useFallback ? value>=0x800 : value>=0xc00) { ++ return -1; /* fallback taken */ + } else { +- value= -1; ++ return 0; /* no mapping */ + } +- *retval=(uint16_t) value; + } + + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -1316,6 +1351,7 @@ + + static void + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { ++ UConverter *cnv = args->converter; + UConverterDataISO2022 *converterData; + ISO2022State *pFromU2022State; + uint8_t *target = (uint8_t *) args->target; +@@ -1335,14 +1371,13 @@ + int8_t cs, g; + + /* set up the state */ +- converterData = (UConverterDataISO2022*)args->converter->extraInfo; ++ converterData = (UConverterDataISO2022*)cnv->extraInfo; + pFromU2022State = &converterData->fromU2022State; +- useFallback = args->converter->useFallback; + + choiceCount = 0; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ +- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + +@@ -1361,26 +1396,26 @@ + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +- args->converter->fromUChar32=0x00; ++ cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -1389,7 +1424,7 @@ + if(IS_2022_CONTROL(sourceChar)) { + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -1407,9 +1442,10 @@ + + /* JIS7/8: try single-byte half-width Katakana before JISX208 */ + if(converterData->version == 3 || converterData->version == 4) { +- choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT; +- csm &= ~CSM(cs); ++ choices[choiceCount++] = (int8_t)HWKANA_7BIT; + } ++ /* Do not try single-byte half-width Katakana for other versions. */ ++ csm &= ~CSM(HWKANA_7BIT); + + /* try the current G0 charset */ + choices[choiceCount++] = cs = pFromU2022State->cs[0]; +@@ -1432,86 +1468,134 @@ + } + + cs = g = 0; ++ /* ++ * len==0: no mapping found yet ++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++ * len>0: found a roundtrip result, done ++ */ + len = 0; ++ /* ++ * We will turn off useFallback after finding a fallback, ++ * but we still get fallbacks from PUA code points as usual. ++ * Therefore, we will also need to check that we don't overwrite ++ * an early fallback with a later one. ++ */ ++ useFallback = cnv->useFallback; + +- for(i = 0; i < choiceCount && len == 0; ++i) { +- cs = choices[i]; +- switch(cs) { ++ for(i = 0; i < choiceCount && len <= 0; ++i) { ++ uint32_t value; ++ int32_t len2; ++ int8_t cs0 = choices[i]; ++ switch(cs0) { + case ASCII: + if(sourceChar <= 0x7f) { + targetValue = (uint32_t)sourceChar; + len = 1; ++ cs = cs0; ++ g = 0; + } + break; + case ISO8859_1: +- if(0x80 <= sourceChar && sourceChar <= 0xff) { ++ if(GR96_START <= sourceChar && sourceChar <= GR96_END) { + targetValue = (uint32_t)sourceChar - 0x80; + len = 1; ++ cs = cs0; + g = 2; + } + break; + case HWKANA_7BIT: +- if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) { +- targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21)); +- len = 1; +- ++ if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ +- pFromU2022State->cs[1] = cs; /* do not output an escape sequence */ ++ /* Shift U+FF61..U+FF9F to bytes 21..5F. */ ++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); ++ len = 1; ++ pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ + g = 1; + } else if(converterData->version==4) { + /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ +- int8_t cs0; +- +- targetValue += 0x80; ++ /* Shift U+FF61..U+FF9F to bytes A1..DF. */ ++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); ++ len = 1; + +- cs0 = pFromU2022State->cs[0]; +- if(IS_JP_DBCS(cs0)) { ++ cs = pFromU2022State->cs[0]; ++ if(IS_JP_DBCS(cs)) { + /* switch from a DBCS charset to JISX201 */ + cs = (int8_t)JISX201; +- } else { +- /* stay in the current G0 charset */ +- cs = cs0; + } ++ /* else stay in the current G0 charset */ ++ g = 0; + } ++ /* else do not use HWKANA_7BIT with other versions */ + } + break; + case JISX201: + /* G0 SBCS */ +- MBCS_SINGLE_FROM_UCHAR32( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback); +- if(targetValue <= 0x7f) { +- len = 1; ++ len2 = MBCS_SINGLE_FROM_UCHAR32( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback); ++ if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; + } + break; + case ISO8859_7: + /* G0 SBCS forced to 7-bit output */ +- MBCS_SINGLE_FROM_UCHAR32( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback); +- if(0x80 <= targetValue && targetValue <= 0xff) { +- targetValue -= 0x80; +- len = 1; ++ len2 = MBCS_SINGLE_FROM_UCHAR32( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback); ++ if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { ++ targetValue = value - 0x80; ++ len = len2; ++ cs = cs0; + g = 2; ++ useFallback = FALSE; + } + break; + default: + /* G0 DBCS */ +- MBCS_FROM_UCHAR32_ISO2022( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback, &len, MBCS_OUTPUT_2); +- if(len != 2) { +- len = 0; ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ if(cs0 == KSC5601) { ++ /* ++ * Check for valid bytes for the encoding scheme. ++ * This is necessary because the sub-converter (windows-949) ++ * has a broader encoding scheme than is valid for 2022. ++ * ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ */ ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ value -= 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ break; /* not valid for ISO 2022 */ ++ } ++ } ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; + } + break; + } + } + +- if(len > 0) { ++ if(len != 0) { ++ if(len < 0) { ++ len = -len; /* fallback */ ++ } + outLen = 0; /* count output bytes */ + + /* write SI if necessary (only for JIS7) */ +@@ -1560,7 +1644,7 @@ + * then this is an error + */ + *err = U_INVALID_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -1586,7 +1670,7 @@ + } + } else { + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, outLen, + &target, (const char *)targetLimit, + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -1615,7 +1699,7 @@ + */ + if( U_SUCCESS(*err) && + (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && +- args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++ args->flush && source>=sourceLimit && cnv->fromUChar32==0 + ) { + int32_t sourceIndex; + +@@ -1654,7 +1738,7 @@ + } + + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, outLen, + &target, (const char *)targetLimit, + &offsets, sourceIndex, +@@ -1777,7 +1861,7 @@ + !IS_JP_DBCS(cs) + ) { + /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ +- targetUniChar = mySourceChar + (0xff61 - 0xa1); ++ targetUniChar = mySourceChar + (HWKANA_START - 0xa1); + + /* return from a single-shift state to the previous one */ + if(pToU2022State->g >= 2) { +@@ -1818,7 +1902,7 @@ + case HWKANA_7BIT: + if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { + /* 7-bit halfwidth Katakana */ +- targetUniChar = mySourceChar + (0xff61 - 0x21); ++ targetUniChar = mySourceChar + (HWKANA_START - 0x21); + } + break; + default: +@@ -1965,9 +2049,10 @@ + break; + } + +- /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData, +- sourceChar,&targetByteUnit,args->converter->useFallback);*/ +- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2); ++ length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); ++ if(length < 0) { ++ length = -length; /* fallback */ ++ } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ + if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ +@@ -2449,7 +2534,7 @@ + + static void + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ +- ++ UConverter *cnv = args->converter; + UConverterDataISO2022 *converterData; + ISO2022State *pFromU2022State; + uint8_t *target = (uint8_t *) args->target; +@@ -2466,14 +2551,13 @@ + UBool useFallback; + + /* set up the state */ +- converterData = (UConverterDataISO2022*)args->converter->extraInfo; ++ converterData = (UConverterDataISO2022*)cnv->extraInfo; + pFromU2022State = &converterData->fromU2022State; +- useFallback = args->converter->useFallback; + + choiceCount = 0; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ +- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + +@@ -2492,26 +2576,26 @@ + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +- args->converter->fromUChar32=0x00; ++ cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -2522,7 +2606,7 @@ + if(IS_2022_CONTROL(sourceChar)) { + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -2545,7 +2629,6 @@ + } + else{ + /* convert U+0080..U+10ffff */ +- UConverterSharedData *cnv; + int32_t i; + int8_t cs, g; + +@@ -2593,17 +2676,41 @@ + } + + cs = g = 0; ++ /* ++ * len==0: no mapping found yet ++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++ * len>0: found a roundtrip result, done ++ */ + len = 0; ++ /* ++ * We will turn off useFallback after finding a fallback, ++ * but we still get fallbacks from PUA code points as usual. ++ * Therefore, we will also need to check that we don't overwrite ++ * an early fallback with a later one. ++ */ ++ useFallback = cnv->useFallback; + +- for(i = 0; i < choiceCount && len == 0; ++i) { +- cs = choices[i]; +- if(cs > 0) { +- if(cs > CNS_11643_0) { +- cnv = converterData->myConverterArray[CNS_11643]; +- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3); +- if(len==3) { +- cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80); +- len = 2; ++ for(i = 0; i < choiceCount && len <= 0; ++i) { ++ int8_t cs0 = choices[i]; ++ if(cs0 > 0) { ++ uint32_t value; ++ int32_t len2; ++ if(cs0 > CNS_11643_0) { ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[CNS_11643], ++ sourceChar, ++ &value, ++ useFallback, ++ MBCS_OUTPUT_3); ++ if(len2 == 3 || (len2 == -3 && len == 0)) { ++ targetValue = value; ++ cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); ++ if(len2 >= 0) { ++ len = 2; ++ } else { ++ len = -2; ++ useFallback = FALSE; ++ } + if(cs == CNS_11643_1) { + g = 1; + } else if(cs == CNS_11643_2) { +@@ -2617,15 +2724,25 @@ + } + } else { + /* GB2312_1 or ISO-IR-165 */ +- cnv = converterData->myConverterArray[cs]; +- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2); +- g = 1; /* used if len == 2 */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[cs0], ++ sourceChar, ++ &value, ++ useFallback, ++ MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 1; ++ useFallback = FALSE; ++ } + } + } + } + +- if(len > 0) { +- len = 0; /* count output bytes; it must have been len == 2 */ ++ if(len != 0) { ++ len = 0; /* count output bytes; it must have been abs(len) == 2 */ + + /* write the designation sequence if necessary */ + if(cs != pFromU2022State->cs[g]) { +@@ -2670,7 +2787,7 @@ + * then this is an error + */ + *err = U_INVALID_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -2691,7 +2808,7 @@ + } + } else { + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, len, + &target, (const char *)targetLimit, + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -2720,7 +2837,7 @@ + */ + if( U_SUCCESS(*err) && + pFromU2022State->g!=0 && +- args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++ args->flush && source>=sourceLimit && cnv->fromUChar32==0 + ) { + int32_t sourceIndex; + +@@ -2748,7 +2865,7 @@ + } + + fromUWriteUInt8( +- args->converter, ++ cnv, + SHIFT_IN_STR, 1, + &target, (const char *)targetLimit, + &offsets, sourceIndex, +@@ -3146,7 +3263,7 @@ + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { + /* include half-width Katakana for JP */ +- sa->addRange(sa->set, 0xff61, 0xff9f); ++ sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } + break; + case 'c': +diff -ru icu.orig/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.orig/source/common/ucnv_ext.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 12:14:20.000000000 +0100 +@@ -551,6 +551,12 @@ + return 0; + } + ++ /* ++ * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: ++ * Do not interpret values with reserved bits used, for forward compatibility, ++ * and do not even remember intermediate results with reserved bits used. ++ */ ++ + if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { + /* partial match, enter the loop below */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); +@@ -575,7 +581,8 @@ + value=*fromUSectionValues++; + if( value!=0 && + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP)) ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* remember longest match so far */ + matchValue=value; +@@ -613,8 +620,9 @@ + /* partial match, continue */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); + } else { +- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP) ++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* full match, stop with result */ + matchValue=value; +@@ -632,8 +640,9 @@ + return 0; + } + } else /* result from firstCP trie lookup */ { +- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP) ++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* full match, stop with result */ + matchValue=value; +@@ -644,20 +653,18 @@ + } + } + +- if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { +- /* do not interpret values with reserved bits used, for forward compatibility */ +- return 0; +- } +- + /* return result */ + if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { + return 1; /* assert matchLength==2 */ + } + +- *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); ++ *pMatchValue=matchValue; + return matchLength; + } + ++/* ++ * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits ++ */ + static U_INLINE void + ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, + uint32_t value, +@@ -792,6 +799,10 @@ + } + } + ++/* ++ * Used by ISO 2022 implementation. ++ * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping ++ */ + U_CFUNC int32_t + ucnv_extSimpleMatchFromU(const int32_t *cx, + UChar32 cp, uint32_t *pValue, +@@ -809,13 +820,15 @@ + if(match>=2) { + /* write result for simple, single-character conversion */ + int32_t length; +- ++ int isRoundtrip; ++ ++ isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); + length=UCNV_EXT_FROM_U_GET_LENGTH(value); + value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); + + if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { + *pValue=value; +- return length; ++ return isRoundtrip ? length : -length; + #if 0 /* not currently used */ + } else if(length==4) { + /* de-serialize a 4-byte result */ +@@ -825,7 +838,7 @@ + ((uint32_t)result[1]<<16)| + ((uint32_t)result[2]<<8)| + result[3]; +- return 4; ++ return isRoundtrip ? 4 : -4; + #endif + } + } +diff -ru icu.orig/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.orig/source/common/ucnv_ext.h 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.h 2009-06-02 12:14:20.000000000 +0100 +@@ -452,7 +452,7 @@ + #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0) + #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) + +-/* use after masking off the roundtrip flag */ ++/* get length; masks away all other bits */ + #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES) + + /* get bytes or bytes index */ +diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.orig/source/common/ucnvmbcs.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:14:20.000000000 +0100 +@@ -3785,7 +3785,8 @@ + + cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { +- return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++ length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++ return length>=0 ? length : -length; /* return abs(length); */ + } + + /* unassigned */ +diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.orig/source/test/testdata/conversion.txt 2009-06-02 11:48:26.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:14:20.000000000 +0100 +@@ -495,6 +495,46 @@ + } + { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } + { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++ // Verify that mappings that would result in byte values outside 20..7F (for SBCS) ++ // or 21..7E (for DBCS) are not used. ++ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): ++ // <U009F> \x9F |0 (also in ISO 8859-1) ++ // <U0387> \xB7 |1 ++ // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): ++ // <UC829> \xA0\xA1 |0 ++ // <UD4FE> \xC0\x41 |0 ++ // <UD79D> \xC8\xFE |0 ++ { ++ "JIS8", // =ISO_2022,locale=ja,version=4 ++ "\u009f\u0387\uc829\ud4fe\ud79d", ++ :bin{ 1a1b2e461b4e371a1a1b242843487e1b2842 }, ++ :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 }, ++ :int{1}, :int{1}, "", "?", "" ++ } ++ // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping ++ // Verify that a roundtrip mapping is used even when a fallback mapping is ++ // available in the current state. ++ // U+FF61 is handled in code ++ // jisx-208.ucm (<ESC>$B=1b2442): ++ // <U30FE> \x21\x34 |0 ++ // <UFF5D> \x21\x51 |0 and ++ // ibm-897_P100-1995.ucm (JIS X 0201, <ESC>(J=1b284a): ++ // <UFF5D> \x7D |1 ++ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): ++ // <U03D5> \xF6 |1 ++ // <U2015> \xAF |0 ++ // <UFF5D> \x7D |1 (not legal for ISO 2022) ++ // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): ++ // <UAC00> \xB0\xA1 |0 ++ // <UFF5D> \xA3\xFD |0 ++ // <U223C> \xA1\xAD |0 (in extension table) ++ { ++ "JIS8", // =ISO_2022,locale=ja,version=4 ++ "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d", // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208. ++ :bin{ 61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 }, ++ :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 }, ++ :int{1}, :int{1}, "", "?", "" ++ } + + // e4b8 is a partial sequence + { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } } |