diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c --- icu.orig/source/common/ucnv2022.c 2009-06-02 11:48:38.000000000 +0100 +++ icu/source/common/ucnv2022.c 2009-06-02 12:30:29.000000000 +0100 @@ -84,6 +84,26 @@ #define V_TAB 0x0B #define SPACE 0x20 +enum { + HWKANA_START=0xff61, + HWKANA_END=0xff9f +}; + +/* + * 94-character sets with native byte values A1..FE are encoded in ISO 2022 + * as bytes 21..7E. (Subtract 0x80.) + * 96-character sets with native byte values A0..FF are encoded in ISO 2022 + * as bytes 20..7F. (Subtract 0x80.) + * Do not encode C1 control codes with native bytes 80..9F + * as bytes 00..1F (C0 control codes). + */ +enum { + GR94_START=0xa1, + GR94_END=0xfe, + GR96_START=0xa0, + GR96_END=0xff +}; + /* * ISO 2022 control codes must not be converted from Unicode * because they would mess up the byte stream. @@ -981,22 +1001,27 @@ /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c - * any future change in _MBCSFromUChar32() function should be reflected in - * this macro + * any future change in _MBCSFromUChar32() function should be reflected here. + * @return number of bytes in *value; negative number if fallback; 0 if no mapping */ -static U_INLINE void +static U_INLINE int32_t MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, UChar32 c, uint32_t* value, UBool useFallback, - int32_t *length, int outputType) { const int32_t *cx; const uint16_t *table; uint32_t stage2Entry; uint32_t myValue; + int32_t length; const uint8_t *p; + /* + * TODO(markus): Use and require new, faster MBCS conversion table structures. + * Use internal version of ucnv_open() that verifies that the new structures are available, + * else U_INTERNAL_PROGRAM_ERROR. + */ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { table=sharedData->mbcs.fromUnicodeTable; @@ -1005,51 +1030,60 @@ if(outputType==MBCS_OUTPUT_2){ myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); if(myValue<=0xff) { - *length=1; + length=1; } else { - *length=2; + length=2; } } else /* outputType==MBCS_OUTPUT_3 */ { p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; if(myValue<=0xff) { - *length=1; + length=1; } else if(myValue<=0xffff) { - *length=2; + length=2; } else { - *length=3; + length=3; } } + /* + * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. + * Pass in parameter for type of output bytes, for validation and shifting: + * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? + * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) + * - A1-FE: Subtract 80 after range check. + * - SJIS: Shift DBCS result to 21-7E x 21-7E. + */ /* is this code point assigned, or do we use fallbacks? */ - if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || - (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) - ) { + if((stage2Entry&(1<<(16+(c&0xf))))!=0) { + /* assigned */ + *value=myValue; + return length; + } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { /* * We allow a 0 byte output if the "assigned" bit is set for this entry. * There is no way with this data structure for fallback output * to be a zero byte. */ - /* assigned */ *value=myValue; - return; + return -length; } } cx=sharedData->mbcs.extIndexes; if(cx!=NULL) { - *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback); - return; + return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); } /* unassigned */ - *length=0; + return 0; } /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c - * any future change in _MBCSSingleFromUChar32() function should be reflected in - * this macro + * any future change in _MBCSSingleFromUChar32() function should be reflected here. + * @param retval pointer to output byte + * @return 1 roundtrip byte 0 no mapping -1 fallback byte */ -static U_INLINE void +static U_INLINE int32_t MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, UChar32 c, uint32_t* retval, @@ -1059,20 +1093,21 @@ int32_t value; /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { - *retval=(uint16_t)-1; - return; + return 0; } /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ table=sharedData->mbcs.fromUnicodeTable; /* get the byte for the output */ value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); /* is this code point assigned, or do we use fallbacks? */ - if(useFallback ? value>=0x800 : value>=0xc00) { - value &=0xff; + *retval=(uint32_t)(value&0xff); + if(value>=0xf00) { + return 1; /* roundtrip */ + } else if(useFallback ? value>=0x800 : value>=0xc00) { + return -1; /* fallback taken */ } else { - value= -1; + return 0; /* no mapping */ } - *retval=(uint16_t) value; } #ifdef U_ENABLE_GENERIC_ISO_2022 @@ -1316,6 +1351,7 @@ static void UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { + UConverter *cnv = args->converter; UConverterDataISO2022 *converterData; ISO2022State *pFromU2022State; uint8_t *target = (uint8_t *) args->target; @@ -1335,14 +1371,13 @@ int8_t cs, g; /* set up the state */ - converterData = (UConverterDataISO2022*)args->converter->extraInfo; + converterData = (UConverterDataISO2022*)cnv->extraInfo; pFromU2022State = &converterData->fromU2022State; - useFallback = args->converter->useFallback; choiceCount = 0; /* check if the last codepoint of previous buffer was a lead surrogate*/ - if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { goto getTrail; } @@ -1361,26 +1396,26 @@ if(UTF_IS_SECOND_SURROGATE(trail)) { source++; sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); - args->converter->fromUChar32=0x00; + cnv->fromUChar32=0x00; /* convert this supplementary code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } else { /* no more input */ - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } @@ -1389,7 +1424,7 @@ if(IS_2022_CONTROL(sourceChar)) { /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } @@ -1407,9 +1442,10 @@ /* JIS7/8: try single-byte half-width Katakana before JISX208 */ if(converterData->version == 3 || converterData->version == 4) { - choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT; - csm &= ~CSM(cs); + choices[choiceCount++] = (int8_t)HWKANA_7BIT; } + /* Do not try single-byte half-width Katakana for other versions. */ + csm &= ~CSM(HWKANA_7BIT); /* try the current G0 charset */ choices[choiceCount++] = cs = pFromU2022State->cs[0]; @@ -1432,86 +1468,134 @@ } cs = g = 0; + /* + * len==0: no mapping found yet + * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks + * len>0: found a roundtrip result, done + */ len = 0; + /* + * We will turn off useFallback after finding a fallback, + * but we still get fallbacks from PUA code points as usual. + * Therefore, we will also need to check that we don't overwrite + * an early fallback with a later one. + */ + useFallback = cnv->useFallback; - for(i = 0; i < choiceCount && len == 0; ++i) { - cs = choices[i]; - switch(cs) { + for(i = 0; i < choiceCount && len <= 0; ++i) { + uint32_t value; + int32_t len2; + int8_t cs0 = choices[i]; + switch(cs0) { case ASCII: if(sourceChar <= 0x7f) { targetValue = (uint32_t)sourceChar; len = 1; + cs = cs0; + g = 0; } break; case ISO8859_1: - if(0x80 <= sourceChar && sourceChar <= 0xff) { + if(GR96_START <= sourceChar && sourceChar <= GR96_END) { targetValue = (uint32_t)sourceChar - 0x80; len = 1; + cs = cs0; g = 2; } break; case HWKANA_7BIT: - if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) { - targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21)); - len = 1; - + if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { if(converterData->version==3) { /* JIS7: use G1 (SO) */ - pFromU2022State->cs[1] = cs; /* do not output an escape sequence */ + /* Shift U+FF61..U+FF9F to bytes 21..5F. */ + targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); + len = 1; + pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ g = 1; } else if(converterData->version==4) { /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ - int8_t cs0; - - targetValue += 0x80; + /* Shift U+FF61..U+FF9F to bytes A1..DF. */ + targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); + len = 1; - cs0 = pFromU2022State->cs[0]; - if(IS_JP_DBCS(cs0)) { + cs = pFromU2022State->cs[0]; + if(IS_JP_DBCS(cs)) { /* switch from a DBCS charset to JISX201 */ cs = (int8_t)JISX201; - } else { - /* stay in the current G0 charset */ - cs = cs0; } + /* else stay in the current G0 charset */ + g = 0; } + /* else do not use HWKANA_7BIT with other versions */ } break; case JISX201: /* G0 SBCS */ - MBCS_SINGLE_FROM_UCHAR32( - converterData->myConverterArray[cs], - sourceChar, &targetValue, - useFallback); - if(targetValue <= 0x7f) { - len = 1; + len2 = MBCS_SINGLE_FROM_UCHAR32( + converterData->myConverterArray[cs0], + sourceChar, &value, + useFallback); + if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { + targetValue = value; + len = len2; + cs = cs0; + g = 0; + useFallback = FALSE; } break; case ISO8859_7: /* G0 SBCS forced to 7-bit output */ - MBCS_SINGLE_FROM_UCHAR32( - converterData->myConverterArray[cs], - sourceChar, &targetValue, - useFallback); - if(0x80 <= targetValue && targetValue <= 0xff) { - targetValue -= 0x80; - len = 1; + len2 = MBCS_SINGLE_FROM_UCHAR32( + converterData->myConverterArray[cs0], + sourceChar, &value, + useFallback); + if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { + targetValue = value - 0x80; + len = len2; + cs = cs0; g = 2; + useFallback = FALSE; } break; default: /* G0 DBCS */ - MBCS_FROM_UCHAR32_ISO2022( - converterData->myConverterArray[cs], - sourceChar, &targetValue, - useFallback, &len, MBCS_OUTPUT_2); - if(len != 2) { - len = 0; + len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[cs0], + sourceChar, &value, + useFallback, MBCS_OUTPUT_2); + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ + if(cs0 == KSC5601) { + /* + * Check for valid bytes for the encoding scheme. + * This is necessary because the sub-converter (windows-949) + * has a broader encoding scheme than is valid for 2022. + * + * Check that the result is a 2-byte value with each byte in the range A1..FE + * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte + * to move it to the ISO 2022 range 21..7E. + */ + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) + ) { + value -= 0x8080; /* shift down to 21..7e byte range */ + } else { + break; /* not valid for ISO 2022 */ + } + } + targetValue = value; + len = len2; + cs = cs0; + g = 0; + useFallback = FALSE; } break; } } - if(len > 0) { + if(len != 0) { + if(len < 0) { + len = -len; /* fallback */ + } outLen = 0; /* count output bytes */ /* write SI if necessary (only for JIS7) */ @@ -1560,7 +1644,7 @@ * then this is an error */ *err = U_INVALID_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } @@ -1586,7 +1670,7 @@ } } else { fromUWriteUInt8( - args->converter, + cnv, buffer, outLen, &target, (const char *)targetLimit, &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), @@ -1615,7 +1699,7 @@ */ if( U_SUCCESS(*err) && (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && - args->flush && source>=sourceLimit && args->converter->fromUChar32==0 + args->flush && source>=sourceLimit && cnv->fromUChar32==0 ) { int32_t sourceIndex; @@ -1654,7 +1738,7 @@ } fromUWriteUInt8( - args->converter, + cnv, buffer, outLen, &target, (const char *)targetLimit, &offsets, sourceIndex, @@ -1777,7 +1861,7 @@ !IS_JP_DBCS(cs) ) { /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ - targetUniChar = mySourceChar + (0xff61 - 0xa1); + targetUniChar = mySourceChar + (HWKANA_START - 0xa1); /* return from a single-shift state to the previous one */ if(pToU2022State->g >= 2) { @@ -1818,7 +1902,7 @@ case HWKANA_7BIT: if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { /* 7-bit halfwidth Katakana */ - targetUniChar = mySourceChar + (0xff61 - 0x21); + targetUniChar = mySourceChar + (HWKANA_START - 0x21); } break; default: @@ -1965,9 +2049,10 @@ break; } - /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData, - sourceChar,&targetByteUnit,args->converter->useFallback);*/ - MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2); + length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); + if(length < 0) { + length = -length; /* fallback */ + } /* only DBCS or SBCS characters are expected*/ /* DB characters with high bit set to 1 are expected */ if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ @@ -2449,7 +2534,7 @@ static void UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ - + UConverter *cnv = args->converter; UConverterDataISO2022 *converterData; ISO2022State *pFromU2022State; uint8_t *target = (uint8_t *) args->target; @@ -2466,14 +2551,13 @@ UBool useFallback; /* set up the state */ - converterData = (UConverterDataISO2022*)args->converter->extraInfo; + converterData = (UConverterDataISO2022*)cnv->extraInfo; pFromU2022State = &converterData->fromU2022State; - useFallback = args->converter->useFallback; choiceCount = 0; /* check if the last codepoint of previous buffer was a lead surrogate*/ - if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { goto getTrail; } @@ -2492,26 +2576,26 @@ if(UTF_IS_SECOND_SURROGATE(trail)) { source++; sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); - args->converter->fromUChar32=0x00; + cnv->fromUChar32=0x00; /* convert this supplementary code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } else { /* no more input */ - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } @@ -2522,7 +2606,7 @@ if(IS_2022_CONTROL(sourceChar)) { /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } @@ -2545,7 +2629,6 @@ } else{ /* convert U+0080..U+10ffff */ - UConverterSharedData *cnv; int32_t i; int8_t cs, g; @@ -2593,17 +2676,41 @@ } cs = g = 0; + /* + * len==0: no mapping found yet + * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks + * len>0: found a roundtrip result, done + */ len = 0; + /* + * We will turn off useFallback after finding a fallback, + * but we still get fallbacks from PUA code points as usual. + * Therefore, we will also need to check that we don't overwrite + * an early fallback with a later one. + */ + useFallback = cnv->useFallback; - for(i = 0; i < choiceCount && len == 0; ++i) { - cs = choices[i]; - if(cs > 0) { - if(cs > CNS_11643_0) { - cnv = converterData->myConverterArray[CNS_11643]; - MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3); - if(len==3) { - cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80); - len = 2; + for(i = 0; i < choiceCount && len <= 0; ++i) { + int8_t cs0 = choices[i]; + if(cs0 > 0) { + uint32_t value; + int32_t len2; + if(cs0 > CNS_11643_0) { + len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[CNS_11643], + sourceChar, + &value, + useFallback, + MBCS_OUTPUT_3); + if(len2 == 3 || (len2 == -3 && len == 0)) { + targetValue = value; + cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); + if(len2 >= 0) { + len = 2; + } else { + len = -2; + useFallback = FALSE; + } if(cs == CNS_11643_1) { g = 1; } else if(cs == CNS_11643_2) { @@ -2617,15 +2724,25 @@ } } else { /* GB2312_1 or ISO-IR-165 */ - cnv = converterData->myConverterArray[cs]; - MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2); - g = 1; /* used if len == 2 */ + len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[cs0], + sourceChar, + &value, + useFallback, + MBCS_OUTPUT_2); + if(len2 == 2 || (len2 == -2 && len == 0)) { + targetValue = value; + len = len2; + cs = cs0; + g = 1; + useFallback = FALSE; + } } } } - if(len > 0) { - len = 0; /* count output bytes; it must have been len == 2 */ + if(len != 0) { + len = 0; /* count output bytes; it must have been abs(len) == 2 */ /* write the designation sequence if necessary */ if(cs != pFromU2022State->cs[g]) { @@ -2670,7 +2787,7 @@ * then this is an error */ *err = U_INVALID_CHAR_FOUND; - args->converter->fromUChar32=sourceChar; + cnv->fromUChar32=sourceChar; break; } } @@ -2691,7 +2808,7 @@ } } else { fromUWriteUInt8( - args->converter, + cnv, buffer, len, &target, (const char *)targetLimit, &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), @@ -2720,7 +2837,7 @@ */ if( U_SUCCESS(*err) && pFromU2022State->g!=0 && - args->flush && source>=sourceLimit && args->converter->fromUChar32==0 + args->flush && source>=sourceLimit && cnv->fromUChar32==0 ) { int32_t sourceIndex; @@ -2748,7 +2865,7 @@ } fromUWriteUInt8( - args->converter, + cnv, SHIFT_IN_STR, 1, &target, (const char *)targetLimit, &offsets, sourceIndex, @@ -3146,7 +3263,7 @@ } if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { /* include half-width Katakana for JP */ - sa->addRange(sa->set, 0xff61, 0xff9f); + sa->addRange(sa->set, HWKANA_START, HWKANA_END); } break; case 'c': diff -ru icu.orig/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c --- icu.orig/source/common/ucnv_ext.c 2009-06-02 11:48:38.000000000 +0100 +++ icu/source/common/ucnv_ext.c 2009-06-02 12:14:20.000000000 +0100 @@ -551,6 +551,12 @@ return 0; } + /* + * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: + * Do not interpret values with reserved bits used, for forward compatibility, + * and do not even remember intermediate results with reserved bits used. + */ + if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { /* partial match, enter the loop below */ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); @@ -575,7 +581,8 @@ value=*fromUSectionValues++; if( value!=0 && (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || - FROM_U_USE_FALLBACK(useFallback, firstCP)) + FROM_U_USE_FALLBACK(useFallback, firstCP)) && + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 ) { /* remember longest match so far */ matchValue=value; @@ -613,8 +620,9 @@ /* partial match, continue */ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); } else { - if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || - FROM_U_USE_FALLBACK(useFallback, firstCP) + if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || + FROM_U_USE_FALLBACK(useFallback, firstCP)) && + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 ) { /* full match, stop with result */ matchValue=value; @@ -632,8 +640,9 @@ return 0; } } else /* result from firstCP trie lookup */ { - if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || - FROM_U_USE_FALLBACK(useFallback, firstCP) + if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || + FROM_U_USE_FALLBACK(useFallback, firstCP)) && + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 ) { /* full match, stop with result */ matchValue=value; @@ -644,20 +653,18 @@ } } - if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { - /* do not interpret values with reserved bits used, for forward compatibility */ - return 0; - } - /* return result */ if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { return 1; /* assert matchLength==2 */ } - *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); + *pMatchValue=matchValue; return matchLength; } +/* + * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits + */ static U_INLINE void ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, uint32_t value, @@ -792,6 +799,10 @@ } } +/* + * Used by ISO 2022 implementation. + * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping + */ U_CFUNC int32_t ucnv_extSimpleMatchFromU(const int32_t *cx, UChar32 cp, uint32_t *pValue, @@ -809,13 +820,15 @@ if(match>=2) { /* write result for simple, single-character conversion */ int32_t length; - + int isRoundtrip; + + isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); length=UCNV_EXT_FROM_U_GET_LENGTH(value); value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { *pValue=value; - return length; + return isRoundtrip ? length : -length; #if 0 /* not currently used */ } else if(length==4) { /* de-serialize a 4-byte result */ @@ -825,7 +838,7 @@ ((uint32_t)result[1]<<16)| ((uint32_t)result[2]<<8)| result[3]; - return 4; + return isRoundtrip ? 4 : -4; #endif } } diff -ru icu.orig/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h --- icu.orig/source/common/ucnv_ext.h 2009-06-02 11:48:38.000000000 +0100 +++ icu/source/common/ucnv_ext.h 2009-06-02 12:14:20.000000000 +0100 @@ -452,7 +452,7 @@ #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0) #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) -/* use after masking off the roundtrip flag */ +/* get length; masks away all other bits */ #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES) /* get bytes or bytes index */ diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c --- icu.orig/source/common/ucnvmbcs.c 2009-06-02 11:48:38.000000000 +0100 +++ icu/source/common/ucnvmbcs.c 2009-06-02 12:14:20.000000000 +0100 @@ -3785,7 +3785,8 @@ cx=sharedData->mbcs.extIndexes; if(cx!=NULL) { - return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); + length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); + return length>=0 ? length : -length; /* return abs(length); */ } /* unassigned */ diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt --- icu.orig/source/test/testdata/conversion.txt 2009-06-02 11:48:26.000000000 +0100 +++ icu/source/test/testdata/conversion.txt 2009-06-02 12:14:20.000000000 +0100 @@ -495,6 +495,46 @@ } { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } + // Verify that mappings that would result in byte values outside 20..7F (for SBCS) + // or 21..7E (for DBCS) are not used. + // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): + // \x9F |0 (also in ISO 8859-1) + // \xB7 |1 + // windows-949-2000 (KSC_5601, $(C=1b242843): + // \xA0\xA1 |0 + // \xC0\x41 |0 + // \xC8\xFE |0 + { + "JIS8", // =ISO_2022,locale=ja,version=4 + "\u009f\u0387\uc829\ud4fe\ud79d", + :bin{ 1a1b2e461b4e371a1a1b242843487e1b2842 }, + :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 }, + :int{1}, :int{1}, "", "?", "" + } + // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping + // Verify that a roundtrip mapping is used even when a fallback mapping is + // available in the current state. + // U+FF61 is handled in code + // jisx-208.ucm ($B=1b2442): + // \x21\x34 |0 + // \x21\x51 |0 and + // ibm-897_P100-1995.ucm (JIS X 0201, (J=1b284a): + // \x7D |1 + // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): + // \xF6 |1 + // \xAF |0 + // \x7D |1 (not legal for ISO 2022) + // windows-949-2000 (KSC_5601, $(C=1b242843): + // \xB0\xA1 |0 + // \xA3\xFD |0 + // \xA1\xAD |0 (in extension table) + { + "JIS8", // =ISO_2022,locale=ja,version=4 + "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d", // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208. + :bin{ 61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 }, + :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 }, + :int{1}, :int{1}, "", "?", "" + } // e4b8 is a partial sequence { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } }