diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c --- icu.5483/source/common/ucnv2022.c 2009-06-02 12:47:41.000000000 +0100 +++ icu/source/common/ucnv2022.c 2009-06-02 13:18:23.000000000 +0100 @@ -473,8 +473,7 @@ if(jpCharsetMasks[version]&CSM(ISO8859_7)) { myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); } - myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); - myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); + myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); if(jpCharsetMasks[version]&CSM(JISX212)) { myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); } @@ -1045,14 +1044,6 @@ length=3; } } - /* - * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. - * Pass in parameter for type of output bytes, for validation and shifting: - * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? - * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) - * - A1-FE: Subtract 80 after range check. - * - SJIS: Shift DBCS result to 21-7E x 21-7E. - */ /* is this code point assigned, or do we use fallbacks? */ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { /* assigned */ @@ -1110,6 +1101,23 @@ } } +/* + * Check that the result is a 2-byte value with each byte in the range A1..FE + * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte + * to move it to the ISO 2022 range 21..7E. + * Return 0 if out of range. + */ +static U_INLINE uint32_t +_2022FromGR94DBCS(uint32_t value) { + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) + ) { + return value - 0x8080; /* shift down to 21..7e byte range */ + } else { + return 0; /* not valid for ISO 2022 */ + } +} + #ifdef U_ENABLE_GENERIC_ISO_2022 /********************************************************************************** @@ -1238,7 +1246,7 @@ } else{ cnv->toUBytes[0] =(char) sourceChar; - cnv->toULength = 2; + cnv->toULength = 1; } if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ @@ -1332,6 +1340,181 @@ 3 /* length of (I HWKANA_7BIT */ }; +/* Map 00..7F to Unicode according to JIS X 0201. */ +static U_INLINE uint32_t +jisx201ToU(uint32_t value) { + if(value < 0x5c) { + return value; + } else if(value == 0x5c) { + return 0xa5; + } else if(value == 0x7e) { + return 0x203e; + } else /* value <= 0x7f */ { + return value; + } +} + +/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ +static U_INLINE uint32_t +jisx201FromU(uint32_t value) { + if(value<=0x7f) { + if(value!=0x5c && value!=0x7e) { + return value; + } + } else if(value==0xa5) { + return 0x5c; + } else if(value==0x203e) { + return 0x7e; + } + return 0xfffe; +} + +/* + * Take a valid Shift-JIS byte pair, check that it is in the range corresponding + * to JIS X 0208, and convert it to a pair of 21..7E bytes. + * Return 0 if the byte pair is out of range. + */ +static U_INLINE uint32_t +_2022FromSJIS(uint32_t value) { + uint8_t trail; + + if(value > 0xEFFC) { + return 0; /* beyond JIS X 0208 */ + } + + trail = (uint8_t)value; + + value &= 0xff00; /* lead byte */ + if(value <= 0x9f00) { + value -= 0x7000; + } else /* 0xe000 <= value <= 0xef00 */ { + value -= 0xb000; + } + value <<= 1; + + if(trail <= 0x9e) { + value -= 0x100; + if(trail <= 0x7e) { + value |= trail - 0x1f; + } else { + value |= trail - 0x20; + } + } else /* trail <= 0xfc */ { + value |= trail - 0x7e; + } + return value; +} + +/* + * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. + * If either byte is outside 21..7E make sure that the result is not valid + * for Shift-JIS so that the converter catches it. + * Some invalid byte values already turn into equally invalid Shift-JIS + * byte values and need not be tested explicitly. + */ +static U_INLINE void +_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { + if(c1&1) { + ++c1; + if(c2 <= 0x5f) { + c2 += 0x1f; + } else if(c2 <= 0x7e) { + c2 += 0x20; + } else { + c2 = 0; /* invalid */ + } + } else { + if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { + c2 += 0x7e; + } else { + c2 = 0; /* invalid */ + } + } + c1 >>= 1; + if(c1 <= 0x2f) { + c1 += 0x70; + } else if(c1 <= 0x3f) { + c1 += 0xb0; + } else { + c1 = 0; /* invalid */ + } + bytes[0] = (char)c1; + bytes[1] = (char)c2; +} + +/* + * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) + * Katakana. + * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks + * because Shift-JIS roundtrips half-width Katakana to single bytes. + * These were the only fallbacks in ICU's jisx-208.ucm file. + */ +static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { + 0x2123, /* U+FF61 */ + 0x2156, + 0x2157, + 0x2122, + 0x2126, + 0x2572, + 0x2521, + 0x2523, + 0x2525, + 0x2527, + 0x2529, + 0x2563, + 0x2565, + 0x2567, + 0x2543, + 0x213C, /* U+FF70 */ + 0x2522, + 0x2524, + 0x2526, + 0x2528, + 0x252A, + 0x252B, + 0x252D, + 0x252F, + 0x2531, + 0x2533, + 0x2535, + 0x2537, + 0x2539, + 0x253B, + 0x253D, + 0x253F, /* U+FF80 */ + 0x2541, + 0x2544, + 0x2546, + 0x2548, + 0x254A, + 0x254B, + 0x254C, + 0x254D, + 0x254E, + 0x254F, + 0x2552, + 0x2555, + 0x2558, + 0x255B, + 0x255E, + 0x255F, /* U+FF90 */ + 0x2560, + 0x2561, + 0x2562, + 0x2564, + 0x2566, + 0x2568, + 0x2569, + 0x256A, + 0x256B, + 0x256C, + 0x256D, + 0x256F, + 0x2573, + 0x212B, + 0x212C /* U+FF9F */ +}; + /* * The iteration over various code pages works this way: * i) Get the currentState from myConverterData->currentState @@ -1504,7 +1687,7 @@ } break; case HWKANA_7BIT: - if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { + if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { if(converterData->version==3) { /* JIS7: use G1 (SO) */ /* Shift U+FF61..U+FF9F to bytes 21..5F. */ @@ -1531,13 +1714,34 @@ break; case JISX201: /* G0 SBCS */ - len2 = MBCS_SINGLE_FROM_UCHAR32( + value = jisx201FromU(sourceChar); + if(value <= 0x7f) { + targetValue = value; + len = 1; + cs = cs0; + g = 0; + useFallback = FALSE; + } + break; + case JISX208: + /* G0 DBCS from Shift-JIS table */ + len2 = MBCS_FROM_UCHAR32_ISO2022( converterData->myConverterArray[cs0], sourceChar, &value, - useFallback); - if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { - targetValue = value; - len = len2; + useFallback, MBCS_OUTPUT_2); + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ + value = _2022FromSJIS(value); + if(value != 0) { + targetValue = value; + len = len2; + cs = cs0; + g = 0; + useFallback = FALSE; + } + } else if(len == 0 && useFallback && + (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { + targetValue = hwkana_fb[sourceChar - HWKANA_START]; + len = -2; cs = cs0; g = 0; useFallback = FALSE; @@ -1569,17 +1773,10 @@ * Check for valid bytes for the encoding scheme. * This is necessary because the sub-converter (windows-949) * has a broader encoding scheme than is valid for 2022. - * - * Check that the result is a 2-byte value with each byte in the range A1..FE - * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte - * to move it to the ISO 2022 range 21..7E. */ - if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && - (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) - ) { - value -= 0x8080; /* shift down to 21..7e byte range */ - } else { - break; /* not valid for ISO 2022 */ + value = _2022FromGR94DBCS(value); + if(value == 0) { + break; } } targetValue = value; @@ -1755,7 +1952,7 @@ static void UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err){ - char tempBuf[3]; + char tempBuf[2]; const char *mySource = (char *) args->source; UChar *myTarget = args->target; const char *mySourceLimit = args->sourceLimit; @@ -1893,10 +2090,7 @@ break; case JISX201: if(mySourceChar <= 0x7f) { - targetUniChar = - _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( - myData->myConverterArray[cs], - mySourceChar); + targetUniChar = jisx201ToU(mySourceChar); } break; case HWKANA_7BIT: @@ -1910,8 +2104,13 @@ if(mySource < mySourceLimit) { char trailByte; getTrailByte: - tempBuf[0] = (char) (mySourceChar); - tempBuf[1] = trailByte = *mySource++; + trailByte = *mySource++; + if(cs == JISX208) { + _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); + } else { + tempBuf[0] = (char)mySourceChar; + tempBuf[1] = trailByte; + } mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); } else { @@ -3254,6 +3453,9 @@ /* open a set and initialize it with code points that are algorithmically round-tripped */ switch(cnvData->locale[0]){ case 'j': + /* include JIS X 0201 which is hardcoded */ + sa->add(sa->set, 0xa5); + sa->add(sa->set, 0x203e); if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { /* include Latin-1 for some variants of JP */ sa->addRange(sa->set, 0, 0xff); @@ -3262,6 +3464,11 @@ sa->addRange(sa->set, 0, 0x7f); } if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { + /* + * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, + * we need to include half-width Katakana for all JP variants because + * JIS X 0208 has hardcoded fallbacks for them. + */ /* include half-width Katakana for JP */ sa->addRange(sa->set, HWKANA_START, HWKANA_END); } @@ -3281,15 +3488,7 @@ break; } - /* - * Version-specific for CN: - * CN version 0 does not map CNS planes 3..7 although - * they are all available in the CNS conversion table; - * CN version 1 does map them all. - * The two versions create different Unicode sets. - */ - for (i=0; imyConverterArray[i]!=NULL) { +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && cnvData->version==0 && i==CNS_11643 ) { @@ -3299,9 +3498,33 @@ sa, UCNV_ROUNDTRIP_SET, 0, 0x81, 0x82, pErrorCode); + } +#endif + + for (i=0; imyConverterArray[i]!=NULL) { + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { + /* + * Version-specific for CN: + * CN version 0 does not map CNS planes 3..7 although + * they are all available in the CNS conversion table; + * CN version 1 (-EXT) does map them all. + * The two versions create different Unicode sets. + */ + filter=UCNV_SET_FILTER_2022_CN; + } else if(cnvData->locale[0]=='j' && i==JISX208) { + /* + * Only add code points that map to Shift-JIS codes + * corresponding to JIS X 0208. + */ + filter=UCNV_SET_FILTER_SJIS; } else { - ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); + filter=UCNV_SET_FILTER_NONE; } + ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); } } diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c --- icu.5483/source/common/ucnvmbcs.c 2009-06-02 12:47:41.000000000 +0100 +++ icu/source/common/ucnvmbcs.c 2009-06-02 12:48:08.000000000 +0100 @@ -340,6 +340,8 @@ /* Miscellaneous ------------------------------------------------------------ */ +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ static void _getUnicodeSetForBytes(const UConverterSharedData *sharedData, @@ -432,11 +434,14 @@ pErrorCode); } +#endif + U_CFUNC void -ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, - const USetAdder *sa, - UConverterUnicodeSet which, - UErrorCode *pErrorCode) { +ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, + UConverterSetFilter filter, + UErrorCode *pErrorCode) { const UConverterMBCSTable *mbcsTable; const uint16_t *table; @@ -490,50 +495,26 @@ c+=1024; /* empty stage 2 block */ } } - } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { - /* ignore single-byte results */ + } else { const uint32_t *stage2; - const uint16_t *stage3, *results; - - results=(const uint16_t *)mbcsTable->fromUnicodeBytes; - - for(st1=0; st1(maxStage1>>1)) { - stage2=(const uint32_t *)table+st2; - for(st2=0; st2<64; ++st2) { - if((st3=stage2[st2])!=0) { - /* read the stage 3 block */ - stage3=results+16*(uint32_t)(uint16_t)st3; + const uint8_t *stage3, *bytes; + uint32_t st3Multiplier; + uint32_t value; - /* get the roundtrip flags for the stage 3 block */ - st3>>=16; + bytes=mbcsTable->fromUnicodeBytes; - /* - * Add code points for which the roundtrip flag is set. - * Once we get a set for fallback mappings, we have to check - * non-roundtrip stage 3 results for whether they are 0. - * See ucnv_MBCSFromUnicodeWithOffsets() for details. - * - * Ignore single-byte results (<0x100). - */ - do { - if((st3&1)!=0 && *stage3>=0x100) { - sa->add(sa->set, c); - } - st3>>=1; - ++stage3; - } while((++c&0xf)!=0); - } else { - c+=16; /* empty stage 3 block */ - } - } - } else { - c+=1024; /* empty stage 2 block */ - } + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: + st3Multiplier=3; + break; + case MBCS_OUTPUT_4: + st3Multiplier=4; + break; + default: + st3Multiplier=2; + break; } - } else { - const uint32_t *stage2; for(st1=0; st1>=16; @@ -550,12 +534,49 @@ * non-roundtrip stage 3 results for whether they are 0. * See ucnv_MBCSFromUnicodeWithOffsets() for details. */ - do { - if(st3&1) { - sa->add(sa->set, c); - } - st3>>=1; - } while((++c&0xf)!=0); + switch(filter) { + case UCNV_SET_FILTER_NONE: + do { + if(st3&1) { + sa->add(sa->set, c); + } + st3>>=1; + } while((++c&0xf)!=0); + break; + case UCNV_SET_FILTER_DBCS_ONLY: + /* Ignore single-byte results (<0x100). */ + do { + if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { + sa->add(sa->set, c); + } + st3>>=1; + stage3+=2; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; + case UCNV_SET_FILTER_2022_CN: + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ + do { + if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { + sa->add(sa->set, c); + } + st3>>=1; + stage3+=3; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; + case UCNV_SET_FILTER_SJIS: + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ + do { + if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { + sa->add(sa->set, c); + } + st3>>=1; + stage3+=2; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; + default: + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; + return; + } } else { c+=16; /* empty stage 3 block */ } @@ -569,6 +590,19 @@ ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); } +U_CFUNC void +ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { + ucnv_MBCSGetFilteredUnicodeSetForUnicode( + sharedData, sa, which, + sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? + UCNV_SET_FILTER_DBCS_ONLY : + UCNV_SET_FILTER_NONE, + pErrorCode); +} + static void ucnv_MBCSGetUnicodeSet(const UConverter *cnv, const USetAdder *sa, diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h --- icu.5483/source/common/ucnvmbcs.h 2009-06-02 12:47:41.000000000 +0100 +++ icu/source/common/ucnvmbcs.h 2009-06-02 12:48:08.000000000 +0100 @@ -363,6 +363,7 @@ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode); +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ /* * Internal function returning a UnicodeSet for toUnicode() conversion. * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. @@ -377,6 +378,7 @@ UConverterUnicodeSet which, uint8_t state, int32_t lowByte, int32_t highByte, UErrorCode *pErrorCode); +#endif /* * Internal function returning a UnicodeSet for toUnicode() conversion. @@ -388,9 +390,30 @@ */ U_CFUNC void ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, - const USetAdder *sa, - UConverterUnicodeSet which, - UErrorCode *pErrorCode); + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode); + +typedef enum UConverterSetFilter { + UCNV_SET_FILTER_NONE, + UCNV_SET_FILTER_DBCS_ONLY, + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, + UCNV_SET_FILTER_COUNT +} UConverterSetFilter; + +/* + * Same as ucnv_MBCSGetUnicodeSetForUnicode() but + * the set can be filtered by encoding scheme. + * Used by stateful converters which share regular conversion tables + * but only use a subset of their mappings. + */ +U_CFUNC void +ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, + UConverterSetFilter filter, + UErrorCode *pErrorCode); #endif diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c --- icu.5483/source/test/cintltst/nucnvtst.c 2009-06-02 12:47:25.000000000 +0100 +++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 12:58:02.000000000 +0100 @@ -3202,7 +3202,7 @@ 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, - 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, 0x201D, 0x3014, 0x000D, 0x000A, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, @@ -3730,7 +3730,7 @@ 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, - 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, 0x201D, 0x000D, 0x000A, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A, diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c --- icu.5483/source/test/cintltst/udatatst.c 2009-06-02 12:47:25.000000000 +0100 +++ icu/source/test/cintltst/udatatst.c 2009-06-02 13:09:15.000000000 +0100 @@ -1260,6 +1260,11 @@ {"gb18030", "cnv", ucnv_swap}, /* MBCS conversion table file with extension */ {"*test4x", "cnv", ucnv_swap}, + /* + * MBCS conversion table file without extension, + * to test swapping and preflighting of UTF-8-friendly mbcsIndex[]. + */ + {"jisx-212", "cnv", ucnv_swap}, #endif #if !UCONFIG_NO_CONVERSION diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt --- icu.5483/source/test/testdata/conversion.txt 2009-06-02 12:47:25.000000000 +0100 +++ icu/source/test/testdata/conversion.txt 2009-06-02 12:49:51.000000000 +0100 @@ -48,6 +48,15 @@ toUnicode { Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } Cases { + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { + "ISO-2022-JP", + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", + :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, + :int{1}, :int{1}, "", "?", :bin{""} + } // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() { "ISO-8859-3", @@ -495,6 +504,15 @@ } { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { + "ISO-2022-JP", + "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", + :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, + :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, + :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e + } // Verify that mappings that would result in byte values outside 20..7F (for SBCS) // or 21..7E (for DBCS) are not used. // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): @@ -1273,13 +1291,13 @@ // versions of ISO-2022-JP { "ISO-2022-JP", - "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", - "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", + "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", :int{0} } { "ISO-2022-JP-2", - "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", "[\x0e\x0f\x1b\uffe7-\U0010ffff]", :int{0} }