From 6deac027c98f5d99e1805f9ddc21ff2dbebe0fb7 Mon Sep 17 00:00:00 2001 From: Remi Collet Date: Wed, 20 Mar 2013 10:29:29 +0100 Subject: compat-icu36: new package (for EL-5) --- icu.icu5691.backport.patch | 730 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 730 insertions(+) create mode 100644 icu.icu5691.backport.patch (limited to 'icu.icu5691.backport.patch') diff --git a/icu.icu5691.backport.patch b/icu.icu5691.backport.patch new file mode 100644 index 0000000..906ecd3 --- /dev/null +++ b/icu.icu5691.backport.patch @@ -0,0 +1,730 @@ +diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6175/source/common/ucnv2022.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 16:03:15.000000000 +0100 +@@ -754,6 +754,7 @@ + UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); + uint32_t key = myData2022->key; + int32_t offset = 0; ++ int8_t initialToULength = _this->toULength; + char c; + + value = VALID_NON_TERMINAL_2022; +@@ -806,7 +807,6 @@ + return; + } else if (value == INVALID_2022 ) { + *err = U_ILLEGAL_ESCAPE_SEQUENCE; +- return; + } else /* value == VALID_TERMINAL_2022 */ { + switch(var){ + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -938,6 +938,35 @@ + } + if(U_SUCCESS(*err)) { + _this->toULength = 0; ++ } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { ++ if(_this->toULength>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte (ESC) in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * In escape sequences, all following bytes are "printable", that is, ++ * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), ++ * they are valid single/lead bytes. ++ * For simplicity, we always only report the initial ESC byte as the ++ * illegal sequence and back out all other bytes we looked at. ++ */ ++ /* Back out some bytes. */ ++ int8_t backOutDistance=_this->toULength-1; ++ int8_t bytesFromThisBuffer=_this->toULength-initialToULength; ++ if(backOutDistance<=bytesFromThisBuffer) { ++ /* same as initialToULength<=1 */ ++ *source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* same as -(initialToULength-1) */ ++ /* preToULength is negative! */ ++ uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); ++ *source-=bytesFromThisBuffer; ++ } ++ _this->toULength=1; ++ } + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { + _this->toUCallbackReason = UCNV_UNASSIGNED; + } +@@ -1973,6 +2002,7 @@ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -2102,17 +2132,44 @@ + default: + /* G0 DBCS */ + if(mySource < mySourceLimit) { +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- if(cs == JISX208) { +- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); +- } else { +- tempBuf[0] = (char)mySourceChar; +- tempBuf[1] = trailByte; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ uint32_t tmpSourceChar = (mySourceChar << 8) | trailByte; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); ++ mySourceChar = tmpSourceChar; ++ } else { ++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ ++ mySourceChar = tmpSourceChar; ++ if (cs == KSC5601) { ++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ ++ } ++ tempBuf[0] = (char)(tmpSourceChar >> 8); ++ tempBuf[1] = (char)(tmpSourceChar); ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +@@ -2254,7 +2311,12 @@ + } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ ++ if( length > 2 || length==0 || ++ (length == 1 && targetByteUnit > 0x7f) || ++ (length == 2 && ++ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || ++ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) ++ ) { + targetByteUnit=missingCharMarker; + } + if (targetByteUnit != missingCharMarker){ +@@ -2583,17 +2645,34 @@ + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ + if(myData->toU2022State.g == 1) { + if(mySource < mySourceLimit) { +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempBuf[0] = (char)(mySourceChar + 0x80); +- tempBuf[1] = (char)(trailByte + 0x80); +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- if((mySourceChar & 0x8080) == 0) { ++ targetUniChar = missingCharMarker; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ tempBuf[0] = (char)(mySourceChar + 0x80); ++ tempBuf[1] = (char)(trailByte + 0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); +- } else { +- /* illegal bytes > 0x7f */ +- targetUniChar = missingCharMarker; ++ mySourceChar = (mySourceChar << 8) | trailByte; ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -2601,8 +2680,10 @@ + break; + } + } +- else{ ++ else if(mySourceChar <= 0x7f) { + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); ++ } else { ++ targetUniChar = 0xffff; + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -3099,6 +3180,7 @@ + /* continue with a partial double-byte character */ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -3178,29 +3260,50 @@ + UConverterSharedData *cnv; + StateEnum tempState; + int32_t tempBufLen; +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; +- if(tempState > CNS_11643_0) { +- cnv = myData->myConverterArray[CNS_11643]; +- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); +- tempBuf[1] = (char) (mySourceChar); +- tempBuf[2] = trailByte; +- tempBufLen = 3; +- +- }else{ +- cnv = myData->myConverterArray[tempState]; +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte; +- tempBufLen = 2; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ if(tempState >= CNS_11643_0) { ++ cnv = myData->myConverterArray[CNS_11643]; ++ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); ++ tempBuf[1] = (char) (mySourceChar); ++ tempBuf[2] = (char) trailByte; ++ tempBufLen = 3; ++ ++ }else{ ++ cnv = myData->myConverterArray[tempState]; ++ tempBuf[0] = (char) (mySourceChar); ++ tempBuf[1] = (char) trailByte; ++ tempBufLen = 2; ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); ++ mySourceChar = (mySourceChar << 8) | trailByte; ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + if(pToU2022State->g>=2) { + /* return from a single-shift state to the previous one */ + pToU2022State->g=pToU2022State->prevG; + } +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6175/source/common/ucnvhz.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:57:18.000000000 +0100 +@@ -196,10 +196,30 @@ + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ +- mySourceChar = 0x7e00 | mySourceChar; +- targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ + myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ +- break; ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUBytes[0] = UCNV_TILDE; ++ if( myData->isStateDBCS ? ++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) : ++ mySourceChar <= 0x7f ++ ) { ++ /* The current byte could be the start of a character: Back it out. */ ++ args->converter->toULength = 1; ++ --mySource; ++ } else { ++ /* Include the current byte in the illegal sequence. */ ++ args->converter->toUBytes[1] = mySourceChar; ++ args->converter->toULength = 2; ++ } ++ args->target = myTarget; ++ args->source = mySource; ++ return; + } + } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ +@@ -215,19 +235,36 @@ + } + else{ + /* trail byte */ ++ int leadIsOk, trailIsOk; + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; +- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && +- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) +- ) { ++ targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In HZ DBCS, if the second byte is in the 21..7e range, ++ * we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); ++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { + tempBuf[0] = (char) (leadByte+0x80) ; + tempBuf[1] = (char) (mySourceChar+0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, + tempBuf, 2, args->converter->useFallback); ++ mySourceChar= (leadByte << 8) | mySourceChar; ++ } else if (trailIsOk) { ++ /* report a single illegal byte and continue with the following DBCS starter byte */ ++ --mySource; ++ mySourceChar = (int32_t)leadByte; + } else { +- targetUniChar = 0xffff; ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + } +- /* add another bit so that the code below writes 2 bytes in case of error */ +- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; + } + } +diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6175/source/common/ucnvmbcs.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:56:07.000000000 +0100 +@@ -1697,6 +1697,65 @@ + pArgs->offsets=offsets; + } + ++static UBool ++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { ++ const int32_t *row=stateTable[state]; ++ int32_t b, entry; ++ /* First test for final entries in this state for some commonly valid byte values. */ ++ entry=row[0xa1]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ entry=row[0x41]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ /* Then test for final entries in this state. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ } ++ /* Then recurse for transition entries. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( MBCS_ENTRY_IS_TRANSITION(entry) && ++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) ++ ) { ++ return TRUE; ++ } ++ } ++ return FALSE; ++} ++ ++/* ++ * Is byte b a single/lead byte in this state? ++ * Recurse for transition states, because here we don't want to say that ++ * b is a lead byte if all byte sequences that start with b are illegal. ++ */ ++static UBool ++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { ++ const int32_t *row=stateTable[state]; ++ int32_t entry=row[b]; ++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ ++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); ++ } else { ++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); ++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { ++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */ ++ } else { ++ return action!=MBCS_STATE_ILLEGAL; ++ } ++ } ++} ++ + U_CFUNC void + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { +@@ -2052,6 +2111,34 @@ + sourceIndex=nextSourceIndex; + } else if(U_FAILURE(*pErrorCode)) { + /* callback(illegal) */ ++ if(byteIndex>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++ int8_t i; ++ for(i=1; ++ isource); ++ byteIndex=i; /* length of reported illegal byte sequence */ ++ if(backOutDistance<=bytesFromThisBuffer) { ++ source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* preToULength is negative! */ ++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); ++ source=(const uint8_t *)pArgs->source; ++ } ++ } ++ } + break; + } else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ +@@ -2062,7 +2149,7 @@ + &offsets, sourceIndex, + pArgs->flush, + pErrorCode); +- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); ++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); + + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ +@@ -2353,15 +2440,37 @@ + + if(c<0) { + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes; + cnv->toULength=(int8_t)(source-lastSource); + do { + *bytes++=*lastSource++; + } while(lastSourcesharedData->mbcs.dbcsOnlyState!=0); ++ uint8_t *bytes=cnv->toUBytes; ++ *bytes++=*lastSource++; /* first byte */ ++ if(lastSource==source) { ++ cnv->toULength=1; ++ } else /* lastSourcetoULength=i; ++ source=lastSource; ++ } + } else { + /* no output because of empty input or only state changes */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; +diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c +--- icu.6175/source/test/cintltst/nccbtst.c 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nccbtst.c 2009-06-02 15:47:38.000000000 +0100 +@@ -2497,13 +2497,13 @@ + + + static const uint8_t text943[] = { +- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; +- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; +- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; ++ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 }; ++ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 }; + static const UChar toUnicode943stop[]= { 0x304b}; + +- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7}; +- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7}; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 }; ++ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 }; + static const int32_t fromIBM943Offsstop[] = { 0}; + + gInBufferSize = inputsize; +@@ -2537,9 +2537,9 @@ + { + static const uint8_t sampleText[] = { + 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82, +- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33}; +- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033}; +- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8}; ++ 0xff, 0x32, 0x33}; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 }; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 }; + /*checking illegal value for ibm-943 with substitute*/ + gInBufferSize = inputsize; + gOutBufferSize = outputsize; +diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6175/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:38.000000000 +0100 +@@ -2606,7 +2606,7 @@ + TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source"); + /*Test for the condition where there is an invalid character*/ + { +- static const uint8_t source2[]={0xa1, 0x01}; ++ static const uint8_t source2[]={0xa1, 0x80}; + TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character"); + } + /*Test for the condition where we have a truncated char*/ +@@ -3899,11 +3899,11 @@ + TestISO_2022_KR() { + /* test input */ + static const uint16_t in[]={ +- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D +- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04 ++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D ++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04 + ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029 + ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB +- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2 ++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2 + ,0x53E3,0x53E4,0x000A,0x000D}; + const UChar* uSource; + const UChar* uSourceLimit; +diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6175/source/test/testdata/conversion.txt 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:57:41.000000000 +0100 +@@ -48,12 +48,144 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // Test ticket 5691: consistent illegal sequences ++ // The following test cases are for illegal character byte sequences. ++ // ++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket ++ // comments because our Shift-JIS table is Windows-compatible and ++ // therefore has no illegal single bytes. Same for GBK. ++ // Instead, we use the stricter GB 18030 also for 2-byte examples. ++ // The byte sequences are generally slightly different from the ticket ++ // comment, simply using assigned characters rather than just ++ // theoretically valid sequences. ++ { ++ "gb18030", ++ :bin{ 618140813c81ff7a }, ++ "a\u4e02\\x81<\\x81\\xFFz", ++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "EUC-JP", ++ :bin{ 618fb0a98fb03c8f3cb0a97a }, ++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", ++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "gb18030", ++ :bin{ 618130fc318130fc8181303c3e813cfc817a }, ++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", ++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "UTF-8", ++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, ++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z", ++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-JP", ++ :bin{ 1b24424141af4142affe41431b2842 }, ++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e", ++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ibm-25546", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-KR", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 411b242941420e4141af4142affe41430f5a }, ++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "HZ", ++ :bin{ 417e7b4141af4142affe41437e7d5a }, ++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: consistent illegal sequences ++ // The following test cases are for illegal escape/designator/shift sequences. ++ // ++ // ISO-2022-JP and -CN with illegal escape sequences. ++ { ++ "ISO-2022-JP", ++ :bin{ 611b24201b244241411b283f1b28427a }, ++ "a\\x1B$ \u758f\\x1B\u2538z", ++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b2429201b2429410e41410f7a }, ++ "a\\x1B$) \u4eaez", ++ :intvector{ 0,1,1,1,1,2,3,4,10,13 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences. ++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space. ++ { ++ "ISO-2022-JP-2", ++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, ++ "N\\x1BNNN\xceN\\x1B N", ++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN-EXT", ++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, ++ "N\\x1BNNN\u8f0eN\\x1B N", ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN-EXT", ++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, ++ "O\\x1BOOO\u492bO\\x1B O", ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: HZ with illegal tilde sequences. ++ { ++ "HZ", ++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, ++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z", ++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS ++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS ++ 25 }, // SBCS ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: Example from Peter Edberg. ++ { ++ "ISO-2022-JP", ++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, ++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", ++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, ++ :int{1}, :int{0}, "", "?", :bin{""} ++ } + // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e + { + "HZ", +- :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, +- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", +- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", ++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and +@@ -61,8 +193,8 @@ + { + "ISO-2022-JP", + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, +- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", +- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() +@@ -341,7 +473,7 @@ + { + "ISO-2022-CN-EXT", + :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, +- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e } ++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b } + } + // G3 designator: recognized, but not supported for -CN (only for -CN-EXT) + { -- cgit