compat-icu36: new package (for EL-5)HEAD master

author: Remi Collet <fedora@famillecollet.com> 2013-03-20 10:29:29 +0100
committer: Remi Collet <fedora@famillecollet.com> 2013-03-20 10:29:29 +0100
commit: 6deac027c98f5d99e1805f9ddc21ff2dbebe0fb7 (patch)
tree: 008990c48199f2d517fc9b1a4b47c6b162ec30ef /icu.icu5691.backport.patch
1 files changed, 730 insertions, 0 deletions
diff --git a/icu.icu5691.backport.patch b/icu.icu5691.backport.patch
new file mode 100644
index 0000000..906ecd3
--- /dev/null
+++ b/icu.icu5691.backport.patch
@@ -0,0 +1,730 @@
+diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.6175/source/common/ucnv2022.c	2009-06-02 15:47:31.000000000 +0100
++++ icu/source/common/ucnv2022.c	2009-06-02 16:03:15.000000000 +0100
+@@ -754,6 +754,7 @@
+     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
+     uint32_t key = myData2022->key;
+     int32_t offset = 0;
++    int8_t initialToULength = _this->toULength;
+     char c;
+ 
+     value = VALID_NON_TERMINAL_2022;
+@@ -806,7 +807,6 @@
+         return;
+     } else if (value == INVALID_2022 ) {
+         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+-        return;
+     } else /* value == VALID_TERMINAL_2022 */ {
+         switch(var){
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+@@ -938,6 +938,35 @@
+     }
+     if(U_SUCCESS(*err)) {
+         _this->toULength = 0;
++    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
++        if(_this->toULength>1) {
++            /*
++             * Ticket 5691: consistent illegal sequences:
++             * - We include at least the first byte (ESC) in the illegal sequence.
++             * - If any of the non-initial bytes could be the start of a character,
++             *   we stop the illegal sequence before the first one of those.
++             *   In escape sequences, all following bytes are "printable", that is,
++             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
++             *   they are valid single/lead bytes.
++             *   For simplicity, we always only report the initial ESC byte as the
++             *   illegal sequence and back out all other bytes we looked at.
++             */
++            /* Back out some bytes. */
++            int8_t backOutDistance=_this->toULength-1;
++            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
++            if(backOutDistance<=bytesFromThisBuffer) {
++                /* same as initialToULength<=1 */
++                *source-=backOutDistance;
++            } else {
++                /* Back out bytes from the previous buffer: Need to replay them. */
++                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
++                /* same as -(initialToULength-1) */
++                /* preToULength is negative! */
++                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
++                *source-=bytesFromThisBuffer;
++            }
++            _this->toULength=1;
++        }
+     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
+         _this->toUCallbackReason = UCNV_UNASSIGNED;
+     }
+@@ -1973,6 +2002,7 @@
+         mySourceChar = args->converter->toUBytes[0];
+         args->converter->toULength = 0;
+         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
++        targetUniChar = missingCharMarker;
+         goto getTrailByte;
+     }
+ 
+@@ -2102,17 +2132,44 @@
+                 default:
+                     /* G0 DBCS */
+                     if(mySource < mySourceLimit) {
+-                        char trailByte;
++                        int leadIsOk, trailIsOk;
++                        uint8_t trailByte;
+ getTrailByte:
+-                        trailByte = *mySource++;
+-                        if(cs == JISX208) {
+-                            _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
+-                        } else {
+-                            tempBuf[0] = (char)mySourceChar;
+-                            tempBuf[1] = trailByte;
++                        trailByte = (uint8_t)*mySource;
++                        /*
++                         * Ticket 5691: consistent illegal sequences:
++                         * - We include at least the first byte in the illegal sequence.
++                         * - If any of the non-initial bytes could be the start of a character,
++                         *   we stop the illegal sequence before the first one of those.
++                         *
++                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
++                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
++                         * Otherwise we convert or report the pair of bytes.
++                         */
++                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
++                        if (leadIsOk && trailIsOk) {
++                            ++mySource;
++                            uint32_t tmpSourceChar = (mySourceChar << 8) | trailByte;
++                            if(cs == JISX208) {
++                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
++                                mySourceChar = tmpSourceChar;
++                            } else {
++                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
++                                mySourceChar = tmpSourceChar;
++                                if (cs == KSC5601) {
++                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
++                                }
++                                tempBuf[0] = (char)(tmpSourceChar >> 8);
++                                tempBuf[1] = (char)(tmpSourceChar);
++                            }
++                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
++                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
++                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++                            ++mySource;
++                            /* add another bit so that the code below writes 2 bytes in case of error */
++                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
+                         }
+-                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+                     } else {
+                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+                         args->converter->toULength = 1;
+@@ -2254,7 +2311,12 @@
+             }
+             /* only DBCS or SBCS characters are expected*/
+             /* DB characters with high bit set to 1 are expected */
+-            if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
++            if( length > 2 || length==0 ||
++                (length == 1 && targetByteUnit > 0x7f) ||
++                (length == 2 &&
++                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
++                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
++            ) {
+                 targetByteUnit=missingCharMarker;
+             }
+             if (targetByteUnit != missingCharMarker){
+@@ -2583,17 +2645,34 @@
+             myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
+             if(myData->toU2022State.g == 1) {
+                 if(mySource < mySourceLimit) {
+-                    char trailByte;
++                    int leadIsOk, trailIsOk;
++                    uint8_t trailByte;
+ getTrailByte:
+-                    trailByte = *mySource++;
+-                    tempBuf[0] = (char)(mySourceChar + 0x80);
+-                    tempBuf[1] = (char)(trailByte + 0x80);
+-                    mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+-                    if((mySourceChar & 0x8080) == 0) {
++                    targetUniChar = missingCharMarker;
++                    trailByte = (uint8_t)*mySource;
++                    /*
++                     * Ticket 5691: consistent illegal sequences:
++                     * - We include at least the first byte in the illegal sequence.
++                     * - If any of the non-initial bytes could be the start of a character,
++                     *   we stop the illegal sequence before the first one of those.
++                     *
++                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
++                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
++                     * Otherwise we convert or report the pair of bytes.
++                     */
++                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
++                    if (leadIsOk && trailIsOk) {
++                        ++mySource;
++                        tempBuf[0] = (char)(mySourceChar + 0x80);
++                        tempBuf[1] = (char)(trailByte + 0x80);
+                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+-                    } else {
+-                        /* illegal bytes > 0x7f */
+-                        targetUniChar = missingCharMarker;
++                        mySourceChar = (mySourceChar << 8) | trailByte;
++                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
++                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++                        ++mySource;
++                        /* add another bit so that the code below writes 2 bytes in case of error */
++                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
+                     }
+                 } else {
+                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+@@ -2601,8 +2680,10 @@
+                     break;
+                 }
+             }
+-            else{
++            else if(mySourceChar <= 0x7f) {
+                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
++            } else {
++                targetUniChar = 0xffff;
+             }
+             if(targetUniChar < 0xfffe){
+                 if(args->offsets) {
+@@ -3099,6 +3180,7 @@
+         /* continue with a partial double-byte character */
+         mySourceChar = args->converter->toUBytes[0];
+         args->converter->toULength = 0;
++        targetUniChar = missingCharMarker;
+         goto getTrailByte;
+     }
+ 
+@@ -3178,29 +3260,50 @@
+                         UConverterSharedData *cnv;
+                         StateEnum tempState;
+                         int32_t tempBufLen;
+-                        char trailByte;
++                        int leadIsOk, trailIsOk;
++                        uint8_t trailByte;
+ getTrailByte:
+-                        trailByte = *mySource++;
+-                        tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
+-                        if(tempState > CNS_11643_0) {
+-                            cnv = myData->myConverterArray[CNS_11643];
+-                            tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
+-                            tempBuf[1] = (char) (mySourceChar);
+-                            tempBuf[2] = trailByte;
+-                            tempBufLen = 3;
+-
+-                        }else{
+-                            cnv = myData->myConverterArray[tempState];
+-                            tempBuf[0] = (char) (mySourceChar);
+-                            tempBuf[1] = trailByte;
+-                            tempBufLen = 2;
++                        trailByte = (uint8_t)*mySource;
++                        /*
++                         * Ticket 5691: consistent illegal sequences:
++                         * - We include at least the first byte in the illegal sequence.
++                         * - If any of the non-initial bytes could be the start of a character,
++                         *   we stop the illegal sequence before the first one of those.
++                         *
++                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
++                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
++                         * Otherwise we convert or report the pair of bytes.
++                         */
++                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
++                        if (leadIsOk && trailIsOk) {
++                            ++mySource;
++                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
++                            if(tempState >= CNS_11643_0) {
++                                cnv = myData->myConverterArray[CNS_11643];
++                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
++                                tempBuf[1] = (char) (mySourceChar);
++                                tempBuf[2] = (char) trailByte;
++                                tempBufLen = 3;
++
++                            }else{
++                                cnv = myData->myConverterArray[tempState];
++                                tempBuf[0] = (char) (mySourceChar);
++                                tempBuf[1] = (char) trailByte;
++                                tempBufLen = 2;
++                            }
++                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
++                            mySourceChar = (mySourceChar << 8) | trailByte;
++                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
++                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++                            ++mySource;
++                            /* add another bit so that the code below writes 2 bytes in case of error */
++                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
+                         }
+-                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+                         if(pToU2022State->g>=2) {
+                             /* return from a single-shift state to the previous one */
+                             pToU2022State->g=pToU2022State->prevG;
+                         }
+-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
+                     } else {
+                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+                         args->converter->toULength = 1;
+diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c
+--- icu.6175/source/common/ucnvhz.c	2009-06-02 15:47:31.000000000 +0100
++++ icu/source/common/ucnvhz.c	2009-06-02 15:57:18.000000000 +0100
+@@ -196,10 +196,30 @@
+                      /* if the first byte is equal to TILDE and the trail byte
+                      * is not a valid byte then it is an error condition
+                      */
+-                    mySourceChar = 0x7e00 | mySourceChar;
+-                    targetUniChar = 0xffff;
++                    /*
++                     * Ticket 5691: consistent illegal sequences:
++                     * - We include at least the first byte in the illegal sequence.
++                     * - If any of the non-initial bytes could be the start of a character,
++                     *   we stop the illegal sequence before the first one of those.
++                     */
+                     myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
+-                    break;
++                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++                    args->converter->toUBytes[0] = UCNV_TILDE;
++                    if( myData->isStateDBCS ?
++                            (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
++                            mySourceChar <= 0x7f
++                    ) {
++                        /* The current byte could be the start of a character: Back it out. */
++                        args->converter->toULength = 1;
++                        --mySource;
++                    } else {
++                        /* Include the current byte in the illegal sequence. */
++                        args->converter->toUBytes[1] = mySourceChar;
++                        args->converter->toULength = 2;
++                    }
++                    args->target = myTarget;
++                    args->source = mySource;
++                    return;
+                 }
+             } else if(myData->isStateDBCS) {
+                 if(args->converter->toUnicodeStatus == 0x00){
+@@ -215,19 +235,36 @@
+                 }
+                 else{
+                     /* trail byte */
++                    int leadIsOk, trailIsOk;
+                     uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
+-                    if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
+-                        (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
+-                    ) {
++                    targetUniChar = 0xffff;
++                    /*
++                     * Ticket 5691: consistent illegal sequences:
++                     * - We include at least the first byte in the illegal sequence.
++                     * - If any of the non-initial bytes could be the start of a character,
++                     *   we stop the illegal sequence before the first one of those.
++                     *
++                     * In HZ DBCS, if the second byte is in the 21..7e range,
++                     * we report only the first byte as the illegal sequence.
++                     * Otherwise we convert or report the pair of bytes.
++                     */
++                    leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
++                    trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++                    if (leadIsOk && trailIsOk) {
+                         tempBuf[0] = (char) (leadByte+0x80) ;
+                         tempBuf[1] = (char) (mySourceChar+0x80);
+                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+                             tempBuf, 2, args->converter->useFallback);
++                        mySourceChar= (leadByte << 8) | mySourceChar;
++                    } else if (trailIsOk) {
++                        /* report a single illegal byte and continue with the following DBCS starter byte */
++                        --mySource;
++                        mySourceChar = (int32_t)leadByte;
+                     } else {
+-                        targetUniChar = 0xffff;
++                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++                        /* add another bit so that the code below writes 2 bytes in case of error */
++                        mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
+                     }
+-                    /* add another bit so that the code below writes 2 bytes in case of error */
+-                    mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
+                     args->converter->toUnicodeStatus =0x00;
+                 }
+             }
+diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.6175/source/common/ucnvmbcs.c	2009-06-02 15:47:31.000000000 +0100
++++ icu/source/common/ucnvmbcs.c	2009-06-02 15:56:07.000000000 +0100
+@@ -1697,6 +1697,65 @@
+     pArgs->offsets=offsets;
+ }
+ 
++static UBool
++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
++    const int32_t *row=stateTable[state];
++    int32_t b, entry;
++    /* First test for final entries in this state for some commonly valid byte values. */
++    entry=row[0xa1];
++    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++    ) {
++        return TRUE;
++    }
++    entry=row[0x41];
++    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++    ) {
++        return TRUE;
++    }
++    /* Then test for final entries in this state. */
++    for(b=0; b<=0xff; ++b) {
++        entry=row[b];
++        if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++            MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++        ) {
++            return TRUE;
++        }
++    }
++    /* Then recurse for transition entries. */
++    for(b=0; b<=0xff; ++b) {
++        entry=row[b];
++        if( MBCS_ENTRY_IS_TRANSITION(entry) &&
++            hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
++        ) {
++            return TRUE;
++        }
++    }
++    return FALSE;
++}
++
++/*
++ * Is byte b a single/lead byte in this state?
++ * Recurse for transition states, because here we don't want to say that
++ * b is a lead byte if all byte sequences that start with b are illegal.
++ */
++static UBool
++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
++    const int32_t *row=stateTable[state];
++    int32_t entry=row[b];
++    if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
++        return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
++    } else {
++        uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
++        if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
++            return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
++        } else {
++            return action!=MBCS_STATE_ILLEGAL;
++        }
++    }
++}
++
+ U_CFUNC void
+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+                           UErrorCode *pErrorCode) {
+@@ -2052,6 +2111,34 @@
+             sourceIndex=nextSourceIndex;
+         } else if(U_FAILURE(*pErrorCode)) {
+             /* callback(illegal) */
++            if(byteIndex>1) {
++                /*
++                 * Ticket 5691: consistent illegal sequences:
++                 * - We include at least the first byte in the illegal sequence.
++                 * - If any of the non-initial bytes could be the start of a character,
++                 *   we stop the illegal sequence before the first one of those.
++                 */
++                UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
++                int8_t i;
++                for(i=1;
++                    i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
++                    ++i) {}
++                if(i<byteIndex) {
++                    /* Back out some bytes. */
++                    int8_t backOutDistance=byteIndex-i;
++                    int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
++                    byteIndex=i;  /* length of reported illegal byte sequence */
++                    if(backOutDistance<=bytesFromThisBuffer) {
++                        source-=backOutDistance;
++                    } else {
++                        /* Back out bytes from the previous buffer: Need to replay them. */
++                        cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
++                        /* preToULength is negative! */
++                        uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
++                        source=(const uint8_t *)pArgs->source;
++                    }
++                }
++            }
+             break;
+         } else /* unassigned sequences indicated with byteIndex>0 */ {
+             /* try an extension mapping */
+@@ -2062,7 +2149,7 @@
+                               &offsets, sourceIndex,
+                               pArgs->flush,
+                               pErrorCode);
+-            sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
++            sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
+ 
+             if(U_FAILURE(*pErrorCode)) {
+                 /* not mappable or buffer overflow */
+@@ -2353,15 +2440,37 @@
+ 
+     if(c<0) {
+         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
+-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+-        }
+-        if(U_FAILURE(*pErrorCode)) {
+             /* incomplete character byte sequence */
+             uint8_t *bytes=cnv->toUBytes;
+             cnv->toULength=(int8_t)(source-lastSource);
+             do {
+                 *bytes++=*lastSource++;
+             } while(lastSource<source);
++            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
++        } else if(U_FAILURE(*pErrorCode)) {
++            /* callback(illegal) */
++            /*
++             * Ticket 5691: consistent illegal sequences:
++             * - We include at least the first byte in the illegal sequence.
++             * - If any of the non-initial bytes could be the start of a character,
++             *   we stop the illegal sequence before the first one of those.
++             */
++            UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
++            uint8_t *bytes=cnv->toUBytes;
++            *bytes++=*lastSource++;     /* first byte */
++            if(lastSource==source) {
++                cnv->toULength=1;
++            } else /* lastSource<source: multi-byte character */ {
++                int8_t i;
++                for(i=1;
++                    lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
++                    ++i
++                ) {
++                    *bytes++=*lastSource++;
++                }
++                cnv->toULength=i;
++                source=lastSource;
++            }
+         } else {
+             /* no output because of empty input or only state changes */
+             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c
+--- icu.6175/source/test/cintltst/nccbtst.c	2009-06-02 15:47:18.000000000 +0100
++++ icu/source/test/cintltst/nccbtst.c	2009-06-02 15:47:38.000000000 +0100
+@@ -2497,13 +2497,13 @@
+ 
+ 
+     static const uint8_t text943[] = {
+-        0x82, 0xa9, 0x82, 0x20, /*0xc8,*/  0x61, 0x8a, 0xbf, 0x8e, 0x9a };
+-    static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22,  0x5b57};
+-    static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22,  0x5b57};
++        0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
++    static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22,  0x5b57 };
++    static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22,  0x5b57 };
+     static const UChar toUnicode943stop[]= { 0x304b};
+ 
+-    static const int32_t  fromIBM943Offssub[]  = {0, 2, 4, 5, 7};
+-    static const int32_t  fromIBM943Offsskip[] = { 0, 4, 5, 7};
++    static const int32_t  fromIBM943Offssub[]  = { 0, 2, 3, 4, 5, 7 };
++    static const int32_t  fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
+     static const int32_t  fromIBM943Offsstop[] = { 0};
+ 
+     gInBufferSize = inputsize;
+@@ -2537,9 +2537,9 @@
+ {
+     static const uint8_t sampleText[] = {
+         0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
+-        0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
+-    static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063,  0xfffd,/*0x304b,*/ 0x0032, 0x0033};
+-    static const int32_t  fromIBM943Offssub[]  = {0, 2, 3, 4, 5, 7, 8};
++        0xff, 0x32, 0x33};
++    static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
++    static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
+     /*checking illegal value for ibm-943 with substitute*/ 
+     gInBufferSize = inputsize;
+     gOutBufferSize = outputsize;
+diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
+--- icu.6175/source/test/cintltst/nucnvtst.c	2009-06-02 15:47:18.000000000 +0100
++++ icu/source/test/cintltst/nucnvtst.c	2009-06-02 15:47:38.000000000 +0100
+@@ -2606,7 +2606,7 @@
+     TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
+     /*Test for the condition where there is an invalid character*/
+     {
+-        static const uint8_t source2[]={0xa1, 0x01};
++        static const uint8_t source2[]={0xa1, 0x80};
+         TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
+     }
+     /*Test for the condition where we have a truncated char*/
+@@ -3899,11 +3899,11 @@
+ TestISO_2022_KR() {
+     /* test input */
+     static const uint16_t in[]={
+-                    0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
+-                   ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
++                    0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
++                   ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
+                    ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
+                    ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
+-                   ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
++                   ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
+                    ,0x53E3,0x53E4,0x000A,0x000D};
+     const UChar* uSource;
+     const UChar* uSourceLimit;
+diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.6175/source/test/testdata/conversion.txt	2009-06-02 15:47:18.000000000 +0100
++++ icu/source/test/testdata/conversion.txt	2009-06-02 15:57:41.000000000 +0100
+@@ -48,12 +48,144 @@
+     toUnicode {
+       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+       Cases {
++        // Test ticket 5691: consistent illegal sequences
++        // The following test cases are for illegal character byte sequences.
++        //
++        // Unfortunately, we cannot use the Shift-JIS examples from the ticket
++        // comments because our Shift-JIS table is Windows-compatible and
++        // therefore has no illegal single bytes. Same for GBK.
++        // Instead, we use the stricter GB 18030 also for 2-byte examples.
++        // The byte sequences are generally slightly different from the ticket
++        // comment, simply using assigned characters rather than just
++        // theoretically valid sequences.
++        {
++          "gb18030",
++          :bin{ 618140813c81ff7a },
++          "a\u4e02\\x81<\\x81\\xFFz",
++          :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "EUC-JP",
++          :bin{ 618fb0a98fb03c8f3cb0a97a },
++          "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
++          :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "gb18030",
++          :bin{ 618130fc318130fc8181303c3e813cfc817a },
++          "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
++          :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "UTF-8",
++          :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
++          "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
++          :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ISO-2022-JP",
++          :bin{ 1b24424141af4142affe41431b2842 },
++          "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
++          :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ibm-25546",
++          :bin{ 411b242943420e4141af4142affe41430f5a },
++          "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
++          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ISO-2022-KR",
++          :bin{ 411b242943420e4141af4142affe41430f5a },
++          "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
++          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ISO-2022-CN",
++          :bin{ 411b242941420e4141af4142affe41430f5a },
++          "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
++          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "HZ",
++          :bin{ 417e7b4141af4142affe41437e7d5a },
++          "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
++          :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        // Test ticket 5691: consistent illegal sequences
++        // The following test cases are for illegal escape/designator/shift sequences.
++        //
++        // ISO-2022-JP and -CN with illegal escape sequences.
++        {
++          "ISO-2022-JP",
++          :bin{ 611b24201b244241411b283f1b28427a },
++          "a\\x1B$ \u758f\\x1B\u2538z",
++          :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ISO-2022-CN",
++          :bin{ 611b2429201b2429410e41410f7a },
++          "a\\x1B$) \u4eaez",
++          :intvector{ 0,1,1,1,1,2,3,4,10,13 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
++        // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
++        {
++          "ISO-2022-JP-2",
++          :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
++          "N\\x1BNNN\xceN\\x1B N",
++          :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ISO-2022-CN-EXT",
++          :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
++          "N\\x1BNNN\u8f0eN\\x1B N",
++          :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        {
++          "ISO-2022-CN-EXT",
++          :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
++          "O\\x1BOOO\u492bO\\x1B O",
++          :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        // Test ticket 5691: HZ with illegal tilde sequences.
++        {
++          "HZ",
++          :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
++          "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",
++          :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9,                          // SBCS
++                      12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS
++                      25 },                                                                 // SBCS
++          :int{1}, :int{0}, "", "&C", :bin{""}
++        }
++        // Test ticket 5691: Example from Peter Edberg.
++        {
++          "ISO-2022-JP",
++          :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
++          "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
++          :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
++          :int{1}, :int{0}, "", "?", :bin{""}
++        }
+         // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
+         {
+           "HZ",
+-          :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
+-          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
+-          :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
++          :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
++          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
++          :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },
+           :int{1}, :int{1}, "", "?", :bin{""}
+         }
+         // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+@@ -61,8 +193,8 @@
+         {
+           "ISO-2022-JP",
+           :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
+-          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
+-          :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
++          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
++          :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
+           :int{1}, :int{1}, "", "?", :bin{""}
+         }
+         // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
+@@ -341,7 +473,7 @@
+         {
+           "ISO-2022-CN-EXT",
+           :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
+-          :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
++          :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
+         }
+         // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
+         {
author	Remi Collet <fedora@famillecollet.com>	2013-03-20 10:29:29 +0100
committer	Remi Collet <fedora@famillecollet.com>	2013-03-20 10:29:29 +0100
commit	6deac027c98f5d99e1805f9ddc21ff2dbebe0fb7 (patch)
tree	008990c48199f2d517fc9b1a4b47c6b162ec30ef /icu.icu5691.backport.patch