/* * Table of the number of utf8 trail bytes, indexed by the lead byte. * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h * * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table. * * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES * may exist in old client code that must continue to run with newer icu library versions. * * This table could be replaced on many machines by * a few lines of assembler code using an * "index of first 0-bit from msb" instruction and * one or two more integer instructions. * * For example, on an i386, do something like * - MOV AL, leadByte * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0) * - MOV AH, 0 * - BSR BX, AX (16-bit) * - MOV AX, 6 (result) * - JZ finish (ZF==1 if leadByte==0xff) * - SUB AX, BX (result) * -finish: * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
*/
U_CAPI const uint8_t
utf8_countTrailBytes[256]={
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
staticconst UChar32
utf8_errorValue[6]={ // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, // but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
0x10ffff
};
/* * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros * and their obsolete sibling UTF8_NEXT_CHAR_SAFE(). * * U8_NEXT() supports NUL-terminated strings indicated via length<0. * * The "strict" parameter controls the error behavior: * <0 "Safe" behavior of U8_NEXT(): * -1: All illegal byte sequences yield U_SENTINEL=-1. * -2: Same as -1, except for lenient treatment of surrogate code points as legal. * Some implementations use this for roundtripping of * Unicode 16-bit strings that are not well-formed UTF-16, that is, they * contain unpaired surrogates. * -3: All illegal byte sequences yield U+FFFD. * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., false): * All illegal byte sequences yield a positive code point such that this * result code point would be encoded with the same number of bytes as * the illegal sequence. * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., true): * Same as the obsolete "safe" behavior, but non-characters are also treated * like illegal sequences. * * Note that a UBool is the same as an int8_t.
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { // *pi is one after byte c.
int32_t i=*pi; // length can be negative for NUL-terminated strings: Read and validate one byte at a time. if(i==length || c>0xf4) { // end of string, or not a lead byte
} elseif(c>=0xf0) { // Test for 4-byte sequences first because // U8_NEXT() handles shorter valid sequences inline.
uint8_t t1=s[i], t2, t3;
c&=7; if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f &&
++i!=length && (t3=s[i]-0x80)<=0x3f) {
++i;
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3; // strict: forbid non-characters like U+fffe if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i; return c;
}
}
} elseif(c>=0xe0) {
c&=0xf; if(strict!=-2) {
uint8_t t1=s[i], t2; if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
++i;
c=(c<<12)|((t1&0x3f)<<6)|t2; // strict: forbid non-characters like U+fffe if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i; return c;
}
}
} else { // strict=-2 -> lenient: allow surrogates
uint8_t t1=s[i]-0x80, t2; if(t1<=0x3f && (c>0 || t1>=0x20) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
*pi=i+1; return (c<<12)|(t1<<6)|t2;
}
}
} elseif(c>=0xc2) {
uint8_t t1=s[i]-0x80; if(t1<=0x3f) {
*pi=i+1; return ((c-0xc0)<<6)|t1;
}
} // else 0x80<=c<0xc2 is not a lead byte
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.