/** * UTF-8 lead byte for minNoMaybeCP. * Can be lower than the actual lead byte for c. * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
*/ inline uint8_t leadByteForCP(UChar32 c) { if (c <= 0x7f) { returnstatic_cast<uint8_t>(c);
} elseif (c <= 0x7ff) { returnstatic_cast<uint8_t>(0xc0 + (c >> 6));
} else { // Should not occur because ccc(U+0300)!=0. return 0xe0;
}
}
/** * Returns the code point from one single well-formed UTF-8 byte sequence * between cpStart and cpLimit. * * Trie UTF-8 macros do not assemble whole code points (for efficiency). * When we do need the code point, we call this function. * We should not need it for normalization-inert data (norm16==0). * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
*/
UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) { // Similar to U8_NEXT_UNSAFE(s, i, c).
U_ASSERT(cpStart < cpLimit);
uint8_t c = *cpStart; switch(cpLimit-cpStart) { case 1: return c; case 2: return ((c&0x1f)<<6) | (cpStart[1]&0x3f); case 3: // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) returnstatic_cast<char16_t>((c << 12) | ((cpStart[1] & 0x3f) << 6) | (cpStart[2] & 0x3f)); case 4: return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f); default:
UPRV_UNREACHABLE_EXIT; // Should not occur.
}
}
/** * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF. * Otherwise returns a negative value.
*/
UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) { if ((p - start) >= 3) {
p -= 3;
uint8_t l = *p;
uint8_t t1, t2; if (0xe1 <= l && l <= 0xed &&
(t1 = static_cast<uint8_t>(p[1] - 0x80)) <= 0x3f &&
(t2 = static_cast<uint8_t>(p[2] - 0x80)) <= 0x3f &&
(l < 0xed || t1 <= 0x1f)) { return ((l & 0xf) << 12) | (t1 << 6) | t2;
}
} return U_SENTINEL;
}
/** * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point. * Otherwise returns a negative value.
*/
int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) { // Jamo T: E1 86 A8..E1 87 82 if ((limit - src) >= 3 && *src == 0xe1) { if (src[1] == 0x86) {
uint8_t t = src[2]; // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7. // Offset 0 does not correspond to any conjoining Jamo. if (0xa8 <= t && t <= 0xbf) { return t - 0xa7;
}
} elseif (src[1] == 0x87) {
uint8_t t = src[2]; if (static_cast<int8_t>(t) <= static_cast<int8_t>(0x82u)) { return t - (0xa7 - 0x40);
}
}
} return -1;
}
void
appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
ByteSink &sink, Edits *edits) { char buffer[U8_MAX_LENGTH];
int32_t length;
int32_t cpLength = static_cast<int32_t>(cpLimit - cpStart); if (cpLength == 1) { // The builder makes ASCII map to ASCII.
buffer[0] = static_cast<uint8_t>(*cpStart + delta);
length = 1;
} else {
int32_t trail = *(cpLimit-1) + delta; if (0x80 <= trail && trail <= 0xbf) { // The delta only changes the last trail byte.
--cpLimit;
length = 0; do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
buffer[length++] = static_cast<uint8_t>(trail);
} else { // Decode the code point, add the delta, re-encode.
UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
length = 0;
U8_APPEND_UNSAFE(buffer, length, c);
}
} if (edits != nullptr) {
edits->addReplace(cpLength, length);
}
sink.Append(buffer, length);
}
UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
U_ASSERT((otherLimit - otherStart) <= INT32_MAX); // ensured by caller
int32_t length = static_cast<int32_t>(limit - start);
int32_t otherLength = static_cast<int32_t>(otherLimit - otherStart); // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long. if (otherLength < length || (otherLength / 3) > length) { returnfalse;
} // Compare valid strings from between normalization boundaries. // (Invalid sequences are normalization-inert.) for (int32_t i = 0, j = 0;;) { if (i >= length) { return j >= otherLength;
} elseif (j >= otherLength) { returnfalse;
} // Not at the end of either string yet.
UChar32 c, other;
U16_NEXT_UNSAFE(start, i, c);
U8_NEXT_UNSAFE(otherStart, j, other); if (c != other) { returnfalse;
}
}
}
// Inserts c somewhere before the last character. // Requires 0<cc<lastCC which implies reorderStart<limit. void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { for(setIterator(), skipPrevious(); previousCC()>cc;) {} // insert c at codePointLimit, after the character with prevCC<=cc
char16_t *q=limit;
char16_t *r=limit+=U16_LENGTH(c); do {
*--r=*--q;
} while(codePointLimit!=q);
writeCodePoint(q, c); if(cc<=1) {
reorderStart=r;
}
}
void
Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { // Add the start code point of each same-value range of the trie.
UChar32 start = 0, end;
uint32_t value; while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
nullptr, nullptr, &value)) >= 0) {
sa->add(sa->set, start); if (start != end && isAlgorithmicNoNo(static_cast<uint16_t>(value)) &&
(value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) { // Range of code points with same-norm16-value algorithmic decompositions. // They might have different non-zero FCD16 values.
uint16_t prevFCD16 = getFCD16(start); while (++start <= end) {
uint16_t fcd16 = getFCD16(start); if (fcd16 != prevFCD16) {
sa->add(sa->set, start);
prevFCD16 = fcd16;
}
}
}
start = end + 1;
}
/* add Hangul LV syllables and LV+1 because of skippables */ for(char16_t c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
sa->add(sa->set, c);
sa->add(sa->set, c+1);
}
sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
}
void
Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { // Add the start code point of each same-value range of the canonical iterator data trie. if (!ensureCanonIterData(errorCode)) { return; } // Currently only used for the SEGMENT_STARTER property.
UChar32 start = 0, end;
uint32_t value; while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
segmentStarterMapper, nullptr, &value)) >= 0) {
sa->add(sa->set, start);
start = end + 1;
}
}
const char16_t *
Normalizer2Impl::copyLowPrefixFromNulTerminated(const char16_t *src,
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const { // Make some effort to support NUL-terminated strings reasonably. // Take the part of the fast quick check loop that does not look up // data and check the first part of the string. // After this prefix, determine the string length to simplify the rest // of the code. const char16_t *prevSrc=src;
char16_t c; while((c=*src++)<minNeedDataCP && c!=0) {} // Back out the last character for full processing. // Copy this prefix. if(--src!=prevSrc) { if(buffer!=nullptr) {
buffer->appendZeroCC(prevSrc, src, errorCode);
}
} return src;
}
const uint8_t *prevBoundary = src; // only for quick check
uint8_t prevCC = 0;
for (;;) { // Fast path: Scan over a sequence of characters below the minimum "no" code point, // or with (decompYes && ccc==0) properties. const uint8_t *fastStart = src; const uint8_t *prevSrc;
uint16_t norm16 = 0;
for (;;) { if (src == limit) { if (prevBoundary != limit && sink != nullptr) {
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
*sink, options, edits, errorCode);
} return src;
} if (*src < minNoLead) {
++src;
} else {
prevSrc = src;
UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); if (!isMostDecompYesAndZeroCC(norm16)) { break;
}
}
} // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo, // and the current character at [prevSrc..src[ is not a common case with cc=0 // (MIN_NORMAL_MAYBE_YES or JAMO_VT). // It could still be a maybeYes with cc=0. if (prevSrc != fastStart) { // The fast path looped over yes/0 characters before the current one. if (sink != nullptr &&
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) { break;
}
prevBoundary = prevSrc;
prevCC = 0;
}
// Medium-fast path: Quick check. if (isMaybeYesOrNonZeroCC(norm16)) { // Does not decompose.
uint8_t cc = getCCFromYesOrMaybeYes(norm16); if (prevCC <= cc || cc == 0) {
prevCC = cc; if (cc <= 1) { if (sink != nullptr &&
!ByteSinkUtil::appendUnchanged(prevBoundary, src,
*sink, options, edits, errorCode)) { break;
}
prevBoundary = src;
} continue;
}
} if (sink == nullptr) { return prevBoundary; // quick check: "no" or cc out of order
}
// Slow path // Decompose up to and including the current character. if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) { if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) { break;
}
prevBoundary = prevSrc;
}
ReorderingBuffer buffer(*this, s16, errorCode); if (U_FAILURE(errorCode)) { break;
}
decomposeShort(prevBoundary, src, STOP_AT_LIMIT, false/* onlyContiguous */,
buffer, errorCode); // Decompose until the next boundary. if (buffer.getLastCC() > 1) {
src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, false/* onlyContiguous */,
buffer, errorCode);
} if (U_FAILURE(errorCode)) { break;
} if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
errorCode = U_INDEX_OUTOFBOUNDS_ERROR; break;
} // We already know there was a change if the original character decomposed; // otherwise compare. if (isMaybeYesOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) { if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
*sink, options, edits, errorCode)) { break;
}
} else { if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
*sink, edits, errorCode)) { break;
}
}
prevBoundary = src;
prevCC = 0;
} return src;
}
const uint8_t *
Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
StopAt stopAt, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const { if (U_FAILURE(errorCode)) { return nullptr;
} while (src < limit) { const uint8_t *prevSrc = src;
uint16_t norm16;
UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); // Get the decomposition and the lead and trail cc's.
UChar32 c = U_SENTINEL; if (norm16 >= limitNoNo) { if (isMaybeYesOrNonZeroCC(norm16)) { // No comp boundaries around this character.
uint8_t cc = getCCFromYesOrMaybeYes(norm16); if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { return prevSrc;
}
c = codePointFromValidUTF8(prevSrc, src); if (!buffer.append(c, cc, errorCode)) { return nullptr;
} if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) { return src;
} continue;
} elseif (norm16 < minMaybeNo) { // Maps to an isCompYesAndZeroCC. if (stopAt != STOP_AT_LIMIT) { return prevSrc;
}
c = codePointFromValidUTF8(prevSrc, src);
c = mapAlgorithmic(c, norm16);
norm16 = getRawNorm16(c);
}
} elseif (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) { return prevSrc;
} // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8. // We do not see invalid UTF-8 here because // its norm16==INERT is normalization-inert, // so it gets copied unchanged in the fast path, // and we stop the slow path where invalid UTF-8 begins. // c >= 0 is the result of an algorithmic mapping.
U_ASSERT(c >= 0 || norm16 != INERT); if (norm16 < minYesNo) { if (c < 0) {
c = codePointFromValidUTF8(prevSrc, src);
} // does not decompose if (!buffer.append(c, 0, errorCode)) { return nullptr;
}
} elseif (isHangulLV(norm16) || isHangulLVT(norm16)) { // Hangul syllable: decompose algorithmically if (c < 0) {
c = codePointFromValidUTF8(prevSrc, src);
}
char16_t jamos[3]; if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) { return nullptr;
}
} else { // The character decomposes, get everything from the variable-length extra data. const uint16_t *mapping = getData(norm16);
uint16_t firstUnit = *mapping;
int32_t length = firstUnit & MAPPING_LENGTH_MASK;
uint8_t trailCC = static_cast<uint8_t>(firstUnit >> 8);
uint8_t leadCC; if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8);
} else {
leadCC = 0;
} if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { return prevSrc;
} if (!buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode)) { return nullptr;
}
} if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
(stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) { return src;
}
} return src;
}
const char16_t *
Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const {
uint16_t norm16; if(c<minDecompNoCP || isMaybeYesOrNonZeroCC(norm16=getNorm16(c))) { // c does not decompose return nullptr;
} const char16_t *decomp = nullptr; if(isDecompNoAlgorithmic(norm16)) { // Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
decomp=buffer;
length=0;
U16_APPEND_UNSAFE(buffer, length, c); // The mapping might decompose further.
norm16 = getRawNorm16(c);
} if (norm16 < minYesNo) { return decomp;
} elseif(isHangulLV(norm16) || isHangulLVT(norm16)) { // Hangul syllable: decompose algorithmically
length=Hangul::decompose(c, buffer); return buffer;
} // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getData(norm16);
length=*mapping&MAPPING_LENGTH_MASK; returnreinterpret_cast<const char16_t*>(mapping) + 1;
}
// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 // so that a raw mapping fits that consists of one unit ("rm0") // plus all but the first two code units of the normal mapping. // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. const char16_t *
Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const {
uint16_t norm16; if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { // c does not decompose return nullptr;
} elseif(isHangulLV(norm16) || isHangulLVT(norm16)) { // Hangul syllable: decompose algorithmically
Hangul::getRawDecomposition(c, buffer);
length=2; return buffer;
} elseif(isDecompNoAlgorithmic(norm16)) {
c=mapAlgorithmic(c, norm16);
length=0;
U16_APPEND_UNSAFE(buffer, length, c); return buffer;
} // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getData(norm16);
uint16_t firstUnit=*mapping;
int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping if(firstUnit&MAPPING_HAS_RAW_MAPPING) { // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. // Bit 7=MAPPING_HAS_CCC_LCCC_WORD const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
uint16_t rm0=*rawMapping; if(rm0<=MAPPING_LENGTH_MASK) {
length=rm0; returnreinterpret_cast<const char16_t*>(rawMapping) - rm0;
} else { // Copy the normal mapping and replace its first two code units with rm0.
buffer[0] = static_cast<char16_t>(rm0);
u_memcpy(buffer + 1, reinterpret_cast<const char16_t*>(mapping) + 1 + 2, mLength - 2);
length=mLength-1; return buffer;
}
} else {
length=mLength; returnreinterpret_cast<const char16_t*>(mapping) + 1;
}
}
UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const { return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
norm16HasDecompBoundaryBefore(getNorm16(c));
}
UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const { if (norm16 < minNoNoCompNoMaybeCC) { returntrue;
} if (norm16 >= limitNoNo) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
} // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getDataForYesOrNo(norm16);
uint16_t firstUnit=*mapping; // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
}
UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const { if (c < minDecompNoCP) { returntrue;
} if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { returntrue;
} return norm16HasDecompBoundaryAfter(getNorm16(c));
}
UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const { if(norm16 <= minYesNo || isHangulLVT(norm16)) { returntrue;
} if (norm16 >= limitNoNo) { if (isMaybeYesOrNonZeroCC(norm16)) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
} elseif (norm16 < minMaybeNo) { // Maps to an isCompYesAndZeroCC. return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
}
} // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getData(norm16);
uint16_t firstUnit=*mapping; // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { returnfalse; // trailCC>1
} if(firstUnit<=0xff) { returntrue; // trailCC==0
} // if(trailCC==1) test leadCC==0, same as checking for before-boundary // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
}
/* * Finds the recomposition result for * a forward-combining "lead" character, * specified with a pointer to its compositions list, * and a backward-combining "trail" character. * * If the lead and trail characters combine, then this function returns * the following "compositeAndFwd" value: * Bits 21..1 composite character * Bit 0 set if the composite is a forward-combining starter * otherwise it returns -1. * * The compositions list has (trail, compositeAndFwd) pair entries, * encoded as either pairs or triples of 16-bit units. * The last entry has the high bit of its first unit set. * * The list is sorted by ascending trail characters (there are no duplicates). * A linear search is used. * * See normalizer2impl.h for a more detailed description * of the compositions list format.
*/
int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
uint16_t key1, firstUnit; if(trail<COMP_1_TRAIL_LIMIT) { // trail character is 0..33FF // result entry may have 2 or 3 units
key1 = static_cast<uint16_t>(trail << 1); while(key1>(firstUnit=*list)) {
list+=2+(firstUnit&COMP_1_TRIPLE);
} if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(firstUnit&COMP_1_TRIPLE) { return (static_cast<int32_t>(list[1]) << 16) | list[2];
} else { return list[1];
}
}
} else { // trail character is 3400..10FFFF // result entry has 3 units
key1 = static_cast<uint16_t>(COMP_1_TRAIL_LIMIT +
(((trail>>COMP_1_TRAIL_SHIFT))&
~COMP_1_TRIPLE));
uint16_t key2 = static_cast<uint16_t>(trail << COMP_2_TRAIL_SHIFT);
uint16_t secondUnit; for(;;) { if(key1>(firstUnit=*list)) {
list+=2+(firstUnit&COMP_1_TRIPLE);
} elseif(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(key2>(secondUnit=list[1])) { if(firstUnit&COMP_1_LAST_TUPLE) { break;
} else {
list+=3;
}
} elseif(key2==(secondUnit&COMP_2_TRAIL_MASK)) { return (static_cast<int32_t>(secondUnit & ~COMP_2_TRAIL_MASK) << 16) | list[2];
} else { break;
}
} else { break;
}
}
} return -1;
}
/** * @param list some character's compositions list * @param set recursively receives the composites from these compositions
*/ void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
uint16_t firstUnit;
int32_t compositeAndFwd; do {
firstUnit=*list; if((firstUnit&COMP_1_TRIPLE)==0) {
compositeAndFwd=list[1];
list+=2;
} else {
compositeAndFwd = ((static_cast<int32_t>(list[1]) & ~COMP_2_TRAIL_MASK) << 16) | list[2];
list+=3;
}
UChar32 composite=compositeAndFwd>>1; if((compositeAndFwd&1)!=0) {
addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
/* * Recomposes the buffer text starting at recomposeStartIndex * (which is in NFD - decomposed and canonically ordered), * and truncates the buffer contents. * * Note that recomposition never lengthens the text: * Any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit.
*/ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
UBool onlyContiguous) const {
char16_t *p=buffer.getStart()+recomposeStartIndex;
char16_t *limit=buffer.getLimit(); if(p==limit) { return;
}
// Some of the following variables are not used until we have a forward-combining starter // and are only initialized now to avoid compiler warnings.
compositionsList=nullptr; // used as indicator for whether we have a forward-combining starter
starter=nullptr;
starterIsSupplementary=false;
prevCC=0;
for(;;) {
UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
cc=getCCFromYesOrMaybeYes(norm16); if( // this character combines backward and
isMaybe(norm16) && // we have seen a starter that combines forward and
compositionsList!=nullptr && // the backward-combining character is not blocked
(prevCC<cc || prevCC==0)
) { if(isJamoVT(norm16)) { // c is a Jamo V/T, see if we can compose it with the previous character. if(c<Hangul::JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
char16_t prev = static_cast<char16_t>(*starter - Hangul::JAMO_L_BASE); if(prev<Hangul::JAMO_L_COUNT) {
pRemove=p-1;
char16_t syllable = static_cast<char16_t>(
Hangul::HANGUL_BASE +
(prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
Hangul::JAMO_T_COUNT);
char16_t t; if (p != limit && (t = static_cast<char16_t>(*p - Hangul::JAMO_T_BASE)) < Hangul::JAMO_T_COUNT) {
++p;
syllable+=t; // The next character was a Jamo T.
}
*starter=syllable; // remove the Jamo V/T
q=pRemove;
r=p; while(r<limit) {
*q++=*r++;
}
limit=q;
p=pRemove;
}
} /* * No "else" for Jamo T: * Since the input is in NFD, there are no Hangul LV syllables that * a Jamo T could combine with. * All Jamo Ts are combined above when handling Jamo Vs.
*/ if(p==limit) { break;
}
compositionsList=nullptr; continue;
} elseif((compositeAndFwd=combine(compositionsList, c))>=0) { // The starter and the combining mark (c) do combine.
UChar32 composite=compositeAndFwd>>1;
// Replace the starter with the composite, remove the combining mark.
pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark if(starterIsSupplementary) { if(U_IS_SUPPLEMENTARY(composite)) { // both are supplementary
starter[0]=U16_LEAD(composite);
starter[1]=U16_TRAIL(composite);
} else {
*starter = static_cast<char16_t>(composite); // The composite is shorter than the starter, // move the intermediate characters forward one.
starterIsSupplementary=false;
q=starter+1;
r=q+1; while(r<pRemove) {
*q++=*r++;
}
--pRemove;
}
} elseif(U_IS_SUPPLEMENTARY(composite)) { // The composite is longer than the starter, // move the intermediate characters back one.
starterIsSupplementary=true;
++starter; // temporarily increment for the loop boundary
q=pRemove;
r=++pRemove; while(starter<q) {
*--r=*--q;
}
*starter=U16_TRAIL(composite);
*--starter=U16_LEAD(composite); // undo the temporary increment
} else { // both are on the BMP
*starter = static_cast<char16_t>(composite);
}
/* remove the combining mark by moving the following text over it */ if(pRemove<p) {
q=pRemove;
r=p; while(r<limit) {
*q++=*r++;
}
limit=q;
p=pRemove;
} // Keep prevCC because we removed the combining mark.
if(p==limit) { break;
} // Is the composite a starter that combines forward? if(compositeAndFwd&1) {
compositionsList=
getCompositionsListForComposite(getRawNorm16(composite));
} else {
compositionsList=nullptr;
}
// We combined; continue with looking for compositions. continue;
}
}
// no combination this time
prevCC=cc; if(p==limit) { break;
}
// If c did not combine, then check if it is a starter. if(cc==0) { // Found a new starter. if((compositionsList=getCompositionsListForDecompYes(norm16))!=nullptr) { // It may combine with something, prepare for it. if(U_IS_BMP(c)) {
starterIsSupplementary=false;
starter=p-1;
} else {
starterIsSupplementary=true;
starter=p-2;
}
}
} elseif(onlyContiguous) { // FCC: no discontiguous compositions; any intervening character blocks.
compositionsList=nullptr;
}
}
buffer.setReorderingLimit(limit);
}
UChar32
Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16 const uint16_t *list; if(isInert(norm16)) { return U_SENTINEL;
} elseif(norm16<minYesNoMappingsOnly) { // a combines forward. if(isJamoL(norm16)) { if (b < Hangul::JAMO_V_BASE) { return U_SENTINEL;
}
b-=Hangul::JAMO_V_BASE; if(b<Hangul::JAMO_V_COUNT) { return
(Hangul::HANGUL_BASE+
((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
Hangul::JAMO_T_COUNT);
} else { return U_SENTINEL;
}
} elseif(isHangulLV(norm16)) { if (b <= Hangul::JAMO_T_BASE) { return U_SENTINEL;
}
b-=Hangul::JAMO_T_BASE; if(b<Hangul::JAMO_T_COUNT) { // not b==0! return a+b;
} else { return U_SENTINEL;
}
} else { // 'a' has a compositions list in extraData
list=getDataForYesOrNo(norm16); if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
list+= // mapping pointer
1+ // +1 to skip the first unit with the mapping length
(*list&MAPPING_LENGTH_MASK); // + mapping length
}
}
} elseif(norm16<minMaybeNoCombinesFwd || MIN_NORMAL_MAYBE_YES<=norm16) { return U_SENTINEL;
} else {
list=getDataForMaybe(norm16); if(norm16<minMaybeYes) { // composite 'a' has both mapping & compositions list
list+= // mapping pointer
1+ // +1 to skip the first unit with the mapping length
(*list&MAPPING_LENGTH_MASK); // + mapping length
}
} if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b return U_SENTINEL;
} #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC return combine(list, b)>>1; #else
int32_t compositeAndFwd=combine(list, b); return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; #endif
}
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize // !doCompose: isNormalized (buffer must be empty and initialized)
UBool
Normalizer2Impl::compose(const char16_t *src, const char16_t *limit,
UBool onlyContiguous,
UBool doCompose,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const { const char16_t *prevBoundary=src;
UChar32 minNoMaybeCP=minCompNoMaybeCP; if(limit==nullptr) {
src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
doCompose ? &buffer : nullptr,
errorCode); if(U_FAILURE(errorCode)) { returnfalse;
}
limit=u_strchr(src, 0); if (prevBoundary != src) { if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
prevBoundary = src;
} else {
buffer.removeSuffix(1);
prevBoundary = --src;
}
}
}
for (;;) { // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, // or with (compYes && ccc==0) properties. const char16_t *prevSrc;
UChar32 c = 0;
uint16_t norm16 = 0; for (;;) { if (src == limit) { if (prevBoundary != limit && doCompose) {
buffer.appendZeroCC(prevBoundary, limit, errorCode);
} returntrue;
} if( (c=*src)<minNoMaybeCP ||
isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
) {
++src;
} else {
prevSrc = src++; if(!U16_IS_LEAD(c)) { break;
} else {
char16_t c2; if(src!=limit && U16_IS_TRAIL(c2=*src)) {
++src;
c=U16_GET_SUPPLEMENTARY(c, c2);
norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); if(!isCompYesAndZeroCC(norm16)) { break;
}
}
}
}
} // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" / "maybeNo" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties.
// Medium-fast path: Handle cases that do not require full decomposition and recomposition. if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo if (!doCompose) { returnfalse;
} // Fast path for mapping a character that is immediately surrounded by boundaries. // In this case, we need not decompose around the current character. if (isDecompNoAlgorithmic(norm16)) { // Maps to a single isCompYesAndZeroCC character // which also implies hasCompBoundaryBefore. if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
hasCompBoundaryBefore(src, limit)) { if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break;
} if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) { break;
}
prevBoundary = src; continue;
}
} elseif (norm16 < minNoNoCompBoundaryBefore) { // The mapping is comp-normalized which also implies hasCompBoundaryBefore. if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
hasCompBoundaryBefore(src, limit)) { if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break;
} const char16_t *mapping = reinterpret_cast<const char16_t *>(getDataForYesOrNo(norm16));
int32_t length = *mapping++ & MAPPING_LENGTH_MASK; if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) { break;
}
prevBoundary = src; continue;
}
} elseif (norm16 >= minNoNoEmpty) { // The current character maps to nothing. // Simply omit it from the output if there is a boundary before _or_ after it. // The character itself implies no boundaries. if (hasCompBoundaryBefore(src, limit) ||
hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break;
}
prevBoundary = src; continue;
}
} // Other "noNo" type, or need to examine more text around this character: // Fall through to the slow path.
} elseif (isJamoVT(norm16) && prevBoundary != prevSrc) {
char16_t prev=*(prevSrc-1); if(c<Hangul::JAMO_T_BASE) { // The current character is a Jamo Vowel, // compose with previous Jamo L and following Jamo T.
char16_t l = static_cast<char16_t>(prev - Hangul::JAMO_L_BASE); if(l<Hangul::JAMO_L_COUNT) { if (!doCompose) { returnfalse;
}
int32_t t; if (src != limit &&
0 < (t = (static_cast<int32_t>(*src) - Hangul::JAMO_T_BASE)) &&
t < Hangul::JAMO_T_COUNT) { // The next character is a Jamo T.
++src;
} elseif (hasCompBoundaryBefore(src, limit)) { // No Jamo T follows, not even via decomposition.
t = 0;
} else {
t = -1;
} if (t >= 0) {
UChar32 syllable = Hangul::HANGUL_BASE +
(l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
Hangul::JAMO_T_COUNT + t;
--prevSrc; // Replace the Jamo L as well. if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break;
} if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) { break;
}
prevBoundary = src; continue;
} // If we see L+V+x where x!=T then we drop to the slow path, // decompose and recompose. // This is to deal with NFKC finding normal L and V but a // compatibility variant of a T. // We need to either fully compose that combination here // (which would complicate the code and may not work with strange custom data) // or use the slow path.
}
} elseif (Hangul::isHangulLV(prev)) { // The current character is a Jamo Trailing consonant, // compose with previous Hangul LV that does not contain a Jamo T. if (!doCompose) { returnfalse;
}
UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
--prevSrc; // Replace the Hangul LV as well. if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break;
} if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) { break;
}
prevBoundary = src; continue;
} // No matching context, or may need to decompose surrounding text first: // Fall through to the slow path.
} elseif (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC // One or more combining marks that do not combine-back: // Check for canonical order, copy unchanged if ok and // if followed by a character with a boundary-before.
uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { // Fails FCD test, need to decompose and contiguously recompose. if (!doCompose) { returnfalse;
}
} else { // If !onlyContiguous (not FCC), then we ignore the tccc of // the previous character which passed the quick check "yes && ccc==0" test. const char16_t *nextSrc;
uint16_t n16; for (;;) { if (src == limit) { if (doCompose) {
buffer.appendZeroCC(prevBoundary, limit, errorCode);
} returntrue;
}
uint8_t prevCC = cc;
nextSrc = src;
UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16); if (n16 >= MIN_YES_YES_WITH_CC) {
cc = getCCFromNormalYesOrMaybe(n16); if (prevCC > cc) { if (!doCompose) { returnfalse;
} break;
}
} else { break;
}
src = nextSrc;
} // src is after the last in-order combining mark. // If there is a boundary here, then we continue with no change. if (norm16HasCompBoundaryBefore(n16)) { if (isCompYesAndZeroCC(n16)) {
src = nextSrc;
} continue;
} // Use the slow path. There is no boundary in [prevSrc, src[.
}
}
// Slow path: Find the nearest boundaries around the current character, // decompose and recompose. if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { const char16_t *p = prevSrc;
UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16); if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
prevSrc = p;
}
} if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break;
}
int32_t recomposeStartIndex=buffer.length(); // We know there is not a boundary here.
decomposeShort(prevSrc, src, false/* !stopAtCompBoundary */, onlyContiguous,
buffer, errorCode); // Decompose until the next boundary.
src = decomposeShort(src, limit, true/* stopAtCompBoundary */, onlyContiguous,
buffer, errorCode); if (U_FAILURE(errorCode)) { break;
} if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
errorCode = U_INDEX_OUTOFBOUNDS_ERROR; returntrue;
}
recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { if(!buffer.equals(prevSrc, src)) { returnfalse;
}
buffer.remove();
}
prevBoundary=src;
} returntrue;
}
// Very similar to compose(): Make the same changes in both places if relevant. // pQCResult==nullptr: spanQuickCheckYes // pQCResult!=nullptr: quickCheck (*pQCResult must be UNORM_YES) const char16_t *
Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit,
UBool onlyContiguous,
UNormalizationCheckResult *pQCResult) const { const char16_t *prevBoundary=src;
UChar32 minNoMaybeCP=minCompNoMaybeCP; if(limit==nullptr) {
UErrorCode errorCode=U_ZERO_ERROR;
src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, nullptr, errorCode);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.31 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.