/** * Input: c is a letter I with or without acute accent. * start is the index in src after c, and is less than segmentLimit. * If a plain i/I is followed by a plain j/J, * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, * then we output accordingly. * * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
*/
int32_t maybeTitleDutchIJ(const char16_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
char16_t *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
icu::Edits *edits) {
U_ASSERT(start < segmentLimit);
int32_t index = start; bool withAcute = false;
// If the conditions are met, then the following variables tell us what to output.
int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) bool doTitleJ = false; // true if the j needs to be titlecased
int32_t unchanged2 = 0; // after the j (0 or 1)
// next character after the first letter
char16_t c2 = src[index++];
// Is the first letter an i/I with accent? if (c == u'I') { if (c2 == ACUTE) {
withAcute = true;
unchanged1 = 1; if (index == segmentLimit) { return start; }
c2 = src[index++];
}
} else { // Í
withAcute = true;
}
// Is the next character a j/J? if (c2 == u'j') {
doTitleJ = true;
} elseif (c2 == u'J') {
++unchanged1;
} else { return start;
}
// A plain i/I must be followed by a plain j/J. // An i/I with acute must be followed by a j/J with acute. if (withAcute) { if (index == segmentLimit || src[index++] != ACUTE) { return start; } if (doTitleJ) {
unchanged2 = 1;
} else {
++unchanged1;
}
}
// There must not be another combining mark. if (index < segmentLimit) {
int32_t cp;
int32_t i = index;
U16_NEXT(src, i, segmentLimit, cp);
uint32_t typeMask = U_GET_GC_MASK(cp); if ((typeMask & U_GC_M_MASK) != 0) { return start;
}
}
/* set up local variables */
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
int32_t destIndex=0;
int32_t prev=0; bool isFirstIndex=true;
/* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */
int32_t index; if(isFirstIndex) {
isFirstIndex=false;
index=iter->first();
} else {
index=iter->next();
} if(index==UBRK_DONE || index>srcLength) {
index=srcLength;
}
/* * Segment [prev..index[ into 3 parts: * a) skipped characters (copy as-is) [prev..titleStart[ * b) first letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[
*/ if(prev<index) { // Find and copy skipped characters [prev..titleStart[
int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U16_NEXT(src, titleLimit, index, c); if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { // Adjust the titlecasing index to the next cased character, // or to the next letter/number/symbol/private use. // Stop with titleStart<titleLimit<=index // if there is a character to be titlecased, // or else stop with titleStart==titleLimit==index. bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit; if(titleLimit==index) { break;
}
U16_NEXT(src, titleLimit, index, c);
} if (prev < titleStart) {
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, options, edits); if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0;
}
}
}
if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */
csc.cpStart=titleStart;
csc.cpLimit=titleLimit; const char16_t *s;
c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
titleLimit-titleStart, options, edits); if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0;
}
/* Special case Dutch IJ titlecasing */ if (titleStart+1 < index &&
caseLocale == UCASE_LOC_DUTCH) { if (c < 0) {
c = ~c;
}
uint32_t getDiacriticData(UChar32 c) { switch (c) { case 0x0300: // varia case 0x0301: // tonos = oxia case 0x0342: // perispomeni case 0x0302: // circumflex can look like perispomeni case 0x0303: // tilde can look like perispomeni case 0x0311: // inverted breve can look like perispomeni return HAS_ACCENT; case 0x0308: // dialytika = diaeresis return HAS_COMBINING_DIALYTIKA; case 0x0344: // dialytika tonos return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; case 0x0345: // ypogegrammeni = iota subscript return HAS_YPOGEGRAMMENI; case 0x0304: // macron case 0x0306: // breve case 0x0313: // comma above case 0x0314: // reversed comma above case 0x0343: // koronis return HAS_OTHER_GREEK_DIACRITIC; default: return 0;
}
}
UBool isFollowedByCasedLetter(const char16_t *s, int32_t i, int32_t length) { while (i < length) {
UChar32 c;
U16_NEXT(s, i, length, c);
int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // Case-ignorable, continue with the loop.
} elseif (type != UCASE_NONE) { returntrue; // Followed by cased letter.
} else { returnfalse; // Uncased and not case-ignorable.
}
} returnfalse; // Not followed by cased letter.
}
/** * Greek string uppercasing with a state machine. * Probably simpler than a stateless function that has to figure out complex context-before * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function.
*/
int32_t toUpper(uint32_t options,
char16_t *dest, int32_t destCapacity, const char16_t *src, int32_t srcLength,
Edits *edits,
UErrorCode &errorCode) {
int32_t destIndex=0;
uint32_t state = 0; for (int32_t i = 0; i < srcLength;) {
int32_t nextIndex = i;
UChar32 c;
U16_NEXT(src, nextIndex, srcLength, c);
uint32_t nextState = 0;
int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable
nextState |= (state & AFTER_CASED);
} elseif (type != UCASE_NONE) { // c is cased
nextState |= AFTER_CASED;
}
uint32_t data = getLetterData(c); if (data > 0) {
uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 &&
(state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
} const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0; // Skip combining diacritics after this Greek letter. while (nextIndex < srcLength) {
uint32_t diacriticData = getDiacriticData(src[nextIndex]); if (diacriticData != 0) {
data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
++numYpogegrammeni;
}
++nextIndex;
} else { break; // not a Greek diacritic
}
} if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
} // Map according to Greek rules.
UBool addTonos = false; if (upper == 0x397 &&
(data & HAS_ACCENT) != 0 &&
numYpogegrammeni == 0 &&
(state & AFTER_CASED) == 0 &&
!isFollowedByCasedLetter(src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (hasPrecomposedAccent) {
upper = 0x389; // Preserve the precomposed form.
} else {
addTonos = true;
}
} elseif ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) {
upper = 0x3AA;
data &= ~HAS_EITHER_DIALYTIKA;
} elseif (upper == 0x3A5) {
upper = 0x3AB;
data &= ~HAS_EITHER_DIALYTIKA;
}
}
/* * This function is a copy of unorm_cmpEquivFold() minus the parts for * canonical equivalence. * Keep the functions in sync, and see there for how this works. * The duplication is for modularization: * It makes caseless (but not canonical caseless) matches independent of * the normalization code.
*/
/* stack element for previous-level source/decomposition pointers */ struct CmpEquivLevel { const char16_t *start, *s, *limit;
}; typedefstruct CmpEquivLevel CmpEquivLevel;
/** * Internal implementation code comparing string with case fold. * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). * * @param s1 input string 1 * @param length1 length of string 1, or -1 (NUL terminated) * @param s2 input string 2 * @param length2 length of string 2, or -1 (NUL terminated) * @param options compare options * @param matchLen1 (output) length of partial prefix match in s1 * @param matchLen2 (output) length of partial prefix match in s2 * @param pErrorCode receives error status * @return The result of comparison
*/ static int32_t _cmpFold( const char16_t *s1, int32_t length1, const char16_t *s2, int32_t length2,
uint32_t options,
int32_t *matchLen1, int32_t *matchLen2,
UErrorCode *pErrorCode) {
int32_t cmpRes = 0;
/* current-level start/limit - s1/s2 as current */ const char16_t *start1, *start2, *limit1, *limit2;
/* points to the original start address */ const char16_t *org1, *org2;
/* points to the end of match + 1 */ const char16_t *m1, *m2;
/* case folding variables */ const char16_t *p;
int32_t length;
/* stacks of previous-level start/current/limit */
CmpEquivLevel stack1[2], stack2[2];
/* case folding buffers, only use current-level start/limit */
char16_t fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
/* track which is the current level per string */
int32_t level1, level2;
/* current code units, and code points for lookups */
UChar32 c1, c2, cp1, cp2;
/* no argument error checking because this itself is not an API */
/* * assume that at least the option U_COMPARE_IGNORE_CASE is set * otherwise this function would have to behave exactly as uprv_strCompare()
*/ if(U_FAILURE(*pErrorCode)) { return 0;
}
/* comparison loop */ for(;;) { /* * here a code unit value of -1 means "get another code unit" * below it will mean "this source is finished"
*/
if(c1<0) { /* get next code unit from string 1, post-increment */ for(;;) { if(s1==limit1 || ((c1=*s1)==0 && (limit1==nullptr || (options&_STRNCMP_STYLE)))) { if(level1==0) {
c1=-1; break;
}
} else {
++s1; break;
}
/* reached end of level buffer, pop one level */ do {
--level1;
start1=stack1[level1].start; /*Not uninitialized*/
} while(start1==nullptr);
s1=stack1[level1].s; /*Not uninitialized*/
limit1=stack1[level1].limit; /*Not uninitialized*/
}
}
if(c2<0) { /* get next code unit from string 2, post-increment */ for(;;) { if(s2==limit2 || ((c2=*s2)==0 && (limit2==nullptr || (options&_STRNCMP_STYLE)))) { if(level2==0) {
c2=-1; break;
}
} else {
++s2; break;
}
/* reached end of level buffer, pop one level */ do {
--level2;
start2=stack2[level2].start; /*Not uninitialized*/
} while(start2==nullptr);
s2=stack2[level2].s; /*Not uninitialized*/
limit2=stack2[level2].limit; /*Not uninitialized*/
}
}
/* * compare c1 and c2 * either variable c1, c2 is -1 only if the corresponding string is finished
*/ if(c1==c2) { const char16_t *next1, *next2;
if(c1<0) {
cmpRes=0; /* c1==c2==-1 indicating end of strings */ break;
}
/* * Note: Move the match positions in both strings at the same time * only when corresponding code point(s) in the original strings * are fully consumed. For example, when comparing s1="Fust" and * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches * the first code point in the case-folded data. But the second "s" * has no matching code point in s1, so this implementation returns * 2 as the prefix match length ("Fu").
*/
next1=next2=nullptr; if(level1==0) {
next1=s1;
} elseif(s1==limit1) { /* Note: This implementation only use a single level of stack. * If this code needs to be changed to use multiple levels * of stacks, the code above should check if the current * code is at the end of all stacks.
*/
U_ASSERT(level1==1);
/* is s1 at the end of the current stack? */
next1=stack1[0].s;
}
if (next1!=nullptr) { if(level2==0) {
next2=s2;
} elseif(s2==limit2) {
U_ASSERT(level2==1);
/* is s2 at the end of the current stack? */
next2=stack2[0].s;
} if(next2!=nullptr) {
m1=next1;
m2=next2;
}
}
c1=c2=-1; /* make us fetch new code units */ continue;
} elseif(c1<0) {
cmpRes=-1; /* string 1 ends before string 2 */ break;
} elseif(c2<0) {
cmpRes=1; /* string 2 ends before string 1 */ break;
} /* c1!=c2 && c1>=0 && c2>=0 */
/* get complete code points for c1, c2 for lookups if either is a surrogate */
cp1=c1; if(U_IS_SURROGATE(c1)) {
char16_t c;
/* * go down one level for each string * continue with the main loop as soon as there is a real change
*/
if( level1==0 &&
(length = ucase_toFullFolding(cp1, &p, options)) >= 0
) { /* cp1 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c1)) { if(U_IS_SURROGATE_LEAD(c1)) { /* advance beyond source surrogate pair if it case-folds */
++s1;
} else/* isTrail(c1) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point
*/
--s2;
--m2;
c2=*(s2-1);
}
}
/* push current level pointers */
stack1[0].start=start1;
stack1[0].s=s1;
stack1[0].limit=limit1;
++level1;
/* copy the folding result to fold1[] */ if(length<=UCASE_MAX_STRING_LENGTH) {
u_memcpy(fold1, p, length);
} else {
int32_t i=0;
U16_APPEND_UNSAFE(fold1, i, length);
length=i;
}
/* set next level pointers to case folding */
start1=s1=fold1;
limit1=fold1+length;
/* get ready to read from decomposition, continue with loop */
c1=-1; continue;
}
if( level2==0 &&
(length = ucase_toFullFolding(cp2, &p, options)) >= 0
) { /* cp2 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c2)) { if(U_IS_SURROGATE_LEAD(c2)) { /* advance beyond source surrogate pair if it case-folds */
++s2;
} else/* isTrail(c2) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point
*/
--s1;
--m2;
c1=*(s1-1);
}
}
/* push current level pointers */
stack2[0].start=start2;
stack2[0].s=s2;
stack2[0].limit=limit2;
++level2;
/* copy the folding result to fold2[] */ if(length<=UCASE_MAX_STRING_LENGTH) {
u_memcpy(fold2, p, length);
} else {
int32_t i=0;
U16_APPEND_UNSAFE(fold2, i, length);
length=i;
}
/* set next level pointers to case folding */
start2=s2=fold2;
limit2=fold2+length;
/* get ready to read from decomposition, continue with loop */
c2=-1; continue;
}
/* * no decomposition/case folding, max level for both sides: * return difference result * * code point order comparison must not just return cp1-cp2 * because when single surrogates are present then the surrogate pairs * that formed cp1 and cp2 may be from different string indexes * * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units * c1=d800 cp1=10001 c2=dc00 cp2=10000 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } * * therefore, use same fix-up as in ustring.c/uprv_strCompare() * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ * so we have slightly different pointer/start/limit comparisons here
*/
if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if(
(c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
(U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
) { /* part of a surrogate pair, leave >=d800 */
} else { /* BMP code point - may be surrogate code point - make <d800 */
c1-=0x2800;
}
if(
(c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
(U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
) { /* part of a surrogate pair, leave >=d800 */
} else { /* BMP code point - may be surrogate code point - make <d800 */
c2-=0x2800;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.