/** * Input: c is a letter I with or without acute accent. * start is the index in src after c, and is less than segmentLimit. * If a plain i/I is followed by a plain j/J, * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, * then we output accordingly. * * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
*/
int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
U_ASSERT(start < segmentLimit);
int32_t index = start; bool withAcute = false;
// If the conditions are met, then the following variables tell us what to output.
int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) bool doTitleJ = false; // true if the j needs to be titlecased
int32_t unchanged2 = 0; // after the j (0 or 1)
// next character after the first letter
UChar32 c2;
c2 = src[index++];
// Is the first letter an i/I with accent? if (c == u'I') { if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
withAcute = true;
unchanged1 = 2; // ACUTE is 2 code units in UTF-8 if (index == segmentLimit) { return start; }
c2 = src[index++];
}
} else { // Í
withAcute = true;
}
// Is the next character a j/J? if (c2 == u'j') {
doTitleJ = true;
} elseif (c2 == u'J') {
++unchanged1;
} else { return start;
}
// A plain i/I must be followed by a plain j/J. // An i/I with acute must be followed by a j/J with acute. if (withAcute) { if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { return start;
} if (doTitleJ) {
unchanged2 = 2; // ACUTE is 2 code units in UTF-8
} else {
unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
}
}
// There must not be another combining mark. if (index < segmentLimit) {
int32_t cp;
int32_t i = index;
U8_NEXT(src, i, segmentLimit, cp);
uint32_t typeMask = U_GET_GC_MASK(cp); if ((typeMask & U_GC_M_MASK) != 0) { return start;
}
}
// Output the rest of the Dutch IJ.
ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
start += unchanged1; if (doTitleJ) {
ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
++start;
}
ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
/* set up local variables */
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
int32_t prev=0;
UBool isFirstIndex=true;
/* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */
int32_t index; if(isFirstIndex) {
isFirstIndex=false;
index=iter->first();
} else {
index=iter->next();
} if(index==UBRK_DONE || index>srcLength) {
index=srcLength;
}
/* * Segment [prev..index[ into 3 parts: * a) skipped characters (copy as-is) [prev..titleStart[ * b) first letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[
*/ if(prev<index) { /* find and copy skipped characters [prev..titleStart[ */
int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U8_NEXT(src, titleLimit, index, c); if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { // Adjust the titlecasing index to the next cased character, // or to the next letter/number/symbol/private use. // Stop with titleStart<titleLimit<=index // if there is a character to be titlecased, // or else stop with titleStart==titleLimit==index.
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit; if(titleLimit==index) { break;
}
U8_NEXT(src, titleLimit, index, c);
} if (prev < titleStart) { if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
sink, options, edits, errorCode)) { return;
}
}
}
if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ if(c>=0) {
csc.cpStart=titleStart;
csc.cpLimit=titleLimit; const char16_t *s;
c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale); if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) { return;
}
} else { // Malformed UTF-8. if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
sink, options, edits, errorCode)) { return;
}
}
/* Special case Dutch IJ titlecasing */ if (titleLimit < index &&
caseLocale == UCASE_LOC_DUTCH) { if (c < 0) {
c = ~c;
}
/* lowercase [titleLimit..index[ */ if(titleLimit<index) { if((options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */
toLower(caseLocale, options,
src, &csc, titleLimit, index,
sink, edits, errorCode); if(U_FAILURE(errorCode)) { return;
}
} else { /* Optionally just copy the rest of the word unchanged. */ if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
sink, options, edits, errorCode)) { return;
}
}
}
}
}
prev=index;
}
}
#endif
U_NAMESPACE_BEGIN namespace GreekUpper {
UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) { while (i < length) {
UChar32 c;
U8_NEXT(s, i, length, c);
int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // Case-ignorable, continue with the loop.
} elseif (type != UCASE_NONE) { returntrue; // Followed by cased letter.
} else { returnfalse; // Uncased and not case-ignorable.
}
} returnfalse; // Not followed by cased letter.
}
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. void toUpper(uint32_t options, const uint8_t *src, int32_t srcLength,
ByteSink &sink, Edits *edits,
UErrorCode &errorCode) {
uint32_t state = 0; for (int32_t i = 0; i < srcLength;) {
int32_t nextIndex = i;
UChar32 c;
U8_NEXT(src, nextIndex, srcLength, c);
uint32_t nextState = 0;
int32_t type = ucase_getTypeOrIgnorable(c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable
nextState |= (state & AFTER_CASED);
} elseif (type != UCASE_NONE) { // c is cased
nextState |= AFTER_CASED;
}
uint32_t data = getLetterData(c); if (data > 0) {
uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 &&
(state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
} const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0; // Skip combining diacritics after this Greek letter.
int32_t nextNextIndex = nextIndex; while (nextIndex < srcLength) {
UChar32 c2;
U8_NEXT(src, nextNextIndex, srcLength, c2);
uint32_t diacriticData = getDiacriticData(c2); if (diacriticData != 0) {
data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
++numYpogegrammeni;
}
nextIndex = nextNextIndex;
} else { break; // not a Greek diacritic
}
} if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
} // Map according to Greek rules.
UBool addTonos = false; if (upper == 0x397 &&
(data & HAS_ACCENT) != 0 &&
numYpogegrammeni == 0 &&
(state & AFTER_CASED) == 0 &&
!isFollowedByCasedLetter(src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (hasPrecomposedAccent) {
upper = 0x389; // Preserve the precomposed form.
} else {
addTonos = true;
}
} elseif ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) {
upper = 0x3AA;
data &= ~HAS_EITHER_DIALYTIKA;
} elseif (upper == 0x3A5) {
upper = 0x3AB;
data &= ~HAS_EITHER_DIALYTIKA;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.