/* * Test if a substring match inside a string is at code point boundaries. * All pointers refer to the same buffer. * The limit pointer may be nullptr, all others must be real pointers.
*/ staticinline UBool
isMatchAtCPBoundary(const char16_t *start, const char16_t *match, const char16_t *matchLimit, const char16_t *limit) { if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { /* the leading edge of the match is in the middle of a surrogate pair */ returnfalse;
} if(U16_IS_LEAD(*(matchLimit-1)) && matchLimit!=limit && U16_IS_TRAIL(*matchLimit)) { /* the trailing edge of the match is in the middle of a surrogate pair */ returnfalse;
} returntrue;
}
U_CAPI char16_t * U_EXPORT2
u_strchr(const char16_t *s, char16_t c) { if(U16_IS_SURROGATE(c)) { /* make sure to not find half of a surrogate pair */ return u_strFindFirst(s, -1, &c, 1);
} else {
char16_t cs;
/* trivial search for a BMP code point */ for(;;) { if((cs=*s)==c) { return (char16_t *)s;
} if(cs==0) { return nullptr;
}
++s;
}
}
}
U_CAPI char16_t * U_EXPORT2
u_strchr32(const char16_t *s, UChar32 c) { if((uint32_t)c<=U_BMP_MAX) { /* find BMP code point */ return u_strchr(s, (char16_t)c);
} elseif((uint32_t)c<=UCHAR_MAX_VALUE) { /* find supplementary code point as surrogate pair */
char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
while((cs=*s++)!=0) { if(cs==lead && *s==trail) { return (char16_t *)(s-1);
}
} return nullptr;
} else { /* not a Unicode code point, not findable */ return nullptr;
}
}
U_CAPI char16_t * U_EXPORT2
u_memchr(const char16_t *s, char16_t c, int32_t count) { if(count<=0) { return nullptr; /* no string */
} elseif(U16_IS_SURROGATE(c)) { /* make sure to not find half of a surrogate pair */ return u_strFindFirst(s, count, &c, 1);
} else { /* trivial search for a BMP code point */ const char16_t *limit=s+count; do { if(*s==c) { return (char16_t *)s;
}
} while(++s!=limit); return nullptr;
}
}
U_CAPI char16_t * U_EXPORT2
u_memchr32(const char16_t *s, UChar32 c, int32_t count) { if((uint32_t)c<=U_BMP_MAX) { /* find BMP code point */ return u_memchr(s, (char16_t)c, count);
} elseif(count<2) { /* too short for a surrogate pair */ return nullptr;
} elseif((uint32_t)c<=UCHAR_MAX_VALUE) { /* find supplementary code point as surrogate pair */ const char16_t *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
do { if(*s==lead && *(s+1)==trail) { return (char16_t *)s;
}
} while(++s!=limit); return nullptr;
} else { /* not a Unicode code point, not findable */ return nullptr;
}
}
/* * This implementation is more lazy than the one for u_strFindFirst(): * There is no special search code for NUL-terminated strings. * It does not seem to be worth it for searching substrings to * search forward and find all matches like in u_strrchr() and similar. * Therefore, we simply get both string lengths and search backward. * * markus 2002oct23
*/
/* * Match each code point in a string against each code point in the matchSet. * Return the index of the first string code point that * is (polarity==true) or is not (false) contained in the matchSet. * Return -(string length)-1 if there is no such code point.
*/ static int32_t
_matchFromSet(const char16_t *string, const char16_t *matchSet, UBool polarity) {
int32_t matchLen, matchBMPLen, strItr, matchItr;
UChar32 stringCh, matchCh;
char16_t c, c2;
/* first part of matchSet contains only BMP code points */
matchBMPLen = 0; while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
++matchBMPLen;
}
/* second part of matchSet contains BMP and supplementary code points */
matchLen = matchBMPLen; while(matchSet[matchLen] != 0) {
++matchLen;
}
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
U_CAPI char16_t * U_EXPORT2
u_strpbrk(const char16_t *string, const char16_t *matchSet)
{
int32_t idx = _matchFromSet(string, matchSet, true); if(idx >= 0) { return (char16_t *)string + idx;
} else { return nullptr;
}
}
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
U_CAPI int32_t U_EXPORT2
u_strcspn(const char16_t *string, const char16_t *matchSet)
{
int32_t idx = _matchFromSet(string, matchSet, true); if(idx >= 0) { return idx;
} else { return -idx - 1; /* == u_strlen(string) */
}
}
/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
U_CAPI int32_t U_EXPORT2
u_strspn(const char16_t *string, const char16_t *matchSet)
{
int32_t idx = _matchFromSet(string, matchSet, false); if(idx >= 0) { return idx;
} else { return -idx - 1; /* == u_strlen(string) */
}
}
/* If saveState is nullptr, the user messed up. */ if (src != nullptr) {
tokSource = src;
*saveState = src; /* Set to "src" in case there are no delimiters */
} elseif (*saveState) {
tokSource = *saveState;
} else { /* src == nullptr && *saveState == nullptr */ /* This shouldn't happen. We already finished tokenizing. */ return nullptr;
}
/* compare identical prefixes - they do not need to be fixed up */ if(length1<0 && length2<0) { /* strcmp style, both NUL-terminated */ if(s1==s2) { return 0;
}
/* setup for fix-up */
limit1=limit2=nullptr;
} elseif(strncmpStyle) { /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ if(s1==s2) { return 0;
}
limit1=start1+length1;
for(;;) { /* both lengths are same, check only one limit */ if(s1==limit1) { return 0;
}
/* setup for fix-up */
limit1=start1+length1;
limit2=start2+length2;
}
/* if both values are in or above the surrogate range, fix them up */ if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if(
(c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
(U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
) { /* part of a surrogate pair, leave >=d800 */
} else { /* BMP code point - may be surrogate code point - make <d800 */
c1-=0x2800;
}
if(
(c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
(U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
) { /* part of a surrogate pair, leave >=d800 */
} else { /* BMP code point - may be surrogate code point - make <d800 */
c2-=0x2800;
}
}
/* now c1 and c2 are in the requested (code unit or code point) order */ return (int32_t)c1-(int32_t)c2;
}
/* * Compare two strings as presented by UCharIterators. * Use code unit or code point order. * When the function returns, it is undefined where the iterators * have stopped.
*/
U_CAPI int32_t U_EXPORT2
u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
UChar32 c1, c2;
/* compare identical prefixes - they do not need to be fixed up */ for(;;) {
c1=iter1->next(iter1);
c2=iter2->next(iter2); if(c1!=c2) { break;
} if(c1==-1) { return 0;
}
}
/* if both values are in or above the surrogate range, fix them up */ if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if(
(c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
(U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
) { /* part of a surrogate pair, leave >=d800 */
} else { /* BMP code point - may be surrogate code point - make <d800 */
c1-=0x2800;
}
if(
(c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
(U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
) { /* part of a surrogate pair, leave >=d800 */
} else { /* BMP code point - may be surrogate code point - make <d800 */
c2-=0x2800;
}
}
/* now c1 and c2 are in the requested (code unit or code point) order */ return (int32_t)c1-(int32_t)c2;
}
#if 0 /* * u_strCompareIter() does not leave the iterators _on_ the different units. * This is possible but would cost a few extra indirect function calls to back * up if the last unit (c1 or c2 respectively) was >=0. * * Consistently leaving them _behind_ the different units is not an option * because the current "unit" is the end of the string if that is reached, * and in such a case the iterator does not move. * For example, when comparing "ab" with "abc", both iterators rest _on_ the end * of their strings. Calling previous() on each does not move them to where * the comparison fails. * * So the simplest semantics is to not define where the iterators end up. * * The following fragment is part of what would need to be done for backing up.
*/ void fragment { /* iff a surrogate is part of a surrogate pair, leave >=d800 */ if(c1<=0xdbff) { if(!U16_IS_TRAIL(iter1->current(iter1))) { /* lead surrogate code point - make <d800 */
c1-=0x2800;
}
} elseif(c1<=0xdfff) {
int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
iter1->previous(iter1); /* ==c1 */ if(!U16_IS_LEAD(iter1->previous(iter1))) { /* trail surrogate code point - make <d800 */
c1-=0x2800;
} /* go back to behind where the difference is */
iter1->move(iter1, idx, UITER_ZERO);
} else/* 0xe000<=c1<=0xffff */ { /* BMP code point - make <d800 */
c1-=0x2800;
}
} #endif
/* * sufficient to look ahead one because of UTF-16; * safe to look ahead one because at worst that would be the terminating NUL
*/ if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
++s;
}
}
} return count;
}
/* s contains at least (length+1)/2 code points: <=2 UChars per cp */ if(((length+1)/2)>number) { returntrue;
}
/* check if s does not even contain enough UChars */
maxSupplementary=length-number; if(maxSupplementary<=0) { returnfalse;
} /* there are maxSupplementary=length-number more UChars than asked-for code points */
/* * count code points until they exceed and also check that there are * no more than maxSupplementary supplementary code points (char16_t pairs)
*/
limit=s+length; for(;;) { if(s==limit) { returnfalse;
} if(number==0) { returntrue;
} if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
++s; if(--maxSupplementary<=0) { /* too many pairs - too few code points */ returnfalse;
}
}
--number;
}
}
}
/* u_unescape & support fns ------------------------------------------------- */
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ staticconst char16_t UNESCAPE_MAP[] = { /*" 0x22, 0x22 */ /*' 0x27, 0x27 */ /*? 0x3F, 0x3F */ /*\ 0x5C, 0x5C */ /*a*/ 0x61, 0x07, /*b*/ 0x62, 0x08, /*e*/ 0x65, 0x1b, /*f*/ 0x66, 0x0c, /*n*/ 0x6E, 0x0a, /*r*/ 0x72, 0x0d, /*t*/ 0x74, 0x09, /*v*/ 0x76, 0x0b
}; enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
/* Convert one octal digit to a numeric value 0..7, or -1 on failure */ static int32_t _digit8(char16_t c) { if (c >= u'0' && c <= u'7') { return c - u'0';
} return -1;
}
/* Convert one hex digit to a numeric value 0..F, or -1 on failure */ static int32_t _digit16(char16_t c) { if (c >= u'0' && c <= u'9') { return c - u'0';
} if (c >= u'A' && c <= u'F') { return c - (u'A' - 10);
} if (c >= u'a' && c <= u'f') { return c - (u'a' - 10);
} return -1;
}
/* Parse a single escape sequence. Although this method deals in * UChars, it does not use C++ or UnicodeString. This allows it to
* be used from C contexts. */
U_CAPI UChar32 U_EXPORT2
u_unescapeAt(UNESCAPE_CHAR_AT charAt,
int32_t *offset,
int32_t length, void *context) {
/* Check that offset is in range */ if (*offset < 0 || *offset >= length) { goto err;
}
/* Fetch first char16_t after '\\' */
c = charAt((*offset)++, context);
/* Convert hexadecimal and octal escapes */ switch (c) { case u'u':
minDig = maxDig = 4; break; case u'U':
minDig = maxDig = 8; break; case u'x':
minDig = 1; if (*offset < length && charAt(*offset, context) == u'{') {
++(*offset);
braces = true;
maxDig = 8;
} else {
maxDig = 2;
} break; default:
dig = _digit8(c); if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
} break;
} if (minDig != 0) { while (*offset < length && n < maxDig) {
c = charAt(*offset, context);
dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); if (dig < 0) { break;
}
result = (result << bitsPerDigit) | dig;
++(*offset);
++n;
} if (n < minDig) { goto err;
} if (braces) { if (c != u'}') { goto err;
}
++(*offset);
} if (result < 0 || result >= 0x110000) { goto err;
} /* If an escape sequence specifies a lead surrogate, see if * there is a trail surrogate after it, either as an escape or * as a literal. If so, join them up into a supplementary.
*/ if (*offset < length && U16_IS_LEAD(result)) {
int32_t ahead = *offset + 1;
c = charAt(*offset, context); if (c == u'\\' && ahead < length) { // Calling ourselves recursively may cause a stack overflow if // we have repeated escaped lead surrogates. // Limit the length to 11 ("x{0000DFFF}") after ahead.
int32_t tailLimit = ahead + 11; if (tailLimit > length) {
tailLimit = length;
}
c = u_unescapeAt(charAt, &ahead, tailLimit, context);
} if (U16_IS_TRAIL(c)) {
*offset = ahead;
result = U16_GET_SUPPLEMENTARY(result, c);
}
} return result;
}
/* Convert C-style escapes in table */ for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { if (c == UNESCAPE_MAP[i]) { return UNESCAPE_MAP[i+1];
} elseif (c < UNESCAPE_MAP[i]) { break;
}
}
/* Map \cX to control-X: X & 0x1F */ if (c == u'c' && *offset < length) {
c = charAt((*offset)++, context); if (U16_IS_LEAD(c) && *offset < length) {
char16_t c2 = charAt(*offset, context); if (U16_IS_TRAIL(c2)) {
++(*offset);
c = U16_GET_SUPPLEMENTARY(c, c2);
}
} return 0x1F & c;
}
/* If no special forms are recognized, then consider * the backslash to generically escape the next character.
* Deal with surrogate pairs. */ if (U16_IS_LEAD(c) && *offset < length) {
char16_t c2 = charAt(*offset, context); if (U16_IS_TRAIL(c2)) {
++(*offset); return U16_GET_SUPPLEMENTARY(c, c2);
}
} return c;
err: /* Invalid escape sequence */
*offset = start; /* Reset to initial value */ return (UChar32)0xFFFFFFFF;
}
/* u_unescapeAt() callback to return a char16_t from a char* */ static char16_t U_CALLCONV
_charPtr_charAt(int32_t offset, void *context) {
char16_t c16; /* It would be more efficient to access the invariant tables
* directly but there is no API for that. */
u_charsToUChars(static_cast<char*>(context) + offset, &c16, 1); return c16;
}
/* Append an escape-free segment of the text; used by u_unescape() */ staticvoid _appendUChars(char16_t *dest, int32_t destCapacity, constchar *src, int32_t srcLen) { if (destCapacity < 0) {
destCapacity = 0;
} if (srcLen > destCapacity) {
srcLen = destCapacity;
}
u_charsToUChars(src, dest, srcLen);
}
/* Do an invariant conversion of char* -> char16_t*, with escape parsing */
U_CAPI int32_t U_EXPORT2
u_unescape(constchar *src, char16_t *dest, int32_t destCapacity) { constchar *segment = src;
int32_t i = 0; char c;
while ((c=*src) != 0) { /* '\\' intentionally written as compiler-specific * character constant to correspond to compiler-specific
* char* constants. */ if (c == '\\') {
int32_t lenParsed = 0;
UChar32 c32; if (src != segment) { if (dest != nullptr) {
_appendUChars(dest + i, destCapacity - i,
segment, (int32_t)(src - segment));
}
i += (int32_t)(src - segment);
}
++src; /* advance past '\\' */
c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), const_cast<char*>(src)); if (lenParsed == 0) { goto err;
}
src += lenParsed; /* advance past escape seq. */ if (dest != nullptr && U16_LENGTH(c32) <= (destCapacity - i)) {
U16_APPEND_UNSAFE(dest, i, c32);
} else {
i += U16_LENGTH(c32);
}
segment = src;
} else {
++src;
}
} if (src != segment) { if (dest != nullptr) {
_appendUChars(dest + i, destCapacity - i,
segment, (int32_t)(src - segment));
}
i += (int32_t)(src - segment);
} if (dest != nullptr && i < destCapacity) {
dest[i] = 0;
} return i;
/* NUL-termination of strings ----------------------------------------------- */
/** * NUL-terminate a string no matter what its type. * Set warning and error codes accordingly.
*/ #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \ if(pErrorCode!=nullptr && U_SUCCESS(*pErrorCode)) { \ /* not a public function, so no complete argument checking */ \
\ if(length<0) { \ /* assume that the caller handles this */ \
} elseif(length<destCapacity) { \ /* NUL-terminate the string, the NUL fits */ \
dest[length]=0; \ /* unset the not-terminated warning but leave all others */ \ if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
*pErrorCode=U_ZERO_ERROR; \
} \
} elseif(length==destCapacity) { \ /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
*pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
} else/* length>destCapacity */ { \ /* even the string itself did not fit - set an error code */ \
*pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
} \
} \
} UPRV_BLOCK_MACRO_END
U_CAPI char16_t U_EXPORT2
u_asciiToUpper(char16_t c) { if (u'a' <= c && c <= u'z') {
c = c + u'A' - u'a';
} return c;
}
// Compute the hash code for a string -------------------------------------- ***
// Moved here from uhash.c so that UnicodeString::hashCode() does not depend // on UHashtable code.
/* Compute the hash by iterating sparsely over about 32 (up to 63) characters spaced evenly through the string. For each character, multiply the previous hash value by a prime number and add the new character in, like a linear congruential random number generator, producing a pseudorandom deterministic value well distributed over the output range. [LIU]
*/
/* Used by UnicodeString to compute its hashcode - Not public API. */
U_CAPI int32_t U_EXPORT2
ustr_hashUCharsN(const char16_t *str, int32_t length) {
STRING_HASH(char16_t, str, length, *p);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.