/* UCharIterator implementation for simple strings -------------------------- */
/* * This is an implementation of a code unit (char16_t) iterator * for char16_t * strings. * * The UCharIterator.context field holds a pointer to the string.
*/
static int32_t U_CALLCONV
stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { switch(origin) { case UITER_ZERO: return 0; case UITER_START: return iter->start; case UITER_CURRENT: return iter->index; case UITER_LIMIT: return iter->limit; case UITER_LENGTH: return iter->length; default: /* not a valid origin */ /* Should never get here! */ return -1;
}
}
/* UCharIterator implementation for UTF-16BE strings ------------------------ */
/* * This is an implementation of a code unit (char16_t) iterator * for UTF-16BE strings, i.e., strings in byte-vectors where * each char16_t is stored as a big-endian pair of bytes. * * The UCharIterator.context field holds a pointer to the string. * Everything works just like with a normal char16_t iterator (uiter_setString), * except that UChars are assembled from byte pairs.
*/
/* * Count the number of UChars in a UTF-16BE string before a terminating char16_t NUL, * i.e., before a pair of 0 bytes where the first 0 byte is at an even * offset from s.
*/ static int32_t
utf16BE_strlen(constchar *s) { if(IS_POINTER_EVEN(s)) { /* * even-aligned, call u_strlen(s) * we are probably on a little-endian machine, but searching for char16_t NUL * does not care about endianness
*/ return u_strlen((const char16_t *)s);
} else { /* odd-aligned, search for pair of 0 bytes */ constchar *p=s;
/* UCharIterator wrapper around CharacterIterator --------------------------- */
/* * This is wrapper code around a C++ CharacterIterator to * look like a C UCharIterator. * * The UCharIterator.context field holds a pointer to the CharacterIterator.
*/
static int32_t U_CALLCONV
characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { switch(origin) { case UITER_ZERO: return 0; case UITER_START: return ((CharacterIterator *)(iter->context))->startIndex(); case UITER_CURRENT: return ((CharacterIterator *)(iter->context))->getIndex(); case UITER_LIMIT: return ((CharacterIterator *)(iter->context))->endIndex(); case UITER_LENGTH: return ((CharacterIterator *)(iter->context))->getLength(); default: /* not a valid origin */ /* Should never get here! */ return -1;
}
}
static int32_t U_CALLCONV
characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { switch(origin) { case UITER_ZERO:
((CharacterIterator *)(iter->context))->setIndex(delta); return ((CharacterIterator *)(iter->context))->getIndex(); case UITER_START: case UITER_CURRENT: case UITER_LIMIT: return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); case UITER_LENGTH:
((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); return ((CharacterIterator *)(iter->context))->getIndex(); default: /* not a valid origin */ /* Should never get here! */ return -1;
}
}
/* UCharIterator wrapper around Replaceable --------------------------------- */
/* * This is an implementation of a code unit (char16_t) iterator * based on a Replaceable object. * * The UCharIterator.context field holds a pointer to the Replaceable. * UCharIterator.length and UCharIterator.index hold Replaceable.length() * and the iteration index.
*/
/* UCharIterator implementation for UTF-8 strings --------------------------- */
/* * Possible, probably necessary only for an implementation for arbitrary * converters: * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. * This would require to turn reservedFn into a close function and * to introduce a uiter_close(iter).
*/
#define UITER_CNV_CAPACITY 16
/* * Minimal implementation: * Maintain a single-char16_t buffer for an additional surrogate. * The caller must not modify start and limit because they are used internally. * * Use UCharIterator fields as follows: * context pointer to UTF-8 string * length UTF-16 length of the string; -1 until lazy evaluation * start current UTF-8 index * index current UTF-16 index; may be -1="unknown" after setState() * limit UTF-8 length of the string * reservedField supplementary code point * * Since UCharIterator delivers 16-bit code units, the iteration can be * currently in the middle of the byte sequence for a supplementary code point. * In this case, reservedField will contain that code point and start will * point to after the corresponding byte sequence. The UTF-16 index will be * one less than what it would otherwise be corresponding to the UTF-8 index. * Otherwise, reservedField will be 0.
*/
/* * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: * Add implementations that do not call strlen() for iteration but check for NUL.
*/
static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { switch(origin) { case UITER_ZERO: case UITER_START: return 0; case UITER_CURRENT: if(iter->index<0) { /* the current UTF-16 index is unknown after setState(), count from the beginning */ const uint8_t *s;
UChar32 c;
int32_t i, limit, index;
s=(const uint8_t *)iter->context;
i=index=0;
limit=iter->start; /* count up to the UTF-8 index */ while(i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
index+=U16_LENGTH(c);
}
iter->start=i; /* just in case setState() did not get us to a code point boundary */ if(i==iter->limit) {
iter->length=index; /* in case it was <0 or wrong */
} if(iter->reservedField!=0) {
--index; /* we are in the middle of a supplementary code point */
}
iter->index=index;
} return iter->index; case UITER_LIMIT: case UITER_LENGTH: if(iter->length<0) { const uint8_t *s;
UChar32 c;
int32_t i, limit, length;
s=(const uint8_t *)iter->context; if(iter->index<0) { /* * the current UTF-16 index is unknown after setState(), * we must first count from the beginning to here
*/
i=length=0;
limit=iter->start;
/* count from the beginning to the current index */ while(i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
length+=U16_LENGTH(c);
}
/* assume i==limit==iter->start, set the UTF-16 index */
iter->start=i; /* just in case setState() did not get us to a code point boundary */
iter->index= iter->reservedField!=0 ? length-1 : length;
} else {
i=iter->start;
length=iter->index; if(iter->reservedField!=0) {
++length;
}
}
/* count from the current index to the end */
limit=iter->limit; while(i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c);
length+=U16_LENGTH(c);
}
iter->length=length;
} return iter->length; default: /* not a valid origin */ /* Should never get here! */ return -1;
}
}
/* calculate the requested UTF-16 index */ switch(origin) { case UITER_ZERO: case UITER_START:
pos=delta;
havePos=true; /* iter->index<0 (unknown) is possible */ break; case UITER_CURRENT: if(iter->index>=0) {
pos=iter->index+delta;
havePos=true;
} else { /* the current UTF-16 index is unknown after setState(), use only delta */
pos=0;
havePos=false;
} break; case UITER_LIMIT: case UITER_LENGTH: if(iter->length>=0) {
pos=iter->length+delta;
havePos=true;
} else { /* pin to the end, avoid counting the length */
iter->index=-1;
iter->start=iter->limit;
iter->reservedField=0; if(delta>=0) { return UITER_UNKNOWN_INDEX;
} else { /* the current UTF-16 index is unknown, use only delta */
pos=0;
havePos=false;
}
} break; default: return -1; /* Error */
}
if(havePos) { /* shortcuts: pinning to the edges of the string */ if(pos<=0) {
iter->index=iter->start=iter->reservedField=0; return 0;
} elseif(iter->length>=0 && pos>=iter->length) {
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0; return iter->index;
}
/* minimize the number of U8_NEXT/PREV operations */ if(iter->index<0 || pos<iter->index/2) { /* go forward from the start instead of backward from the current index */
iter->index=iter->start=iter->reservedField=0;
} elseif(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { /* * if we have the UTF-16 index and length and the new position is * closer to the end than the current index, * then go backward from the end instead of forward from the current index
*/
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
}
delta=pos-iter->index; if(delta==0) { return iter->index; /* nothing to do */
}
} else { /* move relative to unknown UTF-16 index */ if(delta==0) { return UITER_UNKNOWN_INDEX; /* nothing to do */
} elseif(-delta>=iter->start) { /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
iter->index=iter->start=iter->reservedField=0; return 0;
} elseif(delta>=(iter->limit-iter->start)) { /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
iter->index=iter->length; /* may or may not be <0 (unknown) */
iter->start=iter->limit;
iter->reservedField=0; return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
}
}
/* delta!=0 */
/* move towards the requested position, pin to the edges of the string */
s=(const uint8_t *)iter->context;
pos=iter->index; /* could be <0 (unknown) */
i=iter->start; if(delta>0) { /* go forward */
int32_t limit=iter->limit; if(iter->reservedField!=0) {
iter->reservedField=0;
++pos;
--delta;
} while(delta>0 && i<limit) {
U8_NEXT_OR_FFFD(s, i, limit, c); if(c<=0xffff) {
++pos;
--delta;
} elseif(delta>=2) {
pos+=2;
delta-=2;
} else/* delta==1 */ { /* stop in the middle of a supplementary code point */
iter->reservedField=c;
++pos; break; /* delta=0; */
}
} if(i==limit) { if(iter->length<0 && iter->index>=0) {
iter->length= iter->reservedField==0 ? pos : pos+1;
} elseif(iter->index<0 && iter->length>=0) {
iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
}
}
} else/* delta<0 */ { /* go backward */ if(iter->reservedField!=0) {
iter->reservedField=0;
i-=4; /* we stayed behind the supplementary code point; go before it now */
--pos;
++delta;
} while(delta<0 && i>0) {
U8_PREV_OR_FFFD(s, 0, i, c); if(c<=0xffff) {
--pos;
++delta;
} elseif(delta<=-2) {
pos-=2;
delta+=2;
} else/* delta==-1 */ { /* stop in the middle of a supplementary code point */
i+=4; /* back to behind this supplementary code point for consistent state */
iter->reservedField=c;
--pos; break; /* delta=0; */
}
}
}
iter->start=i; if(iter->index>=0) { return iter->index=pos;
} else { /* we started with index<0 (unknown) so pos is bogus */ if(i<=1) { return iter->index=i; /* reached the beginning */
} else { /* we still don't know the UTF-16 index */ return UITER_UNKNOWN_INDEX;
}
}
}
c=iter->current(iter); if(U16_IS_SURROGATE(c)) { if(U16_IS_SURROGATE_LEAD(c)) { /* * go to the next code unit * we know that we are not at the limit because c!=U_SENTINEL
*/
iter->move(iter, 1, UITER_CURRENT); if(U16_IS_TRAIL(c2=iter->current(iter))) {
c=U16_GET_SUPPLEMENTARY(c, c2);
}
/* undo index movement */
iter->move(iter, -1, UITER_CURRENT);
} else { if(U16_IS_LEAD(c2=iter->previous(iter))) {
c=U16_GET_SUPPLEMENTARY(c2, c);
} if(c2>=0) { /* undo index movement */
iter->move(iter, 1, UITER_CURRENT);
}
}
} return c;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.