// 0000uuuuuuuuuuuu records u+1 unchanged text units. const int32_t MAX_UNCHANGED_LENGTH = 0x1000; const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6; const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7; const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff; const int32_t MAX_SHORT_CHANGE = 0x6fff;
// 0111mmmmmmnnnnnn records a replacement of m text units with n. // m or n = 61: actual length follows in the next edits array unit. // m or n = 62..63: actual length follows in the next two edits array units. // Bit 30 of the actual length is in the head unit. // Trailing units have bit 15 set. const int32_t LENGTH_IN_1TRAIL = 61; const int32_t LENGTH_IN_2TRAIL = 62;
UBool Edits::growArray() {
int32_t newCapacity; if (array == stackArray) {
newCapacity = 2000;
} elseif (capacity == INT32_MAX) { // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API // with a result-string-buffer overflow.
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; returnfalse;
} elseif (capacity >= (INT32_MAX / 2)) {
newCapacity = INT32_MAX;
} else {
newCapacity = 2 * capacity;
} // Grow by at least 5 units so that a maximal change record will fit. if ((newCapacity - capacity) < 5) {
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; returnfalse;
}
uint16_t* newArray = static_cast<uint16_t*>(uprv_malloc(static_cast<size_t>(newCapacity) * 2)); if (newArray == nullptr) {
errorCode_ = U_MEMORY_ALLOCATION_ERROR; returnfalse;
}
uprv_memcpy(newArray, array, (size_t)length * 2);
releaseArray();
array = newArray;
capacity = newCapacity; returntrue;
}
UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const { if (U_FAILURE(outErrorCode)) { returntrue; } if (U_SUCCESS(errorCode_)) { returnfalse; }
outErrorCode = errorCode_; returntrue;
}
Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) { if (copyErrorTo(errorCode)) { return *this; } // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. // Parallel iteration over both Edits.
Iterator abIter = ab.getFineIterator();
Iterator bcIter = bc.getFineIterator();
UBool abHasNext = true, bcHasNext = true; // Copy iterator state into local variables, so that we can modify and subdivide spans. // ab old & new length, bc old & new length
int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; // When we have different-intermediate-length changes, we accumulate a larger change.
int32_t pending_aLength = 0, pending_cLength = 0; for (;;) { // At this point, for each of the two iterators: // Either we are done with the locally cached current edit, // and its intermediate-string length has been reset, // or we will continue to work with a truncated remainder of this edit. // // If the current edit is done, and the iterator has not yet reached the end, // then we fetch the next edit. This is true for at least one of the iterators. // // Normally it does not matter whether we fetch from ab and then bc or vice versa. // However, the result is observably different when // ab deletions meet bc insertions at the same intermediate-string index. // Some users expect the bc insertions to come first, so we fetch from bc first. if (bc_bLength == 0) { if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) {
bc_bLength = bcIter.oldLength();
cLength = bcIter.newLength(); if (bc_bLength == 0) { // insertion if (ab_bLength == 0 || !abIter.hasChange()) {
addReplace(pending_aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_cLength += cLength;
} continue;
}
} // else see if the other iterator is done, too.
} if (ab_bLength == 0) { if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) {
aLength = abIter.oldLength();
ab_bLength = abIter.newLength(); if (ab_bLength == 0) { // deletion if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
addReplace(pending_aLength + aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_aLength += aLength;
} continue;
}
} elseif (bc_bLength == 0) { // Both iterators are done at the same time: // The intermediate-string lengths match. break;
} else { // The ab output string is shorter than the bc input string. if (!copyErrorTo(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
} return *this;
}
} if (bc_bLength == 0) { // The bc input string is shorter than the ab output string. if (!copyErrorTo(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
} return *this;
} // Done fetching: ab_bLength > 0 && bc_bLength > 0
// The current state has two parts: // - Past: We accumulate a longer ac edit in the "pending" variables. // - Current: We have copies of the current ab/bc edits in local variables. // At least one side is newly fetched. // One side might be a truncated remainder of an edit we fetched earlier.
if (!abIter.hasChange() && !bcIter.hasChange()) { // An unchanged span all the way from string a to string c. if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
}
int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
addUnchanged(unchangedLength);
ab_bLength = aLength -= unchangedLength;
bc_bLength = cLength -= unchangedLength; // At least one of the unchanged spans is now empty. continue;
} if (!abIter.hasChange() && bcIter.hasChange()) { // Unchanged a->b but changed b->c. if (ab_bLength >= bc_bLength) { // Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
aLength = ab_bLength -= bc_bLength;
bc_bLength = 0; continue;
} // Handle the shorter unchanged span below like a change.
} elseif (abIter.hasChange() && !bcIter.hasChange()) { // Changed a->b and then unchanged b->c. if (ab_bLength <= bc_bLength) { // Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
pending_aLength = pending_cLength = 0;
cLength = bc_bLength -= ab_bLength;
ab_bLength = 0; continue;
} // Handle the shorter unchanged span below like a change.
} else { // both abIter.hasChange() && bcIter.hasChange() if (ab_bLength == bc_bLength) { // Changes on both sides up to the same position. Emit & reset.
addReplace(pending_aLength + aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
ab_bLength = bc_bLength = 0; continue;
}
} // Accumulate the a->c change, reset the shorter side, // keep a remainder of the longer one.
pending_aLength += aLength;
pending_cLength += cLength; if (ab_bLength < bc_bLength) {
bc_bLength -= ab_bLength;
cLength = ab_bLength = 0;
} else { // ab_bLength > bc_bLength
ab_bLength -= bc_bLength;
aLength = bc_bLength = 0;
}
} if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
}
copyErrorTo(errorCode); return *this;
}
UBool Edits::Iterator::noNext() { // No change before or beyond the string.
dir = 0;
changed = false;
oldLength_ = newLength_ = 0; returnfalse;
}
UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { // Forward iteration: Update the string indexes to the limit of the current span, // and post-increment-read array units to assemble a new span. // Leaves the array index one after the last unit of that span. if (U_FAILURE(errorCode)) { returnfalse; } // We have an errorCode in case we need to start guarding against integer overflows. // It is also convenient for caller loops if we bail out when an error was set elsewhere. if (dir > 0) {
updateNextIndexes();
} else { if (dir < 0) { // Turn around from previous() to next(). // Post-increment-read the same span again. if (remaining > 0) { // Fine-grained iterator: // Stay on the current one of a sequence of compressed changes.
++index; // next() rests on the index after the sequence unit.
dir = 1; returntrue;
}
}
dir = 1;
} if (remaining >= 1) { // Fine-grained iterator: Continue a sequence of compressed changes. if (remaining > 1) {
--remaining; returntrue;
}
remaining = 0;
} if (index >= length) { return noNext();
}
int32_t u = array[index++]; if (u <= MAX_UNCHANGED) { // Combine adjacent unchanged ranges.
changed = false;
oldLength_ = u + 1; while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
++index;
oldLength_ += u + 1;
}
newLength_ = oldLength_; if (onlyChanges) {
updateNextIndexes(); if (index >= length) { return noNext();
} // already fetched u > MAX_UNCHANGED at index
++index;
} else { returntrue;
}
}
changed = true; if (u <= MAX_SHORT_CHANGE) {
int32_t oldLen = u >> 12;
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; if (coarse) {
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else { // Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen; if (num > 1) {
remaining = num; // This is the first of two or more changes.
} returntrue;
}
} else {
U_ASSERT(u <= 0x7fff);
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f); if (!coarse) { returntrue;
}
} // Combine adjacent changes. while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index; if (u <= MAX_SHORT_CHANGE) {
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else {
U_ASSERT(u <= 0x7fff);
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
}
} returntrue;
}
UBool Edits::Iterator::previous(UErrorCode &errorCode) { // Backward iteration: Pre-decrement-read array units to assemble a new span, // then update the string indexes to the start of that span. // Leaves the array index on the head unit of that span. if (U_FAILURE(errorCode)) { returnfalse; } // We have an errorCode in case we need to start guarding against integer overflows. // It is also convenient for caller loops if we bail out when an error was set elsewhere. if (dir >= 0) { if (dir > 0) { // Turn around from next() to previous(). // Set the string indexes to the span limit and // pre-decrement-read the same span again. if (remaining > 0) { // Fine-grained iterator: // Stay on the current one of a sequence of compressed changes.
--index; // previous() rests on the sequence unit.
dir = -1; returntrue;
}
updateNextIndexes();
}
dir = -1;
} if (remaining > 0) { // Fine-grained iterator: Continue a sequence of compressed changes.
int32_t u = array[index];
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
++remaining;
updatePreviousIndexes(); returntrue;
}
remaining = 0;
} if (index <= 0) { return noNext();
}
int32_t u = array[--index]; if (u <= MAX_UNCHANGED) { // Combine adjacent unchanged ranges.
changed = false;
oldLength_ = u + 1; while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
--index;
oldLength_ += u + 1;
}
newLength_ = oldLength_; // No need to handle onlyChanges as long as previous() is called only from findIndex().
updatePreviousIndexes(); returntrue;
}
changed = true; if (u <= MAX_SHORT_CHANGE) {
int32_t oldLen = u >> 12;
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; if (coarse) {
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else { // Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen; if (num > 1) {
remaining = 1; // This is the last of two or more changes.
}
updatePreviousIndexes(); returntrue;
}
} else { if (u <= 0x7fff) { // The change is encoded in u alone.
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
} else { // Back up to the head of the change, read the lengths, // and reset the index to the head again.
U_ASSERT(index > 0); while ((u = array[--index]) > 0x7fff) {}
U_ASSERT(u > MAX_SHORT_CHANGE);
int32_t headIndex = index++;
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
index = headIndex;
} if (!coarse) {
updatePreviousIndexes(); returntrue;
}
} // Combine adjacent changes. while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
--index; if (u <= MAX_SHORT_CHANGE) {
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} elseif (u <= 0x7fff) { // Read the lengths, and reset the index to the head again.
int32_t headIndex = index++;
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
index = headIndex;
}
}
updatePreviousIndexes(); returntrue;
}
int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) { if (U_FAILURE(errorCode) || i < 0) { return -1; }
int32_t spanStart, spanLength; if (findSource) { // find source index
spanStart = srcIndex;
spanLength = oldLength_;
} else { // find destination index
spanStart = destIndex;
spanLength = newLength_;
} if (i < spanStart) { if (i >= (spanStart / 2)) { // Search backwards. for (;;) {
UBool hasPrevious = previous(errorCode);
U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
(void)hasPrevious; // avoid unused-variable warning
spanStart = findSource ? srcIndex : destIndex; if (i >= spanStart) { // The index is in the current span. return 0;
} if (remaining > 0) { // Is the index in one of the remaining compressed edits? // spanStart is the start of the current span, first of the remaining ones.
spanLength = findSource ? oldLength_ : newLength_;
int32_t u = array[index];
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
int32_t len = num * spanLength; if (i >= (spanStart - len)) {
int32_t n = ((spanStart - i - 1) / spanLength) + 1; // 1 <= n <= num
srcIndex -= n * oldLength_;
replIndex -= n * newLength_;
destIndex -= n * newLength_;
remaining += n; return 0;
} // Skip all of these edits at once.
srcIndex -= num * oldLength_;
replIndex -= num * newLength_;
destIndex -= num * newLength_;
remaining = 0;
}
}
} // Reset the iterator to the start.
dir = 0;
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} elseif (i < (spanStart + spanLength)) { // The index is in the current span. return 0;
} while (next(false, errorCode)) { if (findSource) {
spanStart = srcIndex;
spanLength = oldLength_;
} else {
spanStart = destIndex;
spanLength = newLength_;
} if (i < (spanStart + spanLength)) { // The index is in the current span. return 0;
} if (remaining > 1) { // Is the index in one of the remaining compressed edits? // spanStart is the start of the current span, first of the remaining ones.
int32_t len = remaining * spanLength; if (i < (spanStart + len)) {
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
srcIndex += n * oldLength_;
replIndex += n * newLength_;
destIndex += n * newLength_;
remaining -= n; return 0;
} // Make next() skip all of these edits at once.
oldLength_ *= remaining;
newLength_ *= remaining;
remaining = 0;
}
} return 1;
}
int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) {
int32_t where = findIndex(i, true, errorCode); if (where < 0) { // Error or before the string. return 0;
} if (where > 0 || i == srcIndex) { // At or after string length, or at start of the found span. return destIndex;
} if (changed) { // In a change span, map to its end. return destIndex + newLength_;
} else { // In an unchanged span, offset 1:1 within it. return destIndex + (i - srcIndex);
}
}
int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) {
int32_t where = findIndex(i, false, errorCode); if (where < 0) { // Error or before the string. return 0;
} if (where > 0 || i == destIndex) { // At or after string length, or at start of the found span. return srcIndex;
} if (changed) { // In a change span, map to its end. return srcIndex + oldLength_;
} else { // In an unchanged span, offset within it. return srcIndex + (i - destIndex);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.