struct RegularExpression: public UMemory { public:
RegularExpression();
~RegularExpression();
int32_t fMagic;
RegexPattern *fPat;
u_atomic_int32_t *fPatRefCount;
char16_t *fPatString;
int32_t fPatStringLen;
RegexMatcher *fMatcher; const char16_t *fText; // Text from setText()
int32_t fTextLength; // Length provided by user with setText(), which // may be -1.
UBool fOwnsText;
};
staticconst int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
//---------------------------------------------------------------------------------------- // // validateRE Do boilerplate style checks on API function parameters. // Return true if they look OK. //---------------------------------------------------------------------------------------- static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { if (U_FAILURE(*status)) { returnfalse;
} if (re == nullptr || re->fMagic != REXP_MAGIC) {
*status = U_ILLEGAL_ARGUMENT_ERROR; returnfalse;
} // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway if (requiresText && re->fText == nullptr && !re->fOwnsText) {
*status = U_REGEX_INVALID_STATE; returnfalse;
} returntrue;
}
// // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a UText wrapper around // this local copy, to avoid making even more copies. //
re->fPatString = patBuf;
re->fPatStringLen = patternLength;
u_memcpy(patBuf, pattern, actualPatLen);
patBuf[actualPatLen] = 0;
// // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a read-only UText wrapper // around this local copy, to avoid making even more copies. //
re->fPatString = patBuf;
re->fPatStringLen = pattern16Length;
utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
UText input = UTEXT_INITIALIZER;
utext_openUChars(&input, text, textLength, status);
regexp->fMatcher->reset(&input);
utext_close(&input); // reset() made a shallow clone, so we don't need this copy
}
if (regexp->fText == nullptr) { // need to fill in the text
UText *inputText = regexp->fMatcher->inputText();
int64_t inputNativeLength = utext_nativeLength(inputText); if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
regexp->fText = inputText->chunkContents;
regexp->fTextLength = (int32_t)inputNativeLength;
regexp->fOwnsText = false; // because the UText owns it
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, nullptr, 0, &lengthStatus); // buffer overflow error
char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(regexp->fTextLength+1));
utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
regexp->fText = inputChars;
regexp->fOwnsText = true; // should already be set but just in case
}
}
if (destCapacity == 0 || regexp->fText != nullptr) { // If preflighting or if we already have the text as UChars, // this is a little cheaper than extracting from the UText
// // Pick up the range of characters from the matcher //
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
int32_t endIx = regexp->fMatcher->end (groupNum, *status); if (U_FAILURE(*status)) { return 0;
}
// Note: Separate error code variables for findNext() and appendReplacement() // are used so that destination buffer overflow errors // in appendReplacement won't stop findNext() from working. // appendReplacement() and appendTail() special case incoming buffer // overflow errors, continuing to return the correct length.
UErrorCode findStatus = *status; while (uregex_findNext(regexp2, &findStatus)) {
len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
if (U_FAILURE(findStatus)) { // If anything went wrong with the findNext(), make that error trump // whatever may have happened with the append() operations. // Errors in findNext() are not expected.
*status = findStatus;
}
U_NAMESPACE_BEGIN // // Dummy class, because these functions need to be friends of class RegexMatcher, // and stand-alone C functions don't work as friends // class RegexCImpl { public: inlinestatic int32_t appendReplacement(RegularExpression *regexp, const char16_t *replacementText,
int32_t replacementLength,
char16_t **destBuf,
int32_t *destCapacity,
UErrorCode *status);
// // Move a character to an output buffer, with bounds checking on the index. // Index advances even if capacity is exceeded, for preflight size computations. // This little sequence is used a LOT. // staticinlinevoid appendToBuf(char16_t c, int32_t *idx, char16_t *buf, int32_t bufCapacity) { if (*idx < bufCapacity) {
buf[*idx] = c;
}
(*idx)++;
}
// If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle.
UBool pendingBufferOverflow = false; if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != nullptr && *destCapacity == 0) {
pendingBufferOverflow = true;
*status = U_ZERO_ERROR;
}
// If it wasn't supplied by the caller, get the length of the replacement text. // TODO: slightly smarter logic in the copy loop could watch for the NUL on // the fly and avoid this step. if (replacementLength == -1) {
replacementLength = u_strlen(replacementText);
}
// Copy input string from the end of previous match to start of current match if (regexp->fText != nullptr) {
int32_t matchStart;
int32_t lastMatchEnd; if (UTEXT_USES_U16(m->fInputText)) {
lastMatchEnd = static_cast<int32_t>(m->fLastMatchEnd);
matchStart = static_cast<int32_t>(m->fMatchStart);
} else { // !!!: Would like a better way to do this!
UErrorCode tempStatus = U_ZERO_ERROR;
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, nullptr, 0, &tempStatus);
tempStatus = U_ZERO_ERROR;
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, nullptr, 0, &tempStatus);
} for (i=lastMatchEnd; i<matchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
}
} else {
UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
dest==nullptr?nullptr:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
&possibleOverflowError);
}
U_ASSERT(destIdx >= 0);
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replIdx = 0; while (replIdx < replacementLength && U_SUCCESS(*status)) {
char16_t c = replacementText[replIdx];
replIdx++; if (c != DOLLARSIGN && c != BACKSLASH) { // Common case, no substitution, no escaping, // just copy the char to the dest buf.
appendToBuf(c, &destIdx, dest, capacity); continue;
}
if (c == BACKSLASH) { // Backslash Escape. Copy the following char out without further checks. // Note: Surrogate pairs don't need any special handling // The second half wont be a '$' or a '\', and // will move to the dest normally on the next // loop iteration. if (replIdx >= replacementLength) { break;
}
c = replacementText[replIdx];
if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence.
UChar32 escapedChar =
u_unescapeAt(uregex_ucstr_unescape_charAt,
&replIdx, // Index is updated by unescapeAt
replacementLength, // Length of replacement text
(void *)replacementText);
if (escapedChar != static_cast<UChar32>(0xFFFFFFFF)) { if (escapedChar <= 0xffff) {
appendToBuf(static_cast<char16_t>(escapedChar), &destIdx, dest, capacity);
} else {
appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
} continue;
} // Note: if the \u escape was invalid, just fall through and // treat it as a plain \<anything> escape.
}
// Plain backslash escape. Just put out the escaped character.
appendToBuf(c, &destIdx, dest, capacity);
replIdx++; continue;
}
// We've got a $. Pick up the following capture group name or number. // For numbers, consume only digits that produce a valid capture group for the pattern.
int32_t digitVal = u_charDigitValue(c32); if (groupNum * 10 + digitVal <= numCaptureGroups) {
groupNum = groupNum * 10 + digitVal;
U16_FWD_1(replacementText, replIdx, replacementLength);
numDigits++;
} else { if (numDigits == 0) {
*status = U_INDEX_OUTOFBOUNDS_ERROR;
} break;
}
}
} elseif (c32 == LEFTBRACKET) { // Scan for Named Capture Group, ${name}.
UnicodeString groupName;
U16_FWD_1(replacementText, replIdx, replacementLength); while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { if (replIdx >= replacementLength) {
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; break;
}
U16_NEXT(replacementText, replIdx, replacementLength, c32); if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
(c32 >= 0x61 && c32 <= 0x7a) || // a..z
(c32 >= 0x31 && c32 <= 0x39)) { // 0..9
groupName.append(c32);
} elseif (c32 == RIGHTBRACKET) {
groupNum = regexp->fPat->fNamedCaptureMap ?
uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0; if (groupNum == 0) { // Name not defined by pattern.
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
} else { // Character was something other than a name char or a closing '}'
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
} else { // $ not followed by {name} or digits.
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
// Finally, append the capture group data to the destination. if (U_SUCCESS(*status)) {
destIdx += uregex_group(reinterpret_cast<URegularExpression*>(regexp), groupNum,
dest==nullptr?nullptr:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); if (*status == U_BUFFER_OVERFLOW_ERROR) { // Ignore buffer overflow when extracting the group. We need to // continue on to get full size of the untruncated result. We will // raise our own buffer overflow error at the end.
*status = U_ZERO_ERROR;
}
}
if (U_FAILURE(*status)) { // bad group number or name. break;
}
}
// // Nul Terminate the dest buffer if possible. // Set the appropriate buffer overflow or not terminated error, if needed. // if (destIdx < capacity) {
dest[destIdx] = 0;
} elseif (U_SUCCESS(*status)) { if (destIdx == *destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
}
// // Return an updated dest buffer and capacity to the caller. // if (destIdx > 0 && *destCapacity > 0) { if (destIdx < capacity) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} else {
*destBuf += capacity;
*destCapacity = 0;
}
}
// If we came in with a buffer overflow, make sure we go out with one also. // (A zero length match right at the end of the previous match could // make this function succeed even though a previous call had overflowed the buf) if (pendingBufferOverflow && U_SUCCESS(*status)) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return destIdx;
}
// // appendReplacement the actual API function, //
U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression *regexp2, const char16_t *replacementText,
int32_t replacementLength,
char16_t **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
// If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle.
UBool pendingBufferOverflow = false; if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != nullptr && *destCapacity == 0) {
pendingBufferOverflow = true;
*status = U_ZERO_ERROR;
}
if (validateRE(regexp, true, status) == false) { return 0;
}
if (srcIdx == regexp->fTextLength) { break;
}
char16_t c = regexp->fText[srcIdx]; if (c == 0 && regexp->fTextLength == -1) {
regexp->fTextLength = srcIdx; break;
}
if (destIdx < destCap) {
dest[destIdx] = c;
} else { // We've overflowed the dest buffer. // If the total input string length is known, we can // compute the total buffer size needed without scanning through the string. if (regexp->fTextLength > 0) {
destIdx += (regexp->fTextLength - srcIdx); break;
}
}
srcIdx++;
destIdx++;
}
} else {
int64_t srcIdx; if (m->fMatch) { // The most recent call to find() succeeded.
srcIdx = m->fMatchEnd;
} else { // The last call to find() on this matcher failed(). // Look back to the end of the last find() that succeeded for src index.
srcIdx = m->fLastMatchEnd; if (srcIdx == -1) { // There has been no successful match with this matcher. // We want to copy the whole string.
srcIdx = 0;
}
}
// // NUL terminate the output string, if possible, otherwise issue the // appropriate error or warning. // if (destIdx < destCap) {
dest[destIdx] = 0;
} elseif (destIdx == destCap) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
// // Update the user's buffer ptr and capacity vars to reflect the // amount used. // if (destIdx < destCap) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} elseif (*destBuf != nullptr) {
*destBuf += destCap;
*destCapacity = 0;
}
if (pendingBufferOverflow && U_SUCCESS(*status)) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return destIdx;
}
// // appendTail the actual API function //
U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression *regexp2,
char16_t **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
}
// // uregex_appendTailUText...can just use the normal C++ method //
U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression *regexp2,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2; return regexp->fMatcher->appendTail(dest, *status);
}
//------------------------------------------------------------------------------ // // copyString Internal utility to copy a string to an output buffer, // while managing buffer overflow and preflight size // computation. NUL termination is added to destination, // and the NUL is counted in the output size. // //------------------------------------------------------------------------------ #if 0 staticvoid copyString(char16_t *destBuffer, // Destination buffer.
int32_t destCapacity, // Total capacity of dest buffer
int32_t *destIndex, // Index into dest buffer. Updated on return. // Update not clipped to destCapacity. const char16_t *srcPtr, // Pointer to source string
int32_t srcLen) // Source string len.
{
int32_t si;
int32_t di = *destIndex;
char16_t c;
for (si=0; si<srcLen; si++) {
c = srcPtr[si]; if (di < destCapacity) {
destBuffer[di] = c;
di++;
} else {
di += srcLen - si; break;
}
} if (di<destCapacity) {
destBuffer[di] = 0;
}
di++;
*destIndex = di;
} #endif
// // Loop through the input text, searching for the delimiter pattern //
int32_t i; // Index of the field being processed.
int32_t destIdx = 0; // Next available position in destBuf;
int32_t numCaptureGroups = regexp->fMatcher->groupCount();
UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted for (i=0; ; i++) { if (i>=destFieldsCapacity-1) { // There are one or zero output strings left. // Fill the last output string with whatever is left from the input, then exit the loop. // ( i will be == destFieldsCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) if (inputLen > nextOutputStringStart) { if (i != destFieldsCapacity-1) { // No fields are left. Recycle the last one for holding the trailing part of // the input string.
i = destFieldsCapacity-1;
destIdx = static_cast<int32_t>(destFields[i] - destFields[0]);
}
if (regexp->fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string.
destFields[i] = (destBuf == nullptr) ? nullptr : &destBuf[destIdx];
// If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings.
int32_t groupNum; for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { // If we've run out of output string slots, bail out. if (i==destFieldsCapacity-1) { break;
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.26 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.