// #include <malloc.h> // Needed for heapcheck testing
U_NAMESPACE_BEGIN
// Default limit for the size of the back track stack, to avoid system // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. // This value puts ICU's limits higher than most other regexp implementations, // which use recursion rather than the heap, and take more storage per // backtrack point. // staticconst int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
// Time limit counter constant. // Time limits for expression evaluation are in terms of quanta of work by // the engine, each of which is 10,000 state saves. // This constant determines that state saves per tick number. staticconst int32_t TIMER_INITIAL_VALUE = 10000;
// Test for any of the Unicode line terminating characters. staticinline UBool isLineTerminator(UChar32 c) { if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { returnfalse;
} return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
}
// // init2() Common initialization for use by RegexMatcher constructors, part 2. // This handles the common setup to be done after the Pattern is available. // void RegexMatcher::init2(UText *input, UErrorCode &status) { if (U_FAILURE(status)) {
fDeferredStatus = status; return;
}
if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
fData = static_cast<int64_t*>(uprv_malloc(fPattern->fDataSize * sizeof(int64_t))); if (fData == nullptr) {
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return;
}
}
fStack = new UVector64(status); if (fStack == nullptr) {
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return;
}
if (U_SUCCESS(status)) {
appendReplacement(&resultText, &replacementText, status);
utext_close(&resultText);
}
utext_close(&replacementText);
}
return *this;
}
// // appendReplacement, UText mode //
RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
UText *replacement,
UErrorCode &status) { if (U_FAILURE(status)) { return *this;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; return *this;
} if (fMatch == false) {
status = U_REGEX_INVALID_STATE; return *this;
}
// Copy input string from the end of previous match to start of current match
int64_t destLen = utext_nativeLength(dest); if (fMatchStart > fAppendPosition) { if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, static_cast<int32_t>(fMatchStart - fAppendPosition), &status);
} else {
int32_t len16; if (UTEXT_USES_U16(fInputText)) {
len16 = static_cast<int32_t>(fMatchStart - fAppendPosition);
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus);
}
char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1))); if (inputChars == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return *this;
}
utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
uprv_free(inputChars);
}
}
fAppendPosition = fMatchEnd;
// scan the replacement text, looking for substitutions ($n) and \escapes. // TODO: optimize this loop by efficiently scanning for '$' or '\', // move entire ranges not containing substitutions.
UTEXT_SETNATIVEINDEX(replacement, 0); for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { if (c == BACKSLASH) { // Backslash Escape. Copy the following char out without further checks. // Note: Surrogate pairs don't need any special handling // The second half wont be a '$' or a '\', and // will move to the dest normally on the next // loop iteration.
c = UTEXT_CURRENT32(replacement); if (c == U_SENTINEL) { break;
}
if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence.
int32_t offset = 0; struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); if (escapedChar != static_cast<UChar32>(0xFFFFFFFF)) { if (U_IS_BMP(escapedChar)) {
char16_t c16 = static_cast<char16_t>(escapedChar);
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
char16_t surrogate[2];
surrogate[0] = U16_LEAD(escapedChar);
surrogate[1] = U16_TRAIL(escapedChar); if (U_SUCCESS(status)) {
destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
}
} // TODO: Report errors for mal-formed \u escapes? // As this is, the original sequence is output, which may be OK. if (context.lastOffset == offset) {
(void)UTEXT_PREVIOUS32(replacement);
} elseif (context.lastOffset != offset-1) {
utext_moveIndex32(replacement, offset - context.lastOffset - 1);
}
}
} else {
(void)UTEXT_NEXT32(replacement); // Plain backslash escape. Just put out the escaped character. if (U_IS_BMP(c)) {
char16_t c16 = static_cast<char16_t>(c);
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
char16_t surrogate[2];
surrogate[0] = U16_LEAD(c);
surrogate[1] = U16_TRAIL(c); if (U_SUCCESS(status)) {
destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
}
}
}
} elseif (c != DOLLARSIGN) { // Normal char, not a $. Copy it out without further checks. if (U_IS_BMP(c)) {
char16_t c16 = static_cast<char16_t>(c);
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
char16_t surrogate[2];
surrogate[0] = U16_LEAD(c);
surrogate[1] = U16_TRAIL(c); if (U_SUCCESS(status)) {
destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
}
}
} else { // We've got a $. Pick up a capture group name or number if one follows. // Consume digits so long as the resulting group number <= the number of // number of capture groups in the pattern.
int32_t groupNum = 0;
int32_t numDigits = 0;
UChar32 nextChar = utext_current32(replacement); if (nextChar == LEFTBRACKET) { // Scan for a Named Capture Group, ${name}.
UnicodeString groupName;
utext_next32(replacement); while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
nextChar = utext_next32(replacement); if (nextChar == U_SENTINEL) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
} elseif ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
(nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
groupName.append(nextChar);
} elseif (nextChar == RIGHTBRACKET) {
groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0; if (groupNum == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
} else { // Character was something other than a name char or a closing '}'
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
} elseif (u_isdigit(nextChar)) { // $n Scan for a capture group number
int32_t numCaptureGroups = fPattern->fGroupMap->size(); for (;;) {
nextChar = UTEXT_CURRENT32(replacement); if (nextChar == U_SENTINEL) { break;
} if (u_isdigit(nextChar) == false) { break;
}
int32_t nextDigitVal = u_charDigitValue(nextChar); if (groupNum*10 + nextDigitVal > numCaptureGroups) { // Don't consume the next digit if it makes the capture group number too big. if (numDigits == 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
} break;
}
(void)UTEXT_NEXT32(replacement);
groupNum=groupNum*10 + nextDigitVal;
++numDigits;
}
} else { // $ not followed by capture group name or number.
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
if (U_SUCCESS(status)) {
destLen += appendGroup(groupNum, dest, status);
}
} // End of $ capture group handling
} // End of per-character loop through the replacement string.
return *this;
}
//-------------------------------------------------------------------------------- // // appendTail Intended to be used in conjunction with appendReplacement() // To the destination string, append everything following // the last match position from the input string. // // Note: Match ranges do not affect appendTail or appendReplacement // //--------------------------------------------------------------------------------
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
UErrorCode status = U_ZERO_ERROR;
UText resultText = UTEXT_INITIALIZER;
utext_openUnicodeString(&resultText, &dest, &status);
if (U_SUCCESS(status)) {
appendTail(&resultText, status);
utext_close(&resultText);
}
return dest;
}
// // appendTail, UText mode //
UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { return dest;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; return dest;
}
//-------------------------------------------------------------------------------- // // findProgressInterrupt This function is called once for each advance in the target // string from the find() function, and calls the user progress callback // function if there is one installed. // // Return: true if the find operation is to be terminated. // false if the find operation is to continue running. // //--------------------------------------------------------------------------------
UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
status = U_REGEX_STOPPED_BY_CALLER; returntrue;
} returnfalse;
}
//-------------------------------------------------------------------------------- // // find() // //--------------------------------------------------------------------------------
UBool RegexMatcher::find() { if (U_FAILURE(fDeferredStatus)) { returnfalse;
}
UErrorCode status = U_ZERO_ERROR;
UBool result = find(status); return result;
}
//-------------------------------------------------------------------------------- // // find() // //--------------------------------------------------------------------------------
UBool RegexMatcher::find(UErrorCode &status) { // Start at the position of the last match end. (Will be zero if the // matcher has been reset.) // if (U_FAILURE(status)) { returnfalse;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; returnfalse;
}
if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { return findUsingChunk(status);
}
if (fMatch) { // Save the position of any previous successful match.
fLastMatchEnd = fMatchEnd;
if (fMatchStart == fMatchEnd) { // Previous match had zero length. Move start position up one position // to avoid sending find() into a loop on zero-length matches. if (startPos >= fActiveLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
} else { if (fLastMatchEnd >= 0) { // A previous find() failed to match. Don't try again. // (without this test, a pattern with a zero-length match // could match again at the end of an input string.)
fHitEnd = true; returnfalse;
}
}
// Compute the position in the input string beyond which a match can not begin, because // the minimum length match would extend past the end of the input. // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. // Be aware of possible overflows if making changes here.
int64_t testStartLimit; if (UTEXT_USES_U16(fInputText)) {
testStartLimit = fActiveLimit - fPattern->fMinMatchLen; if (startPos > testStartLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
}
} else { // We don't know exactly how long the minimum match length is in native characters. // Treat anything > 0 as 1.
testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
}
UChar32 c;
U_ASSERT(startPos >= 0);
switch (fPattern->fStartType) { case START_NO_INFO: // No optimization was found. // Try a match at each input position. for (;;) {
MatchAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
} if (startPos >= testStartLimit) {
fHitEnd = true; returnfalse;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testStartLimit the last time through. if (findProgressInterrupt(startPos, status)) returnfalse;
}
UPRV_UNREACHABLE_EXIT;
case START_START: // Matches are only possible at the start of the input string // (pattern begins with ^ or \A) if (startPos > fActiveStart) {
fMatch = false; returnfalse;
}
MatchAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} return fMatch;
case START_SET:
{ // Match may start on any char from a pre-computed set.
U_ASSERT(fPattern->fMinMatchLen > 0);
UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) {
int64_t pos = startPos;
c = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText); // c will be -1 (U_SENTINEL) at end of text, in which case we // skip this next block (so we don't have a negative array index) // and handle end of text in the following block. if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
(c>=256 && fPattern->fInitialChars->contains(c)))) {
MatchAt(pos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
UTEXT_SETNATIVEINDEX(fInputText, pos);
} if (startPos > testStartLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
} if (findProgressInterrupt(startPos, status)) returnfalse;
}
}
UPRV_UNREACHABLE_EXIT;
case START_STRING: case START_CHAR:
{ // Match starts on exactly one char.
U_ASSERT(fPattern->fMinMatchLen > 0);
UChar32 theChar = fPattern->fInitialChar;
UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) {
int64_t pos = startPos;
c = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText); if (c == theChar) {
MatchAt(pos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
} if (startPos > testStartLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
} if (findProgressInterrupt(startPos, status)) returnfalse;
}
}
UPRV_UNREACHABLE_EXIT;
if (fPattern->fFlags & UREGEX_UNIX_LINES) { for (;;) { if (ch == 0x0a) {
MatchAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
} if (startPos >= testStartLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
}
ch = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testStartLimit the last time through. if (findProgressInterrupt(startPos, status)) returnfalse;
}
} else { for (;;) { if (isLineTerminator(ch)) { if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
MatchAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
} if (startPos >= testStartLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
}
ch = UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testStartLimit the last time through. if (findProgressInterrupt(startPos, status)) returnfalse;
}
}
}
default:
UPRV_UNREACHABLE_ASSERT; // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. // See ICU-21669.
status = U_INTERNAL_PROGRAM_ERROR; returnfalse;
}
UPRV_UNREACHABLE_EXIT;
}
UBool RegexMatcher::find(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { returnfalse;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; returnfalse;
}
this->reset(); // Note: Reset() is specified by Java Matcher documentation. // This will reset the region to be the full input length. if (start < 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR; returnfalse;
}
//-------------------------------------------------------------------------------- // // findUsingChunk() -- like find(), but with the advance knowledge that the // entire string is available in the UText's chunk buffer. // //--------------------------------------------------------------------------------
UBool RegexMatcher::findUsingChunk(UErrorCode &status) { // Start at the position of the last match end. (Will be zero if the // matcher has been reset. //
if (fMatch) { // Save the position of any previous successful match.
fLastMatchEnd = fMatchEnd;
if (fMatchStart == fMatchEnd) { // Previous match had zero length. Move start position up one position // to avoid sending find() into a loop on zero-length matches. if (startPos >= fActiveLimit) {
fMatch = false;
fHitEnd = true; returnfalse;
}
U16_FWD_1(inputBuf, startPos, fInputLength);
}
} else { if (fLastMatchEnd >= 0) { // A previous find() failed to match. Don't try again. // (without this test, a pattern with a zero-length match // could match again at the end of an input string.)
fHitEnd = true; returnfalse;
}
}
// Compute the position in the input string beyond which a match can not begin, because // the minimum length match would extend past the end of the input. // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. // Be aware of possible overflows if making changes here. // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
int32_t testLen = static_cast<int32_t>(fActiveLimit - fPattern->fMinMatchLen); if (startPos > testLen) {
fMatch = false;
fHitEnd = true; returnfalse;
}
UChar32 c;
U_ASSERT(startPos >= 0);
switch (fPattern->fStartType) { case START_NO_INFO: // No optimization was found. // Try a match at each input position. for (;;) {
MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
} if (startPos >= testLen) {
fHitEnd = true; returnfalse;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testLen the last time through. if (findProgressInterrupt(startPos, status)) returnfalse;
}
UPRV_UNREACHABLE_EXIT;
case START_START: // Matches are only possible at the start of the input string // (pattern begins with ^ or \A) if (startPos > fActiveStart) {
fMatch = false; returnfalse;
}
MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} return fMatch;
case START_SET:
{ // Match may start on any char from a pre-computed set.
U_ASSERT(fPattern->fMinMatchLen > 0); for (;;) {
int32_t pos = startPos;
U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
(c>=256 && fPattern->fInitialChars->contains(c))) {
MatchChunkAt(pos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
} if (startPos > testLen) {
fMatch = false;
fHitEnd = true; returnfalse;
} if (findProgressInterrupt(startPos, status)) returnfalse;
}
}
UPRV_UNREACHABLE_EXIT;
case START_STRING: case START_CHAR:
{ // Match starts on exactly one char.
U_ASSERT(fPattern->fMinMatchLen > 0);
UChar32 theChar = fPattern->fInitialChar; for (;;) {
int32_t pos = startPos;
U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; if (c == theChar) {
MatchChunkAt(pos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
} if (startPos > testLen) {
fMatch = false;
fHitEnd = true; returnfalse;
} if (findProgressInterrupt(startPos, status)) returnfalse;
}
}
UPRV_UNREACHABLE_EXIT;
case START_LINE:
{
UChar32 ch; if (startPos == fAnchorStart) {
MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit);
}
if (fPattern->fFlags & UREGEX_UNIX_LINES) { for (;;) {
ch = inputBuf[startPos-1]; if (ch == 0x0a) {
MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
} if (startPos >= testLen) {
fMatch = false;
fHitEnd = true; returnfalse;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testLen the last time through. if (findProgressInterrupt(startPos, status)) returnfalse;
}
} else { for (;;) {
ch = inputBuf[startPos-1]; if (isLineTerminator(ch)) { if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
startPos++;
}
MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { returnfalse;
} if (fMatch) { returntrue;
}
} if (startPos >= testLen) {
fMatch = false;
fHitEnd = true; returnfalse;
}
U16_FWD_1(inputBuf, startPos, fActiveLimit); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testLen the last time through. if (findProgressInterrupt(startPos, status)) returnfalse;
}
}
}
default:
UPRV_UNREACHABLE_ASSERT; // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. // See ICU-21669.
status = U_INTERNAL_PROGRAM_ERROR; returnfalse;
}
int64_t s, e; if (groupNum == 0) {
s = fMatchStart;
e = fMatchEnd;
} else {
int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
e = fFrame->fExtra[groupOffset+1];
}
if (s < 0) { // A capture group wasn't part of the match return utext_clone(dest, fInputText, false, true, &status);
}
U_ASSERT(s <= e);
group_len = e - s;
dest = utext_clone(dest, fInputText, false, true, &status); if (dest)
UTEXT_SETNATIVEINDEX(dest, s); return dest;
}
// Get the group length using a utext_extract preflight. // UText is actually pretty efficient at this when underlying encoding is UTF-16.
int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { return result;
}
//-------------------------------------------------------------------------------- // // appendGroup() -- currently internal only, appends a group to a UText rather // than replacing its contents // //--------------------------------------------------------------------------------
int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const{ if (U_FAILURE(status)) { return 0;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; return 0;
}
int64_t destLen = utext_nativeLength(dest);
if (fMatch == false) {
status = U_REGEX_INVALID_STATE; return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
} if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
status = U_INDEX_OUTOFBOUNDS_ERROR; return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
}
int64_t s, e; if (groupNum == 0) {
s = fMatchStart;
e = fMatchEnd;
} else {
int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
e = fFrame->fExtra[groupOffset+1];
}
if (s < 0) { // A capture group wasn't part of the match return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
}
U_ASSERT(s <= e);
//-------------------------------------------------------------------------------- // // getInput() -- like inputText(), but makes a clone or copies into another UText // //--------------------------------------------------------------------------------
UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { if (U_FAILURE(status)) { return dest;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; return dest;
}
// In the following test, we're really only interested in whether the UText should switch // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents // will still point to the correct data. if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
UnicodeString *us=(UnicodeString *)ut->context;
// Update to the latest length. // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
int32_t newLength = us->length();
// Update the chunk description. // The buffer may have switched between stack- and heap-based.
ut->chunkContents = us->getBuffer();
ut->chunkLength = newLength;
ut->chunkNativeLimit = newLength;
ut->nativeIndexingLimit = newLength;
retVal = true;
}
return retVal;
}
//-------------------------------------------------------------------------------- // // lookingAt() // //--------------------------------------------------------------------------------
UBool RegexMatcher::lookingAt(UErrorCode &status) { if (U_FAILURE(status)) { returnfalse;
} if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus; returnfalse;
}
// Do the following for any UnicodeString. // This is for compatibility for those clients who modify the input string "live" during regex operations.
fInputUniStrMaybeMutable = true;
#if UCONFIG_NO_BREAK_ITERATION==0 if (fWordBreakItr) {
fWordBreakItr->setText(fInputText, fDeferredStatus);
} if (fGCBreakItr) {
fGCBreakItr->setText(fInputText, fDeferredStatus);
} #endif
RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { if (U_FAILURE(status)) { return *this;
}
reset(); // Reset also resets the region to be the entire string.
if (position < 0 || position > fActiveLimit) {
status = U_INDEX_OUTOFBOUNDS_ERROR; return *this;
}
fMatchEnd = position; return *this;
}
//-------------------------------------------------------------------------------- // // refresh // //--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { return *this;
} if (input == nullptr) {
status = U_ILLEGAL_ARGUMENT_ERROR; return *this;
} if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
status = U_ILLEGAL_ARGUMENT_ERROR; return *this;
}
int64_t pos = utext_getNativeIndex(fInputText); // Shallow read-only clone of the new UText into the existing input UText
fInputText = utext_clone(fInputText, input, false, true, &status); if (U_FAILURE(status)) { return *this;
}
utext_setNativeIndex(fInputText, pos);
/** * UText, replace entire contents of the destination UText with a substring of the source UText. * * @param src The source UText * @param dest The destination UText. Must be writable.
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.27 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.