// HIGH_VALUE > all valid values. 110000 for codepoints #define UNICODESET_HIGH 0x0110000
// LOW <= all valid values. ZERO for codepoints #define UNICODESET_LOW 0x000000
/** Max list [0, 1, 2, ..., max code point, HIGH] */
constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
U_NAMESPACE_BEGIN
SymbolTable::~SymbolTable() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
/** * Modify the given UChar32 variable so that it is in range, by * pinning values < UNICODESET_LOW to UNICODESET_LOW, and * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. * It modifies its argument in-place and also returns it.
*/ staticinline UChar32 pinCodePoint(UChar32& c) { if (c < UNICODESET_LOW) {
c = UNICODESET_LOW;
} elseif (c > (UNICODESET_HIGH-1)) {
c = (UNICODESET_HIGH-1);
} return c;
}
// DO NOT DELETE THIS CODE. This code is used to debug memory leaks. // To enable the debugging, define the symbol DEBUG_MEM in the line // below. This will result in text being sent to stdout that looks // like this: // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- // Each line lists a construction (ct) or destruction (dt) event, the // object address, the number of outstanding objects after the event, // and the pattern of the object in question.
//---------------------------------------------------------------- // UnicodeString in UVector support //----------------------------------------------------------------
/** * Constructs a set containing the given range. If <code>end > * start</code> then an empty set is created. * * @param start first character, inclusive, of range * @param end last character, inclusive, of range
*/
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
list[0] = UNICODESET_HIGH;
add(start, end);
_dbgct(this);
}
/** * Constructs a set that is identical to the given UnicodeSet.
*/
UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
*this = o;
_dbgct(this);
}
// Copy-construct as thawed.
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { if (ensureCapacity(o.len)) { // *this = o except for bmpSet and stringSpan
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); if (o.hasStrings()) {
UErrorCode status = U_ZERO_ERROR; if (!allocateStrings(status) ||
(strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
setToBogus(); return;
}
} if (o.pat) {
setPattern(o.pat, o.patLen);
}
_dbgct(this);
}
}
/** * Assigns this object to be a copy of another.
*/
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { return copyFrom(o, false);
}
UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { if (this == &o) { return *this;
} if (isFrozen()) { return *this;
} if (o.isBogus()) {
setToBogus(); return *this;
} if (!ensureCapacity(o.len)) { // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. return *this;
}
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); if (o.bmpSet != nullptr && !asThawed) {
bmpSet = new BMPSet(*o.bmpSet, list, len); if (bmpSet == nullptr) { // Check for memory allocation error.
setToBogus(); return *this;
}
} if (o.hasStrings()) {
UErrorCode status = U_ZERO_ERROR; if ((strings_ == nullptr && !allocateStrings(status)) ||
(strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
setToBogus(); return *this;
}
} elseif (hasStrings()) {
strings_->removeAllElements();
} if (o.stringSpan != nullptr && !asThawed) {
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_); if (stringSpan == nullptr) { // Check for memory allocation error.
setToBogus(); return *this;
}
}
releasePattern(); if (o.pat) {
setPattern(o.pat, o.patLen);
} return *this;
}
/** * Returns a copy of this object. All UnicodeMatcher objects have * to support cloning in order to allow classes using * UnicodeMatchers, such as Transliterator, to implement cloning.
*/
UnicodeSet* UnicodeSet::clone() const { returnnew UnicodeSet(*this);
}
/** * Compares the specified object with this set for equality. Returns * <tt>true</tt> if the two sets * have the same size, and every member of the specified set is * contained in this set (or equivalently, every member of this set is * contained in the specified set). * * @param o set to be compared for equality with this set. * @return <tt>true</tt> if the specified set is equal to this set.
*/ bool UnicodeSet::operator==(const UnicodeSet& o) const { if (len != o.len) returnfalse; for (int32_t i = 0; i < len; ++i) { if (list[i] != o.list[i]) returnfalse;
} if (hasStrings() != o.hasStrings()) { returnfalse; } if (hasStrings() && *strings_ != *o.strings_) returnfalse; returntrue;
}
/** * Returns the hash code value for this set. * * @return the hash code value for this set. * @see Object#hashCode()
*/
int32_t UnicodeSet::hashCode() const {
uint32_t result = static_cast<uint32_t>(len); for (int32_t i = 0; i < len; ++i) {
result *= 1000003u;
result += list[i];
} returnstatic_cast<int32_t>(result);
}
//---------------------------------------------------------------- // Public API //----------------------------------------------------------------
/** * Returns the number of elements in this set (its cardinality), * Note than the elements of a set may include both individual * codepoints and strings. * * @return the number of elements in this set (its cardinality).
*/
int32_t UnicodeSet::size() const {
int32_t n = 0;
int32_t count = getRangeCount(); for (int32_t i = 0; i < count; ++i) {
n += getRangeEnd(i) - getRangeStart(i) + 1;
} return n + stringsSize();
}
/** * Returns <tt>true</tt> if this set contains no elements. * * @return <tt>true</tt> if this set contains no elements.
*/
UBool UnicodeSet::isEmpty() const { return len == 1 && !hasStrings();
}
/** * Returns true if this set contains the given character. * @param c character to be checked for containment * @return true if the test condition is met
*/
UBool UnicodeSet::contains(UChar32 c) const { // Set i to the index of the start item greater than ch // We know we will terminate without length test! // LATER: for large sets, add binary search //int32_t i = -1; //for (;;) { // if (c < list[++i]) break; //} if (bmpSet != nullptr) { return bmpSet->contains(c);
} if (stringSpan != nullptr) { return stringSpan->contains(c);
} if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound returnfalse;
}
int32_t i = findCodePoint(c); return i & 1; // return true if odd
}
/** * Returns the smallest value i such that c < list[i]. Caller * must ensure that c is a legal value or this method will enter * an infinite loop. This method performs a binary search. * @param c a character in the range MIN_VALUE..MAX_VALUE * inclusive * @return the smallest integer i in the range 0..len-1, * inclusive, such that c < list[i]
*/
int32_t UnicodeSet::findCodePoint(UChar32 c) const { /* Examples: findCodePoint(c) set list[] c=0 1 3 4 7 8 === ============== =========== [] [110000] 0 0 0 0 0 0 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 [:Any:] [0, 110000] 1 1 1 1 1 1
*/
// Return the smallest i such that c < list[i]. Assume // list[len - 1] == HIGH and that c is legal (0..HIGH-1). if (c < list[0]) return 0; // High runner test. c is often after the last range, so an // initial check for this condition pays off.
int32_t lo = 0;
int32_t hi = len - 1; if (lo >= hi || c >= list[hi-1]) return hi; // invariant: c >= list[lo] // invariant: c < list[hi] for (;;) {
int32_t i = (lo + hi) >> 1; if (i == lo) { break; // Found!
} elseif (c < list[i]) {
hi = i;
} else {
lo = i;
}
} return hi;
}
/** * Returns true if this set contains every character * of the given range. * @param start first character, inclusive, of the range * @param end last character, inclusive, of the range * @return true if the test condition is met
*/
UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { //int32_t i = -1; //for (;;) { // if (start < list[++i]) break; //}
int32_t i = findCodePoint(start); return ((i & 1) != 0 && end < list[i]);
}
/** * Returns <tt>true</tt> if this set contains the given * multicharacter string. * @param s string to be checked for containment * @return <tt>true</tt> if this set contains the specified string
*/
UBool UnicodeSet::contains(const UnicodeString& s) const {
int32_t cp = getSingleCP(s); if (cp < 0) { return stringsContains(s);
} else { return contains(static_cast<UChar32>(cp));
}
}
/** * Returns true if this set contains all the characters and strings * of the given set. * @param c set to be checked for containment * @return true if the test condition is met
*/
UBool UnicodeSet::containsAll(const UnicodeSet& c) const { // The specified set is a subset if all of its pairs are contained in // this set. It's possible to code this more efficiently in terms of // direct manipulation of the inversion lists if the need arises.
int32_t n = c.getRangeCount(); for (int i=0; i<n; ++i) { if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { returnfalse;
}
} return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_));
}
/** * Returns true if this set contains all the characters * of the given string. * @param s string containing characters to be checked for containment * @return true if the test condition is met
*/
UBool UnicodeSet::containsAll(const UnicodeString& s) const { return span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == s.length();
}
/** * Returns true if this set contains none of the characters * of the given range. * @param start first character, inclusive, of the range * @param end last character, inclusive, of the range * @return true if the test condition is met
*/
UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { //int32_t i = -1; //for (;;) { // if (start < list[++i]) break; //}
int32_t i = findCodePoint(start); return ((i & 1) == 0 && end < list[i]);
}
/** * Returns true if this set contains none of the characters and strings * of the given set. * @param c set to be checked for containment * @return true if the test condition is met
*/
UBool UnicodeSet::containsNone(const UnicodeSet& c) const { // The specified set is a subset if all of its pairs are contained in // this set. It's possible to code this more efficiently in terms of // direct manipulation of the inversion lists if the need arises.
int32_t n = c.getRangeCount(); for (int32_t i=0; i<n; ++i) { if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { returnfalse;
}
} return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_);
}
/** * Returns true if this set contains none of the characters * of the given string. * @param s string containing characters to be checked for containment * @return true if the test condition is met
*/
UBool UnicodeSet::containsNone(const UnicodeString& s) const { return span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == s.length();
}
/** * Returns <tt>true</tt> if this set contains any character whose low byte * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for * indexing.
*/
UBool UnicodeSet::matchesIndexValue(uint8_t v) const { /* The index value v, in the range [0,255], is contained in this set if * it is contained in any pair of this set. Pairs either have the high * bytes equal, or unequal. If the high bytes are equal, then we have * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. * Then v is contained if xx <= v || v <= yy. (This is identical to the * time zone month containment logic.)
*/
int32_t i;
int32_t rangeCount=getRangeCount(); for (i=0; i<rangeCount; ++i) {
UChar32 low = getRangeStart(i);
UChar32 high = getRangeEnd(i); if ((low & ~0xFF) == (high & ~0xFF)) { if ((low & 0xFF) <= v && v <= (high & 0xFF)) { returntrue;
}
} elseif ((low & 0xFF) <= v || v <= (high & 0xFF)) { returntrue;
}
} if (hasStrings()) { for (i=0; i<strings_->size(); ++i) { const UnicodeString& s = *static_cast<const UnicodeString*>(strings_->elementAt(i)); if (s.isEmpty()) { continue; // skip the empty string
}
UChar32 c = s.char32At(0); if ((c & 0xFF) == v) { returntrue;
}
}
} returnfalse;
}
/** * Implementation of UnicodeMatcher::matches(). Always matches the * longest possible multichar string.
*/
UMatchDegree UnicodeSet::matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) { if (offset == limit) { if (contains(U_ETHER)) { return incremental ? U_PARTIAL_MATCH : U_MATCH;
} else { return U_MISMATCH;
}
} else { if (hasStrings()) { // try strings first
// might separate forward and backward loops later // for now they are combined
// TODO Improve efficiency of this, at least in the forward // direction, if not in both. In the forward direction we // can assume the strings are sorted.
int32_t i;
UBool forward = offset < limit;
// firstChar is the leftmost char to match in the // forward direction or the rightmost char to match in // the reverse direction.
char16_t firstChar = text.charAt(offset);
// If there are multiple strings that can match we // return the longest match.
int32_t highWaterLength = 0;
for (i=0; i<strings_->size(); ++i) { const UnicodeString& trial = *static_cast<const UnicodeString*>(strings_->elementAt(i)); if (trial.isEmpty()) { continue; // skip the empty string
}
char16_t c = trial.charAt(forward ? 0 : trial.length() - 1);
// Strings are sorted, so we can optimize in the // forward direction. if (forward && c > firstChar) break; if (c != firstChar) continue;
if (incremental) {
int32_t maxLen = forward ? limit-offset : offset-limit; if (matchLen == maxLen) { // We have successfully matched but only up to limit. return U_PARTIAL_MATCH;
}
}
if (matchLen == trial.length()) { // We have successfully matched the whole string. if (matchLen > highWaterLength) {
highWaterLength = matchLen;
} // In the forward direction we know strings // are sorted so we can bail early. if (forward && matchLen < highWaterLength) { break;
} continue;
}
}
// We've checked all strings without a partial match. // If we have full matches, return the longest one. if (highWaterLength != 0) {
offset += forward ? highWaterLength : -highWaterLength; return U_MATCH;
}
} return UnicodeFilter::matches(text, offset, limit, incremental);
}
}
/** * Returns the longest match for s in text at the given position. * If limit > start then match forward from start+1 to limit * matching all characters except s.charAt(0). If limit < start, * go backward starting from start-1 matching all characters * except s.charAt(s.length()-1). This method assumes that the * first character, text.charAt(start), matches s, so it does not * check it. * @param text the text to match * @param start the first character to match. In the forward * direction, text.charAt(start) is matched against s.charAt(0). * In the reverse direction, it is matched against * s.charAt(s.length()-1). * @param limit the limit offset for matching, either last+1 in * the forward direction, or last-1 in the reverse direction, * where last is the index of the last character to match. * @return If part of s matches up to the limit, return |limit - * start|. If all of s matches before reaching the limit, return * s.length(). If there is a mismatch between s and text, return * 0
*/
int32_t UnicodeSet::matchRest(const Replaceable& text,
int32_t start, int32_t limit, const UnicodeString& s) {
int32_t i;
int32_t maxLen;
int32_t slen = s.length(); if (start < limit) {
maxLen = limit - start; if (maxLen > slen) maxLen = slen; for (i = 1; i < maxLen; ++i) { if (text.charAt(start + i) != s.charAt(i)) return 0;
}
} else {
maxLen = start - limit; if (maxLen > slen) maxLen = slen;
--slen; // <=> slen = s.length() - 1; for (i = 1; i < maxLen; ++i) { if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
}
} return maxLen;
}
/** * Returns the index of the given character within this set, where * the set is ordered by ascending code point. If the character * is not in this set, return -1. The inverse of this method is * <code>charAt()</code>. * @return an index from 0..size()-1, or -1
*/
int32_t UnicodeSet::indexOf(UChar32 c) const { if (c < MIN_VALUE || c > MAX_VALUE) { return -1;
}
int32_t i = 0;
int32_t n = 0; for (;;) {
UChar32 start = list[i++]; if (c < start) { return -1;
}
UChar32 limit = list[i++]; if (c < limit) { return n + c - start;
}
n += limit - start;
}
}
/** * Returns the character at the given index within this set, where * the set is ordered by ascending code point. If the index is * out of range, return (UChar32)-1. The inverse of this method is * <code>indexOf()</code>. * @param index an index from 0..size()-1 * @return the character at the given index, or (UChar32)-1.
*/
UChar32 UnicodeSet::charAt(int32_t index) const { if (index >= 0) { // len2 is the largest even integer <= len, that is, it is len // for even values and len-1 for odd values. With odd values // the last entry is UNICODESET_HIGH.
int32_t len2 = len & ~1; for (int32_t i=0; i < len2;) {
UChar32 start = list[i++];
int32_t count = list[i++] - start; if (index < count) { returnstatic_cast<UChar32>(start + index);
}
index -= count;
}
} returnstatic_cast<UChar32>(-1);
}
/** * Make this object represent the range <code>start - end</code>. * If <code>end > start</code> then this object is set to an * an empty range. * * @param start first character in the set, inclusive * @rparam end last character in the set, inclusive
*/
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
clear();
complement(start, end); return *this;
}
/** * Adds the specified range to this set if it is not already * present. If this set already contains the specified range, * the call leaves this set unchanged. If <code>end > start</code> * then an empty range is added, leaving the set unchanged. * * @param start first character, inclusive, of range to be added * to this set. * @param end last character, inclusive, of range to be added * to this set.
*/
UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { if (pinCodePoint(start) < pinCodePoint(end)) {
UChar32 limit = end + 1; // Fast path for adding a new range after the last one. // Odd list length: [..., lastStart, lastLimit, HIGH] if ((len & 1) != 0) { // If the list is empty, set lastLimit low enough to not be adjacent to 0.
UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; if (lastLimit <= start && !isFrozen() && !isBogus()) { if (lastLimit == start) { // Extend the last range.
list[len - 2] = limit; if (limit == UNICODESET_HIGH) {
--len;
}
} else {
list[len - 1] = start; if (limit < UNICODESET_HIGH) { if (ensureCapacity(len + 2)) {
list[len++] = limit;
list[len++] = UNICODESET_HIGH;
}
} else { // limit == UNICODESET_HIGH if (ensureCapacity(len + 1)) {
list[len++] = UNICODESET_HIGH;
}
}
}
releasePattern(); return *this;
}
} // This is slow. Could be much faster using findCodePoint(start) // and modifying the list, dealing with adjacent & overlapping ranges.
UChar32 range[3] = { start, limit, UNICODESET_HIGH };
add(range, 2, 0);
} elseif (start == end) {
add(start);
} return *this;
}
// #define DEBUG_US_ADD
#ifdef DEBUG_US_ADD #include <stdio.h> void dump(UChar32 c) { if (c <= 0xFF) {
printf("%c", (char)c);
} else {
printf("U+%04X", c);
}
} void dump(const UChar32* list, int32_t len) {
printf("["); for (int32_t i=0; i<len; ++i) { if (i != 0) printf(", ");
dump(list[i]);
}
printf("]");
} #endif
/** * Adds the specified character to this set if it is not already * present. If this set already contains the specified character, * the call leaves this set unchanged.
*/
UnicodeSet& UnicodeSet::add(UChar32 c) { // find smallest i such that c < list[i] // if odd, then it is IN the set // if even, then it is OUT of the set
int32_t i = findCodePoint(pinCodePoint(c));
// already in set? if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
// HIGH is 0x110000 // assert(list[len-1] == HIGH);
#ifdef DEBUG_US_ADD
printf("Add of ");
dump(c);
printf(" found at %d", i);
printf(": ");
dump(list, len);
printf(" => "); #endif
if (c == list[i]-1) { // c is before start of next range
list[i] = c; // if we touched the HIGH mark, then add a new one if (c == (UNICODESET_HIGH - 1)) { if (!ensureCapacity(len+1)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this;
}
list[len++] = UNICODESET_HIGH;
} if (i > 0 && c == list[i-1]) { // collapse adjacent ranges
for (i=1; i<len; ++i) { if (list[i] <= list[i-1]) { // Corrupt array!
printf("ERROR: list has been corrupted\n"); exit(1);
}
} #endif
releasePattern(); return *this;
}
/** * Adds the specified multicharacter to this set if it is not already * present. If this set already contains the multicharacter, * the call leaves this set unchanged. * Thus "ch" => {"ch"} * * @param s the source string * @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::add(const UnicodeString& s) { if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s); if (cp < 0) { if (!stringsContains(s)) {
_add(s);
releasePattern();
}
} else {
add(static_cast<UChar32>(cp));
} return *this;
}
/** * Adds the given string, in order, to 'strings_'. The given string * must have been checked by the caller to not already be in 'strings_'.
*/ void UnicodeSet::_add(const UnicodeString& s) { if (isFrozen() || isBogus()) { return;
}
UErrorCode ec = U_ZERO_ERROR; if (strings_ == nullptr && !allocateStrings(ec)) {
setToBogus(); return;
}
UnicodeString* t = new UnicodeString(s); if (t == nullptr) { // Check for memory allocation error.
setToBogus(); return;
}
strings_->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) {
setToBogus();
}
}
/** * @return a code point IF the string consists of a single one. * otherwise returns -1. * @param string to test
*/
int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
int32_t sLength = s.length(); if (sLength == 1) return s.charAt(0); if (sLength == 2) {
UChar32 cp = s.char32At(0); if (cp > 0xFFFF) { // is surrogate pair return cp;
}
} return -1;
}
/** * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} * If this set already any particular character, it has no effect on that character. * @param the source string * @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
UChar32 cp; for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
cp = s.char32At(i);
add(cp);
} return *this;
}
/** * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} * If this set already any particular character, it has no effect on that character. * @param the source string * @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
UnicodeSet set;
set.addAll(s);
retainAll(set); return *this;
}
/** * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} * If this set already any particular character, it has no effect on that character. * @param the source string * @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
UnicodeSet set;
set.addAll(s);
complementAll(set); return *this;
}
/** * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} * If this set already any particular character, it has no effect on that character. * @param the source string * @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
UnicodeSet set;
set.addAll(s);
removeAll(set); return *this;
}
/** * Makes a set from a multicharacter string. Thus "ch" => {"ch"} * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> * @param the source string * @return a newly created set containing the given string
*/
UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
UnicodeSet *set = new UnicodeSet(); if (set != nullptr) { // Check for memory allocation error.
set->add(s);
} return set;
}
/** * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} * @param the source string * @return a newly created set containing the given characters
*/
UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
UnicodeSet *set = new UnicodeSet(); if (set != nullptr) { // Check for memory allocation error.
set->addAll(s);
} return set;
}
/** * Retain only the elements in this set that are contained in the * specified range. If <code>end > start</code> then an empty range is * retained, leaving the set empty. * * @param start first character, inclusive, of range to be retained * to this set. * @param end last character, inclusive, of range to be retained * to this set.
*/
UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { if (pinCodePoint(start) <= pinCodePoint(end)) {
UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
retain(range, 2, 0);
} else {
clear();
} return *this;
}
UnicodeSet& UnicodeSet::retain(UChar32 c) { return retain(c, c);
}
UnicodeSet& UnicodeSet::retain(const UnicodeString &s) { if (isFrozen() || isBogus()) { return *this; }
UChar32 cp = getSingleCP(s); if (cp < 0) { bool isIn = stringsContains(s); // Check for getRangeCount() first to avoid somewhat-expensive size() // when there are single code points. if (isIn && getRangeCount() == 0 && size() == 1) { return *this;
}
clear(); if (isIn) {
_add(s);
}
} else {
retain(cp, cp);
} return *this;
}
/** * Removes the specified range from this set if it is present. * The set will not contain the specified range once the call * returns. If <code>end > start</code> then an empty range is * removed, leaving the set unchanged. * * @param start first character, inclusive, of range to be removed * from this set. * @param end last character, inclusive, of range to be removed * from this set.
*/
UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { if (pinCodePoint(start) <= pinCodePoint(end)) {
UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
retain(range, 2, 2);
} return *this;
}
/** * Removes the specified character from this set if it is present. * The set will not contain the specified range once the call * returns.
*/
UnicodeSet& UnicodeSet::remove(UChar32 c) { return remove(c, c);
}
/** * Removes the specified string from this set if it is present. * The set will not contain the specified character once the call * returns. * @param the source string * @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s); if (cp < 0) { if (strings_ != nullptr && strings_->removeElement((void*) &s)) {
releasePattern();
}
} else {
remove(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
} return *this;
}
/** * Complements the specified range in this set. Any character in * the range will be removed if it is in this set, or will be * added if it is not in this set. If <code>end > start</code> * then an empty range is xor'ed, leaving the set unchanged. * * @param start first character, inclusive, of range to be removed * from this set. * @param end last character, inclusive, of range to be removed * from this set.
*/
UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { if (isFrozen() || isBogus()) { return *this;
} if (pinCodePoint(start) <= pinCodePoint(end)) {
UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
exclusiveOr(range, 2, 0);
}
releasePattern(); return *this;
}
UnicodeSet& UnicodeSet::complement(UChar32 c) { return complement(c, c);
}
/** * This is equivalent to * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
*/
UnicodeSet& UnicodeSet::complement() { if (isFrozen() || isBogus()) { return *this;
} if (list[0] == UNICODESET_LOW) {
uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
--len;
} else { if (!ensureCapacity(len+1)) { return *this;
}
uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
list[0] = UNICODESET_LOW;
++len;
}
releasePattern(); return *this;
}
/** * Complement the specified string in this set. * The set will not contain the specified string once the call * returns. * * @param s the string to complement * @return this object, for chaining
*/
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s); if (cp < 0) { if (stringsContains(s)) {
strings_->removeElement((void*) &s);
} else {
_add(s);
}
releasePattern();
} else {
complement(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
} return *this;
}
/** * Adds all of the elements in the specified set to this set if * they're not already present. This operation effectively * modifies this set so that its value is the <i>union</i> of the two * sets. The behavior of this operation is unspecified if the specified * collection is modified while the operation is in progress. * * @param c set whose elements are to be added to this set. * @see #add(char, char)
*/
UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { if ( c.len>0 && c.list!=nullptr ) {
add(c.list, c.len, 0);
}
// Add strings in order if ( c.strings_!=nullptr ) { for (int32_t i=0; i<c.strings_->size(); ++i) { const UnicodeString* s = static_cast<const UnicodeString*>(c.strings_->elementAt(i)); if (!stringsContains(*s)) {
_add(*s);
}
}
} return *this;
}
/** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the <i>intersection</i> of the two sets. * * @param c set that defines which elements this set will retain.
*/
UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { if (isFrozen() || isBogus()) { return *this;
}
retain(c.list, c.len, 0); if (hasStrings()) { if (!c.hasStrings()) {
strings_->removeAllElements();
} else {
strings_->retainAll(*c.strings_);
}
} return *this;
}
/** * Removes from this set all of its elements that are contained in the * specified set. This operation effectively modifies this * set so that its value is the <i>asymmetric set difference</i> of * the two sets. * * @param c set that defines which elements will be removed from * this set.
*/
UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { if (isFrozen() || isBogus()) { return *this;
}
retain(c.list, c.len, 2); if (hasStrings() && c.hasStrings()) {
strings_->removeAll(*c.strings_);
} return *this;
}
/** * Complements in this set all elements contained in the specified * set. Any character in the other set will be removed if it is * in this set, or will be added if it is not in this set. * * @param c set that defines which elements will be xor'ed from * this set.
*/
UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { if (isFrozen() || isBogus()) { return *this;
}
exclusiveOr(c.list, c.len, 0);
if (c.strings_ != nullptr) { for (int32_t i=0; i<c.strings_->size(); ++i) { void* e = c.strings_->elementAt(i); if (strings_ == nullptr || !strings_->removeElement(e)) {
_add(*static_cast<const UnicodeString*>(e));
}
}
} return *this;
}
/** * Removes all of the elements from this set. This set will be * empty after this call returns.
*/
UnicodeSet& UnicodeSet::clear() { if (isFrozen()) { return *this;
}
list[0] = UNICODESET_HIGH;
len = 1;
releasePattern(); if (strings_ != nullptr) {
strings_->removeAllElements();
} // Remove bogus
fFlags = 0; return *this;
}
/** * Iteration method that returns the number of ranges contained in * this set. * @see #getRangeStart * @see #getRangeEnd
*/
int32_t UnicodeSet::getRangeCount() const { return len/2;
}
/** * Iteration method that returns the first character in the * specified range of this set. * @see #getRangeCount * @see #getRangeEnd
*/
UChar32 UnicodeSet::getRangeStart(int32_t index) const { return list[index*2];
}
/** * Iteration method that returns the last character in the * specified range of this set. * @see #getRangeStart * @see #getRangeEnd
*/
UChar32 UnicodeSet::getRangeEnd(int32_t index) const { return list[index*2 + 1] - 1;
}
/** * Reallocate this objects internal structures to take up the least * possible space, without changing this object's value.
*/
UnicodeSet& UnicodeSet::compact() { if (isFrozen() || isBogus()) { return *this;
} // Delete buffer first to defragment memory less. if (buffer != stackList) {
uprv_free(buffer);
buffer = nullptr;
bufferCapacity = 0;
} if (list == stackList) { // pass
} elseif (len <= INITIAL_CAPACITY) {
uprv_memcpy(stackList, list, len * sizeof(UChar32));
uprv_free(list);
list = stackList;
capacity = INITIAL_CAPACITY;
} elseif ((len + 7) < capacity) { // If we have more than a little unused capacity, shrink it to len.
UChar32* temp = static_cast<UChar32*>(uprv_realloc(list, sizeof(UChar32) * len)); if (temp) {
list = temp;
capacity = len;
} // else what the heck happened?! We allocated less memory! // Oh well. We'll keep our original array.
} if (strings_ != nullptr && strings_->isEmpty()) { delete strings_;
strings_ = nullptr;
} return *this;
}
/* count necessary 16-bit units */
length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH // assert(length>=0); if (length==0) { /* empty set */ if (destCapacity>0) {
*dest=0;
} else {
ec=U_BUFFER_OVERFLOW_ERROR;
} return 1;
} /* now length>0 */
if (this->list[length-1]<=0xffff) { /* all BMP */
bmpLength=length;
} elseif (this->list[0]>=0x10000) { /* all supplementary */
bmpLength=0;
length*=2;
} else { /* some BMP, some supplementary */ for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
length=bmpLength+2*(length-bmpLength);
} #ifdef DEBUG_SERIALIZE
printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); #endif /* length: number of 16-bit array units */ if (length>0x7fff) { /* there are only 15 bits for the length in the first serialized word */
ec=U_INDEX_OUTOFBOUNDS_ERROR; return 0;
}
/* * total serialized length: * number of 16-bit array units (length) + * 1 length unit (always) + * 1 bmpLength unit (if there are supplementary values)
*/
destLength=length+((length>bmpLength)?2:1); if (destLength<=destCapacity) { const UChar32 *p;
int32_t i;
/* write the BMP part of the array */
p=this->list; for (i=0; i<bmpLength; ++i) { #ifdef DEBUG_SERIALIZE
printf("writebmp: %x\n", (int)*p); #endif
*dest++ = static_cast<uint16_t>(*p++);
}
/* write the supplementary part of the array */ for (; i<length; i+=2) { #ifdef DEBUG_SERIALIZE
printf("write32: %x\n", (int)*p); #endif
*dest++ = static_cast<uint16_t>(*p >> 16);
*dest++ = static_cast<uint16_t>(*p++);
}
} else {
ec=U_BUFFER_OVERFLOW_ERROR;
} return destLength;
}
int32_t i = 0, j = 0, k = 0;
UChar32 a = list[i++];
UChar32 b; if (polarity == 1 || polarity == 2) {
b = UNICODESET_LOW; if (other[j] == UNICODESET_LOW) { // skip base if already LOW
++j;
b = other[j];
}
} else {
b = other[j++];
} // simplest of all the routines // sort the values, discarding identicals! for (;;) { if (a < b) {
buffer[k++] = a;
a = list[i++];
} elseif (b < a) {
buffer[k++] = b;
b = other[j++];
} elseif (a != UNICODESET_HIGH) { // at this point, a == b // discard both values!
a = list[i++];
b = other[j++];
} else { // DONE!
buffer[k++] = UNICODESET_HIGH;
len = k; break;
}
}
swapBuffers();
releasePattern();
}
// polarity = 0 is normal: x union y // polarity = 2: x union ~y // polarity = 1: ~x union y // polarity = 3: ~x union ~y
int32_t i = 0, j = 0, k = 0;
UChar32 a = list[i++];
UChar32 b = other[j++]; // change from xor is that we have to check overlapping pairs // polarity bit 1 means a is second, bit 2 means b is. for (;;) { switch (polarity) { case 0: // both first; take lower if unequal if (a < b) { // take a // Back up over overlapping ranges in buffer[] if (k > 0 && a <= buffer[k-1]) { // Pick latter end value in buffer[] vs. list[]
a = max(list[i], buffer[--k]);
} else { // No overlap
buffer[k++] = a;
a = list[i];
}
i++; // Common if/else code factored out
polarity ^= 1;
} elseif (b < a) { // take b if (k > 0 && b <= buffer[k-1]) {
b = max(other[j], buffer[--k]);
} else {
buffer[k++] = b;
b = other[j];
}
j++;
polarity ^= 2;
} else { // a == b, take a, drop b if (a == UNICODESET_HIGH) goto loop_end; // This is symmetrical; it doesn't matter if // we backtrack with a or b. - liu if (k > 0 && a <= buffer[k-1]) {
a = max(list[i], buffer[--k]);
} else { // No overlap
buffer[k++] = a;
a = list[i];
}
i++;
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break; case 3: // both second; take higher if unequal, and drop other if (b <= a) { // take a if (a == UNICODESET_HIGH) goto loop_end;
buffer[k++] = a;
} else { // take b if (b == UNICODESET_HIGH) goto loop_end;
buffer[k++] = b;
}
a = list[i++];
polarity ^= 1; // factored common code
b = other[j++];
polarity ^= 2; break; case 1: // a second, b first; if b < a, overlap if (a < b) { // no overlap, take a
buffer[k++] = a; a = list[i++]; polarity ^= 1;
} elseif (b < a) { // OVERLAP, drop b
b = other[j++];
polarity ^= 2;
} else { // a == b, drop both! if (a == UNICODESET_HIGH) goto loop_end;
a = list[i++];
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break; case 2: // a first, b second; if a < b, overlap if (b < a) { // no overlap, take b
buffer[k++] = b;
b = other[j++];
polarity ^= 2;
} elseif (a < b) { // OVERLAP, drop a
a = list[i++];
polarity ^= 1;
} else { // a == b, drop both! if (a == UNICODESET_HIGH) goto loop_end;
a = list[i++];
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break;
}
}
loop_end:
buffer[k++] = UNICODESET_HIGH; // terminate
len = k;
swapBuffers();
releasePattern();
}
// polarity = 0 is normal: x intersect y // polarity = 2: x intersect ~y == set-minus // polarity = 1: ~x intersect y // polarity = 3: ~x intersect ~y
int32_t i = 0, j = 0, k = 0;
UChar32 a = list[i++];
UChar32 b = other[j++]; // change from xor is that we have to check overlapping pairs // polarity bit 1 means a is second, bit 2 means b is. for (;;) { switch (polarity) { case 0: // both first; drop the smaller if (a < b) { // drop a
a = list[i++];
polarity ^= 1;
} elseif (b < a) { // drop b
b = other[j++];
polarity ^= 2;
} else { // a == b, take one, drop other if (a == UNICODESET_HIGH) goto loop_end;
buffer[k++] = a;
a = list[i++];
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break; case 3: // both second; take lower if unequal if (a < b) { // take a
buffer[k++] = a;
a = list[i++];
polarity ^= 1;
} elseif (b < a) { // take b
buffer[k++] = b;
b = other[j++];
polarity ^= 2;
} else { // a == b, take one, drop other if (a == UNICODESET_HIGH) goto loop_end;
buffer[k++] = a;
a = list[i++];
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break; case 1: // a second, b first; if (a < b) { // NO OVERLAP, drop a
a = list[i++];
polarity ^= 1;
} elseif (b < a) { // OVERLAP, take b
buffer[k++] = b;
b = other[j++];
polarity ^= 2;
} else { // a == b, drop both! if (a == UNICODESET_HIGH) goto loop_end;
a = list[i++];
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break; case 2: // a first, b second; if a < b, overlap if (b < a) { // no overlap, drop b
b = other[j++];
polarity ^= 2;
} elseif (a < b) { // OVERLAP, take a
buffer[k++] = a;
a = list[i++];
polarity ^= 1;
} else { // a == b, drop both! if (a == UNICODESET_HIGH) goto loop_end;
a = list[i++];
polarity ^= 1;
b = other[j++];
polarity ^= 2;
} break;
}
}
loop_end:
buffer[k++] = UNICODESET_HIGH; // terminate
len = k;
swapBuffers();
releasePattern();
}
/** * Append the <code>toPattern()</code> representation of a * string to the given <code>StringBuffer</code>.
*/ void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
UChar32 cp; for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
}
}
/** * Append the <code>toPattern()</code> representation of a * character to the given <code>StringBuffer</code>.
*/ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) { if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything // unprintable
ICU_Utility::escape(buf, c); return;
} // Okay to let ':' pass through switch (c) { case u'[': case u']': case u'-': case u'^': case u'&': case u'\\': case u'{': case u'}': case u':': case SymbolTable::SYMBOL_REF:
buf.append(u'\\'); break; default: // Escape whitespace if (PatternProps::isWhiteSpace(c)) {
buf.append(u'\\');
} break;
}
buf.append(c);
}
void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
UBool escapeUnprintable) {
_appendToPat(result, start, escapeUnprintable); if (start != end) { if ((start+1) != end || // Avoid writing what looks like a lead+trail surrogate pair.
start == 0xdbff) {
result.append(u'-');
}
_appendToPat(result, end, escapeUnprintable);
}
}
/** * Append a string representation of this set to result. This will be * a cleaned version of the string passed to applyPattern(), if there * is one. Otherwise it will be generated.
*/
UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
UBool escapeUnprintable) const
{ if (pat != nullptr) {
int32_t i;
int32_t backslashCount = 0; for (i=0; i<patLen; ) {
UChar32 c;
U16_NEXT(pat, i, patLen, c); if (escapeUnprintable ?
ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // If the unprintable character is preceded by an odd // number of backslashes, then it has been escaped. // Before unescaping it, we delete the final // backslash. if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
ICU_Utility::escape(result, c);
backslashCount = 0;
} else {
result.append(c); if (c == u'\\') {
++backslashCount;
} else {
backslashCount = 0;
}
}
} return result;
}
/** * Returns a string representation of this set. If the result of * calling this function is passed to a UnicodeSet constructor, it * will produce another set that is equal to this one.
*/
UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
UBool escapeUnprintable) const
{
result.truncate(0); return _toPattern(result, escapeUnprintable);
}
/** * Generate and append a string representation of this set to result. * This does not use this.pat, the cleaned up copy of the string * passed to applyPattern().
*/
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
UBool escapeUnprintable) const
{
result.append(u'[');
int32_t i = 0;
int32_t limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. // if (getRangeCount() >= 2 && // getRangeStart(0) == MIN_VALUE && // getRangeEnd(last) == MAX_VALUE) // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 // If limit == len then len is even and the last range ends with MAX_VALUE. // // *But* do not write the inverse (complement) if there are strings. // Since ICU 70, the '^' performs a code point complement which removes all strings. if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { // Emit the inverse
result.append(u'^'); // Offsetting the inversion list index by one lets us // iterate over the ranges of the set complement.
i = 1;
--limit;
}
// Emit the ranges as pairs. while (i < limit) {
UChar32 start = list[i]; // getRangeStart()
UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one if (!(0xd800 <= end && end <= 0xdbff)) {
_appendToPat(result, start, end, escapeUnprintable);
i += 2;
} else { // The range ends with a lead surrogate. // Avoid writing what looks like a lead+trail surrogate pair. // 1. Postpone ranges that start with a lead surrogate code point.
int32_t firstLead = i; while ((i += 2) < limit && list[i] <= 0xdbff) {}
int32_t firstAfterLead = i; // 2. Write following ranges that start with a trail surrogate code point. while (i < limit && (start = list[i]) <= 0xdfff) {
_appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
i += 2;
} // 3. Now write the postponed ranges. for (int j = firstLead; j < firstAfterLead; j += 2) {
_appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}
if (strings_ != nullptr) { for (int32_t i = 0; i<strings_->size(); ++i) {
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.71Angebot
Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.