Quelle nsUnicharUtils.cpp

Sprache: C

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicharUtils.h"
#include "nsUnicodeProperties.h"
#include "nsUTF8Utils.h"
#include "mozilla/Likely.h"
#include "mozilla/HashFunctions.h"
#include "mozilla/intl/UnicodeProperties.h"
#include "mozilla/StaticPrefs_layout.h"

// We map x -> x, except for upper-case letters,
// which we map to their lower-case equivalents.
static const uint8_t gASCIIToLower[128] = {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
    0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
    0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
    0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
    0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
    0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
};

// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
// when they're called from within the case-insensitive comparators, so we
// define inlined versions.
static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) {
  if (IS_ASCII(aChar)) {
    return gASCIIToLower[aChar];
  }

  return mozilla::intl::UnicodeProperties::ToLower(aChar);
}

static MOZ_ALWAYS_INLINE uint32_t
ToLowerCaseASCII_inline(const uint32_t aChar) {
  if (IS_ASCII(aChar)) {
    return gASCIIToLower[aChar];
  }

  return aChar;
}

void ToLowerCase(nsAString& aString) {
  char16_t* buf = aString.BeginWriting();
  ToLowerCase(buf, buf, aString.Length());
}

void ToLowerCaseASCII(nsAString& aString) {
  char16_t* buf = aString.BeginWriting();
  ToLowerCaseASCII(buf, buf, aString.Length());
}

char ToLowerCaseASCII(char aChar) {
  if (aChar >= 'A' && aChar <= 'Z') {
    return aChar + 0x20;
  }
  return aChar;
}

char16_t ToLowerCaseASCII(char16_t aChar) {
  if (aChar >= 'A' && aChar <= 'Z') {
    return aChar + 0x20;
  }
  return aChar;
}

char32_t ToLowerCaseASCII(char32_t aChar) {
  if (aChar >= 'A' && aChar <= 'Z') {
    return aChar + 0x20;
  }
  return aChar;
}

char ToUpperCaseASCII(char aChar) {
  if (aChar >= 'a' && aChar <= 'z') {
    return aChar - 0x20;
  }
  return aChar;
}

char16_t ToUpperCaseASCII(char16_t aChar) {
  if (aChar >= 'a' && aChar <= 'z') {
    return aChar - 0x20;
  }
  return aChar;
}

char32_t ToUpperCaseASCII(char32_t aChar) {
  if (aChar >= 'a' && aChar <= 'z') {
    return aChar - 0x20;
  }
  return aChar;
}

void ToLowerCase(const nsAString& aSource, nsAString& aDest) {
  const char16_t* in = aSource.BeginReading();
  size_t len = aSource.Length();

  aDest.SetLength(len);
  char16_t* out = aDest.BeginWriting();

  ToLowerCase(in, out, len);
}

void ToLowerCaseASCII(const nsAString& aSource, nsAString& aDest) {
  const char16_t* in = aSource.BeginReading();
  size_t len = aSource.Length();

  aDest.SetLength(len);
  char16_t* out = aDest.BeginWriting();

  ToLowerCaseASCII(in, out, len);
}

uint32_t ToLowerCaseASCII(const uint32_t aChar) {
  return ToLowerCaseASCII_inline(aChar);
}

void ToUpperCase(nsAString& aString) {
  char16_t* buf = aString.BeginWriting();
  ToUpperCase(buf, buf, aString.Length());
}

void ToUpperCase(const nsAString& aSource, nsAString& aDest) {
  const char16_t* in = aSource.BeginReading();
  size_t len = aSource.Length();

  aDest.SetLength(len);
  char16_t* out = aDest.BeginWriting();

  ToUpperCase(in, out, len);
}

#ifdef MOZILLA_INTERNAL_API

uint32_t ToFoldedCase(uint32_t aChar) {
  if (IS_ASCII(aChar)) return gASCIIToLower[aChar];
  return mozilla::unicode::GetFoldedcase(aChar);
}

void ToFoldedCase(nsAString& aString) {
  char16_t* buf = aString.BeginWriting();
  ToFoldedCase(buf, buf, aString.Length());
}

void ToFoldedCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {
  for (uint32_t i = 0; i < aLen; i++) {
    uint32_t ch = aIn[i];
    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {
      ch = mozilla::unicode::GetFoldedcase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
      aOut[i++] = H_SURROGATE(ch);
      aOut[i] = L_SURROGATE(ch);
      continue;
    }
    aOut[i] = ToFoldedCase(ch);
  }
}

uint32_t ToNaked(uint32_t aChar) {
  if (IS_ASCII(aChar)) {
    return aChar;
  }
  return mozilla::unicode::GetNaked(aChar);
}

void ToNaked(nsAString& aString) {
  uint32_t i = 0;
  while (i < aString.Length()) {
    uint32_t ch = aString[i];
    if (i < aString.Length() - 1 && NS_IS_SURROGATE_PAIR(ch, aString[i + 1])) {
      ch = SURROGATE_TO_UCS4(ch, aString[i + 1]);
      if (mozilla::unicode::IsCombiningDiacritic(ch)) {
        aString.Cut(i, 2);
      } else {
        ch = mozilla::unicode::GetNaked(ch);
        NS_ASSERTION(!IS_IN_BMP(ch), "stripping crossed BMP/SMP boundary!");
        aString.Replace(i++, 1, H_SURROGATE(ch));
        aString.Replace(i++, 1, L_SURROGATE(ch));
      }
      continue;
    }
    if (mozilla::unicode::IsCombiningDiacritic(ch)) {
      aString.Cut(i, 1);
    } else {
      aString.Replace(i++, 1, ToNaked(ch));
    }
  }
}

int32_t nsCaseInsensitiveStringComparator(const char16_t* lhs,
                                          const char16_t* rhs, size_t lLength,
                                          size_t rLength) {
  return (lLength == rLength)  ? CaseInsensitiveCompare(lhs, rhs, lLength)
         : (lLength > rLength) ? 1
                               : -1;
}

int32_t nsCaseInsensitiveUTF8StringComparator(const char* lhs, const char* rhs,
                                              size_t lLength, size_t rLength) {
  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
}

int32_t nsASCIICaseInsensitiveStringComparator(const char16_t* lhs,
                                               const char16_t* rhs,
                                               size_t lLength, size_t rLength) {
  if (lLength != rLength) {
    if (lLength > rLength) return 1;
    return -1;
  }

  while (rLength) {
    // we don't care about surrogates here, because we're only
    // lowercasing the ASCII range
    char16_t l = *lhs++;
    char16_t r = *rhs++;
    if (l != r) {
      l = ToLowerCaseASCII_inline(l);
      r = ToLowerCaseASCII_inline(r);

      if (l > r)
        return 1;
      else if (r > l)
        return -1;
    }
    rLength--;
  }

  return 0;
}

#endif  // MOZILLA_INTERNAL_API

uint32_t ToLowerCase(uint32_t aChar) { return ToLowerCase_inline(aChar); }

void ToLowerCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {
  for (size_t i = 0; i < aLen; i++) {
    uint32_t ch = aIn[i];
    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {
      ch = mozilla::intl::UnicodeProperties::ToLower(
          SURROGATE_TO_UCS4(ch, aIn[i + 1]));
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
      aOut[i++] = H_SURROGATE(ch);
      aOut[i] = L_SURROGATE(ch);
      continue;
    }
    aOut[i] = ToLowerCase(ch);
  }
}

void ToLowerCaseASCII(const char16_t* aIn, char16_t* aOut, size_t aLen) {
  for (size_t i = 0; i < aLen; i++) {
    char16_t ch = aIn[i];
    aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch;
  }
}

uint32_t ToUpperCase(uint32_t aChar) {
  if (IS_ASCII(aChar)) {
    if (IS_ASCII_LOWER(aChar)) {
      return aChar - 0x20;
    }
    return aChar;
  }

  return mozilla::intl::UnicodeProperties::ToUpper(aChar);
}

void ToUpperCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {
  for (size_t i = 0; i < aLen; i++) {
    uint32_t ch = aIn[i];
    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {
      ch = mozilla::intl::UnicodeProperties::ToUpper(
          SURROGATE_TO_UCS4(ch, aIn[i + 1]));
      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
      aOut[i++] = H_SURROGATE(ch);
      aOut[i] = L_SURROGATE(ch);
      continue;
    }
    aOut[i] = ToUpperCase(ch);
  }
}

uint32_t ToTitleCase(uint32_t aChar) {
  if (IS_ASCII(aChar)) {
    return ToUpperCase(aChar);
  }

  return mozilla::unicode::GetTitlecaseForLower(aChar);
}

int32_t CaseInsensitiveCompare(const char16_t* a, const char16_t* b,
                               size_t len) {
  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");

  if (len) {
    do {
      uint32_t c1 = *a++;
      uint32_t c2 = *b++;

      // Unfortunately, we need to check for surrogates BEFORE we check
      // for equality, because we could have identical high surrogates
      // but non-identical characters, so we can't just skip them

      // If c1 isn't a surrogate, we don't bother to check c2;
      // in the case where it _is_ a surrogate, we're definitely going to get
      // a mismatch, and don't need to interpret and lowercase it

      if (len > 1 && NS_IS_SURROGATE_PAIR(c1, *a)) {
        c1 = SURROGATE_TO_UCS4(c1, *a++);
        if (NS_IS_SURROGATE_PAIR(c2, *b)) {
          c2 = SURROGATE_TO_UCS4(c2, *b++);
        }
        // If c2 wasn't a surrogate, decrementing len means we'd stop
        // short of the end of string b, but that doesn't actually matter
        // because we're going to find a mismatch and return early
        --len;
      }

      if (c1 != c2) {
        c1 = ToLowerCase_inline(c1);
        c2 = ToLowerCase_inline(c2);
        if (c1 != c2) {
          if (c1 < c2) {
            return -1;
          }
          return 1;
        }
      }
    } while (--len != 0);
  }
  return 0;
}

// Inlined definition of GetLowerUTF8Codepoint, which we use because we want
// to be fast when called from the case-insensitive comparators.
static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline(
    const char* aStr, const char* aEnd, const char** aNext) {
  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
  // sign extend.
  const unsigned char* str = (unsigned char*)aStr;

  if (UTF8traits::isASCII(str[0])) {
    // It's ASCII; just convert to lower-case and return it.
    *aNext = aStr + 1;
    return gASCIIToLower[*str];
  }
  if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
    // It's a two-byte sequence, so it looks like
    //  110XXXXX 10XXXXXX.
    // This is definitely in the BMP, so we can store straightaway into a
    // uint16_t.

    uint16_t c;
    c = (str[0] & 0x1F) << 6;
    c += (str[1] & 0x3F);

    // we don't go through ToLowerCase here, because we know this isn't
    // an ASCII character so the ASCII fast-path there is useless
    c = mozilla::intl::UnicodeProperties::ToLower(c);

    *aNext = aStr + 2;
    return c;
  }
  if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
    // It's a three-byte sequence, so it looks like
    //  1110XXXX 10XXXXXX 10XXXXXX.
    // This will just barely fit into 16-bits, so store into a uint16_t.

    uint16_t c;
    c = (str[0] & 0x0F) << 12;
    c += (str[1] & 0x3F) << 6;
    c += (str[2] & 0x3F);

    c = mozilla::intl::UnicodeProperties::ToLower(c);

    *aNext = aStr + 3;
    return c;
  }
  if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
    // It's a four-byte sequence, so it looks like
    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.

    uint32_t c;
    c = (str[0] & 0x07) << 18;
    c += (str[1] & 0x3F) << 12;
    c += (str[2] & 0x3F) << 6;
    c += (str[3] & 0x3F);

    c = mozilla::intl::UnicodeProperties::ToLower(c);

    *aNext = aStr + 4;
    return c;
  }

  // Hm, we don't understand this sequence.
  return -1;
}

uint32_t GetLowerUTF8Codepoint(const char* aStr, const char* aEnd,
                               const char** aNext) {
  return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
}

int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight,
                               size_t aLeftBytes, size_t aRightBytes) {
  const char* leftEnd = aLeft + aLeftBytes;
  const char* rightEnd = aRight + aRightBytes;

  while (aLeft < leftEnd && aRight < rightEnd) {
    uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
    if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) return -1;

    uint32_t rightChar =
        GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
    if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) return -1;

    // Now leftChar and rightChar are lower-case, so we can compare them.
    if (leftChar != rightChar) {
      if (leftChar > rightChar) return 1;
      return -1;
    }
  }

  // Make sure that if one string is longer than the other we return the
  // correct result.
  if (aLeft < leftEnd) return 1;
  if (aRight < rightEnd) return -1;

  return 0;
}

static MOZ_ALWAYS_INLINE uint32_t
GetLowerUTF8Codepoint_inline(const char* aStr, const char* aEnd,
                             const char** aNext, bool aMatchDiacritics) {
  uint32_t c;
  for (;;) {
    c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
    if (aMatchDiacritics) {
      break;
    }
    if (!mozilla::unicode::IsCombiningDiacritic(c)) {
      break;
    }
    aStr = *aNext;
  }
  return c;
}

bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
                                   const char* aLeftEnd, const char* aRightEnd,
                                   const char** aLeftNext,
                                   const char** aRightNext, bool* aErr,
                                   bool aMatchDiacritics) {
  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");

  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext,
                                                   aMatchDiacritics);
  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
    *aErr = true;
    return false;
  }

  uint32_t rightChar = GetLowerUTF8Codepoint_inline(
      aRight, aRightEnd, aRightNext, aMatchDiacritics);
  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
    *aErr = true;
    return false;
  }

  // Can't have an error past this point.
  *aErr = false;

  if (!aMatchDiacritics) {
    leftChar = ToNaked(leftChar);
    rightChar = ToNaked(rightChar);
  }

  return leftChar == rightChar;
}

namespace mozilla {

uint32_t HashUTF8AsUTF16(const char* aUTF8, size_t aLength, bool* aErr) {
  uint32_t hash = 0;
  const char* s = aUTF8;
  const char* end = aUTF8 + aLength;

  *aErr = false;

  while (s < end) {
    uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
    if (*aErr) {
      return 0;
    }

    if (ucs4 < PLANE1_BASE) {
      hash = AddToHash(hash, ucs4);
    } else {
      hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
    }
  }

  return hash;
}

// The Korean Won currency sign has East Asian Width = HALFWIDTH, and
// Script = COMMON (rather than HANGUL), but we don't want to treat it like
// Chinese/Japanese half-width characters for segment break transformation,
// so we exclude it individually in the two functions here.
static constexpr uint32_t kWonCurrencySign = 0x20A9;

bool IsSegmentBreakSkipChar(uint32_t u) {
  return intl::UnicodeProperties::IsEastAsianWidthFHWexcludingEmoji(u) &&
         intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL &&
         u != kWonCurrencySign;
}

bool IsEastAsianPunctuation(uint32_t u) {
  // U+FF5E FULLWIDTH TILDE has General Category = Symbol (not Punctuation),
  // but is used similarly to U+301C WAVE DASH (which does have category
  // Punctuation). So we treat FULLWIDTH TILDE as punctuation here to give the
  // two characters consistent behavior.
  constexpr uint32_t kFullwidthTilde = 0xFF5E;
  return intl::UnicodeProperties::IsEastAsianWidthFHW(u) &&
         ((intl::UnicodeProperties::IsPunctuation(u) &&
           u != kWonCurrencySign) ||
          u == kFullwidthTilde);
}

bool IsPunctuationForWordSelect(char16_t aCh) {
  const uint8_t cat = unicode::GetGeneralCategory(aCh);
  switch (cat) {
    case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
      if (aCh == '_' && !StaticPrefs::layout_word_select_stop_at_underscore()) {
        return false;
      }
      [[fallthrough]];
    case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION:    /* Pd */
    case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION:   /* Pe */
    case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION:   /* Pf */
    case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
    case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION:   /* Po */
    case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION:    /* Ps */
    case HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL:     /* Sc */
    // Deliberately omitted:
    // case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL:     /* Sk */
    case HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL:  /* Sm */
    case HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL: /* So */
      return true;
    default:
      return false;
  }
}

}  // namespace mozilla

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.13 Sekunden (vorverarbeitet am 2026-06-05) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.