/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUnicharUtils.h"
#include "nsUnicodeProperties.h"
#include "nsUTF8Utils.h"
#include "mozilla/Likely.h"
#include "mozilla/HashFunctions.h"
#include "mozilla/intl/UnicodeProperties.h"
#include "mozilla/StaticPrefs_layout.h"
// We map x -> x, except for upper-case letters,
// which we map to their lower-case equivalents.
static const uint8_t gASCIIToLower[
128] = {
0x00,
0x01,
0x02,
0x03,
0x04,
0x05,
0x06,
0x07,
0x08,
0x09,
0x0a,
0x0b,
0x0c,
0x0d,
0x0e,
0x0f,
0x10,
0x11,
0x12,
0x13,
0x14,
0x15,
0x16,
0x17,
0x18,
0x19,
0x1a,
0x1b,
0x1c,
0x1d,
0x1e,
0x1f,
0x20,
0x21,
0x22,
0x23,
0x24,
0x25,
0x26,
0x27,
0x28,
0x29,
0x2a,
0x2b,
0x2c,
0x2d,
0x2e,
0x2f,
0x30,
0x31,
0x32,
0x33,
0x34,
0x35,
0x36,
0x37,
0x38,
0x39,
0x3a,
0x3b,
0x3c,
0x3d,
0x3e,
0x3f,
0x40,
0x61,
0x62,
0x63,
0x64,
0x65,
0x66,
0x67,
0x68,
0x69,
0x6a,
0x6b,
0x6c,
0x6d,
0x6e,
0x6f,
0x70,
0x71,
0x72,
0x73,
0x74,
0x75,
0x76,
0x77,
0x78,
0x79,
0x7a,
0x5b,
0x5c,
0x5d,
0x5e,
0x5f,
0x60,
0x61,
0x62,
0x63,
0x64,
0x65,
0x66,
0x67,
0x68,
0x69,
0x6a,
0x6b,
0x6c,
0x6d,
0x6e,
0x6f,
0x70,
0x71,
0x72,
0x73,
0x74,
0x75,
0x76,
0x77,
0x78,
0x79,
0x7a,
0x7b,
0x7c,
0x7d,
0x7e,
0x7f,
};
// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
// when they're called from within the case-insensitive comparators, so we
// define inlined versions.
static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) {
if (IS_ASCII(aChar)) {
return gASCIIToLower[aChar];
}
return mozilla::intl::UnicodeProperties::ToLower(aChar);
}
static MOZ_ALWAYS_INLINE uint32_t
ToLowerCaseASCII_inline(
const uint32_t aChar) {
if (IS_ASCII(aChar)) {
return gASCIIToLower[aChar];
}
return aChar;
}
void ToLowerCase(nsAString& aString) {
char16_t* buf = aString.BeginWriting();
ToLowerCase(buf, buf, aString.Length());
}
void ToLowerCaseASCII(nsAString& aString) {
char16_t* buf = aString.BeginWriting();
ToLowerCaseASCII(buf, buf, aString.Length());
}
char ToLowerCaseASCII(
char aChar) {
if (aChar >=
'A' && aChar <=
'Z') {
return aChar +
0x20;
}
return aChar;
}
char16_t ToLowerCaseASCII(char16_t aChar) {
if (aChar >=
'A' && aChar <=
'Z') {
return aChar +
0x20;
}
return aChar;
}
char32_t ToLowerCaseASCII(char32_t aChar) {
if (aChar >=
'A' && aChar <=
'Z') {
return aChar +
0x20;
}
return aChar;
}
char ToUpperCaseASCII(
char aChar) {
if (aChar >=
'a' && aChar <=
'z') {
return aChar -
0x20;
}
return aChar;
}
char16_t ToUpperCaseASCII(char16_t aChar) {
if (aChar >=
'a' && aChar <=
'z') {
return aChar -
0x20;
}
return aChar;
}
char32_t ToUpperCaseASCII(char32_t aChar) {
if (aChar >=
'a' && aChar <=
'z') {
return aChar -
0x20;
}
return aChar;
}
void ToLowerCase(
const nsAString& aSource, nsAString& aDest) {
const char16_t* in = aSource.BeginReading();
size_t len = aSource.Length();
aDest.SetLength(len);
char16_t* out = aDest.BeginWriting();
ToLowerCase(in, out, len);
}
void ToLowerCaseASCII(
const nsAString& aSource, nsAString& aDest) {
const char16_t* in = aSource.BeginReading();
size_t len = aSource.Length();
aDest.SetLength(len);
char16_t* out = aDest.BeginWriting();
ToLowerCaseASCII(in, out, len);
}
uint32_t ToLowerCaseASCII(
const uint32_t aChar) {
return ToLowerCaseASCII_inline(aChar);
}
void ToUpperCase(nsAString& aString) {
char16_t* buf = aString.BeginWriting();
ToUpperCase(buf, buf, aString.Length());
}
void ToUpperCase(
const nsAString& aSource, nsAString& aDest) {
const char16_t* in = aSource.BeginReading();
size_t len = aSource.Length();
aDest.SetLength(len);
char16_t* out = aDest.BeginWriting();
ToUpperCase(in, out, len);
}
#ifdef MOZILLA_INTERNAL_API
uint32_t ToFoldedCase(uint32_t aChar) {
if (IS_ASCII(aChar))
return gASCIIToLower[aChar];
return mozilla::unicode::GetFoldedcase(aChar);
}
void ToFoldedCase(nsAString& aString) {
char16_t* buf = aString.BeginWriting();
ToFoldedCase(buf, buf, aString.Length());
}
void ToFoldedCase(
const char16_t* aIn, char16_t* aOut, size_t aLen) {
for (uint32_t i =
0; i < aLen; i++) {
uint32_t ch = aIn[i];
if (i < aLen -
1 && NS_IS_SURROGATE_PAIR(ch, aIn[i +
1])) {
ch = mozilla::unicode::GetFoldedcase(SURROGATE_TO_UCS4(ch, aIn[i +
1]));
NS_ASSERTION(!IS_IN_BMP(ch),
"case mapping crossed BMP/SMP boundary!");
aOut[i++] = H_SURROGATE(ch);
aOut[i] = L_SURROGATE(ch);
continue;
}
aOut[i] = ToFoldedCase(ch);
}
}
uint32_t ToNaked(uint32_t aChar) {
if (IS_ASCII(aChar)) {
return aChar;
}
return mozilla::unicode::GetNaked(aChar);
}
void ToNaked(nsAString& aString) {
uint32_t i =
0;
while (i < aString.Length()) {
uint32_t ch = aString[i];
if (i < aString.Length() -
1 && NS_IS_SURROGATE_PAIR(ch, aString[i +
1])) {
ch = SURROGATE_TO_UCS4(ch, aString[i +
1]);
if (mozilla::unicode::IsCombiningDiacritic(ch)) {
aString.Cut(i,
2);
}
else {
ch = mozilla::unicode::GetNaked(ch);
NS_ASSERTION(!IS_IN_BMP(ch),
"stripping crossed BMP/SMP boundary!");
aString.Replace(i++,
1, H_SURROGATE(ch));
aString.Replace(i++,
1, L_SURROGATE(ch));
}
continue;
}
if (mozilla::unicode::IsCombiningDiacritic(ch)) {
aString.Cut(i,
1);
}
else {
aString.Replace(i++,
1, ToNaked(ch));
}
}
}
int32_t nsCaseInsensitiveStringComparator(
const char16_t* lhs,
const char16_t* rhs, size_t lLength,
size_t rLength) {
return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength)
: (lLength > rLength) ?
1
: -
1;
}
int32_t nsCaseInsensitiveUTF8StringComparator(
const char* lhs,
const char* rhs,
size_t lLength, size_t rLength) {
return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
}
int32_t nsASCIICaseInsensitiveStringComparator(
const char16_t* lhs,
const char16_t* rhs,
size_t lLength, size_t rLength) {
if (lLength != rLength) {
if (lLength > rLength)
return 1;
return -
1;
}
while (rLength) {
// we don't care about surrogates here, because we're only
// lowercasing the ASCII range
char16_t l = *lhs++;
char16_t r = *rhs++;
if (l != r) {
l = ToLowerCaseASCII_inline(l);
r = ToLowerCaseASCII_inline(r);
if (l > r)
return 1;
else if (r > l)
return -
1;
}
rLength--;
}
return 0;
}
#endif // MOZILLA_INTERNAL_API
uint32_t ToLowerCase(uint32_t aChar) {
return ToLowerCase_inline(aChar); }
void ToLowerCase(
const char16_t* aIn, char16_t* aOut, size_t aLen) {
for (size_t i =
0; i < aLen; i++) {
uint32_t ch = aIn[i];
if (i < aLen -
1 && NS_IS_SURROGATE_PAIR(ch, aIn[i +
1])) {
ch = mozilla::intl::UnicodeProperties::ToLower(
SURROGATE_TO_UCS4(ch, aIn[i +
1]));
NS_ASSERTION(!IS_IN_BMP(ch),
"case mapping crossed BMP/SMP boundary!");
aOut[i++] = H_SURROGATE(ch);
aOut[i] = L_SURROGATE(ch);
continue;
}
aOut[i] = ToLowerCase(ch);
}
}
void ToLowerCaseASCII(
const char16_t* aIn, char16_t* aOut, size_t aLen) {
for (size_t i =
0; i < aLen; i++) {
char16_t ch = aIn[i];
aOut[i] = IS_ASCII_UPPER(ch) ? (ch +
0x20) : ch;
}
}
uint32_t ToUpperCase(uint32_t aChar) {
if (IS_ASCII(aChar)) {
if (IS_ASCII_LOWER(aChar)) {
return aChar -
0x20;
}
return aChar;
}
return mozilla::intl::UnicodeProperties::ToUpper(aChar);
}
void ToUpperCase(
const char16_t* aIn, char16_t* aOut, size_t aLen) {
for (size_t i =
0; i < aLen; i++) {
uint32_t ch = aIn[i];
if (i < aLen -
1 && NS_IS_SURROGATE_PAIR(ch, aIn[i +
1])) {
ch = mozilla::intl::UnicodeProperties::ToUpper(
SURROGATE_TO_UCS4(ch, aIn[i +
1]));
NS_ASSERTION(!IS_IN_BMP(ch),
"case mapping crossed BMP/SMP boundary!");
aOut[i++] = H_SURROGATE(ch);
aOut[i] = L_SURROGATE(ch);
continue;
}
aOut[i] = ToUpperCase(ch);
}
}
uint32_t ToTitleCase(uint32_t aChar) {
if (IS_ASCII(aChar)) {
return ToUpperCase(aChar);
}
return mozilla::unicode::GetTitlecaseForLower(aChar);
}
int32_t CaseInsensitiveCompare(
const char16_t* a,
const char16_t* b,
size_t len) {
NS_ASSERTION(a && b,
"Do not pass in invalid pointers!");
if (len) {
do {
uint32_t c1 = *a++;
uint32_t c2 = *b++;
// Unfortunately, we need to check for surrogates BEFORE we check
// for equality, because we could have identical high surrogates
// but non-identical characters, so we can't just skip them
// If c1 isn't a surrogate, we don't bother to check c2;
// in the case where it _is_ a surrogate, we're definitely going to get
// a mismatch, and don't need to interpret and lowercase it
if (len >
1 && NS_IS_SURROGATE_PAIR(c1, *a)) {
c1 = SURROGATE_TO_UCS4(c1, *a++);
if (NS_IS_SURROGATE_PAIR(c2, *b)) {
c2 = SURROGATE_TO_UCS4(c2, *b++);
}
// If c2 wasn't a surrogate, decrementing len means we'd stop
// short of the end of string b, but that doesn't actually matter
// because we're going to find a mismatch and return early
--len;
}
if (c1 != c2) {
c1 = ToLowerCase_inline(c1);
c2 = ToLowerCase_inline(c2);
if (c1 != c2) {
if (c1 < c2) {
return -
1;
}
return 1;
}
}
}
while (--len !=
0);
}
return 0;
}
// Inlined definition of GetLowerUTF8Codepoint, which we use because we want
// to be fast when called from the case-insensitive comparators.
static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline(
const char* aStr,
const char* aEnd,
const char** aNext) {
// Convert to unsigned char so that stuffing chars into PRUint32s doesn't
// sign extend.
const unsigned char* str = (
unsigned char*)aStr;
if (UTF8traits::isASCII(str[
0])) {
// It's ASCII; just convert to lower-case and return it.
*aNext = aStr +
1;
return gASCIIToLower[*str];
}
if (UTF8traits::is2byte(str[
0]) && MOZ_LIKELY(aStr +
1 < aEnd)) {
// It's a two-byte sequence, so it looks like
// 110XXXXX 10XXXXXX.
// This is definitely in the BMP, so we can store straightaway into a
// uint16_t.
uint16_t c;
c = (str[
0] &
0x1F) <<
6;
c += (str[
1] &
0x3F);
// we don't go through ToLowerCase here, because we know this isn't
// an ASCII character so the ASCII fast-path there is useless
c = mozilla::intl::UnicodeProperties::ToLower(c);
*aNext = aStr +
2;
return c;
}
if (UTF8traits::is3byte(str[
0]) && MOZ_LIKELY(aStr +
2 < aEnd)) {
// It's a three-byte sequence, so it looks like
// 1110XXXX 10XXXXXX 10XXXXXX.
// This will just barely fit into 16-bits, so store into a uint16_t.
uint16_t c;
c = (str[
0] &
0x0F) <<
12;
c += (str[
1] &
0x3F) <<
6;
c += (str[
2] &
0x3F);
c = mozilla::intl::UnicodeProperties::ToLower(c);
*aNext = aStr +
3;
return c;
}
if (UTF8traits::is4byte(str[
0]) && MOZ_LIKELY(aStr +
3 < aEnd)) {
// It's a four-byte sequence, so it looks like
// 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
uint32_t c;
c = (str[
0] &
0x07) <<
18;
c += (str[
1] &
0x3F) <<
12;
c += (str[
2] &
0x3F) <<
6;
c += (str[
3] &
0x3F);
c = mozilla::intl::UnicodeProperties::ToLower(c);
*aNext = aStr +
4;
return c;
}
// Hm, we don't understand this sequence.
return -
1;
}
uint32_t GetLowerUTF8Codepoint(
const char* aStr,
const char* aEnd,
const char** aNext) {
return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
}
int32_t CaseInsensitiveCompare(
const char* aLeft,
const char* aRight,
size_t aLeftBytes, size_t aRightBytes) {
const char* leftEnd = aLeft + aLeftBytes;
const char* rightEnd = aRight + aRightBytes;
while (aLeft < leftEnd && aRight < rightEnd) {
uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
if (MOZ_UNLIKELY(leftChar == uint32_t(-
1)))
return -
1;
uint32_t rightChar =
GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
if (MOZ_UNLIKELY(rightChar == uint32_t(-
1)))
return -
1;
// Now leftChar and rightChar are lower-case, so we can compare them.
if (leftChar != rightChar) {
if (leftChar > rightChar)
return 1;
return -
1;
}
}
// Make sure that if one string is longer than the other we return the
// correct result.
if (aLeft < leftEnd)
return 1;
if (aRight < rightEnd)
return -
1;
return 0;
}
static MOZ_ALWAYS_INLINE uint32_t
GetLowerUTF8Codepoint_inline(
const char* aStr,
const char* aEnd,
const char** aNext,
bool aMatchDiacritics) {
uint32_t c;
for (;;) {
c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
if (aMatchDiacritics) {
break;
}
if (!mozilla::unicode::IsCombiningDiacritic(c)) {
break;
}
aStr = *aNext;
}
return c;
}
bool CaseInsensitiveUTF8CharsEqual(
const char* aLeft,
const char* aRight,
const char* aLeftEnd,
const char* aRightEnd,
const char** aLeftNext,
const char** aRightNext,
bool* aErr,
bool aMatchDiacritics) {
NS_ASSERTION(aLeftNext,
"Out pointer shouldn't be null.");
NS_ASSERTION(aRightNext,
"Out pointer shouldn't be null.");
NS_ASSERTION(aErr,
"Out pointer shouldn't be null.");
NS_ASSERTION(aLeft < aLeftEnd,
"aLeft must be less than aLeftEnd.");
NS_ASSERTION(aRight < aRightEnd,
"aRight must be less than aRightEnd.");
uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext,
aMatchDiacritics);
if (MOZ_UNLIKELY(leftChar == uint32_t(-
1))) {
*aErr =
true;
return false;
}
uint32_t rightChar = GetLowerUTF8Codepoint_inline(
aRight, aRightEnd, aRightNext, aMatchDiacritics);
if (MOZ_UNLIKELY(rightChar == uint32_t(-
1))) {
*aErr =
true;
return false;
}
// Can't have an error past this point.
*aErr =
false;
if (!aMatchDiacritics) {
leftChar = ToNaked(leftChar);
rightChar = ToNaked(rightChar);
}
return leftChar == rightChar;
}
namespace mozilla {
uint32_t HashUTF8AsUTF16(
const char* aUTF8, size_t aLength,
bool* aErr) {
uint32_t hash =
0;
const char* s = aUTF8;
const char* end = aUTF8 + aLength;
*aErr =
false;
while (s < end) {
uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
if (*aErr) {
return 0;
}
if (ucs4 < PLANE1_BASE) {
hash = AddToHash(hash, ucs4);
}
else {
hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
}
}
return hash;
}
// The Korean Won currency sign has East Asian Width = HALFWIDTH, and
// Script = COMMON (rather than HANGUL), but we don't want to treat it like
// Chinese/Japanese half-width characters for segment break transformation,
// so we exclude it individually in the two functions here.
static constexpr uint32_t kWonCurrencySign =
0x20A9;
bool IsSegmentBreakSkipChar(uint32_t u) {
return intl::UnicodeProperties::IsEastAsianWidthFHWexcludingEmoji(u) &&
intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL &&
u != kWonCurrencySign;
}
bool IsEastAsianPunctuation(uint32_t u) {
// U+FF5E FULLWIDTH TILDE has General Category = Symbol (not Punctuation),
// but is used similarly to U+301C WAVE DASH (which does have category
// Punctuation). So we treat FULLWIDTH TILDE as punctuation here to give the
// two characters consistent behavior.
constexpr uint32_t kFullwidthTilde =
0xFF5E;
return intl::UnicodeProperties::IsEastAsianWidthFHW(u) &&
((intl::UnicodeProperties::IsPunctuation(u) &&
u != kWonCurrencySign) ||
u == kFullwidthTilde);
}
bool IsPunctuationForWordSelect(char16_t aCh) {
const uint8_t cat = unicode::GetGeneralCategory(aCh);
switch (cat) {
case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION:
/* Pc */
if (aCh ==
'_' && !StaticPrefs::layout_word_select_stop_at_underscore()) {
return false;
}
[[fallthrough]];
case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION:
/* Pd */
case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION:
/* Pe */
case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION:
/* Pf */
case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION:
/* Pi */
case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION:
/* Po */
case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION:
/* Ps */
case HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL:
/* Sc */
// Deliberately omitted:
// case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL: /* Sk */
case HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL:
/* Sm */
case HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL:
/* So */
return true;
default:
return false;
}
}
}
// namespace mozilla