/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <stdio.h>
#include "gtest/gtest.h"
#include "mozilla/intl/LineBreaker.h"
#include "mozilla/intl/WordBreaker.h"
#include "mozilla/Preferences.h"
#include "mozilla/Span.h"
#include "nsISupports.h"
#include "nsServiceManagerUtils.h"
#include "nsString.h"
#include "nsTArray.h"
#include "nsXPCOM.h"
using mozilla::intl::LineBreaker;
using mozilla::intl::WordBreaker;
// Turn off clang-format to align the ruler comments to the test strings.
// clang-format off
static char teng0[] =
// 1 2 3 4 5 6 7
// 01234567890123456789012345678901234567890123456789012345678901234567890123456789
"hello world";
// clang-format on
static uint32_t lexp0[] = {
5,
11};
static uint32_t wexp0[] = {
5,
6,
11};
// clang-format off
static char teng1[] =
// 1 2 3 4 5 6 7
// 01234567890123456789012345678901234567890123456789012345678901234567890123456789
"This is a test to test(reasonable) line break. This 0.01123 = 45 x 48.";
// clang-format on
static uint32_t lexp1[] = {
4,
7,
9,
14,
17,
34,
39,
40,
41,
42,
49,
54,
62,
64,
67,
69,
73};
static uint32_t wexp1[] = {
4,
5,
7,
8,
9,
10,
14,
15,
17,
18,
22,
23,
33,
34,
35,
39,
43,
48,
49,
50,
54,
55,
56,
57,
62,
63,
64,
65,
67,
68,
69,
70,
72,
73};
// clang-format off
static char teng2[] =
// 1 2 3 4 5 6 7
// 01234567890123456789012345678901234567890123456789012345678901234567890123456789
"()((reasonab(l)e) line break. .01123=45x48.";
// clang-format on
static uint32_t lexp2[] = {
17,
22,
23,
30,
44};
static uint32_t wexp2[] = {
4,
12,
13,
14,
15,
16,
17,
18,
22,
24,
29,
30,
31,
32,
37,
38,
43,
44};
// clang-format off
static char teng3[] =
// 1 2 3 4 5 6 7
// 01234567890123456789012345678901234567890123456789012345678901234567890123456789
"It's a test to test(ronae ) line break....";
// clang-format on
static uint32_t lexp3[] = {
4,
6,
11,
14,
25,
27,
32,
42};
static uint32_t wexp3[] = {
2,
3,
4,
5,
6,
7,
11,
12,
14,
15,
19,
20,
25,
26,
27,
28,
32,
33,
38,
42};
static char ruler1[] =
" 1 2 3 4 5 6 7 ";
static char ruler2[] =
"0123456789012345678901234567890123456789012345678901234567890123456789012";
bool Check(
const char* in, mozilla::Span<
const uint32_t> out,
mozilla::Span<
const uint32_t> res) {
const uint32_t outlen = out.Length();
const uint32_t i = res.Length();
bool ok =
true;
if (i != outlen) {
ok =
false;
printf(
"WARNING!!! return size wrong, expect %d but got %d \n", outlen, i);
}
for (uint32_t j =
0; j < i; j++) {
if (j < outlen) {
if (res[j] != out[j]) {
ok =
false;
printf(
"[%d] expect %d but got %d\n", j, out[j], res[j]);
}
}
else {
ok =
false;
printf(
"[%d] additional %d\n", j, res[j]);
}
}
if (!ok) {
printf(
"string = \n%s\n", in);
printf(
"%s\n", ruler1);
printf(
"%s\n", ruler2);
printf(
"Expect = \n");
for (uint32_t j =
0; j < outlen; j++) {
printf(
"%d,", out[j]);
}
printf(
"\nResult = \n");
for (uint32_t j =
0; j < i; j++) {
printf(
"%d,", res[j]);
}
printf(
"\n");
}
return ok;
}
bool TestASCIILB(
const char* in, mozilla::Span<
const uint32_t> out) {
NS_ConvertASCIItoUTF16 input(in);
EXPECT_GT(input.Length(),
0u) <<
"Expect a non-empty input!";
nsTArray<uint32_t> result;
int32_t curr =
0;
while (
true) {
curr = LineBreaker::Next(input.get(), input.Length(), curr);
if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) {
break;
}
result.AppendElement(curr);
}
return Check(in, out, result);
}
bool TestASCIIWB(
const char* in, mozilla::Span<
const uint32_t> out) {
NS_ConvertASCIItoUTF16 input(in);
EXPECT_GT(input.Length(),
0u) <<
"Expect a non-empty input!";
nsTArray<uint32_t> result;
int32_t curr =
0;
while (
true) {
curr = WordBreaker::Next(input.get(), input.Length(), curr);
if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) {
break;
}
result.AppendElement(curr);
}
return Check(in, out, result);
}
TEST(LineBreak, LineBreaker)
{
ASSERT_TRUE(TestASCIILB(teng0, lexp0));
ASSERT_TRUE(TestASCIILB(teng1, lexp1));
ASSERT_TRUE(TestASCIILB(teng2, lexp2));
ASSERT_TRUE(TestASCIILB(teng3, lexp3));
}
TEST(WordBreak, WordBreaker)
{
ASSERT_TRUE(TestASCIIWB(teng0, wexp0));
ASSERT_TRUE(TestASCIIWB(teng1, wexp1));
ASSERT_TRUE(TestASCIIWB(teng2, wexp2));
ASSERT_TRUE(TestASCIIWB(teng3, wexp3));
}
// 012345678901234
static const char wb0[] =
"T";
static const char wb1[] =
"h";
static const char wb2[] =
"";
static const char wb3[] =
"is is a int";
static const char wb4[] =
"";
static const char wb5[] =
"";
static const char wb6[] =
"ernationali";
static const char wb7[] =
"zation work.";
static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7};
TEST(WordBreak, TestPrintWordWithBreak)
{
uint32_t numOfFragment =
sizeof(wb) /
sizeof(
char*);
// This test generate the result string by appending '^' at every word break
// opportunity except the one at end of the text.
nsAutoString result;
for (uint32_t i =
0; i < numOfFragment; i++) {
NS_ConvertASCIItoUTF16 fragText(wb[i]);
int32_t cur =
0;
cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur);
uint32_t start =
0;
while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) {
result.Append(Substring(fragText, start, cur - start));
// Append '^' only if cur is within the fragText. We'll check the word
// break opportunity between fragText and nextFragText using
// BreakInBetween() below.
if (cur <
static_cast<int32_t>(fragText.Length())) {
result.Append(
'^');
}
start = (cur >=
0 ? cur : cur - start);
cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur);
}
if (i != numOfFragment -
1) {
NS_ConvertASCIItoUTF16 nextFragText(wb[i +
1]);
if (nextFragText.IsEmpty()) {
// If nextFragText is empty, there's no new possible word break
// opportunity.
continue;
}
const auto origFragLen =
static_cast<int32_t>(fragText.Length());
fragText.Append(nextFragText);
bool canBreak =
origFragLen ==
WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen -
1);
if (canBreak) {
result.Append(
'^');
}
}
}
ASSERT_STREQ(
"This^ ^is^ ^a^ ^internationalization^ ^work^.",
NS_ConvertUTF16toUTF8(result).get());
}
// This function searches a complete word starting from |offset| in wb[fragN].
// If it reaches the end of wb[fragN], and there is no word break opportunity
// between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1]
// until a word break.
void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset,
const char* expected) {
uint32_t numOfFragment =
sizeof(wb) /
sizeof(
char*);
NS_ConvertASCIItoUTF16 fragText(wb[fragN]);
mozilla::intl::WordRange res = WordBreaker::FindWord(fragText, offset);
nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin));
if ((uint32_t)fragText.Length() <= res.mEnd) {
// if we hit the end of the fragment
nsAutoString curFragText = fragText;
for (uint32_t p = fragN +
1; p < numOfFragment; p++) {
NS_ConvertASCIItoUTF16 nextFragText(wb[p]);
if (nextFragText.IsEmpty()) {
// If nextFragText is empty, there's no new possible word break
// opportunity between curFragText and nextFragText.
continue;
}
const auto origFragLen =
static_cast<int32_t>(curFragText.Length());
curFragText.Append(nextFragText);
bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(),
curFragText.Length(),
origFragLen -
1);
if (canBreak) {
break;
}
mozilla::intl::WordRange r = WordBreaker::FindWord(nextFragText,
0);
result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin));
if ((uint32_t)nextFragText.Length() != r.mEnd) {
break;
}
}
}
ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get())
<<
"FindWordBreakFromPosition(" << fragN <<
", " << offset <<
")";
}
TEST(WordBreak, TestNextWordBreakWithComplexLanguage)
{
nsString fragText(u
"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01");
int32_t offset =
0;
while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) {
int32_t newOffset =
WordBreaker::Next(fragText.get(), fragText.Length(), offset);
ASSERT_NE(offset, newOffset);
offset = newOffset;
}
ASSERT_TRUE(
true);
}
TEST(WordBreak, TestFindWordWithEmptyString)
{
mozilla::intl::WordRange expect{
0,
0};
mozilla::intl::WordRange result = WordBreaker::FindWord(EmptyString(),
0);
ASSERT_EQ(expect.mBegin, result.mBegin);
ASSERT_EQ(expect.mEnd, result.mEnd);
}
TEST(WordBreak, TestNextWordBreakWithEmptyString)
{
char16_t empty[] = {};
ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty,
0,
0));
ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty,
0,
1));
}
TEST(WordBreak, TestFindWordBreakFromPosition)
{
TestFindWordBreakFromPosition(
0,
0,
"This");
TestFindWordBreakFromPosition(
1,
0,
"his");
TestFindWordBreakFromPosition(
2,
0,
"is");
TestFindWordBreakFromPosition(
3,
0,
"is");
TestFindWordBreakFromPosition(
3,
1,
"is");
TestFindWordBreakFromPosition(
3,
9,
" ");
TestFindWordBreakFromPosition(
3,
10,
"internationalization");
TestFindWordBreakFromPosition(
4,
0,
"ernationalization");
TestFindWordBreakFromPosition(
5,
0,
"ernationalization");
TestFindWordBreakFromPosition(
6,
4,
"ernationalization");
TestFindWordBreakFromPosition(
6,
8,
"ernationalization");
TestFindWordBreakFromPosition(
7,
6,
" ");
TestFindWordBreakFromPosition(
7,
7,
"work");
}
// Test for StopAtPunctuation option.
TEST(WordBreak, TestFindBreakWithStopAtPunctuation)
{
bool original =
mozilla::Preferences::GetBool(
"intl.icu4x.segmenter.enabled",
true);
// Not UAX#29 rule
mozilla::Preferences::SetBool(
"intl.icu4x.segmenter.enabled",
false);
nsString fragText(u
"one.two");
mozilla::intl::WordRange result1 = WordBreaker::FindWord(fragText,
0);
ASSERT_EQ(
0u, result1.mBegin);
ASSERT_EQ(
3u, result1.mEnd);
mozilla::intl::WordRange result2 = WordBreaker::FindWord(fragText,
3);
ASSERT_EQ(
3u, result2.mBegin);
ASSERT_EQ(
4u, result2.mEnd);
mozilla::intl::WordRange result3 = WordBreaker::FindWord(fragText,
4);
ASSERT_EQ(
4u, result3.mBegin);
ASSERT_EQ(
7u, result3.mEnd);
// UAX#29 rule
mozilla::Preferences::SetBool(
"intl.icu4x.segmenter.enabled",
true);
mozilla::intl::WordRange result4 = WordBreaker::FindWord(
fragText,
0, WordBreaker::FindWordOptions::StopAtPunctuation);
ASSERT_EQ(
0u, result4.mBegin);
ASSERT_EQ(
3u, result4.mEnd);
mozilla::intl::WordRange result5 = WordBreaker::FindWord(
fragText,
3, WordBreaker::FindWordOptions::StopAtPunctuation);
ASSERT_EQ(
3u, result5.mBegin);
ASSERT_EQ(
4u, result5.mEnd);
mozilla::intl::WordRange result6 = WordBreaker::FindWord(
fragText,
4, WordBreaker::FindWordOptions::StopAtPunctuation);
ASSERT_EQ(
4u, result6.mBegin);
ASSERT_EQ(
7u, result6.mEnd);
// Default (without StopAtPunctuation)
mozilla::intl::WordRange result7 = WordBreaker::FindWord(fragText,
0);
ASSERT_EQ(
0u, result7.mBegin);
ASSERT_EQ(
7u, result7.mEnd);
mozilla::intl::WordRange result8 = WordBreaker::FindWord(fragText,
3);
ASSERT_EQ(
0u, result8.mBegin);
ASSERT_EQ(
7u, result8.mEnd);
mozilla::intl::WordRange result9 = WordBreaker::FindWord(fragText,
4);
ASSERT_EQ(
0u, result9.mBegin);
ASSERT_EQ(
7u, result9.mEnd);
mozilla::Preferences::SetBool(
"intl.icu4x.segmenter.enabled", original);
}