/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
{ //Here we want the line break to leave text here) on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
}
{ //Here we want the line break to leave "here)" on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
}
}
{ //Here we want the line break to happen at the whitespace
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
}
}
{ //Here we want the line break to leave /bar/ba clumped together on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(u"foo /bar/baz"_ustr, strlen("foo /bar/ba"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
}
}
// i#22602: writer breaks word after dot immediately followed by a letter
{
aLocale.Language = "en";
aLocale.Country = "US";
{ //Here we want the line break to leave ./bar/baz clumped together on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"foo ./bar/baz"_ustr, strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period", static_cast<sal_Int32>(4), aResult.breakIndex);
}
}
// i#81448: slash and backslash make non-breaking spaces of preceding spaces
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // Per the bug, the line break should leave ...BE clumped together on the next line. // However, the current behavior does not wrap the string at all. This test asserts the // current behavior as a point of reference.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"THIS... ...BE"_ustr, strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aResult.breakIndex);
}
}
// i#81448: slash and backslash make non-breaking spaces of preceding spaces
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // The line break should leave /BE clumped together on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"THIS... /BE"_ustr, strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(8), aResult.breakIndex);
}
}
// i#80548: Bad word wrap between dash and word
{
aLocale.Language = "fi";
aLocale.Country = "FI";
{ // Per the bug, the line break should leave -bar clumped together on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", static_cast<sal_Int32>(4), aResult.breakIndex);
}
}
// i#80645: Line erroneously breaks at backslash
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // Note that the current behavior deviates from the original fix for this bug. // // The original report was filed due to wrapping all of "\Program Files\aaaa" to the // next line, even though only "aaaa" overflowed. The original fix was to simply make // U+005C reverse solidus (backslash) a breaking character. // // However, the root cause for this bug was not the behavior of '\', but rather some // other bug making all of "\Program Files\" behave like a single token, despite it // even containing whitespace. // // Reverting to the ICU line rules fixes this root issue. Now, in the following, // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also // consistent with the behavior of other office programs.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"C:\\Program Files\\LibreOffice"_ustr, strlen("C:\\Program Files\\Libre"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
// An identical result should be generated for solidus.
aResult = m_xBreak->getLineBreak(
u"C:/Program Files/LibreOffice"_ustr, strlen("C:/Program Files/Libre"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
}
}
// i#80841: Words separated by hyphens will always break to next line
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // Here we want the line break to leave toll- on the first line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"toll-free"_ustr, strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// i#83464: Line break between letter and $
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // Here we want the line break to leave US$ clumped on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"word US$ 123"_ustr, strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// Unknown bug number: "fix line break problem of dot after letter and before number"
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // Here we want the line break to leave US$ clumped on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"word L.5 word"_ustr, strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// i#83229: Wrong line break when word contains a hyphen
{
aLocale.Language = "en";
aLocale.Country = "US";
{ // The root cause for this bug was the Unicode standard introducing special treatment // for '-' in a number range context. This change makes number ranges (e.g. "100-199") // behave as if they are single tokens for the purposes of line breaking. Unfortunately, // this caused a significant appearance change to existing documents. // // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping // number ranges as a single token is consistent with other applications, including web // browsers, and other office suites as mentioned in the bug discussion. Removing this // customization seems like it would be a major change, however. // // Here we want the line break to leave 100- clumped on the first line.
// i#83649: "Line break should be between typographical quote and left bracket" // - Actually: Spaces between quotation mark and opening punctuation not treated as a break. // - Note that per the Unicode standard, prohibiting breaks in this context is intentional // because it may cause issues in certain languages due to the various ways quotation // characters are used. // - We do it anyway by customizing the ICU line breaking rules.
{
{ // This uses the sample text provided in the bug report. Based on usage, it is assumed // they were in the de_DE locale.
aLocale.Language = "de";
aLocale.Country = "DE";
// Per the bug report, it is expected that »angetan werden« remains on the first line. const OUString str = u"»angetan werden« [Passiv]"_ustr;
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
// The same result should be returned for this and the first case. const OUString str2 = u"»angetan werden« Passiv"_ustr;
aResult = m_xBreak->getLineBreak(
str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
// Under ICU rules, no amount of spaces would cause this to wrap. const OUString str3 = u"»angetan werden« [Passiv]"_ustr;
aResult = m_xBreak->getLineBreak(
str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex);
// i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters // tdf#130592: Fixed the regression. If this case fails, UI text will be laid out incorrectly.
{
aLocale.Language = "zh";
aLocale.Country = "HK";
// i#80891: Character in the forbidden list sometimes appears at the start of line
{
aLocale.Language = "zh";
aLocale.Country = "HK";
{ // Per the bug, the ideographic two-dot leader should be a forbidden character. However, // this change seems to have been reverted or broken at some point. const OUString str = u"電話︰電話"_ustr;
i18n::LineBreakResults aResult
= m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aResult.breakIndex);
}
}
{
OUString aTest(u"aaa]aaa"_ustr); //Here we want the line break to move the whole lot to the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
}
}
//this is an example sequence from tdf92993-1.docx caught by the load crashtesting
{ static constexpr OUStringLiteral aTest = u"\U0001f356\U0001f357\U0001f346" "\U0001f364\u2668\ufe0f\U0001f3c6";
aLocale.Language = "en";
aLocale.Country = "US";
{ //This must not assert/crash
(void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
}
}
{
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// i#65267: Comma is badly broken at end of line // - The word should be wrapped along with the comma
{
aLocale.Language = "de";
aLocale.Country = "DE";
{ auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, wort"_ustr,
strlen("Wort -prinzessinnen,"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
}
}
// tdf#114160: ZWJ shouldn't be treated as a breaking character
{
aLocale.Language = "mn";
aLocale.Country = "MN";
{ auto res = m_xBreak->getLineBreak(u"\u1828\u1820\u200d\u00a0\u200d\u1873\u1873"_ustr, 6,
aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
}
aLocale.Language = "en";
aLocale.Country = "US";
{ auto res = m_xBreak->getLineBreak(u"AB\u200d\u00a0\u200dCD"_ustr, 6, aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
}
}
}
// i#85411: ZWSP should be a word separator for spellchecking // - This fix was applied to both dict and edit customizations for (int j = 0; j < 3; ++j)
{ switch (j)
{ case 0:
aLocale.Language = "en";
aLocale.Country = "US"; break; case 1:
aLocale.Language = "ca";
aLocale.Country = "ES"; break; case 2:
aLocale.Language = "fi";
aLocale.Country = "FI"; break; default:
CPPUNIT_ASSERT(false); break;
}
// i#56347: "BreakIterator patch for Hungarian" // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) // Rules for Hungarian affixes after numbers and certain symbols
{
aLocale.Language = "hu";
aLocale.Country = "HU";
// When using the old LO custom dictionaries, this will select the entire phrase. // When using ICU, it will select only 北海道.
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
}
// tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
{
aLocale.Language = "en";
aLocale.Country = "US";
OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos); // This was 24 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
}
// tdf#161737: narrow no-break space between digits resulted spelling mistakes // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow // to check numbers with thousand separators and with correct suffix
{
aLocale.Language = "en";
aLocale.Country = "US";
OUString aTest(u"1\u202F000\u202F000"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); // This was 0 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); // This was 8 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
}
// tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
{
aLocale.Language = "hu";
aLocale.Country = "HU";
OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos); // This was 24 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
}
// tdf#161737: narrow no-break space between digits resulted spelling mistakes // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow // to check numbers with thousand separators and with correct suffix
{
aLocale.Language = "hu";
aLocale.Country = "HU";
OUString aTest(u"1\u202F000\u202F000"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); // This was 0 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); // This was 8 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
}
}
// i#24098: i18n API beginOfSentence/endOfSentence // fix beginOfSentence, ... when cursor is on the beginning of the sentence
{
OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
// i#24098: i18n API beginOfSentence/endOfSentence // "skip preceding space for beginOfSentence"
{
OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
// i#55063: Sentence selection in Thai should select a space-delimited phrase. // - This customization broke at some point. It works in an English locale in a synthetic test // like this one, but does not work in the Thai locale, nor on Thai text in practice.
{ static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
// i#55063: Thai phrases should delimit English sentence selection. // - This customization broke at some point. It works in an English locale in a synthetic test // like this one, but does not work in the Thai locale, nor on Thai text in practice.
{ static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr;
// i#55063: Characteristic test for English text delimiting Thai phrases (sentences) // - English text should not delimit Thai phrases.
{ static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
//A test to ensure that certain ranges and codepoints that are categorized as //weak remain as weak, so that existing docs that depend on this don't silently //change font for those weak chars void TestBreakIterator::testWeak()
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
{ static constexpr OUString aWeaks =
u"\u0001\u0002" " \u00A0" "\u0300\u036F"//Combining Diacritical Marks "\u1AB0\u1AFF"//Combining Diacritical Marks Extended "\u1DC0\u1DFF"//Combining Diacritical Marks Supplement "\u20D0\u20FF"//Combining Diacritical Marks for Symbols "\u2150\u215F"//Number Forms, fractions "\u2160\u2180"//Number Forms, roman numerals "\u2200\u22FF"//Mathematical Operators "\u27C0\u27EF"//Miscellaneous Mathematical Symbols-A "\u2980\u29FF"//Miscellaneous Mathematical Symbols-B "\u2A00\u2AFF"//Supplemental Mathematical Operators "\u2100\u214F"//Letterlike Symbols "\u2308\u230B"//Miscellaneous technical "\u25A0\u25FF"//Geometric Shapes "\u2B30\u2B4C"_ustr; //Miscellaneous Symbols and Arrows
for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
{
sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
OString aMsg = "Char 0x" +
OString::number(static_cast<sal_Int32>(std::u16string_view(aWeaks)[i]), 16) + " should have been weak";
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
i18n::ScriptType::WEAK, nScript);
}
}
}
//A test to ensure that certain ranges and codepoints that are categorized as //asian remain as asian, so that existing docs that depend on this don't silently //change font for those asian chars. //See https://bugs.libreoffice.org/show_bug.cgi?id=38095 void TestBreakIterator::testAsian()
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
{ static constexpr OUString aAsians = //some typical CJK chars
u"\u4E00\u62FF" //The full HalfWidth and FullWidth block has historically been //designated as taking the CJK font :-( //HalfWidth and FullWidth forms of ASCII 0-9, categorized under //UAX24 as "Common" i.e. by that logic WEAK "\uFF10\uFF19" //HalfWidth and FullWidth forms of ASCII A-z, categorized under //UAX25 as "Latin", i.e. by that logic LATIN "\uFF21\uFF5A"_ustr;
for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
{
sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
OString aMsg = "Char 0x" +
OString::number(static_cast<sal_Int32>(std::u16string_view(aAsians)[i]), 16) + " should have been asian";
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
i18n::ScriptType::ASIAN, nScript);
}
}
}
//A test to ensure that our Lao word boundary detection is useful void TestBreakIterator::testLao()
{
lang::Locale aLocale;
aLocale.Language = "lo";
aLocale.Country = "LA";
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); #if (U_ICU_VERSION_MAJOR_NUM < 70)
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); #else // FIXME: // In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and // instead the length 12 is returned as endpos. // Deep in // icu_70::RuleBasedBreakIterator::BreakCache::next() // icu_70::RuleBasedBreakIterator::BreakCache::following() // icu_70::RuleBasedBreakIterator::following() // i18npool::BreakIterator_Unicode::getWordBoundary()
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); #endif
}
//A test to ensure that our thai word boundary detection is useful void TestBreakIterator::testThai()
{
lang::Locale aLocale;
aLocale.Language = "th";
aLocale.Country = "TH";
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.