Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/third_party/rust/icu_segmenter/tests/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 6 kB image not shown  

Quelle  css_word_break.rs   Sprache: unbekannt

 
Spracherkennung für: .rs vermutete Sprache: Unknown {[0] [0] [0]} [Methode: Schwerpunktbildung, einfache Gewichte, sechs Dimensionen]

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu_segmenter::LineBreakOptions;
use icu_segmenter::LineBreakStrictness;
use icu_segmenter::LineBreakWordOption;
use icu_segmenter::LineSegmenter;

fn check_with_options(
    s: &str,
    mut expect_utf8: Vec<usize>,
    mut expect_utf16: Vec<usize>,
    options: LineBreakOptions,
) {
    let segmenter = LineSegmenter::new_dictionary_with_options(options);

    let iter = segmenter.segment_str(s);
    let result: Vec<usize> = iter.collect();
    expect_utf8.insert(00);
    assert_eq!(expect_utf8, result, "{s}");

    let s_utf16: Vec<u16> = s.encode_utf16().collect();
    let iter = segmenter.segment_utf16(&s_utf16);
    let result: Vec<usize> = iter.collect();
    expect_utf16.insert(00);
    assert_eq!(expect_utf16, result, "{s}");
}

fn break_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
    let mut options = LineBreakOptions::default();
    options.strictness = LineBreakStrictness::Strict;
    options.word_option = LineBreakWordOption::BreakAll;
    options.ja_zh = false;
    check_with_options(s, expect_utf8, expect_utf16, options);
}

fn keep_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
    let mut options = LineBreakOptions::default();
    options.strictness = LineBreakStrictness::Strict;
    options.word_option = LineBreakWordOption::KeepAll;
    options.ja_zh = false;
    check_with_options(s, expect_utf8, expect_utf16, options);
}

fn normal(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
    let mut options = LineBreakOptions::default();
    options.strictness = LineBreakStrictness::Strict;
    options.word_option = LineBreakWordOption::Normal;
    options.ja_zh = false;
    check_with_options(s, expect_utf8, expect_utf16, options);
}

#[test]
fn wordbreak_breakall() {
    // from css/css-text/word-break/word-break-break-all-000.html
    let s = "\u{65e5}\u{672c}\u{8a9e}";
    break_all(s, vec![369], vec![123]);

    // from css/css-text/word-break/word-break-break-all-001.html
    let s = "latin";
    break_all(s, vec![12345], vec![12345]);

    // from css/css-text/word-break/word-break-break-all-002.html
    let s = "\u{d55c}\u{ae00}\u{c77e}";
    break_all(s, vec![369], vec![123]);

    // from css/css-text/word-break/word-break-break-all-003.html
    let s = "ภาษาไทยภาษาไทย";
    break_all(
        s,
        vec![3691215182124273033363942],
        vec![1234567891011121314],
    );

    // from css/css-text/word-break/word-break-break-all-004.html
    let s = "التدويل نشاط التدويل";
    break_all(
        s,
        vec![
            24681012151719212426283032343638,
        ],
        vec![
            1234568910111314151617181920,
        ],
    );

    // from css/css-text/word-break/word-break-break-all-008.html
    let s = "हिन्दी हिन्दी हिन्दी";
    break_all(
        s,
        vec![61219253138445056],
        vec![24791114161820],
    );

    // from css/css-text/word-break/word-break-break-all-014.html
    let s = "\u{1f496}\u{1f494}";
    break_all(s, vec![48], vec![24]);

    // from css/css-text/word-break/word-break-break-all-018.html
    //let s = "XXXX\u{00a0}X";
    //break_all(s, vec![12356], vec![12356]);

    // from css/css-text/word-break/word-break-break-all-022.html
    //let s = "XX\u{00a0}X";
    //break_all(s, vec![1245], vec![1234]);

    // from css/css-text/word-break/word-break-break-all-023.html
    let s = "XX XX\u{005C}\u{005C}\u{005C}";
    break_all(s, vec![1345678], vec![1345678]);

    // from css/css-text/word-break/word-break-break-all-026.html
    let s = "XX XXX///";
    break_all(s, vec![13459], vec![13459]);

    // css/css-text/word-break/word-break-break-all-inline-008.html
    let s = "X.";
    break_all(s, vec![2], vec![2]);

    // ID and CJ
    let s = "フォ";
    break_all(s, vec![36], vec![12]);
}

#[test]
fn wordbreak_keepall() {
    // from css/css-text/word-break/word-break-keep-all-000.html
    let s = "latin";
    keep_all(s, vec![5], vec![5]);

    // from css/css-text/word-break/word-break-keep-all-001.html
    let s = "\u{65e5}\u{672c}\u{8a9e}";
    keep_all(s, vec![9], vec![3]);

    // from css/css-text/word-break/word-break-keep-all-002.html
    let s = "한글이";
    keep_all(s, vec![9], vec![3]);

    // from css/css-text/word-break/word-break-keep-all-005.html
    let s = "字\u{3000}字";
    keep_all(s, vec![69], vec![23]);

    // from css/css-text/word-break/word-break-keep-all-006.html
    let s = "字\u{3001}字";
    keep_all(s, vec![69], vec![23]);

    // from css/css-text/word-boundary/word-boundary-107.html
    let s = "しょう。";
    keep_all(s, vec![12], vec![4]);

    // failed test. JL, JV and JT
    let s = "\u{110B}\u{1162}\u{1100}\u{1175}\u{1111}\u{1161}\u{11AB}\u{1103}\u{1161}";
    keep_all(s, vec![27], vec![9]);
}

#[test]
#[cfg_attr(not(feature = "lstm"), ignore)]
fn wordbreak_keepall_lstm() {
    // from css/css-text/word-break/word-break-keep-all-003.html
    let s = "และและ";
    keep_all(s, vec![918], vec![36]);
}

#[test]
fn wordbreak_normal() {
    // from css/css-text/word-break/word-break-normal-th-000.html
    let s = "ภาษาไทยภาษาไทย";
    normal(s, vec![12213342], vec![471114]);
}

#[test]
fn wordbreak_normal_km() {
    // from css/css-text/word-break/word-break-normal-km-000.html
    let _s = "ភាសាខ្មែរភាសាខ្មែរភាសាខ្មែរ";
    normal(_s, vec![275481], vec![91827]);
}

#[test]
fn wordbreak_normal_lo() {
    // from css/css-text/word-break/word-break-normal-lo-000.html
    let _s = "ພາສາລາວພາສາລາວພາສາລາວ";
    normal(_s, vec![122133425463], vec![4711141821]);
}

[Dauer der Verarbeitung: 0.26 Sekunden, vorverarbeitet 2026-06-06]