Quelle css_word_break.rs
Sprache: unbekannt
|
|
Spracherkennung für: .rs vermutete Sprache: Unknown {[0] [0] [0]} [Methode: Schwerpunktbildung, einfache Gewichte, sechs Dimensionen]
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use icu_segmenter::LineBreakOptions;
use icu_segmenter::LineBreakStrictness;
use icu_segmenter::LineBreakWordOption;
use icu_segmenter::LineSegmenter;
fn check_with_options(
s: &str,
mut expect_utf8: Vec<usize>,
mut expect_utf16: Vec<usize>,
options: LineBreakOptions,
) {
let segmenter = LineSegmenter::new_dictionary_with_options(options);
let iter = segmenter.segment_str(s);
let result: Vec<usize> = iter.collect();
expect_utf8.insert( 0, 0);
assert_eq!(expect_utf8, result, "{s}");
let s_utf16: Vec<u16> = s.encode_utf16().collect();
let iter = segmenter.segment_utf16(&s_utf16);
let result: Vec<usize> = iter.collect();
expect_utf16.insert( 0, 0);
assert_eq!(expect_utf16, result, "{s}");
}
fn break_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::BreakAll;
options.ja_zh = false;
check_with_options(s, expect_utf8, expect_utf16, options);
}
fn keep_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::KeepAll;
options.ja_zh = false;
check_with_options(s, expect_utf8, expect_utf16, options);
}
fn normal(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
let mut options = LineBreakOptions::default();
options.strictness = LineBreakStrictness::Strict;
options.word_option = LineBreakWordOption::Normal;
options.ja_zh = false;
check_with_options(s, expect_utf8, expect_utf16, options);
}
#[test]
fn wordbreak_breakall() {
// from css/css-text/word-break/word-break-break-all- 000.html
let s = "\u{ 65e5}\u{ 672c}\u{ 8a9e}";
break_all(s, vec![ 3, 6, 9], vec![ 1, 2, 3]);
// from css/css-text/word-break/word-break-break-all- 001.html
let s = "latin";
break_all(s, vec![ 1, 2, 3, 4, 5], vec![ 1, 2, 3, 4, 5]);
// from css/css-text/word-break/word-break-break-all- 002.html
let s = "\u{d55c}\u{ae00}\u{c77e}";
break_all(s, vec![ 3, 6, 9], vec![ 1, 2, 3]);
// from css/css-text/word-break/word-break-break-all- 003.html
let s = "ภาษาไทยภาษาไทย";
break_all(
s,
vec![ 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42],
vec![ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
);
// from css/css-text/word-break/word-break-break-all- 004.html
let s = "التدويل نشاط التدويل";
break_all(
s,
vec![
2, 4, 6, 8, 10, 12, 15, 17, 19, 21, 24, 26, 28, 30, 32, 34, 36, 38,
],
vec![
1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20,
],
);
// from css/css-text/word-break/word-break-break-all- 008.html
let s = "हिन्दी हिन्दी हिन्दी";
break_all(
s,
vec![ 6, 12, 19, 25, 31, 38, 44, 50, 56],
vec![ 2, 4, 7, 9, 11, 14, 16, 18, 20],
);
// from css/css-text/word-break/word-break-break-all- 014.html
let s = "\u{ 1f496}\u{ 1f494}";
break_all(s, vec![ 4, 8], vec![ 2, 4]);
// from css/css-text/word-break/word-break-break-all- 018.html
//let s = "XXXX\u{ 00a0}X";
//break_all(s, vec![ 1, 2, 3, 5, 6], vec![ 1, 2, 3, 5, 6]);
// from css/css-text/word-break/word-break-break-all- 022.html
//let s = "XX\u{ 00a0}X";
//break_all(s, vec![ 1, 2, 4, 5], vec![ 1, 2, 3, 4]);
// from css/css-text/word-break/word-break-break-all- 023.html
let s = "XX XX\u{ 005C}\u{ 005C}\u{ 005C}";
break_all(s, vec![ 1, 3, 4, 5, 6, 7, 8], vec![ 1, 3, 4, 5, 6, 7, 8]);
// from css/css-text/word-break/word-break-break-all- 026.html
let s = "XX XXX///";
break_all(s, vec![ 1, 3, 4, 5, 9], vec![ 1, 3, 4, 5, 9]);
// css/css-text/word-break/word-break-break-all-inline- 008.html
let s = "X.";
break_all(s, vec![ 2], vec![ 2]);
// ID and CJ
let s = "フォ";
break_all(s, vec![ 3, 6], vec![ 1, 2]);
}
#[test]
fn wordbreak_keepall() {
// from css/css-text/word-break/word-break-keep-all- 000.html
let s = "latin";
keep_all(s, vec![ 5], vec![ 5]);
// from css/css-text/word-break/word-break-keep-all- 001.html
let s = "\u{ 65e5}\u{ 672c}\u{ 8a9e}";
keep_all(s, vec![ 9], vec![ 3]);
// from css/css-text/word-break/word-break-keep-all- 002.html
let s = "한글이";
keep_all(s, vec![ 9], vec![ 3]);
// from css/css-text/word-break/word-break-keep-all- 005.html
let s = "字\u{ 3000}字";
keep_all(s, vec![ 6, 9], vec![ 2, 3]);
// from css/css-text/word-break/word-break-keep-all- 006.html
let s = "字\u{ 3001}字";
keep_all(s, vec![ 6, 9], vec![ 2, 3]);
// from css/css-text/word-boundary/word-boundary- 107.html
let s = "しょう。";
keep_all(s, vec![ 12], vec![ 4]);
// failed test. JL, JV and JT
let s = "\u{ 110B}\u{ 1162}\u{ 1100}\u{ 1175}\u{ 1111}\u{ 1161}\u{ 11AB}\u{ 1103}\u{ 1161}";
keep_all(s, vec![ 27], vec![ 9]);
}
#[test]
#[cfg_attr(not(feature = "lstm"), ignore)]
fn wordbreak_keepall_lstm() {
// from css/css-text/word-break/word-break-keep-all- 003.html
let s = "และและ";
keep_all(s, vec![ 9, 18], vec![ 3, 6]);
}
#[test]
fn wordbreak_normal() {
// from css/css-text/word-break/word-break-normal-th- 000.html
let s = "ภาษาไทยภาษาไทย";
normal(s, vec![ 12, 21, 33, 42], vec![ 4, 7, 11, 14]);
}
#[test]
fn wordbreak_normal_km() {
// from css/css-text/word-break/word-break-normal-km- 000.html
let _s = "ភាសាខ្មែរភាសាខ្មែរភាសាខ្មែរ";
normal(_s, vec![ 27, 54, 81], vec![ 9, 18, 27]);
}
#[test]
fn wordbreak_normal_lo() {
// from css/css-text/word-break/word-break-normal-lo- 000.html
let _s = "ພາສາລາວພາສາລາວພາສາລາວ";
normal(_s, vec![ 12, 21, 33, 42, 54, 63], vec![ 4, 7, 11, 14, 18, 21]);
}
[Dauer der Verarbeitung: 0.26 Sekunden, vorverarbeitet 2026-06-06]
|
2026-06-09
|