Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/intl/icu_capi/src/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 10 kB image not shown  

Quelle  segmenter_word.rs   Sprache: unbekannt

 
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

#[diplomat::bridge]
pub mod ffi {
    use crate::errors::ffi::ICU4XError;
    use crate::provider::ffi::ICU4XDataProvider;
    use alloc::boxed::Box;
    use core::convert::TryFrom;
    use icu_segmenter::{
        WordBreakIteratorLatin1, WordBreakIteratorPotentiallyIllFormedUtf8, WordBreakIteratorUtf16,
        WordSegmenter, WordType,
    };

    #[diplomat::enum_convert(WordType, needs_wildcard)]
    #[diplomat::rust_link(icu::segmenter::WordType, Enum)]
    pub enum ICU4XSegmenterWordType {
        None = 0,
        Number = 1,
        Letter = 2,
    }

    #[diplomat::opaque]
    /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings.
    #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)]
    pub struct ICU4XWordSegmenter(WordSegmenter);

    #[diplomat::opaque]
    #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
    #[diplomat::rust_link(
        icu::segmenter::WordBreakIteratorPotentiallyIllFormedUtf8,
        Typedef,
        hidden
    )]
    #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf8, Typedef, hidden)]
    pub struct ICU4XWordBreakIteratorUtf8<'a>(WordBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>);

    #[diplomat::opaque]
    #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
    #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf16, Typedef, hidden)]
    pub struct ICU4XWordBreakIteratorUtf16<'a>(WordBreakIteratorUtf16<'a, 'a>);

    #[diplomat::opaque]
    #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
    #[diplomat::rust_link(icu::segmenter::WordBreakIteratorLatin1, Typedef, hidden)]
    pub struct ICU4XWordBreakIteratorLatin1<'a>(WordBreakIteratorLatin1<'a, 'a>);

    impl ICU4XSegmenterWordType {
        #[diplomat::rust_link(icu::segmenter::WordType::is_word_like, FnInEnum)]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn is_word_like(self) -> bool {
            WordType::from(self).is_word_like()
        }
    }

    impl ICU4XWordSegmenter {
        /// Construct an [`ICU4XWordSegmenter`] with automatically selecting the best available LSTM
        /// or dictionary payload data.
        ///
        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
        /// Khmer, Lao, and Thai.
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)]
        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto")]
        pub fn create_auto(
            provider: &ICU4XDataProvider,
        ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> {
            Ok(Box::new(ICU4XWordSegmenter(call_constructor!(
                WordSegmenter::new_auto [r => Ok(r)],
                WordSegmenter::try_new_auto_with_any_provider,
                WordSegmenter::try_new_auto_with_buffer_provider,
                provider
            )?)))
        }

        /// Construct an [`ICU4XWordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
        /// Thai.
        ///
        /// Warning: [`ICU4XWordSegmenter`] created by this function doesn't handle Chinese or
        /// Japanese.
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)]
        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm")]
        pub fn create_lstm(
            provider: &ICU4XDataProvider,
        ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> {
            Ok(Box::new(ICU4XWordSegmenter(call_constructor!(
                WordSegmenter::new_lstm [r => Ok(r)],
                WordSegmenter::try_new_lstm_with_any_provider,
                WordSegmenter::try_new_lstm_with_buffer_provider,
                provider,
            )?)))
        }

        /// Construct an [`ICU4XWordSegmenter`] with dictionary payload data for Chinese, Japanese,
        /// Burmese, Khmer, Lao, and Thai.
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)]
        #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary")]
        pub fn create_dictionary(
            provider: &ICU4XDataProvider,
        ) -> Result<Box<ICU4XWordSegmenter>, ICU4XError> {
            Ok(Box::new(ICU4XWordSegmenter(call_constructor!(
                WordSegmenter::new_dictionary [r => Ok(r)],
                WordSegmenter::try_new_dictionary_with_any_provider,
                WordSegmenter::try_new_dictionary_with_buffer_provider,
                provider,
            )?)))
        }

        /// Segments a string.
        ///
        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
        /// to the WHATWG Encoding Standard.
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf8, FnInStruct)]
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_str, FnInStruct, hidden)]
        #[diplomat::attr(dart, disable)]
        pub fn segment_utf8<'a>(
            &'a self,
            input: &'a DiplomatStr,
        ) -> Box<ICU4XWordBreakIteratorUtf8<'a>> {
            Box::new(ICU4XWordBreakIteratorUtf8(self.0.segment_utf8(input)))
        }

        /// Segments a string.
        ///
        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
        /// to the WHATWG Encoding Standard.
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf16, FnInStruct)]
        #[diplomat::attr(dart, rename = "segment")]
        pub fn segment_utf16<'a>(
            &'a self,
            input: &'a DiplomatStr16,
        ) -> Box<ICU4XWordBreakIteratorUtf16<'a>> {
            Box::new(ICU4XWordBreakIteratorUtf16(self.0.segment_utf16(input)))
        }

        /// Segments a Latin-1 string.
        #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_latin1, FnInStruct)]
        #[diplomat::attr(dart, disable)]
        pub fn segment_latin1<'a>(
            &'a self,
            input: &'a [u8],
        ) -> Box<ICU4XWordBreakIteratorLatin1<'a>> {
            Box::new(ICU4XWordBreakIteratorLatin1(self.0.segment_latin1(input)))
        }
    }

    impl<'a> ICU4XWordBreakIteratorUtf8<'a> {
        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
        /// out of range of a 32-bit signed integer.
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
        #[diplomat::rust_link(
            icu::segmenter::WordBreakIterator::Item,
            AssociatedTypeInStruct,
            hidden
        )]
        pub fn next(&mut self) -> i32 {
            self.0
                .next()
                .and_then(|u| i32::try_from(u).ok())
                .unwrap_or(-1)
        }

        /// Return the status value of break boundary.
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn word_type(&self) -> ICU4XSegmenterWordType {
            self.0.word_type().into()
        }

        /// Return true when break boundary is word-like such as letter/number/CJK
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn is_word_like(&self) -> bool {
            self.0.is_word_like()
        }
    }

    impl<'a> ICU4XWordBreakIteratorUtf16<'a> {
        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
        /// out of range of a 32-bit signed integer.
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
        #[diplomat::rust_link(
            icu::segmenter::WordBreakIterator::Item,
            AssociatedTypeInStruct,
            hidden
        )]
        pub fn next(&mut self) -> i32 {
            self.0
                .next()
                .and_then(|u| i32::try_from(u).ok())
                .unwrap_or(-1)
        }

        /// Return the status value of break boundary.
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
        #[diplomat::rust_link(
            icu::segmenter::WordBreakIterator::iter_with_word_type,
            FnInStruct,
            hidden
        )]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn word_type(&self) -> ICU4XSegmenterWordType {
            self.0.word_type().into()
        }

        /// Return true when break boundary is word-like such as letter/number/CJK
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn is_word_like(&self) -> bool {
            self.0.is_word_like()
        }
    }

    impl<'a> ICU4XWordBreakIteratorLatin1<'a> {
        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
        /// out of range of a 32-bit signed integer.
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
        #[diplomat::rust_link(
            icu::segmenter::WordBreakIterator::Item,
            AssociatedTypeInStruct,
            hidden
        )]
        pub fn next(&mut self) -> i32 {
            self.0
                .next()
                .and_then(|u| i32::try_from(u).ok())
                .unwrap_or(-1)
        }

        /// Return the status value of break boundary.
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn word_type(&self) -> ICU4XSegmenterWordType {
            self.0.word_type().into()
        }

        /// Return true when break boundary is word-like such as letter/number/CJK
        #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
        #[diplomat::attr(supports = accessors, getter)]
        pub fn is_word_like(&self) -> bool {
            self.0.is_word_like()
        }
    }
}

[ Dauer der Verarbeitung: 0.33 Sekunden  (vorverarbeitet)  ]