/**
* CJK - aware character counting for accurate token estimation .
*
* Most LLM tokenizers encode CJK ( Chinese , Japanese , Korean ) characters as
* roughly 1 token per character , whereas Latin / ASCII text averages ~ 1 token
* per 4 characters . When the codebase estimates tokens as ` chars / 4 ` , CJK
* content is underestimated by 2 – 4 × .
*
* This module provides a shared helper that inflates the character count of
* CJK text so that the standard ` chars / 4 ` formula yields an accurate
* token estimate for any script .
*/
/**
* Default characters - per - token ratio used throughout the codebase .
* Latin text ≈ 4 chars / token ; CJK ≈ 1 char / token .
*/
export const CHARS_PER_TOKEN_ESTIMATE = 4 ;
/**
* Matches CJK Unified Ideographs , CJK Extension A / B , CJK Compatibility
* Ideographs , Hangul Syllables , Hiragana , Katakana , and other non - Latin
* scripts that typically use ~ 1 token per character .
*/
const NON_LATIN_RE = /[\u2E80-\u9FFF\uA000-\uA4FF\uAC00-\uD7AF\uF900-\uFAFF\u{20000 }-\u{2 FA1F}]/gu;
/**
* Return an adjusted character length that accounts for non - Latin ( CJK , etc . )
* characters . Each non - Latin character is counted as
* { @ link CHARS_PER_TOKEN_ESTIMATE } chars so that the downstream
* ` chars / CHARS_PER_TOKEN_ESTIMATE ` token estimate remains accurate .
*
* For pure ASCII / Latin text the return value equals ` text . length ` ( no change ) .
*/
export function estimateStringChars(text: string): number {
if (text.length === 0 ) {
return 0 ;
}
const nonLatinCount = (text.match(NON_LATIN_RE) ?? []).length;
// Use code-point length instead of UTF-16 length so that surrogate pairs
// (CJK Extension B+, U+20000–U+2FA1F) are counted as 1 character, not 2.
const codePointLength = countCodePoints(text, nonLatinCount);
// Non-Latin chars already contribute 1 to codePointLength, so add the extra weight.
return codePointLength + nonLatinCount * (CHARS_PER_TOKEN_ESTIMATE - 1 );
}
/**
* Matches surrogate pairs whose code point falls in the CJK Extension B +
* range ( U + 20000 – U + 2 FA1F ) . Only these surrogates need adjustment because
* they are matched by { @ link NON_LATIN_RE } and already counted in
* ` nonLatinCount ` . Other surrogates ( emoji , symbols ) are not matched by
* that regex , so collapsing them would create an inconsistency .
*
* High - surrogate range for U + 20000 – U + 2 FA1F is D840 – D87E .
*/
const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g;
/**
* Return the code - point - aware length of the string , adjusting only for
* CJK Extension B + surrogate pairs . For text without such characters
* ( the vast majority of inputs ) this returns ` text . length ` unchanged .
*/
function countCodePoints(text: string, nonLatinCount: number): number {
if (nonLatinCount === 0 ) {
return text.length;
}
// Count only CJK-range surrogate pairs — each occupies 2 UTF-16 units
// but represents 1 code point (and 1 regex match in NON_LATIN_RE).
const cjkSurrogates = (text.match(CJK_SURROGATE_HIGH_RE) ?? []).length;
return text.length - cjkSurrogates;
}
/**
* Estimate the number of tokens from a raw character count .
*
* For a more accurate estimate when the source text is available , prefer
* ` estimateStringChars ( text ) / CHARS_PER_TOKEN_ESTIMATE ` instead .
*/
export function estimateTokensFromChars(chars: number): number {
return Math.ceil(Math.max(0 , chars) / CHARS_PER_TOKEN_ESTIMATE);
}
Messung V0.5 in Prozent C=98 H=97 G=97
¤ Dauer der Verarbeitung: 0.9 Sekunden
(vorverarbeitet am 2026-06-10)
¤
*© Formatika GbR, Deutschland