import { describe, expect, it, vi } from "vitest" ;
import * as fences from "../markdown/fences.js" ;
import { hasBalancedFences } from "../test-utils/chunk-test-helpers.js" ;
import {
chunkByNewline,
chunkMarkdownText,
chunkMarkdownTextWithMode,
chunkText,
chunkTextWithMode,
resolveChunkMode,
resolveTextChunkLimit,
} from "./chunk.js" ;
function expectFencesBalanced(chunks: string[]) {
for (const chunk of chunks) {
expect(hasBalancedFences(chunk)).toBe(true );
}
}
function expectChunkLengths(chunks: string[], expectedLengths: number[]) {
expect(chunks).toHaveLength(expectedLengths.length);
expectedLengths.forEach((length, index) => {
expect(chunks[index]?.length).toBe(length);
});
}
function expectNormalizedChunkJoin(chunks: string[], text: string) {
expect(chunks.join(" " ).replace(/\s+/g, " " ).trim()).toBe(text.replace(/\s+/g, " " ).trim());
}
function expectChunkTextCase(params: {
text: string;
limit: number;
assert : (chunks: string[], text: string) => void ;
}) {
const chunks = chunkText(params.text, params.limit);
params.assert (chunks, params.text);
}
function expectChunkSpecialCase(run: () => void ) {
run();
}
type ChunkCase = {
name: string;
text: string;
limit: number;
expected: string[];
};
function runChunkCases(chunker: (text: string, limit: number) => string[], cases: ChunkCase[]) {
it.each(cases)("$name" , ({ text, limit, expected }) => {
expect(chunker(text, limit)).toEqual(expected);
});
}
function expectChunkModeCase(params: {
chunker: (text: string, limit: number, mode: "length" | "newline" ) => string[];
text: string;
limit: number;
mode: "length" | "newline" ;
expected: readonly string[];
name?: string;
}) {
expect(params.chunker(params.text, params.limit, params.mode), params.name).toEqual(
params.expected,
);
}
function expectMarkdownFenceSplitCases(
cases: ReadonlyArray<{
name: string;
text: string;
limit: number;
expectedPrefix: string;
expectedSuffix: string;
}>,
) {
cases.forEach(({ name, text, limit, expectedPrefix, expectedSuffix }) => {
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length, name).toBeGreaterThan(1 );
for (const chunk of chunks) {
expect(chunk.length, name).toBeLessThanOrEqual(limit);
expect(chunk.startsWith(expectedPrefix), name).toBe(true );
expect(chunk.trimEnd().endsWith(expectedSuffix), name).toBe(true );
}
expectFencesBalanced(chunks);
});
}
function expectNoEmptyFencedChunks(text: string, limit: number) {
const chunks = chunkMarkdownText(text, limit);
for (const chunk of chunks) {
const nonFenceLines = chunk
.split("\n" )
.filter((line) => !/^( {0 ,3 })(`{3 ,}|~{3 ,})(.*)$/.test(line));
expect(nonFenceLines.join("\n" ).trim()).not.toBe("" );
}
}
function expectFenceParseOccursOnce(text: string, limit: number) {
const parseSpy = vi.spyOn(fences, "parseFenceSpans" );
const chunks = chunkMarkdownText(text, limit);
expect(chunks.length).toBeGreaterThan(2 );
expect(parseSpy).toHaveBeenCalledTimes(1 );
parseSpy.mockRestore();
}
const parentheticalCases: ChunkCase[] = [
{
name: "keeps parenthetical phrases together" ,
text: "Heads up now (Though now I'm curious)ok" ,
limit: 35 ,
expected: ["Heads up now" , "(Though now I'm curious)ok" ],
},
{
name: "handles nested parentheses" ,
text: "Hello (outer (inner) end) world" ,
limit: 26 ,
expected: ["Hello (outer (inner) end)" , "world" ],
},
{
name: "ignores unmatched closing parentheses" ,
text: "Hello) world (ok)" ,
limit: 12 ,
expected: ["Hello)" , "world (ok)" ],
},
];
const newlineModeFenceCases = (() => {
const fence = "```python\ndef my_function():\n x = 1\n\n y = 2\n return x + y\n```" ;
const longFence = `\`\`\`js\n${"const a = 1;\n" .repeat(20 )}\`\`\``;
return [
{
name: "keeps single-newline fence+paragraph together" ,
text: "```js\nconst a = 1;\nconst b = 2;\n```\nAfter" ,
limit: 1000 ,
expected: ["```js\nconst a = 1;\nconst b = 2;\n```\nAfter" ],
},
{
name: "keeps blank lines inside fence together" ,
text: fence,
limit: 1000 ,
expected: [fence],
},
{
name: "splits between fence and following paragraph" ,
text: `${fence}\n\nAfter`,
limit: 1000 ,
expected: [fence, "After" ],
},
{
name: "defers long markdown blocks to markdown chunker" ,
text: longFence,
limit: 40 ,
expected: chunkMarkdownText(longFence, 40 ),
},
] as const ;
})();
describe("chunkText" , () => {
it.each([
{
name: "keeps multi-line text in one chunk when under limit" ,
text: "Line one\n\nLine two\n\nLine three" ,
limit: 1600 ,
assert : (chunks: string[], text: string) => {
expect(chunks).toEqual([text]);
},
},
{
name: "splits only when text exceeds the limit" ,
text: "a" .repeat(20 ).repeat(5 ),
limit: 60 ,
assert : (chunks: string[], text: string) => {
expectChunkLengths(chunks, [60 , 40 ]);
expect(chunks.join("" )).toBe(text);
},
},
{
name: "prefers breaking at a newline before the limit" ,
text: "paragraph one line\n\nparagraph two starts here and continues" ,
limit: 40 ,
assert : (chunks: string[]) => {
expect(chunks).toEqual(["paragraph one line" , "paragraph two starts here and continues" ]);
},
},
{
name: "otherwise breaks at the last whitespace under the limit" ,
text: "This is a message that should break nicely near a word boundary." ,
limit: 30 ,
assert : (chunks: string[], text: string) => {
expect(chunks[0 ]?.length).toBeLessThanOrEqual(30 );
expect(chunks[1 ]?.length).toBeLessThanOrEqual(30 );
expectNormalizedChunkJoin(chunks, text);
},
},
{
name: "falls back to a hard break when no whitespace is present" ,
text: "Supercalifragilisticexpialidocious" ,
limit: 10 ,
assert : (chunks: string[]) => {
expect(chunks).toEqual(["Supercalif" , "ragilistic" , "expialidoc" , "ious" ]);
},
},
] as const )("$name" , ({ text, limit, assert }) => {
expectChunkTextCase({ text, limit, assert });
});
runChunkCases(chunkText, [parentheticalCases[0 ]]);
});
describe("resolveTextChunkLimit" , () => {
it.each([
...(["whatsapp" , "telegram" , "slack" , "signal" , "imessage" , "discord" ] as const ).map(
(provider) => ({
name: `uses default limit for ${provider}`,
cfg: undefined,
provider,
accountId: undefined,
options: undefined,
expected: 4000 ,
}),
),
{
name: "uses fallback limit override when provided" ,
cfg: undefined,
provider: "discord" as const ,
accountId: undefined,
options: { fallbackLimit: 2000 },
expected: 2000 ,
},
{
name: "supports provider overrides for telegram" ,
cfg: { channels: { telegram: { textChunkLimit: 1234 } } },
provider: "telegram" as const ,
accountId: undefined,
options: undefined,
expected: 1234 ,
},
{
name: "falls back when provider override does not match" ,
cfg: { channels: { telegram: { textChunkLimit: 1234 } } },
provider: "whatsapp" as const ,
accountId: undefined,
options: undefined,
expected: 4000 ,
},
{
name: "prefers account overrides when provided" ,
cfg: {
channels: {
telegram: {
textChunkLimit: 2000 ,
accounts: {
default : { textChunkLimit: 1234 },
primary: { textChunkLimit: 777 },
},
},
},
},
provider: "telegram" as const ,
accountId: "primary" ,
options: undefined,
expected: 777 ,
},
{
name: "uses default account override when requested" ,
cfg: {
channels: {
telegram: {
textChunkLimit: 2000 ,
accounts: {
default : { textChunkLimit: 1234 },
primary: { textChunkLimit: 777 },
},
},
},
},
provider: "telegram" as const ,
accountId: "default" ,
options: undefined,
expected: 1234 ,
},
{
name: "uses the matching provider override for discord" ,
cfg: {
channels: {
discord: { textChunkLimit: 111 },
slack: { textChunkLimit: 222 },
},
},
provider: "discord" as const ,
accountId: undefined,
options: undefined,
expected: 111 ,
},
{
name: "uses the matching provider override for slack" ,
cfg: {
channels: {
discord: { textChunkLimit: 111 },
slack: { textChunkLimit: 222 },
},
},
provider: "slack" as const ,
accountId: undefined,
options: undefined,
expected: 222 ,
},
{
name: "falls back when multi-provider override does not match" ,
cfg: {
channels: {
discord: { textChunkLimit: 111 },
slack: { textChunkLimit: 222 },
},
},
provider: "telegram" as const ,
accountId: undefined,
options: undefined,
expected: 4000 ,
},
] as const )("$name" , ({ cfg, provider, accountId, options, expected }) => {
expect(resolveTextChunkLimit(cfg as never, provider, accountId, options)).toBe(expected);
});
});
describe("chunkMarkdownText" , () => {
it.each([
{
name: "keeps fenced blocks intact when a safe break exists" ,
run: () => {
const prefix = "p" .repeat(60 );
const fence = "```bash\nline1\nline2\n```" ;
const suffix = "s" .repeat(60 );
const text = `${prefix}\n\n${fence}\n\n${suffix}`;
const chunks = chunkMarkdownText(text, 40 );
expect(chunks.some((chunk) => chunk.trimEnd() === fence)).toBe(true );
expectFencesBalanced(chunks);
},
},
{
name: "handles multiple fence marker styles when splitting inside fences" ,
run: () =>
expectMarkdownFenceSplitCases([
{
name: "backtick fence" ,
text: `\`\`\`txt\n${"a" .repeat(500 )}\n\`\`\``,
limit: 120 ,
expectedPrefix: "```txt\n" ,
expectedSuffix: "```" ,
},
{
name: "tilde fence" ,
text: `~~~sh\n${"x" .repeat(600 )}\n~~~`,
limit: 140 ,
expectedPrefix: "~~~sh\n" ,
expectedSuffix: "~~~" ,
},
{
name: "long backtick fence" ,
text: `\`\`\`\`md\n${"y" .repeat(600 )}\n\`\`\`\``,
limit: 140 ,
expectedPrefix: "````md\n" ,
expectedSuffix: "````" ,
},
{
name: "indented fence" ,
text: ` \`\`\`js\n ${"z" .repeat(600 )}\n \`\`\``,
limit: 160 ,
expectedPrefix: " ```js\n" ,
expectedSuffix: " ```" ,
},
]),
},
] as const )("$name" , ({ run }) => {
expectChunkSpecialCase(run);
});
runChunkCases(chunkMarkdownText, parentheticalCases);
it.each([
{
name: "never produces an empty fenced chunk when splitting" ,
run: () => {
expectNoEmptyFencedChunks(`\`\`\`txt\n${"a" .repeat(300 )}\n\`\`\``, 60 );
},
},
{
name: "hard-breaks when a parenthetical exceeds the limit" ,
run: () => {
const text = `(${"a" .repeat(80 )})`;
const chunks = chunkMarkdownText(text, 20 );
expect(chunks[0 ]?.length).toBe(20 );
expect(chunks.join("" )).toBe(text);
},
},
{
name: "parses fence spans once for long fenced payloads" ,
run: () => {
expectFenceParseOccursOnce(`\`\`\`txt\n${"line\n" .repeat(600 )}\`\`\``, 80 );
},
},
] as const )("$name" , ({ run }) => {
expectChunkSpecialCase(run);
});
});
describe("chunkByNewline" , () => {
it.each([
{
name: "splits text on newlines" ,
text: "Line one\nLine two\nLine three" ,
limit: 1000 ,
expected: ["Line one" , "Line two" , "Line three" ],
},
{
name: "preserves blank lines by folding into the next chunk" ,
text: "Line one\n\n\nLine two\n\nLine three" ,
limit: 1000 ,
expected: ["Line one" , "\n\nLine two" , "\nLine three" ],
},
{
name: "trims whitespace from lines" ,
text: " Line one \n Line two " ,
limit: 1000 ,
expected: ["Line one" , "Line two" ],
},
{
name: "preserves leading blank lines on the first chunk" ,
text: "\n\nLine one\nLine two" ,
limit: 1000 ,
expected: ["\n\nLine one" , "Line two" ],
},
{
name: "preserves trailing blank lines on the last chunk" ,
text: "Line one\n\n" ,
limit: 1000 ,
expected: ["Line one\n\n" ],
},
{
name: "keeps whitespace when trimLines is false" ,
text: " indented line \nNext" ,
limit: 1000 ,
options: { trimLines: false },
expected: [" indented line " , "Next" ],
},
] as const )("$name" , ({ text, limit, options, expected }) => {
expect(chunkByNewline(text, limit, options)).toEqual(expected);
});
it.each([
{
name: "falls back to length-based for long lines" ,
run: () => {
const text = "Short line\n" + "a" .repeat(50 ) + "\nAnother short" ;
const chunks = chunkByNewline(text, 20 );
expect(chunks[0 ]).toBe("Short line" );
expectChunkLengths(chunks.slice(1 , 4 ), [20 , 20 , 10 ]);
expect(chunks[4 ]).toBe("Another short" );
},
},
{
name: "does not split long lines when splitLongLines is false" ,
run: () => {
const text = "a" .repeat(50 );
expect(chunkByNewline(text, 20 , { splitLongLines: false })).toEqual([text]);
},
},
] as const )("$name" , ({ run }) => {
expectChunkSpecialCase(run);
});
it.each(["" , " \n\n " ] as const )("returns empty array for input %j" , (text) => {
expect(chunkByNewline(text, 100 )).toEqual([]);
});
});
describe("chunkTextWithMode" , () => {
it.each([
{
name: "length mode" ,
text: "Line one\nLine two" ,
mode: "length" as const ,
expected: ["Line one\nLine two" ],
},
{
name: "newline mode (single paragraph)" ,
text: "Line one\nLine two" ,
mode: "newline" as const ,
expected: ["Line one\nLine two" ],
},
{
name: "newline mode (blank-line split)" ,
text: "Para one\n\nPara two" ,
mode: "newline" as const ,
expected: ["Para one" , "Para two" ],
},
] as const )(
"applies mode-specific chunking behavior: $name" ,
({ text, mode, expected, name }) => {
expectChunkModeCase({
chunker: chunkTextWithMode,
text,
limit: 1000 ,
mode,
expected,
name,
});
},
);
});
describe("chunkMarkdownTextWithMode" , () => {
it.each([
{
name: "length mode uses markdown-aware chunker" ,
text: "Line one\nLine two" ,
mode: "length" as const ,
expected: chunkMarkdownText("Line one\nLine two" , 1000 ),
},
{
name: "newline mode keeps single paragraph" ,
text: "Line one\nLine two" ,
mode: "newline" as const ,
expected: ["Line one\nLine two" ],
},
{
name: "newline mode splits by blank line" ,
text: "Para one\n\nPara two" ,
mode: "newline" as const ,
expected: ["Para one" , "Para two" ],
},
] as const )("applies markdown/newline mode behavior: $name" , ({ text, mode, expected, name }) => {
expectChunkModeCase({
chunker: chunkMarkdownTextWithMode,
text,
limit: 1000 ,
mode,
expected,
name,
});
});
it.each(newlineModeFenceCases)(
"handles newline mode fence splitting rules: $name" ,
({ text, limit, expected, name }) => {
expect(chunkMarkdownTextWithMode(text, limit, "newline" ), name).toEqual(expected);
},
);
});
describe("resolveChunkMode" , () => {
const providerCfg = { channels: { slack: { chunkMode: "newline" as const } } };
const accountCfg = {
channels: {
slack: {
chunkMode: "length" as const ,
accounts: {
primary: { chunkMode: "newline" as const },
},
},
},
};
it.each([
{ cfg: undefined, provider: "telegram" , accountId: undefined, expected: "length" },
{ cfg: {}, provider: "discord" , accountId: undefined, expected: "length" },
{ cfg: undefined, provider: "bluebubbles" , accountId: undefined, expected: "length" },
{ cfg: providerCfg, provider: "__internal__" , accountId: undefined, expected: "length" },
{ cfg: providerCfg, provider: "slack" , accountId: undefined, expected: "newline" },
{ cfg: providerCfg, provider: "discord" , accountId: undefined, expected: "length" },
{ cfg: accountCfg, provider: "slack" , accountId: "primary" , expected: "newline" },
{ cfg: accountCfg, provider: "slack" , accountId: "other" , expected: "length" },
] as const )(
"resolves default/provider/account/internal chunk mode for $provider $accountId" ,
({ cfg, provider, accountId, expected }) => {
expect(resolveChunkMode(cfg as never, provider, accountId)).toBe(expected);
},
);
});
Messung V0.5 in Prozent C=99 H=97 G=97
¤ Dauer der Verarbeitung: 0.7 Sekunden
¤
*© Formatika GbR, Deutschland