import { Type } from "typebox"; import { SILENT_REPLY_TOKEN } from "../../auto-reply/tokens.js"; import { loadConfig } from "../../config/config.js"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { textToSpeech } from "../../tts/tts.js"; import type { GatewayMessageChannel } from "../../utils/message-channel.js"; import type { AnyAgentTool } from "./common.js"; import { ToolInputError, readNumberParam, readStringParam } from "./common.js";
const TtsToolSchema = Type.Object({
text: Type.String({ description: "Text to convert to speech." }),
channel: Type.Optional(
Type.String({ description: "Optional channel id to pick output format." }),
),
timeoutMs: Type.Optional(
Type.Number({
description: "Optional provider request timeout in milliseconds.",
minimum: 1,
}),
),
});
function readTtsTimeoutMs(args: Record<string, unknown>): number | undefined { const timeoutMs = readNumberParam(args, "timeoutMs", {
integer: true,
strict: true,
}); if (timeoutMs === undefined) { return undefined;
} if (timeoutMs <= 0) { thrownew ToolInputError("timeoutMs must be a positive integer in milliseconds.");
} return timeoutMs;
}
export function createTtsTool(opts?: {
config?: OpenClawConfig;
agentChannel?: GatewayMessageChannel;
}): AnyAgentTool { return {
label: "TTS",
name: "tts",
displaySummary: "Convert text to speech and return audio.",
description: `Convert text to speech. Audio is delivered automatically from the tool result — reply with ${SILENT_REPLY_TOKEN} after a successful call to avoid duplicate messages.`,
parameters: TtsToolSchema,
execute: async (_toolCallId, args) => { const params = args as Record<string, unknown>; const text = readStringParam(params, "text", { required: true }); const channel = readStringParam(params, "channel"); const timeoutMs = readTtsTimeoutMs(params); const cfg = opts?.config ?? loadConfig(); const result = await textToSpeech({
text,
cfg,
channel: channel ?? opts?.agentChannel,
timeoutMs,
});
if (result.success && result.audioPath) { // Preserve the spoken text in the tool result content so the session // transcript retains what was said across turns. The audio itself is // still delivered via details.media. Sanitize first so a crafted // utterance cannot inject reply directives when the tool output is // rendered in verbose mode. return {
content: [{ type: "text", text: `(spoken) ${sanitizeTranscriptForToolContent(text)}` }],
details: {
audioPath: result.audioPath,
provider: result.provider,
...(timeoutMs !== undefined ? { timeoutMs } : {}),
media: {
mediaUrl: result.audioPath,
trustedLocalMedia: true,
...(result.voiceCompatible ? { audioAsVoice: true } : {}),
},
},
};
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.