import { loadConfig, readConfigFileSnapshot } from "../../config/config.js" ;
import { redactConfigObject } from "../../config/redact-snapshot.js" ;
import {
buildTalkConfigResponse,
normalizeTalkSection,
resolveActiveTalkProviderConfig,
} from "../../config/talk.js" ;
import type { TalkConfigResponse, TalkProviderConfig } from "../../config/types.gateway.js" ;
import type { OpenClawConfig, TtsConfig, TtsProviderConfigMap } from "../../config/types.js" ;
import {
REALTIME_VOICE_AGENT_CONSULT_TOOL,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
} from "../../realtime-voice/agent-consult-tool.js" ;
import { getRealtimeVoiceProvider } from "../../realtime-voice/provider-registry.js" ;
import { resolveConfiguredRealtimeVoiceProvider } from "../../realtime-voice/provider-resolver.js" ;
import type { RealtimeVoiceProviderConfig } from "../../realtime-voice/provider-types.js" ;
import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
normalizeOptionalString,
} from "../../shared/string-coerce.js" ;
import { canonicalizeSpeechProviderId, getSpeechProvider } from "../../tts/provider-registry.js" ;
import { synthesizeSpeech, type TtsDirectiveOverrides } from "../../tts/tts.js" ;
import { ADMIN_SCOPE, TALK_SECRETS_SCOPE } from "../operator-scopes.js" ;
import {
ErrorCodes,
errorShape,
formatValidationErrors,
type TalkSpeakParams,
validateTalkConfigParams,
validateTalkModeParams,
validateTalkRealtimeSessionParams,
validateTalkSpeakParams,
} from "../protocol/index.js" ;
import { formatForLog } from "../ws-log.js" ;
import { asRecord } from "./record-shared.js" ;
import type { GatewayRequestHandlers } from "./types.js" ;
type TalkSpeakReason =
| "talk_unconfigured"
| "talk_provider_unsupported"
| "method_unavailable"
| "synthesis_failed"
| "invalid_audio_result" ;
type TalkSpeakErrorDetails = {
reason: TalkSpeakReason;
fallbackEligible: boolean ;
};
function canReadTalkSecrets(client: { connect?: { scopes?: string[] } } | null ): boolean {
const scopes = Array.isArray(client?.connect?.scopes) ? client.connect.scopes : [];
return scopes.includes(ADMIN_SCOPE) || scopes.includes(TALK_SECRETS_SCOPE);
}
function asStringRecord(value: unknown): Record<string, string> | undefined {
const record = asRecord(value);
if (!record) {
return undefined;
}
const next: Record<string, string> = {};
for (const [key, entryValue] of Object.entries(record)) {
if (typeof entryValue === "string" ) {
next[key] = entryValue;
}
}
return Object.keys(next).length > 0 ? next : undefined;
}
function normalizeAliasKey(value: string): string {
return normalizeLowercaseStringOrEmpty(value);
}
function resolveTalkVoiceId(
providerConfig: TalkProviderConfig,
requested: string | undefined,
): string | undefined {
if (!requested) {
return undefined;
}
const aliases = asStringRecord(providerConfig.voiceAliases);
if (!aliases) {
return requested;
}
const normalizedRequested = normalizeAliasKey(requested);
for (const [alias, voiceId] of Object.entries(aliases)) {
if (normalizeAliasKey(alias) === normalizedRequested) {
return voiceId;
}
}
return requested;
}
function buildTalkTtsConfig(
config: OpenClawConfig,
):
| { cfg: OpenClawConfig; provider: string; providerConfig: TalkProviderConfig }
| { error: string; reason: TalkSpeakReason } {
const resolved = resolveActiveTalkProviderConfig(config.talk);
const provider = canonicalizeSpeechProviderId(resolved?.provider, config);
if (!resolved || !provider) {
return {
error: "talk.speak unavailable: talk provider not configured" ,
reason: "talk_unconfigured" ,
};
}
const speechProvider = getSpeechProvider(provider, config);
if (!speechProvider) {
return {
error: `talk.speak unavailable: speech provider "${provider}" does not support Talk mode`,
reason: "talk_provider_unsupported" ,
};
}
const baseTts = config.messages?.tts ?? {};
const providerConfig = resolved.config;
const resolvedProviderConfig =
speechProvider.resolveTalkConfig?.({
cfg: config,
baseTtsConfig: baseTts as Record<string, unknown>,
talkProviderConfig: providerConfig,
timeoutMs: baseTts.timeoutMs ?? 30 _000 ,
}) ?? providerConfig;
const talkTts: TtsConfig = {
...baseTts,
auto: "always" ,
provider,
providers: {
...((asRecord(baseTts.providers) ?? {}) as TtsProviderConfigMap),
[provider]: resolvedProviderConfig,
},
};
return {
provider,
providerConfig,
cfg: {
...config,
messages: {
...config.messages,
tts: talkTts,
},
},
};
}
function getRecord(value: unknown): Record<string, unknown> | undefined {
return asRecord(value) ?? undefined;
}
function getVoiceCallRealtimeConfig(config: OpenClawConfig): {
provider?: string;
providers?: Record<string, RealtimeVoiceProviderConfig>;
} {
const plugins = getRecord(config.plugins);
const entries = getRecord(plugins?.entries);
const voiceCall = getRecord(entries?.["voice-call" ]);
const pluginConfig = getRecord(voiceCall?.config);
const realtime = getRecord(pluginConfig?.realtime);
const providersRaw = getRecord(realtime?.providers);
const providers: Record<string, RealtimeVoiceProviderConfig> = {};
if (providersRaw) {
for (const [providerId, providerConfig] of Object.entries(providersRaw)) {
const record = getRecord(providerConfig);
if (record) {
providers[providerId] = record;
}
}
}
return {
provider: normalizeOptionalString(realtime?.provider),
providers: Object.keys(providers).length > 0 ? providers : undefined,
};
}
function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvider?: string) {
const voiceCallRealtime = getVoiceCallRealtimeConfig(config);
const talkProviderConfigs = config.talk?.providers as
| Record<string, RealtimeVoiceProviderConfig>
| undefined;
const talkProvider = normalizeOptionalString(config.talk?.provider);
const talkProviderSupportsRealtime = talkProvider
? Boolean (getRealtimeVoiceProvider(talkProvider, config))
: false ;
const provider =
normalizeOptionalString(requestedProvider) ??
(talkProviderSupportsRealtime ? talkProvider : undefined) ??
voiceCallRealtime.provider;
return {
provider,
providers: {
...voiceCallRealtime.providers,
...talkProviderConfigs,
},
};
}
function buildRealtimeInstructions(): string {
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
}
function isFallbackEligibleTalkReason(reason: TalkSpeakReason): boolean {
return (
reason === "talk_unconfigured" ||
reason === "talk_provider_unsupported" ||
reason === "method_unavailable"
);
}
function talkSpeakError(reason: TalkSpeakReason, message: string) {
const details: TalkSpeakErrorDetails = {
reason,
fallbackEligible: isFallbackEligibleTalkReason(reason),
};
return errorShape(ErrorCodes.UNAVAILABLE, message, { details });
}
function resolveTalkSpeed(params: TalkSpeakParams): number | undefined {
if (typeof params.speed === "number" ) {
return params.speed;
}
if (typeof params.rateWpm !== "number" || params.rateWpm <= 0 ) {
return undefined;
}
const resolved = params.rateWpm / 175 ;
if (resolved <= 0 .5 || resolved >= 2 .0 ) {
return undefined;
}
return resolved;
}
function buildTalkSpeakOverrides(
provider: string,
providerConfig: TalkProviderConfig,
config: OpenClawConfig,
params: TalkSpeakParams,
): TtsDirectiveOverrides {
const speechProvider = getSpeechProvider(provider, config);
if (!speechProvider?.resolveTalkOverrides) {
return { provider };
}
const resolvedSpeed = resolveTalkSpeed(params);
const resolvedVoiceId = resolveTalkVoiceId(
providerConfig,
normalizeOptionalString(params.voiceId),
);
const providerOverrides = speechProvider.resolveTalkOverrides({
talkProviderConfig: providerConfig,
params: {
...params,
...(resolvedVoiceId == null ? {} : { voiceId: resolvedVoiceId }),
...(resolvedSpeed == null ? {} : { speed: resolvedSpeed }),
},
});
if (!providerOverrides || Object.keys(providerOverrides).length === 0 ) {
return { provider };
}
return {
provider,
providerOverrides: {
[provider]: providerOverrides,
},
};
}
function inferMimeType(
outputFormat: string | undefined,
fileExtension: string | undefined,
): string | undefined {
const normalizedOutput = normalizeOptionalLowercaseString(outputFormat);
const normalizedExtension = normalizeOptionalLowercaseString(fileExtension);
if (
normalizedOutput === "mp3" ||
normalizedOutput?.startsWith("mp3_" ) ||
normalizedOutput?.endsWith("-mp3" ) ||
normalizedExtension === ".mp3"
) {
return "audio/mpeg" ;
}
if (
normalizedOutput === "opus" ||
normalizedOutput?.startsWith("opus_" ) ||
normalizedExtension === ".opus" ||
normalizedExtension === ".ogg"
) {
return "audio/ogg" ;
}
if (normalizedOutput?.endsWith("-wav" ) || normalizedExtension === ".wav" ) {
return "audio/wav" ;
}
if (normalizedOutput?.endsWith("-webm" ) || normalizedExtension === ".webm" ) {
return "audio/webm" ;
}
return undefined;
}
function resolveTalkResponseFromConfig(params: {
includeSecrets: boolean ;
sourceConfig: OpenClawConfig;
runtimeConfig: OpenClawConfig;
}): TalkConfigResponse | undefined {
const normalizedTalk = normalizeTalkSection(params.sourceConfig.talk);
if (!normalizedTalk) {
return undefined;
}
const payload = buildTalkConfigResponse(normalizedTalk);
if (!payload) {
return undefined;
}
if (params.includeSecrets) {
return payload;
}
const sourceResolved = resolveActiveTalkProviderConfig(normalizedTalk);
const runtimeResolved = resolveActiveTalkProviderConfig(params.runtimeConfig.talk);
const activeProviderId = sourceResolved?.provider ?? runtimeResolved?.provider;
const provider = canonicalizeSpeechProviderId(activeProviderId, params.runtimeConfig);
if (!provider) {
return payload;
}
const speechProvider = getSpeechProvider(provider, params.runtimeConfig);
const sourceBaseTts = asRecord(params.sourceConfig.messages?.tts) ?? {};
const runtimeBaseTts = asRecord(params.runtimeConfig.messages?.tts) ?? {};
const talkProviderConfig = sourceResolved?.config ?? runtimeResolved?.config ?? {};
const resolvedConfig =
speechProvider?.resolveTalkConfig?.({
cfg: params.runtimeConfig,
baseTtsConfig: Object.keys(sourceBaseTts).length > 0 ? sourceBaseTts : runtimeBaseTts,
talkProviderConfig,
timeoutMs:
typeof sourceBaseTts.timeoutMs === "number"
? sourceBaseTts.timeoutMs
: typeof runtimeBaseTts.timeoutMs === "number"
? runtimeBaseTts.timeoutMs
: 30 _000 ,
}) ?? talkProviderConfig;
return {
...payload,
provider,
resolved: {
provider,
config: resolvedConfig,
},
};
}
export const talkHandlers: GatewayRequestHandlers = {
"talk.config" : async ({ params, respond, client }) => {
if (!validateTalkConfigParams(params)) {
respond(
false ,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.config params: ${formatValidationErrors(validateTalkConfigParams.errors)}`,
),
);
return ;
}
const includeSecrets = Boolean ((params as { includeSecrets?: boolean }).includeSecrets);
if (includeSecrets && !canReadTalkSecrets(client)) {
respond(
false ,
undefined,
errorShape(ErrorCodes.INVALID_REQUEST, `missing scope: ${TALK_SECRETS_SCOPE}`),
);
return ;
}
const snapshot = await readConfigFileSnapshot();
const runtimeConfig = loadConfig();
const configPayload: Record<string, unknown> = {};
const talk = resolveTalkResponseFromConfig({
includeSecrets,
sourceConfig: snapshot.config,
runtimeConfig,
});
if (talk) {
configPayload.talk = includeSecrets ? talk : redactConfigObject(talk);
}
const sessionMainKey = snapshot.config.session?.mainKey;
if (typeof sessionMainKey === "string" ) {
configPayload.session = { mainKey: sessionMainKey };
}
const seamColor = snapshot.config.ui?.seamColor;
if (typeof seamColor === "string" ) {
configPayload.ui = { seamColor };
}
respond(true , { config: configPayload }, undefined);
},
"talk.realtime.session" : async ({ params, respond }) => {
if (!validateTalkRealtimeSessionParams(params)) {
respond(
false ,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.realtime.session params: ${formatValidationErrors(validateTalkRealtimeSessionParams.errors)}`,
),
);
return ;
}
const typedParams = params as {
provider?: string;
model?: string;
voice?: string;
};
try {
const runtimeConfig = loadConfig();
const realtimeConfig = buildTalkRealtimeConfig(runtimeConfig, typedParams.provider);
const resolution = resolveConfiguredRealtimeVoiceProvider({
configuredProviderId: realtimeConfig.provider,
providerConfigs: realtimeConfig.providers,
cfg: runtimeConfig,
cfgForResolve: runtimeConfig,
noRegisteredProviderMessage: "No realtime voice provider registered" ,
});
if (!resolution.provider.createBrowserSession) {
respond(
false ,
undefined,
errorShape(
ErrorCodes.UNAVAILABLE,
`Realtime voice provider "${resolution.provider.id}" does not support browser WebRTC sessions`,
),
);
return ;
}
const session = await resolution.provider.createBrowserSession({
providerConfig: resolution.providerConfig,
instructions: buildRealtimeInstructions(),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model: normalizeOptionalString(typedParams.model),
voice: normalizeOptionalString(typedParams.voice),
});
respond(
true ,
{
provider: session.provider,
clientSecret: session.clientSecret,
...(session.model ? { model: session.model } : {}),
...(session.voice ? { voice: session.voice } : {}),
...(typeof session.expiresAt === "number" ? { expiresAt: session.expiresAt } : {}),
},
undefined,
);
} catch (err) {
respond(false , undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
}
},
"talk.speak" : async ({ params, respond }) => {
if (!validateTalkSpeakParams(params)) {
respond(
false ,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.speak params: ${formatValidationErrors(validateTalkSpeakParams.errors)}`,
),
);
return ;
}
const typedParams = params;
const text = normalizeOptionalString(typedParams.text);
if (!text) {
respond(false , undefined, errorShape(ErrorCodes.INVALID_REQUEST, "talk.speak requires text" ));
return ;
}
if (
typedParams.speed == null &&
typedParams.rateWpm != null &&
resolveTalkSpeed(typedParams) == null
) {
respond(
false ,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.speak params: rateWpm must resolve to speed between 0 .5 and 2 .0 `,
),
);
return ;
}
try {
const runtimeConfig = loadConfig();
const setup = buildTalkTtsConfig(runtimeConfig);
if ("error" in setup) {
respond(false , undefined, talkSpeakError(setup.reason, setup.error));
return ;
}
const overrides = buildTalkSpeakOverrides(
setup.provider,
setup.providerConfig,
runtimeConfig,
typedParams,
);
const result = await synthesizeSpeech({
text,
cfg: setup.cfg,
overrides,
disableFallback: true ,
});
if (!result.success || !result.audioBuffer) {
respond(
false ,
undefined,
talkSpeakError("synthesis_failed" , result.error ?? "talk synthesis failed" ),
);
return ;
}
if ((result.provider ?? setup.provider).trim().length === 0 ) {
respond(
false ,
undefined,
talkSpeakError("invalid_audio_result" , "talk synthesis returned empty provider" ),
);
return ;
}
if (result.audioBuffer.length === 0 ) {
respond(
false ,
undefined,
talkSpeakError("invalid_audio_result" , "talk synthesis returned empty audio" ),
);
return ;
}
respond(
true ,
{
audioBase64: result.audioBuffer.toString("base64" ),
provider: result.provider ?? setup.provider,
outputFormat: result.outputFormat,
voiceCompatible: result.voiceCompatible,
mimeType: inferMimeType(result.outputFormat, result.fileExtension),
fileExtension: result.fileExtension,
},
undefined,
);
} catch (err) {
respond(false , undefined, talkSpeakError("synthesis_failed" , formatForLog(err)));
}
},
"talk.mode" : ({ params, respond, context, client, isWebchatConnect }) => {
if (client && isWebchatConnect(client.connect) && !context.hasConnectedMobileNode()) {
respond(
false ,
undefined,
errorShape(ErrorCodes.UNAVAILABLE, "talk disabled: no connected iOS/Android nodes" ),
);
return ;
}
if (!validateTalkModeParams(params)) {
respond(
false ,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
`invalid talk.mode params: ${formatValidationErrors(validateTalkModeParams.errors)}`,
),
);
return ;
}
const payload = {
enabled: (params as { enabled: boolean }).enabled,
phase: (params as { phase?: string }).phase ?? null ,
ts: Date.now(),
};
context.broadcast("talk.mode" , payload, { dropIfSlow: true });
respond(true , payload, undefined);
},
};
Messung V0.5 in Prozent C=100 H=98 G=98
¤ Dauer der Verarbeitung: 0.10 Sekunden
(vorverarbeitet am 2026-06-06)
¤
*© Formatika GbR, Deutschland