From 98bcc543f5ff838456d68919f3420607a095680d Mon Sep 17 00:00:00 2001 From: Johannes Kresner Date: Wed, 8 Apr 2026 20:13:36 +0200 Subject: [PATCH] feat(vela): mock push-to-talk transcript updates --- apps/vela-gateway/src/index.js | 22 +++++++ .../test/websocket-session.test.js | 63 +++++++++++++++++++ apps/vela-ui/e2e/voice-session.spec.js | 10 +++ apps/vela-ui/src/lib/VoiceSessionShell.svelte | 25 +++++++- apps/vela-ui/tests/voice-session.test.js | 27 ++++++++ docs/architecture.md | 6 +- docs/backlog.md | 4 +- docs/protocol.md | 28 ++++++++- 8 files changed, 179 insertions(+), 6 deletions(-) diff --git a/apps/vela-gateway/src/index.js b/apps/vela-gateway/src/index.js index 5ee489f..6f82770 100644 --- a/apps/vela-gateway/src/index.js +++ b/apps/vela-gateway/src/index.js @@ -14,6 +14,22 @@ const WEBSOCKET_GUID = '258EAFA5-E914-47DA-95CA-C5AB0DC85B11'; const MOCKED_USER_TRANSCRIPT = '[mocked user] What is the current mocked vertical slice?'; const MOCKED_ASSISTANT_RESPONSE = '[mocked assistant] This is a deterministic mocked response from the gateway vertical slice.'; +function createPlaceholderPartialTranscript(audioChunkCount) { + return audioChunkCount === 1 + ? '[mocked partial] Placeholder push-to-talk transcript in progress.' + : `[mocked partial] Placeholder push-to-talk transcript in progress (${audioChunkCount} chunks).`; +} + +function createPlaceholderFinalTranscript(audioChunkCount) { + if (audioChunkCount === 0) { + return '[mocked final] Placeholder push-to-talk transcript completed without appended audio.'; + } + + return audioChunkCount === 1 + ? '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + : `[mocked final] Placeholder push-to-talk transcript completed from ${audioChunkCount} appended chunks.`; +} + function createSessionRecord() { return { id: crypto.randomUUID(), @@ -238,6 +254,9 @@ function handleClientMessage(socket, session, rawMessage) { session.audioChunkCount += 1; updateSessionState(socket, session, 'listening'); + sendSocketMessage(socket, 'transcript.partial', { + text: createPlaceholderPartialTranscript(session.audioChunkCount) + }); break; case 'input_audio.commit': if (session.mockedTurnInFlight) { @@ -245,6 +264,9 @@ function handleClientMessage(socket, session, rawMessage) { break; } + sendSocketMessage(socket, 'transcript.final', { + text: createPlaceholderFinalTranscript(session.audioChunkCount) + }); session.audioChunkCount = 0; updateSessionState(socket, session, 'idle'); break; diff --git a/apps/vela-gateway/test/websocket-session.test.js b/apps/vela-gateway/test/websocket-session.test.js index 47cb43b..d829146 100644 --- a/apps/vela-gateway/test/websocket-session.test.js +++ b/apps/vela-gateway/test/websocket-session.test.js @@ -293,8 +293,16 @@ test('websocket handles valid and invalid client messages safely', async () => { type: 'session.state', payload: { value: 'listening' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.final', + payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' } + }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'idle' } @@ -340,8 +348,16 @@ test('websocket accepts a placeholder input cycle before a mocked turn on the sa type: 'session.state', payload: { value: 'listening' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.final', + payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' } + }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'idle' } @@ -359,6 +375,53 @@ test('websocket accepts a placeholder input cycle before a mocked turn on the sa } }); +test('websocket emits deterministic partials for repeated appends and a deterministic final for commit without append', async () => { + const server = await startServer(); + + try { + const client = await connectWebSocket(server.port); + await client.nextMessage(); + await client.nextMessage(); + + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'chunk-1' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'listening' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); + + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'chunk-2' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress (2 chunks).' } + }); + + client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.final', + payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 2 appended chunks.' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'idle' } + }); + + client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.final', + payload: { text: '[mocked final] Placeholder push-to-talk transcript completed without appended audio.' } + }); + await assert.rejects(() => client.nextMessage(150), /timed out waiting for websocket message/); + + await client.close(); + } finally { + await server.close(); + } +}); + test('websocket mocked turn emits deterministic transcript and response events in order', async () => { const server = await startServer(); diff --git a/apps/vela-ui/e2e/voice-session.spec.js b/apps/vela-ui/e2e/voice-session.spec.js index 0cced90..b54c20d 100644 --- a/apps/vela-ui/e2e/voice-session.spec.js +++ b/apps/vela-ui/e2e/voice-session.spec.js @@ -92,10 +92,20 @@ test('voice session shell supports a placeholder mic-control cycle before anothe await page.getByTestId('mic-control-button').dispatchEvent('mousedown'); await expect(page.getByTestId('mic-control-status')).toHaveText('holding'); await expect(page.getByTestId('gateway-session-state')).toHaveText('listening'); + await expect(page.getByTestId('partial-transcript')).toHaveText( + '[mocked partial] Placeholder push-to-talk transcript in progress.' + ); + await expect(page.getByTestId('user-transcript')).toHaveText( + '[mocked partial] Placeholder push-to-talk transcript in progress.' + ); await expect(page.getByTestId('mocked-turn-button')).toBeDisabled(); await page.getByTestId('mic-control-button').dispatchEvent('mouseup'); await expect(page.getByTestId('mic-control-status')).toHaveText('idle'); + await expect(page.getByTestId('user-transcript')).toHaveText( + '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + ); + await expect(page.getByTestId('partial-transcript')).toHaveText('none'); await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); diff --git a/apps/vela-ui/src/lib/VoiceSessionShell.svelte b/apps/vela-ui/src/lib/VoiceSessionShell.svelte index b722239..e29d67b 100644 --- a/apps/vela-ui/src/lib/VoiceSessionShell.svelte +++ b/apps/vela-ui/src/lib/VoiceSessionShell.svelte @@ -50,6 +50,7 @@ let socket = null; let connectionAttempts = 0; let mockedUserTranscript = 'none'; + let inProgressPartialTranscript = 'none'; let mockedAssistantResponse = 'none'; let mockedTurnInFlight = false; let mockedConversationRenderOrder = []; @@ -92,6 +93,7 @@ sessionReadyReceived = false; lastServerEvent = 'none'; mockedUserTranscript = 'none'; + inProgressPartialTranscript = 'none'; mockedAssistantResponse = 'none'; mockedTurnInFlight = false; mockedConversationRenderOrder = []; @@ -145,6 +147,7 @@ } mockedUserTranscript = 'waiting for mocked transcript…'; + inProgressPartialTranscript = 'none'; mockedAssistantResponse = 'waiting for mocked response…'; mockedTurnInFlight = true; lastError = 'none'; @@ -237,9 +240,23 @@ } if (message.type === 'transcript.final') { + inProgressPartialTranscript = 'none'; mockedUserTranscript = message.payload.text; - mockedAssistantResponse = '…'; - mockedConversationRenderOrder = [...mockedConversationRenderOrder, 'transcript']; + if (mockedTurnInFlight) { + mockedAssistantResponse = '…'; + } + if (!mockedConversationRenderOrder.includes('transcript')) { + mockedConversationRenderOrder = [...mockedConversationRenderOrder, 'transcript']; + } + return; + } + + if (message.type === 'transcript.partial') { + inProgressPartialTranscript = message.payload.text; + mockedUserTranscript = message.payload.text; + if (!mockedConversationRenderOrder.includes('transcript')) { + mockedConversationRenderOrder = [...mockedConversationRenderOrder, 'transcript']; + } return; } @@ -395,6 +412,10 @@ Mocked user transcript

{mockedUserTranscript}

+
+ In-progress partial transcript +

{inProgressPartialTranscript}

+
Mocked assistant response

{mockedAssistantResponse}

diff --git a/apps/vela-ui/tests/voice-session.test.js b/apps/vela-ui/tests/voice-session.test.js index 4ea9253..de457f9 100644 --- a/apps/vela-ui/tests/voice-session.test.js +++ b/apps/vela-ui/tests/voice-session.test.js @@ -123,10 +123,21 @@ describe('voice session shell', () => { expect(getByTestId('mic-control-status').textContent).toBe('holding'); socket.message(createMessageEnvelope('session.state', { value: 'listening' })); + socket.message( + createMessageEnvelope('transcript.partial', { + text: '[mocked partial] Placeholder push-to-talk transcript in progress.' + }) + ); await waitFor(() => { expect(getByTestId('gateway-session-state').textContent).toBe('listening'); expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); + expect(getByTestId('user-transcript').textContent).toBe( + '[mocked partial] Placeholder push-to-talk transcript in progress.' + ); + expect(getByTestId('partial-transcript').textContent).toBe( + '[mocked partial] Placeholder push-to-talk transcript in progress.' + ); }); await fireEvent.mouseUp(getByTestId('mic-control-button')); @@ -138,11 +149,21 @@ describe('voice session shell', () => { }); expect(getByTestId('mic-control-status').textContent).toBe('idle'); + socket.message( + createMessageEnvelope('transcript.final', { + text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + }) + ); + socket.message(createMessageEnvelope('session.state', { value: 'idle' })); await waitFor(() => { expect(getByTestId('gateway-session-state').textContent).toBe('idle'); expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); + expect(getByTestId('user-transcript').textContent).toBe( + '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + ); + expect(getByTestId('partial-transcript').textContent).toBe('none'); }); await fireEvent.click(getByTestId('mocked-turn-button')); @@ -171,6 +192,11 @@ describe('voice session shell', () => { expect(sentMessage.type).toBe('mocked.turn.trigger'); socket.message(createMessageEnvelope('session.state', { value: 'listening' })); + socket.message( + createMessageEnvelope('transcript.partial', { + text: '[mocked partial] Placeholder push-to-talk transcript in progress.' + }) + ); socket.message(createMessageEnvelope('transcript.final', { text: 'Turn on the office lamp.' })); socket.message(createMessageEnvelope('session.state', { value: 'thinking' })); socket.message(createMessageEnvelope('session.state', { value: 'speaking' })); @@ -181,6 +207,7 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('user-transcript').textContent).toBe('Turn on the office lamp.'); + expect(getByTestId('partial-transcript').textContent).toBe('none'); expect(getByTestId('assistant-response').textContent).toBe('Mocked assistant response.'); expect(getByTestId('conversation-render-order').textContent).toBe('transcript>response'); expect(getByTestId('gateway-session-state').textContent).toBe('idle'); diff --git a/docs/architecture.md b/docs/architecture.md index 515b9d2..b479d0c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -36,7 +36,7 @@ The repository now includes separate runnable workspaces for the UI and gateway - PWA enabled - WebSocket client -The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, trigger one deterministic mocked turn while connected, and render the mocked user transcript plus mocked assistant response for the active session. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. +The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, trigger one deterministic mocked turn while connected, render deterministic placeholder partial/final transcripts for the push-to-talk shell, and render the mocked user transcript plus mocked assistant response for the existing mocked-turn path. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. #### Responsibilities @@ -105,6 +105,7 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and - WebSocket upgrades on `/ws` create an ephemeral session immediately - the gateway sends `session.ready` followed by `session.state` (`idle`) when the socket is established - valid minimal client events, including placeholder `input_audio.append` / `input_audio.commit`, can move the session between `idle` and `listening` +- placeholder `input_audio.append` emits deterministic mocked `transcript.partial` events and `input_audio.commit` emits one deterministic mocked `transcript.final` - `mocked.turn.trigger` drives a fixed transcript/response event sequence over the existing shared protocol - only one mocked turn is allowed in flight per session at a time - invalid JSON, invalid envelopes, and malformed frames are handled defensively so the process stays up @@ -115,12 +116,13 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and - exposes connect, disconnect, mic-control shell interactions, and mocked-turn controls - does not request microphone permission or capture real microphone audio - only emits placeholder `input_audio.append` / `input_audio.commit` events; it does not send real audio data or play back audio +- renders the latest placeholder partial transcript during a push-to-talk shell turn and replaces it with the final deterministic transcript on commit - reads mocked transcript and mocked response events from the shared protocol contract ## Voice Pipeline ```text -Mic control shell / mocked turn button → Placeholder `input_audio.append` / `input_audio.commit` or mocked session flow → Transcript events → Response text events → UI +Mic control shell / mocked turn button → Placeholder `input_audio.append` / `input_audio.commit` or mocked session flow → Deterministic transcript events → Mocked response text events when using mocked.turn.trigger → UI ``` This mocked vertical slice intentionally stands in for the future real pipeline: diff --git a/docs/backlog.md b/docs/backlog.md index 225ab11..1b3b3d5 100644 --- a/docs/backlog.md +++ b/docs/backlog.md @@ -38,7 +38,7 @@ Prove the end-to-end interaction model with mocked or stubbed providers. - [x] create a minimal UI with mic control - [x] create a gateway WebSocket session skeleton - [x] implement a mocked transcript/response vertical slice over the existing WebSocket session -- implement mocked STT flow for partial transcript events +- [x] implement mocked STT flow for partial transcript events - implement mocked LLM response streaming beyond the fixed deterministic slice - implement stubbed audio playback or placeholder TTS output - [x] implement interrupt handling across the mocked pipeline @@ -190,6 +190,8 @@ Polish the system after the core voice loop is reliable. - `apps/vela-gateway` now exposes a minimal `/ws` WebSocket session skeleton with ephemeral in-memory sessions and defensive message handling - `apps/vela-gateway` now accepts `mocked.turn.trigger` and emits protocol-valid mocked transcript/response events with one in-flight mocked turn per session - `apps/vela-gateway` now supports placeholder input-audio append/commit cycles before running another mocked turn on the same socket +- `apps/vela-gateway` now emits deterministic `transcript.partial` events for placeholder `input_audio.append` messages and exactly one deterministic `transcript.final` for each placeholder `input_audio.commit` +- `apps/vela-ui` now renders the latest placeholder partial transcript during the push-to-talk shell turn and replaces it with the deterministic final transcript on commit - `apps/vela-ui` now exposes a cancel control for active mocked turns and keeps already-rendered transcript/response text visible after cancellation - `apps/vela-gateway` now honors `response.cancel` during mocked turns by stopping pending mocked response events, returning the session to `idle`, and allowing a new mocked turn on the same socket - `apps/vela-protocol` now provides the shared WebSocket event contract for the UI and gateway diff --git a/docs/protocol.md b/docs/protocol.md index a53ebf5..f3dc927 100644 --- a/docs/protocol.md +++ b/docs/protocol.md @@ -62,7 +62,8 @@ type ClientEvent = - `mocked.turn.trigger` is accepted only when no other mocked turn is already in flight for that session - a mocked turn emits deterministic `transcript.final`, `response.text.delta`, `response.completed`, and `session.state` events in protocol-valid order - `input_audio.append` updates the ephemeral session record and moves the session to `listening` -- `input_audio.commit` resets the minimal buffered state and returns the session to `idle` +- each accepted `input_audio.append` emits one deterministic `transcript.partial` for the current placeholder turn +- `input_audio.commit` emits exactly one deterministic `transcript.final`, resets the minimal buffered state, and returns the session to `idle` - after a completed placeholder input cycle, the same socket can still send `mocked.turn.trigger` - `response.cancel` is safe to send even when no mocked turn is active - `response.cancel` stops any still-pending mocked turn events for the active turn and resets the minimal session state back to `idle` @@ -90,6 +91,8 @@ Notes: - the UI disables the mocked-turn control until `session.ready` arrives, while disconnected, or while a mocked turn is already in flight - the UI disables the mic control while disconnected, before `session.ready`, or while a mocked turn is already in flight - pressing the mic control sends one placeholder `input_audio.append` chunk and releasing it sends `input_audio.commit` +- while a placeholder push-to-talk turn is in progress, the UI renders the latest `transcript.partial` +- after placeholder commit, the UI renders the `transcript.final` and clears the partial-only display - the UI copy explicitly labels the mic button as a control shell and not real microphone capture - the UI shows a cancel control and enables it only while a mocked turn is active - after cancel returns the gateway to `idle`, the UI clears the active-turn indicator but keeps any transcript or response text that was already rendered @@ -144,6 +147,29 @@ Notes: - no audio, STT, LLM, TTS, or external providers participate in this flow - `response.cancel` can stop the mocked turn early, suppress any later mocked response events for that turn, and return the session to `idle` +### Deterministic placeholder push-to-talk transcript sequence + +For this increment, the existing mic-control shell still sends placeholder `input_audio.append` on press and `input_audio.commit` on release. The gateway now translates that shell flow into deterministic mocked transcript events only: + +```text +input_audio.append #1 +→ session.state(listening) when entering the turn +→ transcript.partial("[mocked partial] Placeholder push-to-talk transcript in progress.") + +input_audio.append #N (N > 1) +→ transcript.partial("[mocked partial] Placeholder push-to-talk transcript in progress (N chunks).") + +input_audio.commit after N appends +→ transcript.final("[mocked final] Placeholder push-to-talk transcript completed from N appended chunk(s).") +→ session.state(idle) +``` + +Safe deterministic edge cases for this mocked placeholder flow: + +- commit without any prior append is accepted and emits `transcript.final("[mocked final] Placeholder push-to-talk transcript completed without appended audio.")` +- repeated appends during one placeholder turn are accepted and each append replaces the latest partial transcript with a chunk-count-based deterministic value +- placeholder commit does not automatically start assistant thinking, response streaming, or audio playback + ## Contract Scope for This Increment This contract is intentionally limited to the smallest event set needed to unblock: