From 8e14eaeed0193564020d1320a2dabbead6ad6129 Mon Sep 17 00:00:00 2001 From: Johannes Kresner Date: Wed, 8 Apr 2026 21:50:18 +0200 Subject: [PATCH] feat(vela): retire legacy mocked turn trigger --- apps/vela-gateway/src/index.js | 9 +- .../test/websocket-session.test.js | 121 +++-------------- apps/vela-ui/README.md | 2 +- apps/vela-ui/e2e/voice-session.spec.js | 95 ++------------ apps/vela-ui/src/lib/VoiceSessionShell.svelte | 44 +------ apps/vela-ui/tests/voice-session.test.js | 123 +++--------------- docs/architecture.md | 10 +- docs/backlog.md | 13 +- docs/overview.md | 1 + docs/protocol.md | 38 ++---- 10 files changed, 78 insertions(+), 378 deletions(-) diff --git a/apps/vela-gateway/src/index.js b/apps/vela-gateway/src/index.js index 465506e..c162365 100644 --- a/apps/vela-gateway/src/index.js +++ b/apps/vela-gateway/src/index.js @@ -11,7 +11,6 @@ const { const WEBSOCKET_ROUTE = '/ws'; const WEBSOCKET_GUID = '258EAFA5-E914-47DA-95CA-C5AB0DC85B11'; -const MOCKED_USER_TRANSCRIPT = '[mocked user] What is the current mocked vertical slice?'; const MOCKED_ASSISTANT_RESPONSE = '[mocked assistant] This is a deterministic mocked response from the gateway vertical slice.'; function createPlaceholderPartialTranscript(audioChunkCount) { @@ -67,7 +66,7 @@ function scheduleMockedTurnStep(session, turnId, delay, callback) { session.mockedTurnTimers.push(timer); } -function startMockedTurn(socket, session, { transcript = MOCKED_USER_TRANSCRIPT, includeListeningState = true } = {}) { +function startMockedTurn(socket, session, { transcript, includeListeningState = true } = {}) { if (session.mockedTurnInFlight) { sendSocketError(socket, 'mocked_turn_in_flight', 'Only one mocked turn can run per session at a time.'); return; @@ -246,7 +245,11 @@ function handleClientMessage(socket, session, rawMessage) { sendSocketMessage(socket, 'session.state', { value: session.state }); break; case 'mocked.turn.trigger': - startMockedTurn(socket, session); + sendSocketError( + socket, + 'unsupported_mocked_turn_trigger', + 'mocked.turn.trigger is no longer supported; use input_audio.append and input_audio.commit instead.' + ); break; case 'input_audio.append': if (session.mockedTurnInFlight) { diff --git a/apps/vela-gateway/test/websocket-session.test.js b/apps/vela-gateway/test/websocket-session.test.js index 1258a22..c078f35 100644 --- a/apps/vela-gateway/test/websocket-session.test.js +++ b/apps/vela-gateway/test/websocket-session.test.js @@ -355,7 +355,7 @@ test('websocket handles valid and invalid client messages safely', async () => { } }); -test('websocket accepts a placeholder input cycle before a mocked turn on the same socket', async () => { +test('websocket accepts repeated placeholder input cycles on the same socket', async () => { const server = await startServer(); try { @@ -403,11 +403,15 @@ test('websocket accepts a placeholder input cycle before a mocked turn on the sa payload: { value: 'idle' } }); - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'placeholder-control-shell-chunk-2' } }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'listening' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); await client.close(); } finally { @@ -516,7 +520,7 @@ test('websocket emits deterministic partials for repeated appends and a determin } }); -test('websocket mocked turn emits deterministic transcript and response events in order', async () => { +test('websocket rejects the retired mocked.turn.trigger path deterministically', async () => { const server = await startServer(); try { @@ -526,110 +530,17 @@ test('websocket mocked turn emits deterministic transcript and response events i client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'listening' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'transcript.final', - payload: { text: '[mocked user] What is the current mocked vertical slice?' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'thinking' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'speaking' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'response.text.delta', - payload: { text: '[mocked assistant] ' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'response.text.delta', - payload: { text: 'This is a deterministic mocked response from the gateway vertical slice.' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'response.completed', - payload: {} - }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'idle' } - }); - - await client.close(); - } finally { - await server.close(); - } -}); - -test('websocket rejects a second mocked turn while one is in flight', async () => { - const server = await startServer(); - - try { - const client = await connectWebSocket(server.port); - await client.nextMessage(); - await client.nextMessage(); - - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'listening' } - }); - - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); assert.deepEqual(await client.nextMessage(), { type: 'error', payload: { - code: 'mocked_turn_in_flight', - message: 'Only one mocked turn can run per session at a time.', + code: 'unsupported_mocked_turn_trigger', + message: + 'mocked.turn.trigger is no longer supported; use input_audio.append and input_audio.commit instead.', retryable: true } }); - - await client.close(); - } finally { - await server.close(); - } -}); - -test('websocket cancel stops an active mocked turn and allows a new one without reconnecting', async () => { - const server = await startServer(); - - try { - const client = await connectWebSocket(server.port); - await client.nextMessage(); - await client.nextMessage(); - - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'listening' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'transcript.final', - payload: { text: '[mocked user] What is the current mocked vertical slice?' } - }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'thinking' } - }); - - client.sendJson({ type: 'response.cancel', payload: {} }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'idle' } - }); await assert.rejects(() => client.nextMessage(150), /timed out waiting for websocket message/); - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); - assert.deepEqual(await client.nextMessage(), { - type: 'session.state', - payload: { value: 'listening' } - }); - await client.close(); } finally { await server.close(); @@ -671,11 +582,15 @@ test('websocket cancel stops a push-to-talk commit response and allows another t }); await assert.rejects(() => client.nextMessage(150), /timed out waiting for websocket message/); - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'chunk-2' } }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'listening' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); await client.close(); } finally { @@ -694,11 +609,15 @@ test('websocket safely accepts cancel when no turn is active', async () => { client.sendJson({ type: 'response.cancel', payload: {} }); await assert.rejects(() => client.nextMessage(150), /timed out waiting for websocket message/); - client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'chunk-1' } }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'listening' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); await client.close(); } finally { diff --git a/apps/vela-ui/README.md b/apps/vela-ui/README.md index c0822ac..e14a932 100644 --- a/apps/vela-ui/README.md +++ b/apps/vela-ui/README.md @@ -7,7 +7,7 @@ Current status: - SvelteKit app boots in the Yarn workspace - root page shows a minimal voice-session shell with connect/disconnect controls - the shell can connect to the gateway `/ws` endpoint and display developer-visible session status -- the shell can trigger one deterministic mocked turn and render the mocked transcript plus assistant response +- the current mocked interaction path is push-to-talk only and renders the mocked transcript plus assistant response - Vitest covers connect/disconnect plus the deterministic mocked transcript/response UI flow without requiring a browser harness - Playwright remains optional for deeper browser-level checks - microphone capture and audio playback remain future increments diff --git a/apps/vela-ui/e2e/voice-session.spec.js b/apps/vela-ui/e2e/voice-session.spec.js index ddaf6b2..4fb0b17 100644 --- a/apps/vela-ui/e2e/voice-session.spec.js +++ b/apps/vela-ui/e2e/voice-session.spec.js @@ -1,87 +1,15 @@ import { expect, test } from '@playwright/test'; -const MOCKED_USER_TRANSCRIPT = '[mocked user] What is the current mocked vertical slice?'; const MOCKED_ASSISTANT_RESPONSE = '[mocked assistant] This is a deterministic mocked response from the gateway vertical slice.'; -test('voice session shell covers the mocked transcript/response slice', async ({ page }) => { +test('voice session shell supports the placeholder push-to-talk path', async ({ page }) => { await page.goto('/'); await expect(page.getByTestId('hydration-status')).toHaveText('ready'); await expect(page.getByTestId('connection-state')).toHaveText('not connected'); - await expect(page.getByTestId('mocked-turn-button')).toBeDisabled(); - await expect(page.getByTestId('session-id')).toHaveText('not assigned'); - await expect(page.getByTestId('gateway-session-state')).toHaveText('not received'); - - await page.getByTestId('connect-button').click(); - - await expect(page.getByTestId('connection-state')).toHaveText('connected'); - await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); - await expect(page.getByTestId('session-id')).not.toHaveText('not assigned'); - await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); - const sessionId = await page.getByTestId('session-id').textContent(); - - await page.getByTestId('mocked-turn-button').click(); - - await expect(page.getByTestId('mocked-turn-status')).toHaveText('running'); - await expect(page.getByTestId('user-transcript')).toHaveText('waiting for mocked transcript…'); - await expect(page.getByTestId('assistant-response')).toHaveText('waiting for mocked response…'); - - await expect(page.getByTestId('user-transcript')).toHaveText(MOCKED_USER_TRANSCRIPT); - await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); - await expect(page.getByTestId('conversation-render-order')).toHaveText('transcript>response'); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); - - await page.getByTestId('disconnect-button').click(); - - await expect(page.getByTestId('connection-state')).toHaveText('disconnected'); - await expect(page.getByTestId('connection-detail')).toHaveText('Gateway WebSocket is closed.'); - await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); - await expect(page.getByTestId('session-id')).toHaveText(sessionId ?? ''); - await expect(page.getByTestId('mocked-turn-button')).toBeDisabled(); - await expect(page.getByTestId('user-transcript')).toHaveText(MOCKED_USER_TRANSCRIPT); - await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); - await expect(page.getByTestId('session-id')).toHaveText(sessionId ?? ''); - await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); - await expect(page.getByTestId('user-transcript')).toHaveText(MOCKED_USER_TRANSCRIPT); - await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); -}); - -test('voice session shell can cancel an active mocked turn and start another one', async ({ page }) => { - await page.goto('/'); - await expect(page.getByTestId('hydration-status')).toHaveText('ready'); - - await expect(page.getByTestId('cancel-turn-button')).toBeDisabled(); - await page.getByTestId('connect-button').click(); - - await expect(page.getByTestId('connection-state')).toHaveText('connected'); - await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); - - await page.getByTestId('mocked-turn-button').click(); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('running'); - await expect(page.getByTestId('cancel-turn-button')).toBeEnabled(); - await expect(page.getByTestId('user-transcript')).toHaveText(MOCKED_USER_TRANSCRIPT); - await expect(page.getByTestId('assistant-response')).toContainText('[mocked assistant]'); - - await page.getByTestId('cancel-turn-button').click(); - - await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); - await expect(page.getByTestId('cancel-turn-button')).toBeDisabled(); - await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); - await expect(page.getByTestId('user-transcript')).toHaveText(MOCKED_USER_TRANSCRIPT); - await expect(page.getByTestId('assistant-response')).toContainText('[mocked assistant]'); - - await page.getByTestId('mocked-turn-button').click(); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('running'); - await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); -}); - -test('voice session shell supports a placeholder mic-control cycle before another mocked turn', async ({ page }) => { - await page.goto('/'); - await expect(page.getByTestId('hydration-status')).toHaveText('ready'); await expect(page.getByTestId('mic-control-button')).toBeDisabled(); + await expect(page.getByTestId('mocked-turn-button')).toHaveCount(0); await page.getByTestId('connect-button').click(); @@ -95,10 +23,6 @@ test('voice session shell supports a placeholder mic-control cycle before anothe await expect(page.getByTestId('partial-transcript')).toHaveText( '[mocked partial] Placeholder push-to-talk transcript in progress.' ); - await expect(page.getByTestId('user-transcript')).toHaveText( - '[mocked partial] Placeholder push-to-talk transcript in progress.' - ); - await expect(page.getByTestId('mocked-turn-button')).toBeDisabled(); await page.getByTestId('mic-control-button').dispatchEvent('mouseup'); await expect(page.getByTestId('mic-control-status')).toHaveText('idle'); @@ -108,15 +32,10 @@ test('voice session shell supports a placeholder mic-control cycle before anothe await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); await expect(page.getByTestId('partial-transcript')).toHaveText('none'); await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); - await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); - - await page.getByTestId('mocked-turn-button').click(); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('running'); - await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); - await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); + await expect(page.getByTestId('mic-control-button')).toBeEnabled(); }); -test('voice session shell can cancel a push-to-talk mocked response and start another turn', async ({ page }) => { +test('voice session shell can cancel a push-to-talk mocked response and start another push-to-talk turn', async ({ page }) => { await page.goto('/'); await expect(page.getByTestId('hydration-status')).toHaveText('ready'); @@ -140,9 +59,11 @@ test('voice session shell can cancel a push-to-talk mocked response and start an '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' ); await expect(page.getByTestId('assistant-response')).toContainText('[mocked assistant]'); - await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); + await expect(page.getByTestId('mic-control-button')).toBeEnabled(); - await page.getByTestId('mocked-turn-button').click(); + await page.getByTestId('mic-control-button').dispatchEvent('mousedown'); + await expect(page.getByTestId('gateway-session-state')).toHaveText('listening'); + await page.getByTestId('mic-control-button').dispatchEvent('mouseup'); await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); }); diff --git a/apps/vela-ui/src/lib/VoiceSessionShell.svelte b/apps/vela-ui/src/lib/VoiceSessionShell.svelte index 8e8d3b2..592b917 100644 --- a/apps/vela-ui/src/lib/VoiceSessionShell.svelte +++ b/apps/vela-ui/src/lib/VoiceSessionShell.svelte @@ -58,15 +58,6 @@ let micControlActive = false; let hydrationStatus = 'mounting'; - $: canTriggerMockedTurn = - typeof WebSocket !== 'undefined' && - connectionState === 'connected' && - socket?.readyState === WebSocket.OPEN && - sessionReadyReceived && - gatewaySessionState === 'idle' && - !micControlActive && - !mockedTurnInFlight; - $: canCancelMockedTurn = typeof WebSocket !== 'undefined' && connectionState === 'connected' && @@ -132,33 +123,6 @@ socket.send(JSON.stringify(createMessageEnvelope('input_audio.commit', {}))); } - function triggerMockedTurn() { - if (!socket || socket.readyState !== WebSocket.OPEN || connectionState !== 'connected') { - connectionDetail = 'Connect to the gateway before triggering a mocked turn.'; - lastError = 'mocked turn requires an active WebSocket connection'; - return; - } - - if (!sessionReadyReceived) { - connectionDetail = 'Wait for the gateway session to be ready before triggering a mocked turn.'; - lastError = 'mocked turn requires session.ready'; - return; - } - - if (mockedTurnInFlight) { - connectionDetail = 'A mocked turn is already running for this session.'; - return; - } - - mockedUserTranscript = 'waiting for mocked transcript…'; - inProgressPartialTranscript = 'none'; - mockedAssistantResponse = 'waiting for mocked response…'; - mockedTurnInFlight = true; - assistantResponseExpected = true; - lastError = 'none'; - socket.send(JSON.stringify(createMessageEnvelope('mocked.turn.trigger', {}))); - } - function cancelActiveResponse() { if (!socket || socket.readyState !== WebSocket.OPEN || connectionState !== 'connected') { connectionDetail = 'Connect to the gateway before cancelling a mocked turn.'; @@ -363,9 +327,8 @@

Vela UI

Voice session shell

- This minimal browser shell can connect to the gateway WebSocket, trigger one deterministic - mocked turn, and expose a push-to-talk control shell that only sends placeholder protocol - events for the active session. + This minimal browser shell can connect to the gateway WebSocket and expose a push-to-talk + control shell that only sends placeholder protocol events for the active session.

@@ -408,9 +371,6 @@ > Disconnect - diff --git a/apps/vela-ui/tests/voice-session.test.js b/apps/vela-ui/tests/voice-session.test.js index d8284f6..177b98a 100644 --- a/apps/vela-ui/tests/voice-session.test.js +++ b/apps/vela-ui/tests/voice-session.test.js @@ -67,7 +67,7 @@ describe('voice session shell', () => { render(VoiceSessionShell); expect(getByTestId('connection-state').textContent).toBe('not connected'); - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); + expect(screen.queryByTestId('mocked-turn-button')).toBeNull(); expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(true); await fireEvent.click(getByTestId('connect-button')); @@ -78,7 +78,7 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('connection-state').textContent).toBe('connected'); - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); + expect(screen.queryByTestId('mocked-turn-button')).toBeNull(); expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(false); }); @@ -87,13 +87,13 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('connection-state').textContent).toBe('disconnected'); - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); + expect(screen.queryByTestId('mocked-turn-button')).toBeNull(); expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(true); expect(getByTestId('session-id').textContent).toBe('session-123'); }); }); - it('runs a placeholder mic-control cycle, streams mocked assistant text, and keeps mocked turn usable on the same socket', async () => { + it('runs a placeholder mic-control cycle, streams mocked assistant text, and leaves push-to-talk usable on the same socket', async () => { render(VoiceSessionShell); await fireEvent.click(getByTestId('connect-button')); @@ -131,7 +131,6 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('gateway-session-state').textContent).toBe('listening'); - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); expect(getByTestId('user-transcript').textContent).toBe( '[mocked partial] Placeholder push-to-talk transcript in progress.' ); @@ -168,20 +167,15 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('gateway-session-state').textContent).toBe('idle'); - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(false); expect(getByTestId('user-transcript').textContent).toBe( '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' ); expect(getByTestId('assistant-response').textContent).toBe( '[mocked assistant] This is a deterministic mocked response from the gateway vertical slice.' ); - expect(getByTestId('partial-transcript').textContent).toBe('none'); + expect(getByTestId('partial-transcript').textContent).toBe('none'); }); - - await fireEvent.click(getByTestId('mocked-turn-button')); - - expect(socket.sent).toHaveLength(3); - expect(JSON.parse(socket.sent[2]).type).toBe('mocked.turn.trigger'); }); it('keeps rendered push-to-talk transcript and assistant text visible after cancel', async () => { @@ -239,118 +233,33 @@ describe('voice session shell', () => { }); }); - it('renders mocked transcript before assistant response for a connected session', async () => { + it('shows protocol errors returned for the retired mocked turn trigger path', async () => { render(VoiceSessionShell); await fireEvent.click(getByTestId('connect-button')); const socket = MockWebSocket.latest(); socket.open(); - socket.message(createMessageEnvelope('session.ready', { sessionId: 'session-456' })); + socket.message(createMessageEnvelope('session.ready', { sessionId: 'session-legacy' })); socket.message(createMessageEnvelope('session.state', { value: 'idle' })); await waitFor(() => { expect(getByTestId('connection-state').textContent).toBe('connected'); }); - await fireEvent.click(getByTestId('mocked-turn-button')); - - expect(socket.sent).toHaveLength(1); - const sentMessage = JSON.parse(socket.sent[0]); - expect(sentMessage.type).toBe('mocked.turn.trigger'); - - socket.message(createMessageEnvelope('session.state', { value: 'listening' })); socket.message( - createMessageEnvelope('transcript.partial', { - text: '[mocked partial] Placeholder push-to-talk transcript in progress.' + createMessageEnvelope('error', { + code: 'unsupported_mocked_turn_trigger', + message: + 'mocked.turn.trigger is no longer supported; use input_audio.append and input_audio.commit instead.', + retryable: true }) ); - socket.message(createMessageEnvelope('transcript.final', { text: 'Turn on the office lamp.' })); - socket.message(createMessageEnvelope('session.state', { value: 'thinking' })); - socket.message(createMessageEnvelope('session.state', { value: 'speaking' })); - socket.message(createMessageEnvelope('response.text.delta', { text: 'Mocked ' })); - socket.message(createMessageEnvelope('response.text.delta', { text: 'assistant response.' })); - socket.message(createMessageEnvelope('response.completed', { reason: 'mocked_turn_complete' })); - socket.message(createMessageEnvelope('session.state', { value: 'idle' })); await waitFor(() => { - expect(getByTestId('user-transcript').textContent).toBe('Turn on the office lamp.'); - expect(getByTestId('partial-transcript').textContent).toBe('none'); - expect(getByTestId('assistant-response').textContent).toBe('Mocked assistant response.'); - expect(getByTestId('conversation-render-order').textContent).toBe('transcript>response'); + expect(getByTestId('last-error').textContent).toBe( + 'unsupported_mocked_turn_trigger: mocked.turn.trigger is no longer supported; use input_audio.append and input_audio.commit instead.' + ); expect(getByTestId('gateway-session-state').textContent).toBe('idle'); }); }); - - it('blocks mocked turn trigger before session.ready and allows it after session.ready', async () => { - render(VoiceSessionShell); - - await fireEvent.click(getByTestId('connect-button')); - const socket = MockWebSocket.latest(); - socket.open(); - - await waitFor(() => { - expect(getByTestId('connection-state').textContent).toBe('connected'); - }); - - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); - await fireEvent.click(getByTestId('mocked-turn-button')); - expect(socket.sent).toHaveLength(0); - expect(getByTestId('last-error').textContent).toBe('mocked turn requires session.ready'); - - socket.message(createMessageEnvelope('session.ready', { sessionId: 'session-789' })); - socket.message(createMessageEnvelope('session.state', { value: 'idle' })); - - await waitFor(() => { - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); - }); - - await fireEvent.click(getByTestId('mocked-turn-button')); - - expect(socket.sent).toHaveLength(1); - expect(JSON.parse(socket.sent[0]).type).toBe('mocked.turn.trigger'); - }); - - it('shows cancel control during an active mocked turn and preserves rendered text after cancel', async () => { - render(VoiceSessionShell); - - await fireEvent.click(getByTestId('connect-button')); - const socket = MockWebSocket.latest(); - socket.open(); - socket.message(createMessageEnvelope('session.ready', { sessionId: 'session-cancel' })); - socket.message(createMessageEnvelope('session.state', { value: 'idle' })); - - await waitFor(() => { - expect(getByTestId('cancel-turn-button').hasAttribute('disabled')).toBe(true); - }); - - await fireEvent.click(getByTestId('mocked-turn-button')); - - expect(JSON.parse(socket.sent[0]).type).toBe('mocked.turn.trigger'); - - socket.message(createMessageEnvelope('session.state', { value: 'listening' })); - socket.message(createMessageEnvelope('transcript.final', { text: 'Keep this transcript.' })); - socket.message(createMessageEnvelope('session.state', { value: 'thinking' })); - socket.message(createMessageEnvelope('session.state', { value: 'speaking' })); - socket.message(createMessageEnvelope('response.text.delta', { text: 'Partial response' })); - - await waitFor(() => { - expect(getByTestId('cancel-turn-button').hasAttribute('disabled')).toBe(false); - expect(getByTestId('assistant-response').textContent).toBe('Partial response'); - }); - - await fireEvent.click(getByTestId('cancel-turn-button')); - - expect(JSON.parse(socket.sent[1]).type).toBe('response.cancel'); - - socket.message(createMessageEnvelope('session.state', { value: 'idle' })); - - await waitFor(() => { - expect(getByTestId('mocked-turn-status').textContent).toBe('idle'); - expect(getByTestId('gateway-session-state').textContent).toBe('idle'); - expect(getByTestId('cancel-turn-button').hasAttribute('disabled')).toBe(true); - expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); - expect(getByTestId('user-transcript').textContent).toBe('Keep this transcript.'); - expect(getByTestId('assistant-response').textContent).toBe('Partial response'); - }); - }); }); diff --git a/docs/architecture.md b/docs/architecture.md index 1ceefa0..63c9f3c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -36,14 +36,13 @@ The repository now includes separate runnable workspaces for the UI and gateway - PWA enabled - WebSocket client -The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, trigger one deterministic mocked turn while connected, render deterministic placeholder partial/final transcripts for the push-to-talk shell, and stream the mocked assistant response both for `mocked.turn.trigger` and for push-to-talk commits. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. +The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, render deterministic placeholder partial/final transcripts for the push-to-talk shell, and stream the mocked assistant response after push-to-talk commit. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. #### Responsibilities Current shell responsibilities: - connection state rendering -- mocked-turn trigger rendering with disconnected/in-flight guards - mocked transcript and mocked assistant response rendering - developer-oriented session metadata rendering - browser session connect/disconnect controls @@ -62,7 +61,6 @@ Current shell: - developer-focused voice-session panel - connect button - disconnect button -- mocked-turn button - connection status indicator - mocked transcript display - mocked assistant response display @@ -106,14 +104,14 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and - the gateway sends `session.ready` followed by `session.state` (`idle`) when the socket is established - valid minimal client events, including placeholder `input_audio.append` / `input_audio.commit`, can move the session through the mocked turn states on one socket - placeholder `input_audio.append` emits deterministic mocked `transcript.partial` events and `input_audio.commit` emits one deterministic mocked `transcript.final` before starting the existing mocked assistant response stream -- `mocked.turn.trigger` drives a fixed transcript/response event sequence over the existing shared protocol - only one mocked turn is allowed in flight per session at a time - invalid JSON, invalid envelopes, and malformed frames are handled defensively so the process stays up +- retired `mocked.turn.trigger` messages are rejected with a deterministic recoverable error ### Current UI shell behavior - renders a minimal developer-focused voice-session panel -- exposes connect, disconnect, mic-control shell interactions, and mocked-turn controls +- exposes connect, disconnect, and mic-control shell interactions - does not request microphone permission or capture real microphone audio - only emits placeholder `input_audio.append` / `input_audio.commit` events; it does not send real audio data or play back audio - renders the latest placeholder partial transcript during a push-to-talk shell turn, replaces it with the final deterministic transcript on commit, and appends streamed mocked assistant text for that same push-to-talk turn @@ -122,7 +120,7 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and ## Voice Pipeline ```text -Mic control shell / mocked turn button → Placeholder `input_audio.append` / `input_audio.commit` or mocked session flow → Deterministic transcript events → Shared mocked response engine → Mocked response text events → UI +Mic control shell → Placeholder `input_audio.append` / `input_audio.commit` → Deterministic transcript events → Shared mocked response engine → Mocked response text events → UI ``` This mocked vertical slice intentionally stands in for the future real pipeline: diff --git a/docs/backlog.md b/docs/backlog.md index 3dfd726..2f11a67 100644 --- a/docs/backlog.md +++ b/docs/backlog.md @@ -183,16 +183,15 @@ Polish the system after the core voice loop is reliable. - `apps/vela-ui` now boots as a minimal SvelteKit app with a starter page - `apps/vela-ui` now includes a minimal voice-session shell that can connect to the gateway `/ws` endpoint and display developer-visible session status -- `apps/vela-ui` can now trigger one deterministic mocked turn while connected and render the mocked transcript plus assistant response for the active session - `apps/vela-ui` now exposes a visible push-to-talk mic control shell that sends placeholder `input_audio.append` / `input_audio.commit` events without requesting browser mic permission or capturing real audio -- `apps/vela-ui` now includes browser-level coverage for the mocked transcript/response slice, including connect, disconnect, and disconnected-state trigger guarding +- `apps/vela-ui` now includes browser-level coverage for the placeholder push-to-talk mocked transcript/response slice, including connect, disconnect, and cancel behavior - `apps/vela-gateway` now boots as a minimal Fastify app with `/` and `/health` endpoints - `apps/vela-gateway` now exposes a minimal `/ws` WebSocket session skeleton with ephemeral in-memory sessions and defensive message handling -- `apps/vela-gateway` now accepts `mocked.turn.trigger` and emits protocol-valid mocked transcript/response events with one in-flight mocked turn per session -- `apps/vela-gateway` now supports placeholder input-audio append/commit cycles before running another mocked turn on the same socket +- `apps/vela-gateway` now rejects retired `mocked.turn.trigger` requests with a deterministic recoverable error instead of starting a mocked turn +- `apps/vela-gateway` now supports repeated placeholder input-audio append/commit cycles on the same socket - `apps/vela-gateway` now emits deterministic `transcript.partial` events for placeholder `input_audio.append` messages and, after each accepted `input_audio.commit`, reuses the mocked response engine to stream a deterministic assistant reply for that push-to-talk turn -- `apps/vela-ui` now renders the latest placeholder partial transcript during the push-to-talk shell turn, replaces it with the deterministic final transcript on commit, and shows streamed assistant text for the same push-to-talk flow -- `apps/vela-ui` now exposes a cancel control for active mocked turns and mocked push-to-talk responses, and keeps already-rendered transcript/response text visible after cancellation -- `apps/vela-gateway` now honors `response.cancel` during mocked turns and push-to-talk-triggered mocked responses by stopping pending mocked response events, returning the session to `idle`, and allowing a new turn on the same socket +- `apps/vela-ui` now renders the latest placeholder partial transcript during the push-to-talk shell turn, replaces it with the deterministic final transcript on commit, and shows streamed assistant text for that same push-to-talk flow +- `apps/vela-ui` now exposes a cancel control for active push-to-talk-triggered mocked responses, and keeps already-rendered transcript/response text visible after cancellation +- `apps/vela-gateway` now honors `response.cancel` during push-to-talk-triggered mocked responses by stopping pending mocked response events, returning the session to `idle`, and allowing a new turn on the same socket - `apps/vela-protocol` now provides the shared WebSocket event contract for the UI and gateway - backend framework choice is now concrete: Fastify diff --git a/docs/overview.md b/docs/overview.md index fe470f2..c7178e3 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -43,6 +43,7 @@ Vela is a fully local, voice-first assistant system with: - browser-based PWA - push-to-talk interaction +- current mocked vertical slice enters turns only through the placeholder push-to-talk shell - transcript and response display - playback of streamed or returned audio diff --git a/docs/protocol.md b/docs/protocol.md index 4550dba..bced8ab 100644 --- a/docs/protocol.md +++ b/docs/protocol.md @@ -15,8 +15,8 @@ Current UI baseline: - the browser opens a WebSocket directly to `/ws` - the UI tracks connection status separately from gateway session status -- the UI can send `mocked.turn.trigger` after `session.ready` while connected to request one deterministic mocked turn for the active session - the UI exposes a push-to-talk mic control shell that sends placeholder `input_audio.append` on press and `input_audio.commit` on release without capturing real audio +- the push-to-talk shell is the only supported mocked turn entry path from the shipped UI ## WebSocket Message Envelope @@ -50,7 +50,7 @@ type ClientEvent = #### Client event intent - `session.start` initializes a voice session without locking in transport or auth details yet -- `mocked.turn.trigger` asks the gateway to run one obviously mocked, deterministic transcript/response turn +- `mocked.turn.trigger` is a retired legacy event name that the gateway now rejects with a deterministic recoverable error - `input_audio.append` carries a chunk of captured input audio as an encoded string - `input_audio.commit` marks the current buffered user turn as ready for downstream processing - `response.cancel` interrupts the active listen/think/speak flow @@ -59,15 +59,13 @@ type ClientEvent = - on connect, the gateway creates an ephemeral in-memory session and emits `session.ready` plus `session.state` - `session.start` is accepted as an idempotent session acknowledgment and re-sends readiness/state -- `mocked.turn.trigger` is accepted only when no other mocked turn is already in flight for that session -- a mocked turn emits deterministic `transcript.final`, `response.text.delta`, `response.completed`, and `session.state` events in protocol-valid order +- `mocked.turn.trigger` is rejected deterministically with `error.code = unsupported_mocked_turn_trigger` - `input_audio.append` updates the ephemeral session record and moves the session to `listening` - each accepted `input_audio.append` emits one deterministic `transcript.partial` for the current placeholder turn -- `input_audio.commit` emits exactly one deterministic `transcript.final` and then starts the same deterministic mocked assistant response stream used by `mocked.turn.trigger` -- after a completed placeholder input cycle, the same socket can still send `mocked.turn.trigger` +- `input_audio.commit` emits exactly one deterministic `transcript.final` and then starts the deterministic mocked assistant response stream for that push-to-talk turn +- after a completed placeholder input cycle, the same socket can start another placeholder push-to-talk turn without reconnecting - `response.cancel` is safe to send even when no mocked turn is active - `response.cancel` stops any still-pending mocked turn events for the active turn and resets the minimal session state back to `idle` -- a second mocked-turn trigger during an active mocked turn produces `error` with code `mocked_turn_in_flight` - malformed JSON produces `error` with code `invalid_json` - invalid envelopes or unsupported client event names produce `error` with code `invalid_message` - malformed WebSocket frames are rejected without crashing the gateway process @@ -88,7 +86,6 @@ Notes: - this UI state is transport-oriented and is separate from the shared gateway `session.state` payload - `session.state` currently reflects the gateway session phase (`idle`, `listening`, `thinking`, `speaking`) -- the UI disables the mocked-turn control until `session.ready` arrives, while disconnected, or while a mocked turn is already in flight - the UI disables the mic control while disconnected, before `session.ready`, or while a mocked turn is already in flight - pressing the mic control sends one placeholder `input_audio.append` chunk and releasing it sends `input_audio.commit` - while a placeholder push-to-talk turn is in progress, the UI renders the latest `transcript.partial` @@ -126,26 +123,19 @@ type ServerEvent = - `response.completed` marks the current assistant turn as done - `error` is the minimal recoverable failure shape for both UI and gateway work -### Deterministic mocked turn sequence +### Legacy mocked turn trigger rejection -For this increment, `mocked.turn.trigger` produces one fixed interaction for the active session: +For this increment, direct `mocked.turn.trigger` requests no longer start a mocked turn: ```text -session.state(listening) -→ transcript.final("[mocked user] What is the current mocked vertical slice?") -→ session.state(thinking) -→ session.state(speaking) -→ response.text.delta("[mocked assistant] ") -→ response.text.delta("This is a deterministic mocked response from the gateway vertical slice.") -→ response.completed -→ session.state(idle) +mocked.turn.trigger +→ error(code="unsupported_mocked_turn_trigger", message="mocked.turn.trigger is no longer supported; use input_audio.append and input_audio.commit instead.") ``` Notes: -- the content is intentionally fixed and obviously mocked -- no audio, STT, LLM, TTS, or external providers participate in this flow -- `response.cancel` can stop the mocked turn early, suppress any later mocked response events for that turn, and return the session to `idle` +- this rejection is deterministic and recoverable +- the session remains available for the supported push-to-talk flow on the same socket ### Deterministic placeholder push-to-talk transcript and mocked response sequence @@ -173,8 +163,8 @@ Safe deterministic edge cases for this mocked placeholder flow: - commit without any prior append is accepted and emits `transcript.final("[mocked final] Placeholder push-to-talk transcript completed without appended audio.")` - repeated appends during one placeholder turn are accepted and each append replaces the latest partial transcript with a chunk-count-based deterministic value -- after the final transcript, placeholder commit follows the same mocked `thinking → speaking → response.text.delta* → response.completed → idle` path as `mocked.turn.trigger` -- `response.cancel` can interrupt this mocked post-commit response path the same way it interrupts `mocked.turn.trigger`; already-rendered transcript or assistant text is not retracted +- after the final transcript, placeholder commit follows the deterministic mocked `thinking → speaking → response.text.delta* → response.completed → idle` path +- `response.cancel` can interrupt this mocked post-commit response path; already-rendered transcript or assistant text is not retracted ## Contract Scope for This Increment @@ -207,7 +197,7 @@ Current mocked-pipeline behavior: - during an active mocked turn, `response.cancel` returns the session to `idle` immediately - any mocked turn timers that have not fired yet are dropped, so no later `response.text.delta` or `response.completed` events are emitted for the cancelled turn - the same cancellation behavior applies when a mocked turn was started by `input_audio.commit` -- once `idle` is restored, the same WebSocket session can start another mocked turn without reconnecting +- once `idle` is restored, the same WebSocket session can start another placeholder push-to-talk turn without reconnecting More general future-state expectations: