diff --git a/apps/vela-gateway/src/index.js b/apps/vela-gateway/src/index.js index 6f82770..465506e 100644 --- a/apps/vela-gateway/src/index.js +++ b/apps/vela-gateway/src/index.js @@ -67,21 +67,23 @@ function scheduleMockedTurnStep(session, turnId, delay, callback) { session.mockedTurnTimers.push(timer); } -function startMockedTurn(socket, session) { +function startMockedTurn(socket, session, { transcript = MOCKED_USER_TRANSCRIPT, includeListeningState = true } = {}) { if (session.mockedTurnInFlight) { sendSocketError(socket, 'mocked_turn_in_flight', 'Only one mocked turn can run per session at a time.'); return; } clearMockedTurn(session); - session.audioChunkCount = 0; session.mockedTurnInFlight = true; const turnId = crypto.randomUUID(); session.activeMockedTurnId = turnId; - updateSessionState(socket, session, 'listening'); + + if (includeListeningState) { + updateSessionState(socket, session, 'listening'); + } scheduleMockedTurnStep(session, turnId, 75, () => { - sendSocketMessage(socket, 'transcript.final', { text: MOCKED_USER_TRANSCRIPT }); + sendSocketMessage(socket, 'transcript.final', { text: transcript }); updateSessionState(socket, session, 'thinking'); }); @@ -264,11 +266,12 @@ function handleClientMessage(socket, session, rawMessage) { break; } - sendSocketMessage(socket, 'transcript.final', { - text: createPlaceholderFinalTranscript(session.audioChunkCount) - }); + const finalTranscript = createPlaceholderFinalTranscript(session.audioChunkCount); session.audioChunkCount = 0; - updateSessionState(socket, session, 'idle'); + startMockedTurn(socket, session, { + transcript: finalTranscript, + includeListeningState: false + }); break; case 'response.cancel': clearMockedTurn(session); diff --git a/apps/vela-gateway/test/websocket-session.test.js b/apps/vela-gateway/test/websocket-session.test.js index d829146..1258a22 100644 --- a/apps/vela-gateway/test/websocket-session.test.js +++ b/apps/vela-gateway/test/websocket-session.test.js @@ -303,6 +303,26 @@ test('websocket handles valid and invalid client messages safely', async () => { type: 'transcript.final', payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'thinking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'speaking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: '[mocked assistant] ' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: 'This is a deterministic mocked response from the gateway vertical slice.' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.completed', + payload: {} + }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'idle' } @@ -358,6 +378,26 @@ test('websocket accepts a placeholder input cycle before a mocked turn on the sa type: 'transcript.final', payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'thinking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'speaking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: '[mocked assistant] ' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: 'This is a deterministic mocked response from the gateway vertical slice.' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.completed', + payload: {} + }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'idle' } @@ -404,6 +444,37 @@ test('websocket emits deterministic partials for repeated appends and a determin type: 'transcript.final', payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 2 appended chunks.' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'thinking' } + }); + + client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'error', + payload: { + code: 'mocked_turn_in_flight', + message: 'Wait for the mocked turn to finish before committing input.', + retryable: true + } + }); + + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'speaking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: '[mocked assistant] ' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: 'This is a deterministic mocked response from the gateway vertical slice.' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.completed', + payload: {} + }); assert.deepEqual(await client.nextMessage(), { type: 'session.state', payload: { value: 'idle' } @@ -414,7 +485,30 @@ test('websocket emits deterministic partials for repeated appends and a determin type: 'transcript.final', payload: { text: '[mocked final] Placeholder push-to-talk transcript completed without appended audio.' } }); - await assert.rejects(() => client.nextMessage(150), /timed out waiting for websocket message/); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'thinking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'speaking' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: '[mocked assistant] ' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.text.delta', + payload: { text: 'This is a deterministic mocked response from the gateway vertical slice.' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'response.completed', + payload: {} + }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'idle' } + }); await client.close(); } finally { @@ -542,6 +636,53 @@ test('websocket cancel stops an active mocked turn and allows a new one without } }); +test('websocket cancel stops a push-to-talk commit response and allows another turn', async () => { + const server = await startServer(); + + try { + const client = await connectWebSocket(server.port); + await client.nextMessage(); + await client.nextMessage(); + + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'chunk-1' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'listening' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.partial', + payload: { text: '[mocked partial] Placeholder push-to-talk transcript in progress.' } + }); + + client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'transcript.final', + payload: { text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' } + }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'thinking' } + }); + + client.sendJson({ type: 'response.cancel', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'idle' } + }); + await assert.rejects(() => client.nextMessage(150), /timed out waiting for websocket message/); + + client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'listening' } + }); + + await client.close(); + } finally { + await server.close(); + } +}); + test('websocket safely accepts cancel when no turn is active', async () => { const server = await startServer(); diff --git a/apps/vela-ui/e2e/voice-session.spec.js b/apps/vela-ui/e2e/voice-session.spec.js index b54c20d..ddaf6b2 100644 --- a/apps/vela-ui/e2e/voice-session.spec.js +++ b/apps/vela-ui/e2e/voice-session.spec.js @@ -105,6 +105,7 @@ test('voice session shell supports a placeholder mic-control cycle before anothe await expect(page.getByTestId('user-transcript')).toHaveText( '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' ); + await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); await expect(page.getByTestId('partial-transcript')).toHaveText('none'); await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); @@ -114,3 +115,34 @@ test('voice session shell supports a placeholder mic-control cycle before anothe await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); }); + +test('voice session shell can cancel a push-to-talk mocked response and start another turn', async ({ page }) => { + await page.goto('/'); + await expect(page.getByTestId('hydration-status')).toHaveText('ready'); + + await page.getByTestId('connect-button').click(); + await expect(page.getByTestId('connection-state')).toHaveText('connected'); + + await page.getByTestId('mic-control-button').dispatchEvent('mousedown'); + await expect(page.getByTestId('gateway-session-state')).toHaveText('listening'); + await page.getByTestId('mic-control-button').dispatchEvent('mouseup'); + + await expect(page.getByTestId('user-transcript')).toHaveText( + '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + ); + await expect(page.getByTestId('cancel-turn-button')).toBeEnabled(); + await expect(page.getByTestId('assistant-response')).toContainText('[mocked assistant]'); + + await page.getByTestId('cancel-turn-button').click(); + await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); + await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); + await expect(page.getByTestId('user-transcript')).toHaveText( + '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + ); + await expect(page.getByTestId('assistant-response')).toContainText('[mocked assistant]'); + await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); + + await page.getByTestId('mocked-turn-button').click(); + await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); + await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); +}); diff --git a/apps/vela-ui/src/lib/VoiceSessionShell.svelte b/apps/vela-ui/src/lib/VoiceSessionShell.svelte index e29d67b..8e8d3b2 100644 --- a/apps/vela-ui/src/lib/VoiceSessionShell.svelte +++ b/apps/vela-ui/src/lib/VoiceSessionShell.svelte @@ -53,6 +53,7 @@ let inProgressPartialTranscript = 'none'; let mockedAssistantResponse = 'none'; let mockedTurnInFlight = false; + let assistantResponseExpected = false; let mockedConversationRenderOrder = []; let micControlActive = false; let hydrationStatus = 'mounting'; @@ -78,6 +79,7 @@ connectionState === 'connected' && socket?.readyState === WebSocket.OPEN && sessionReadyReceived && + gatewaySessionState === 'idle' && !mockedTurnInFlight; function clearSocketHandlers(targetSocket) { @@ -96,6 +98,7 @@ inProgressPartialTranscript = 'none'; mockedAssistantResponse = 'none'; mockedTurnInFlight = false; + assistantResponseExpected = false; mockedConversationRenderOrder = []; micControlActive = false; } @@ -125,6 +128,7 @@ } connectionDetail = 'Mic control shell released. Sending placeholder input_audio.commit.'; + assistantResponseExpected = true; socket.send(JSON.stringify(createMessageEnvelope('input_audio.commit', {}))); } @@ -150,6 +154,7 @@ inProgressPartialTranscript = 'none'; mockedAssistantResponse = 'waiting for mocked response…'; mockedTurnInFlight = true; + assistantResponseExpected = true; lastError = 'none'; socket.send(JSON.stringify(createMessageEnvelope('mocked.turn.trigger', {}))); } @@ -235,6 +240,7 @@ gatewaySessionState = message.payload.value; if (message.payload.value === 'idle') { mockedTurnInFlight = false; + assistantResponseExpected = false; } return; } @@ -242,7 +248,8 @@ if (message.type === 'transcript.final') { inProgressPartialTranscript = 'none'; mockedUserTranscript = message.payload.text; - if (mockedTurnInFlight) { + if (assistantResponseExpected) { + mockedTurnInFlight = true; mockedAssistantResponse = '…'; } if (!mockedConversationRenderOrder.includes('transcript')) { @@ -276,12 +283,14 @@ if (message.type === 'response.completed') { mockedTurnInFlight = false; + assistantResponseExpected = false; return; } if (message.type === 'error') { if (message.payload.retryable === false) { mockedTurnInFlight = false; + assistantResponseExpected = false; connectionState = 'error'; connectionDetail = 'Gateway reported a protocol error.'; } else { diff --git a/apps/vela-ui/tests/voice-session.test.js b/apps/vela-ui/tests/voice-session.test.js index de457f9..d8284f6 100644 --- a/apps/vela-ui/tests/voice-session.test.js +++ b/apps/vela-ui/tests/voice-session.test.js @@ -93,7 +93,7 @@ describe('voice session shell', () => { }); }); - it('runs a placeholder mic-control cycle and keeps mocked turn usable on the same socket', async () => { + it('runs a placeholder mic-control cycle, streams mocked assistant text, and keeps mocked turn usable on the same socket', async () => { render(VoiceSessionShell); await fireEvent.click(getByTestId('connect-button')); @@ -154,6 +154,15 @@ describe('voice session shell', () => { text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' }) ); + socket.message(createMessageEnvelope('session.state', { value: 'thinking' })); + socket.message(createMessageEnvelope('session.state', { value: 'speaking' })); + socket.message(createMessageEnvelope('response.text.delta', { text: '[mocked assistant] ' })); + socket.message( + createMessageEnvelope('response.text.delta', { + text: 'This is a deterministic mocked response from the gateway vertical slice.' + }) + ); + socket.message(createMessageEnvelope('response.completed', {})); socket.message(createMessageEnvelope('session.state', { value: 'idle' })); @@ -163,6 +172,9 @@ describe('voice session shell', () => { expect(getByTestId('user-transcript').textContent).toBe( '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' ); + expect(getByTestId('assistant-response').textContent).toBe( + '[mocked assistant] This is a deterministic mocked response from the gateway vertical slice.' + ); expect(getByTestId('partial-transcript').textContent).toBe('none'); }); @@ -172,6 +184,61 @@ describe('voice session shell', () => { expect(JSON.parse(socket.sent[2]).type).toBe('mocked.turn.trigger'); }); + it('keeps rendered push-to-talk transcript and assistant text visible after cancel', async () => { + render(VoiceSessionShell); + + await fireEvent.click(getByTestId('connect-button')); + const socket = MockWebSocket.latest(); + socket.open(); + socket.message(createMessageEnvelope('session.ready', { sessionId: 'session-ptt-cancel' })); + socket.message(createMessageEnvelope('session.state', { value: 'idle' })); + + await waitFor(() => { + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(false); + }); + + await fireEvent.mouseDown(getByTestId('mic-control-button')); + socket.message(createMessageEnvelope('session.state', { value: 'listening' })); + socket.message( + createMessageEnvelope('transcript.partial', { + text: '[mocked partial] Placeholder push-to-talk transcript in progress.' + }) + ); + + await fireEvent.mouseUp(getByTestId('mic-control-button')); + expect(JSON.parse(socket.sent[1]).type).toBe('input_audio.commit'); + + socket.message( + createMessageEnvelope('transcript.final', { + text: '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + }) + ); + socket.message(createMessageEnvelope('session.state', { value: 'thinking' })); + socket.message(createMessageEnvelope('session.state', { value: 'speaking' })); + socket.message(createMessageEnvelope('response.text.delta', { text: '[mocked assistant] ' })); + + await waitFor(() => { + expect(getByTestId('cancel-turn-button').hasAttribute('disabled')).toBe(false); + expect(getByTestId('assistant-response').textContent).toBe('[mocked assistant] '); + }); + + await fireEvent.click(getByTestId('cancel-turn-button')); + expect(JSON.parse(socket.sent[2]).type).toBe('response.cancel'); + + socket.message(createMessageEnvelope('session.state', { value: 'idle' })); + + await waitFor(() => { + expect(getByTestId('gateway-session-state').textContent).toBe('idle'); + expect(getByTestId('mocked-turn-status').textContent).toBe('idle'); + expect(getByTestId('cancel-turn-button').hasAttribute('disabled')).toBe(true); + expect(getByTestId('user-transcript').textContent).toBe( + '[mocked final] Placeholder push-to-talk transcript completed from 1 appended chunk.' + ); + expect(getByTestId('assistant-response').textContent).toBe('[mocked assistant] '); + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(false); + }); + }); + it('renders mocked transcript before assistant response for a connected session', async () => { render(VoiceSessionShell); diff --git a/docs/architecture.md b/docs/architecture.md index b479d0c..1ceefa0 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -36,7 +36,7 @@ The repository now includes separate runnable workspaces for the UI and gateway - PWA enabled - WebSocket client -The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, trigger one deterministic mocked turn while connected, render deterministic placeholder partial/final transcripts for the push-to-talk shell, and render the mocked user transcript plus mocked assistant response for the existing mocked-turn path. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. +The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, trigger one deterministic mocked turn while connected, render deterministic placeholder partial/final transcripts for the push-to-talk shell, and stream the mocked assistant response both for `mocked.turn.trigger` and for push-to-talk commits. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. #### Responsibilities @@ -104,8 +104,8 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and - `GET /ws` documents the route for plain HTTP clients and returns `426 Upgrade Required` - WebSocket upgrades on `/ws` create an ephemeral session immediately - the gateway sends `session.ready` followed by `session.state` (`idle`) when the socket is established -- valid minimal client events, including placeholder `input_audio.append` / `input_audio.commit`, can move the session between `idle` and `listening` -- placeholder `input_audio.append` emits deterministic mocked `transcript.partial` events and `input_audio.commit` emits one deterministic mocked `transcript.final` +- valid minimal client events, including placeholder `input_audio.append` / `input_audio.commit`, can move the session through the mocked turn states on one socket +- placeholder `input_audio.append` emits deterministic mocked `transcript.partial` events and `input_audio.commit` emits one deterministic mocked `transcript.final` before starting the existing mocked assistant response stream - `mocked.turn.trigger` drives a fixed transcript/response event sequence over the existing shared protocol - only one mocked turn is allowed in flight per session at a time - invalid JSON, invalid envelopes, and malformed frames are handled defensively so the process stays up @@ -116,13 +116,13 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and - exposes connect, disconnect, mic-control shell interactions, and mocked-turn controls - does not request microphone permission or capture real microphone audio - only emits placeholder `input_audio.append` / `input_audio.commit` events; it does not send real audio data or play back audio -- renders the latest placeholder partial transcript during a push-to-talk shell turn and replaces it with the final deterministic transcript on commit +- renders the latest placeholder partial transcript during a push-to-talk shell turn, replaces it with the final deterministic transcript on commit, and appends streamed mocked assistant text for that same push-to-talk turn - reads mocked transcript and mocked response events from the shared protocol contract ## Voice Pipeline ```text -Mic control shell / mocked turn button → Placeholder `input_audio.append` / `input_audio.commit` or mocked session flow → Deterministic transcript events → Mocked response text events when using mocked.turn.trigger → UI +Mic control shell / mocked turn button → Placeholder `input_audio.append` / `input_audio.commit` or mocked session flow → Deterministic transcript events → Shared mocked response engine → Mocked response text events → UI ``` This mocked vertical slice intentionally stands in for the future real pipeline: diff --git a/docs/backlog.md b/docs/backlog.md index 1b3b3d5..3dfd726 100644 --- a/docs/backlog.md +++ b/docs/backlog.md @@ -190,9 +190,9 @@ Polish the system after the core voice loop is reliable. - `apps/vela-gateway` now exposes a minimal `/ws` WebSocket session skeleton with ephemeral in-memory sessions and defensive message handling - `apps/vela-gateway` now accepts `mocked.turn.trigger` and emits protocol-valid mocked transcript/response events with one in-flight mocked turn per session - `apps/vela-gateway` now supports placeholder input-audio append/commit cycles before running another mocked turn on the same socket -- `apps/vela-gateway` now emits deterministic `transcript.partial` events for placeholder `input_audio.append` messages and exactly one deterministic `transcript.final` for each placeholder `input_audio.commit` -- `apps/vela-ui` now renders the latest placeholder partial transcript during the push-to-talk shell turn and replaces it with the deterministic final transcript on commit -- `apps/vela-ui` now exposes a cancel control for active mocked turns and keeps already-rendered transcript/response text visible after cancellation -- `apps/vela-gateway` now honors `response.cancel` during mocked turns by stopping pending mocked response events, returning the session to `idle`, and allowing a new mocked turn on the same socket +- `apps/vela-gateway` now emits deterministic `transcript.partial` events for placeholder `input_audio.append` messages and, after each accepted `input_audio.commit`, reuses the mocked response engine to stream a deterministic assistant reply for that push-to-talk turn +- `apps/vela-ui` now renders the latest placeholder partial transcript during the push-to-talk shell turn, replaces it with the deterministic final transcript on commit, and shows streamed assistant text for the same push-to-talk flow +- `apps/vela-ui` now exposes a cancel control for active mocked turns and mocked push-to-talk responses, and keeps already-rendered transcript/response text visible after cancellation +- `apps/vela-gateway` now honors `response.cancel` during mocked turns and push-to-talk-triggered mocked responses by stopping pending mocked response events, returning the session to `idle`, and allowing a new turn on the same socket - `apps/vela-protocol` now provides the shared WebSocket event contract for the UI and gateway - backend framework choice is now concrete: Fastify diff --git a/docs/protocol.md b/docs/protocol.md index f3dc927..4550dba 100644 --- a/docs/protocol.md +++ b/docs/protocol.md @@ -63,7 +63,7 @@ type ClientEvent = - a mocked turn emits deterministic `transcript.final`, `response.text.delta`, `response.completed`, and `session.state` events in protocol-valid order - `input_audio.append` updates the ephemeral session record and moves the session to `listening` - each accepted `input_audio.append` emits one deterministic `transcript.partial` for the current placeholder turn -- `input_audio.commit` emits exactly one deterministic `transcript.final`, resets the minimal buffered state, and returns the session to `idle` +- `input_audio.commit` emits exactly one deterministic `transcript.final` and then starts the same deterministic mocked assistant response stream used by `mocked.turn.trigger` - after a completed placeholder input cycle, the same socket can still send `mocked.turn.trigger` - `response.cancel` is safe to send even when no mocked turn is active - `response.cancel` stops any still-pending mocked turn events for the active turn and resets the minimal session state back to `idle` @@ -92,7 +92,7 @@ Notes: - the UI disables the mic control while disconnected, before `session.ready`, or while a mocked turn is already in flight - pressing the mic control sends one placeholder `input_audio.append` chunk and releasing it sends `input_audio.commit` - while a placeholder push-to-talk turn is in progress, the UI renders the latest `transcript.partial` -- after placeholder commit, the UI renders the `transcript.final` and clears the partial-only display +- after placeholder commit, the UI renders the `transcript.final`, clears the partial-only display, and streams the mocked assistant text from the downstream response events - the UI copy explicitly labels the mic button as a control shell and not real microphone capture - the UI shows a cancel control and enables it only while a mocked turn is active - after cancel returns the gateway to `idle`, the UI clears the active-turn indicator but keeps any transcript or response text that was already rendered @@ -147,9 +147,9 @@ Notes: - no audio, STT, LLM, TTS, or external providers participate in this flow - `response.cancel` can stop the mocked turn early, suppress any later mocked response events for that turn, and return the session to `idle` -### Deterministic placeholder push-to-talk transcript sequence +### Deterministic placeholder push-to-talk transcript and mocked response sequence -For this increment, the existing mic-control shell still sends placeholder `input_audio.append` on press and `input_audio.commit` on release. The gateway now translates that shell flow into deterministic mocked transcript events only: +For this increment, the existing mic-control shell still sends placeholder `input_audio.append` on press and `input_audio.commit` on release. The gateway now translates that shell flow into deterministic mocked transcript events and then reuses the existing mocked response stream: ```text input_audio.append #1 @@ -161,6 +161,11 @@ input_audio.append #N (N > 1) input_audio.commit after N appends → transcript.final("[mocked final] Placeholder push-to-talk transcript completed from N appended chunk(s).") +→ session.state(thinking) +→ session.state(speaking) +→ response.text.delta("[mocked assistant] ") +→ response.text.delta("This is a deterministic mocked response from the gateway vertical slice.") +→ response.completed → session.state(idle) ``` @@ -168,7 +173,8 @@ Safe deterministic edge cases for this mocked placeholder flow: - commit without any prior append is accepted and emits `transcript.final("[mocked final] Placeholder push-to-talk transcript completed without appended audio.")` - repeated appends during one placeholder turn are accepted and each append replaces the latest partial transcript with a chunk-count-based deterministic value -- placeholder commit does not automatically start assistant thinking, response streaming, or audio playback +- after the final transcript, placeholder commit follows the same mocked `thinking → speaking → response.text.delta* → response.completed → idle` path as `mocked.turn.trigger` +- `response.cancel` can interrupt this mocked post-commit response path the same way it interrupts `mocked.turn.trigger`; already-rendered transcript or assistant text is not retracted ## Contract Scope for This Increment @@ -200,6 +206,7 @@ Current mocked-pipeline behavior: - during an active mocked turn, `response.cancel` returns the session to `idle` immediately - any mocked turn timers that have not fired yet are dropped, so no later `response.text.delta` or `response.completed` events are emitted for the cancelled turn +- the same cancellation behavior applies when a mocked turn was started by `input_audio.commit` - once `idle` is restored, the same WebSocket session can start another mocked turn without reconnecting More general future-state expectations: