From 103bb1195487dfc9fedf36360e50dfcd5b595852 Mon Sep 17 00:00:00 2001 From: Johannes Kresner Date: Wed, 8 Apr 2026 20:04:32 +0200 Subject: [PATCH] feat(vela-ui): add placeholder push-to-talk control shell --- .../test/websocket-session.test.js | 32 +++++++++ apps/vela-ui/e2e/voice-session.spec.js | 27 +++++++ apps/vela-ui/src/lib/VoiceSessionShell.svelte | 71 ++++++++++++++++++- apps/vela-ui/tests/voice-session.test.js | 61 ++++++++++++++++ docs/architecture.md | 12 ++-- docs/backlog.md | 4 +- docs/protocol.md | 5 ++ 7 files changed, 204 insertions(+), 8 deletions(-) diff --git a/apps/vela-gateway/test/websocket-session.test.js b/apps/vela-gateway/test/websocket-session.test.js index 689d542..47cb43b 100644 --- a/apps/vela-gateway/test/websocket-session.test.js +++ b/apps/vela-gateway/test/websocket-session.test.js @@ -327,6 +327,38 @@ test('websocket handles valid and invalid client messages safely', async () => { } }); +test('websocket accepts a placeholder input cycle before a mocked turn on the same socket', async () => { + const server = await startServer(); + + try { + const client = await connectWebSocket(server.port); + await client.nextMessage(); + await client.nextMessage(); + + client.sendJson({ type: 'input_audio.append', payload: { chunk: 'placeholder-control-shell-chunk' } }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'listening' } + }); + + client.sendJson({ type: 'input_audio.commit', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'idle' } + }); + + client.sendJson({ type: 'mocked.turn.trigger', payload: {} }); + assert.deepEqual(await client.nextMessage(), { + type: 'session.state', + payload: { value: 'listening' } + }); + + await client.close(); + } finally { + await server.close(); + } +}); + test('websocket mocked turn emits deterministic transcript and response events in order', async () => { const server = await startServer(); diff --git a/apps/vela-ui/e2e/voice-session.spec.js b/apps/vela-ui/e2e/voice-session.spec.js index bf72cd3..0cced90 100644 --- a/apps/vela-ui/e2e/voice-session.spec.js +++ b/apps/vela-ui/e2e/voice-session.spec.js @@ -77,3 +77,30 @@ test('voice session shell can cancel an active mocked turn and start another one await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); }); + +test('voice session shell supports a placeholder mic-control cycle before another mocked turn', async ({ page }) => { + await page.goto('/'); + await expect(page.getByTestId('hydration-status')).toHaveText('ready'); + await expect(page.getByTestId('mic-control-button')).toBeDisabled(); + + await page.getByTestId('connect-button').click(); + + await expect(page.getByTestId('connection-state')).toHaveText('connected'); + await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); + await expect(page.getByTestId('mic-control-button')).toBeEnabled(); + + await page.getByTestId('mic-control-button').dispatchEvent('mousedown'); + await expect(page.getByTestId('mic-control-status')).toHaveText('holding'); + await expect(page.getByTestId('gateway-session-state')).toHaveText('listening'); + await expect(page.getByTestId('mocked-turn-button')).toBeDisabled(); + + await page.getByTestId('mic-control-button').dispatchEvent('mouseup'); + await expect(page.getByTestId('mic-control-status')).toHaveText('idle'); + await expect(page.getByTestId('gateway-session-state')).toHaveText('idle'); + await expect(page.getByTestId('mocked-turn-button')).toBeEnabled(); + + await page.getByTestId('mocked-turn-button').click(); + await expect(page.getByTestId('mocked-turn-status')).toHaveText('running'); + await expect(page.getByTestId('assistant-response')).toHaveText(MOCKED_ASSISTANT_RESPONSE); + await expect(page.getByTestId('mocked-turn-status')).toHaveText('idle'); +}); diff --git a/apps/vela-ui/src/lib/VoiceSessionShell.svelte b/apps/vela-ui/src/lib/VoiceSessionShell.svelte index 184fe3d..b722239 100644 --- a/apps/vela-ui/src/lib/VoiceSessionShell.svelte +++ b/apps/vela-ui/src/lib/VoiceSessionShell.svelte @@ -53,6 +53,7 @@ let mockedAssistantResponse = 'none'; let mockedTurnInFlight = false; let mockedConversationRenderOrder = []; + let micControlActive = false; let hydrationStatus = 'mounting'; $: canTriggerMockedTurn = @@ -60,6 +61,8 @@ connectionState === 'connected' && socket?.readyState === WebSocket.OPEN && sessionReadyReceived && + gatewaySessionState === 'idle' && + !micControlActive && !mockedTurnInFlight; $: canCancelMockedTurn = @@ -69,6 +72,13 @@ sessionReadyReceived && mockedTurnInFlight; + $: canUseMicControl = + typeof WebSocket !== 'undefined' && + connectionState === 'connected' && + socket?.readyState === WebSocket.OPEN && + sessionReadyReceived && + !mockedTurnInFlight; + function clearSocketHandlers(targetSocket) { targetSocket.onopen = null; targetSocket.onmessage = null; @@ -85,6 +95,35 @@ mockedAssistantResponse = 'none'; mockedTurnInFlight = false; mockedConversationRenderOrder = []; + micControlActive = false; + } + + function startMicControl() { + if (!canUseMicControl || micControlActive) { + return; + } + + micControlActive = true; + lastError = 'none'; + connectionDetail = 'Mic control shell active. Sending placeholder input_audio.append only.'; + socket.send( + JSON.stringify(createMessageEnvelope('input_audio.append', { chunk: 'placeholder-control-shell-chunk' })) + ); + } + + function stopMicControl() { + if (!micControlActive) { + return; + } + + micControlActive = false; + + if (!socket || socket.readyState !== WebSocket.OPEN || connectionState !== 'connected' || !sessionReadyReceived) { + return; + } + + connectionDetail = 'Mic control shell released. Sending placeholder input_audio.commit.'; + socket.send(JSON.stringify(createMessageEnvelope('input_audio.commit', {}))); } function triggerMockedTurn() { @@ -252,6 +291,7 @@ lastClose = formatCloseReason(event); mockedTurnInFlight = false; + micControlActive = false; connectionState = connectionState === 'error' ? 'error' : 'disconnected'; connectionDetail = connectionState === 'error' ? 'Socket closed after an error.' : 'Gateway WebSocket is closed.'; @@ -298,7 +338,13 @@

Voice session shell

This minimal browser shell can connect to the gateway WebSocket, trigger one deterministic - mocked turn, and render the mocked transcript plus assistant response for the active session. + mocked turn, and expose a push-to-talk control shell that only sends placeholder protocol + events for the active session. +

+ +

+ The mic button in this increment does not request browser microphone permission and does not + capture real audio.

@@ -308,6 +354,20 @@

+
+
+ Mic control shell + {micControlActive ? 'holding' : 'idle'} +
Mocked turn status {mockedTurnInFlight ? 'running' : 'idle'} @@ -464,6 +528,11 @@ opacity: 0.55; } + button.mic-active { + background: #7d2034; + border-color: #ff7d9a; + } + .conversation { margin-top: 1.5rem; display: grid; diff --git a/apps/vela-ui/tests/voice-session.test.js b/apps/vela-ui/tests/voice-session.test.js index 93d1cde..4ea9253 100644 --- a/apps/vela-ui/tests/voice-session.test.js +++ b/apps/vela-ui/tests/voice-session.test.js @@ -68,6 +68,7 @@ describe('voice session shell', () => { expect(getByTestId('connection-state').textContent).toBe('not connected'); expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(true); await fireEvent.click(getByTestId('connect-button')); const socket = MockWebSocket.latest(); @@ -78,6 +79,7 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('connection-state').textContent).toBe('connected'); expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(false); }); await fireEvent.click(getByTestId('disconnect-button')); @@ -86,10 +88,69 @@ describe('voice session shell', () => { await waitFor(() => { expect(getByTestId('connection-state').textContent).toBe('disconnected'); expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(true); expect(getByTestId('session-id').textContent).toBe('session-123'); }); }); + it('runs a placeholder mic-control cycle and keeps mocked turn usable on the same socket', async () => { + render(VoiceSessionShell); + + await fireEvent.click(getByTestId('connect-button')); + const socket = MockWebSocket.latest(); + socket.open(); + + await waitFor(() => { + expect(getByTestId('connection-state').textContent).toBe('connected'); + }); + + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(true); + + socket.message(createMessageEnvelope('session.ready', { sessionId: 'session-mic' })); + socket.message(createMessageEnvelope('session.state', { value: 'idle' })); + + await waitFor(() => { + expect(getByTestId('mic-control-button').hasAttribute('disabled')).toBe(false); + }); + + await fireEvent.mouseDown(getByTestId('mic-control-button')); + + expect(socket.sent).toHaveLength(1); + expect(JSON.parse(socket.sent[0])).toEqual({ + type: 'input_audio.append', + payload: { chunk: 'placeholder-control-shell-chunk' } + }); + expect(getByTestId('mic-control-status').textContent).toBe('holding'); + + socket.message(createMessageEnvelope('session.state', { value: 'listening' })); + + await waitFor(() => { + expect(getByTestId('gateway-session-state').textContent).toBe('listening'); + expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(true); + }); + + await fireEvent.mouseUp(getByTestId('mic-control-button')); + + expect(socket.sent).toHaveLength(2); + expect(JSON.parse(socket.sent[1])).toEqual({ + type: 'input_audio.commit', + payload: {} + }); + expect(getByTestId('mic-control-status').textContent).toBe('idle'); + + socket.message(createMessageEnvelope('session.state', { value: 'idle' })); + + await waitFor(() => { + expect(getByTestId('gateway-session-state').textContent).toBe('idle'); + expect(getByTestId('mocked-turn-button').hasAttribute('disabled')).toBe(false); + }); + + await fireEvent.click(getByTestId('mocked-turn-button')); + + expect(socket.sent).toHaveLength(3); + expect(JSON.parse(socket.sent[2]).type).toBe('mocked.turn.trigger'); + }); + it('renders mocked transcript before assistant response for a connected session', async () => { render(VoiceSessionShell); diff --git a/docs/architecture.md b/docs/architecture.md index 592cdc3..515b9d2 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -36,7 +36,7 @@ The repository now includes separate runnable workspaces for the UI and gateway - PWA enabled - WebSocket client -The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), trigger one deterministic mocked turn while connected, and render the mocked user transcript plus mocked assistant response for the active session. Microphone capture, real provider integration, and audio playback are still future work. +The current implementation is a minimal SvelteKit app with a single voice-session shell page. The shipped UI can open and close a browser WebSocket connection to the gateway `/ws` endpoint, show explicit connection status (`not connected`, `connecting`, `connected`, `disconnected`, `error`), expose mic control shell interactions that emit placeholder `input_audio.append` / `input_audio.commit` events, trigger one deterministic mocked turn while connected, and render the mocked user transcript plus mocked assistant response for the active session. This remains a shell only: there is no real microphone capture, real provider integration, or audio playback yet. #### Responsibilities @@ -104,7 +104,7 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and - `GET /ws` documents the route for plain HTTP clients and returns `426 Upgrade Required` - WebSocket upgrades on `/ws` create an ephemeral session immediately - the gateway sends `session.ready` followed by `session.state` (`idle`) when the socket is established -- valid minimal client events can move the session between `idle` and `listening` +- valid minimal client events, including placeholder `input_audio.append` / `input_audio.commit`, can move the session between `idle` and `listening` - `mocked.turn.trigger` drives a fixed transcript/response event sequence over the existing shared protocol - only one mocked turn is allowed in flight per session at a time - invalid JSON, invalid envelopes, and malformed frames are handled defensively so the process stays up @@ -112,15 +112,15 @@ The current implementation is a minimal Fastify service with `/`, `/health`, and ### Current UI shell behavior - renders a minimal developer-focused voice-session panel -- exposes connect, disconnect, and mocked-turn controls -- does not request microphone permission -- does not send or process audio data +- exposes connect, disconnect, mic-control shell interactions, and mocked-turn controls +- does not request microphone permission or capture real microphone audio +- only emits placeholder `input_audio.append` / `input_audio.commit` events; it does not send real audio data or play back audio - reads mocked transcript and mocked response events from the shared protocol contract ## Voice Pipeline ```text -Mocked turn button → Gateway mocked session flow → Transcript events → Response text events → UI +Mic control shell / mocked turn button → Placeholder `input_audio.append` / `input_audio.commit` or mocked session flow → Transcript events → Response text events → UI ``` This mocked vertical slice intentionally stands in for the future real pipeline: diff --git a/docs/backlog.md b/docs/backlog.md index 3073bba..225ab11 100644 --- a/docs/backlog.md +++ b/docs/backlog.md @@ -35,7 +35,7 @@ Prove the end-to-end interaction model with mocked or stubbed providers. - [x] bootstrap `vela-gateway` as a runnable Fastify app in the Yarn workspace - [x] add the first UI voice-session shell with connect/disconnect controls and explicit WebSocket status - [x] create a minimal mocked-turn UI with transcript and response text over the shared WebSocket session -- create a minimal UI with mic control +- [x] create a minimal UI with mic control - [x] create a gateway WebSocket session skeleton - [x] implement a mocked transcript/response vertical slice over the existing WebSocket session - implement mocked STT flow for partial transcript events @@ -184,10 +184,12 @@ Polish the system after the core voice loop is reliable. - `apps/vela-ui` now boots as a minimal SvelteKit app with a starter page - `apps/vela-ui` now includes a minimal voice-session shell that can connect to the gateway `/ws` endpoint and display developer-visible session status - `apps/vela-ui` can now trigger one deterministic mocked turn while connected and render the mocked transcript plus assistant response for the active session +- `apps/vela-ui` now exposes a visible push-to-talk mic control shell that sends placeholder `input_audio.append` / `input_audio.commit` events without requesting browser mic permission or capturing real audio - `apps/vela-ui` now includes browser-level coverage for the mocked transcript/response slice, including connect, disconnect, and disconnected-state trigger guarding - `apps/vela-gateway` now boots as a minimal Fastify app with `/` and `/health` endpoints - `apps/vela-gateway` now exposes a minimal `/ws` WebSocket session skeleton with ephemeral in-memory sessions and defensive message handling - `apps/vela-gateway` now accepts `mocked.turn.trigger` and emits protocol-valid mocked transcript/response events with one in-flight mocked turn per session +- `apps/vela-gateway` now supports placeholder input-audio append/commit cycles before running another mocked turn on the same socket - `apps/vela-ui` now exposes a cancel control for active mocked turns and keeps already-rendered transcript/response text visible after cancellation - `apps/vela-gateway` now honors `response.cancel` during mocked turns by stopping pending mocked response events, returning the session to `idle`, and allowing a new mocked turn on the same socket - `apps/vela-protocol` now provides the shared WebSocket event contract for the UI and gateway diff --git a/docs/protocol.md b/docs/protocol.md index 3d3b864..a53ebf5 100644 --- a/docs/protocol.md +++ b/docs/protocol.md @@ -16,6 +16,7 @@ Current UI baseline: - the browser opens a WebSocket directly to `/ws` - the UI tracks connection status separately from gateway session status - the UI can send `mocked.turn.trigger` after `session.ready` while connected to request one deterministic mocked turn for the active session +- the UI exposes a push-to-talk mic control shell that sends placeholder `input_audio.append` on press and `input_audio.commit` on release without capturing real audio ## WebSocket Message Envelope @@ -62,6 +63,7 @@ type ClientEvent = - a mocked turn emits deterministic `transcript.final`, `response.text.delta`, `response.completed`, and `session.state` events in protocol-valid order - `input_audio.append` updates the ephemeral session record and moves the session to `listening` - `input_audio.commit` resets the minimal buffered state and returns the session to `idle` +- after a completed placeholder input cycle, the same socket can still send `mocked.turn.trigger` - `response.cancel` is safe to send even when no mocked turn is active - `response.cancel` stops any still-pending mocked turn events for the active turn and resets the minimal session state back to `idle` - a second mocked-turn trigger during an active mocked turn produces `error` with code `mocked_turn_in_flight` @@ -86,6 +88,9 @@ Notes: - this UI state is transport-oriented and is separate from the shared gateway `session.state` payload - `session.state` currently reflects the gateway session phase (`idle`, `listening`, `thinking`, `speaking`) - the UI disables the mocked-turn control until `session.ready` arrives, while disconnected, or while a mocked turn is already in flight +- the UI disables the mic control while disconnected, before `session.ready`, or while a mocked turn is already in flight +- pressing the mic control sends one placeholder `input_audio.append` chunk and releasing it sends `input_audio.commit` +- the UI copy explicitly labels the mic button as a control shell and not real microphone capture - the UI shows a cancel control and enables it only while a mocked turn is active - after cancel returns the gateway to `idle`, the UI clears the active-turn indicator but keeps any transcript or response text that was already rendered - the UI treats malformed server messages, browser WebSocket errors, and gateway `error` events as safe error states instead of throwing