diff --git a/apps/vela-gateway/package.json b/apps/vela-gateway/package.json index 25cd092..1cd96b2 100644 --- a/apps/vela-gateway/package.json +++ b/apps/vela-gateway/package.json @@ -8,7 +8,8 @@ "start": "node src/index.js", "build": "node -e \"console.log('vela-gateway: no build step required')\"" }, - "dependencies": { - "fastify": "^5.2.1" - } + "dependencies": { + "@vela/protocol": "0.0.0", + "fastify": "^5.2.1" + } } diff --git a/apps/vela-gateway/src/index.js b/apps/vela-gateway/src/index.js index 57b8a5b..c5b8d2d 100644 --- a/apps/vela-gateway/src/index.js +++ b/apps/vela-gateway/src/index.js @@ -1,4 +1,9 @@ const Fastify = require('fastify'); +const { + CLIENT_EVENT_TYPES, + PROTOCOL_PACKAGE_NAME, + SERVER_EVENT_TYPES +} = require('@vela/protocol'); function buildServer() { const app = Fastify({ logger: true }); @@ -7,7 +12,12 @@ function buildServer() { service: 'vela-gateway', status: 'ok', transport: 'http', - next: 'websocket session skeleton' + next: 'websocket session skeleton', + protocol: { + package: PROTOCOL_PACKAGE_NAME, + clientEventCount: CLIENT_EVENT_TYPES.length, + serverEventCount: SERVER_EVENT_TYPES.length + } })); app.get('/health', async () => ({ status: 'ok' })); diff --git a/apps/vela-protocol/package.json b/apps/vela-protocol/package.json new file mode 100644 index 0000000..d61c8aa --- /dev/null +++ b/apps/vela-protocol/package.json @@ -0,0 +1,13 @@ +{ + "name": "@vela/protocol", + "private": true, + "version": "0.0.0", + "type": "module", + "exports": { + ".": { + "types": "./src/index.d.ts", + "import": "./src/index.js", + "require": "./src/index.cjs" + } + } +} diff --git a/apps/vela-protocol/src/index.cjs b/apps/vela-protocol/src/index.cjs new file mode 100644 index 0000000..3391470 --- /dev/null +++ b/apps/vela-protocol/src/index.cjs @@ -0,0 +1,52 @@ +const PROTOCOL_PACKAGE_NAME = '@vela/protocol'; + +const SESSION_STATES = Object.freeze(['idle', 'listening', 'thinking', 'speaking']); + +const CLIENT_EVENT_TYPES = Object.freeze([ + 'session.start', + 'input_audio.append', + 'input_audio.commit', + 'response.cancel' +]); + +const SERVER_EVENT_TYPES = Object.freeze([ + 'session.ready', + 'session.state', + 'transcript.partial', + 'transcript.final', + 'response.text.delta', + 'response.completed', + 'error' +]); + +function createMessageEnvelope(type, payload) { + return { type, payload }; +} + +function isMessageEnvelope(value) { + return Boolean( + value && + typeof value === 'object' && + typeof value.type === 'string' && + 'payload' in value + ); +} + +function isClientEventType(type) { + return CLIENT_EVENT_TYPES.includes(type); +} + +function isServerEventType(type) { + return SERVER_EVENT_TYPES.includes(type); +} + +module.exports = { + PROTOCOL_PACKAGE_NAME, + SESSION_STATES, + CLIENT_EVENT_TYPES, + SERVER_EVENT_TYPES, + createMessageEnvelope, + isMessageEnvelope, + isClientEventType, + isServerEventType +}; diff --git a/apps/vela-protocol/src/index.d.ts b/apps/vela-protocol/src/index.d.ts new file mode 100644 index 0000000..bfdcfdc --- /dev/null +++ b/apps/vela-protocol/src/index.d.ts @@ -0,0 +1,68 @@ +export type SessionState = 'idle' | 'listening' | 'thinking' | 'speaking'; + +export type MessageEnvelope = { + type: TType; + payload: TPayload; +}; + +export type ClientEventPayloads = { + 'session.start': Record; + 'input_audio.append': { + chunk: string; + }; + 'input_audio.commit': Record; + 'response.cancel': Record; +}; + +export type ServerEventPayloads = { + 'session.ready': { + sessionId: string; + }; + 'session.state': { + value: SessionState; + }; + 'transcript.partial': { + text: string; + }; + 'transcript.final': { + text: string; + }; + 'response.text.delta': { + text: string; + }; + 'response.completed': Record; + 'error': { + code: string; + message: string; + retryable?: boolean; + }; +}; + +export type ClientEventType = keyof ClientEventPayloads; +export type ServerEventType = keyof ServerEventPayloads; + +export type ClientEvent = { + [Type in ClientEventType]: MessageEnvelope; +}[ClientEventType]; + +export type ServerEvent = { + [Type in ServerEventType]: MessageEnvelope; +}[ServerEventType]; + +export const PROTOCOL_PACKAGE_NAME: '@vela/protocol'; +export const SESSION_STATES: readonly SessionState[]; +export const CLIENT_EVENT_TYPES: readonly ClientEventType[]; +export const SERVER_EVENT_TYPES: readonly ServerEventType[]; + +export function createMessageEnvelope( + type: TType, + payload: ClientEventPayloads[TType] +): MessageEnvelope; +export function createMessageEnvelope( + type: TType, + payload: ServerEventPayloads[TType] +): MessageEnvelope; + +export function isMessageEnvelope(value: unknown): value is MessageEnvelope; +export function isClientEventType(type: string): type is ClientEventType; +export function isServerEventType(type: string): type is ServerEventType; diff --git a/apps/vela-protocol/src/index.js b/apps/vela-protocol/src/index.js new file mode 100644 index 0000000..b41f78e --- /dev/null +++ b/apps/vela-protocol/src/index.js @@ -0,0 +1,41 @@ +export const PROTOCOL_PACKAGE_NAME = '@vela/protocol'; + +export const SESSION_STATES = Object.freeze(['idle', 'listening', 'thinking', 'speaking']); + +export const CLIENT_EVENT_TYPES = Object.freeze([ + 'session.start', + 'input_audio.append', + 'input_audio.commit', + 'response.cancel' +]); + +export const SERVER_EVENT_TYPES = Object.freeze([ + 'session.ready', + 'session.state', + 'transcript.partial', + 'transcript.final', + 'response.text.delta', + 'response.completed', + 'error' +]); + +export function createMessageEnvelope(type, payload) { + return { type, payload }; +} + +export function isMessageEnvelope(value) { + return Boolean( + value && + typeof value === 'object' && + typeof value.type === 'string' && + 'payload' in value + ); +} + +export function isClientEventType(type) { + return CLIENT_EVENT_TYPES.includes(type); +} + +export function isServerEventType(type) { + return SERVER_EVENT_TYPES.includes(type); +} diff --git a/apps/vela-ui/package.json b/apps/vela-ui/package.json index d45b694..87e6561 100644 --- a/apps/vela-ui/package.json +++ b/apps/vela-ui/package.json @@ -10,10 +10,11 @@ "preview": "vite preview", "check": "svelte-kit sync && svelte-check --tsconfig ./jsconfig.json" }, - "dependencies": { - "@sveltejs/adapter-auto": "^3.3.1", - "@sveltejs/kit": "^2.17.1", - "svelte": "^5.19.5" + "dependencies": { + "@vela/protocol": "0.0.0", + "@sveltejs/adapter-auto": "^3.3.1", + "@sveltejs/kit": "^2.17.1", + "svelte": "^5.19.5" }, "devDependencies": { "@sveltejs/vite-plugin-svelte": "^5.0.3", diff --git a/apps/vela-ui/src/routes/+page.svelte b/apps/vela-ui/src/routes/+page.svelte index 5cfb3bf..7a6f7af 100644 --- a/apps/vela-ui/src/routes/+page.svelte +++ b/apps/vela-ui/src/routes/+page.svelte @@ -7,8 +7,14 @@
@@ -20,6 +26,11 @@ streaming session UI will be added in later increments.

+

+ Shared protocol package loaded with {CLIENT_EVENT_TYPES.length} client event types and + {SERVER_EVENT_TYPES.length} server event types. +

+
Status @@ -76,6 +87,10 @@ color: #c7d6e8; } + .contract-note { + margin-top: 1rem; + } + .meta { margin-top: 1.5rem; display: grid; diff --git a/docs/backlog.md b/docs/backlog.md index ce968c4..2881d77 100644 --- a/docs/backlog.md +++ b/docs/backlog.md @@ -11,7 +11,7 @@ Establish the boundaries, protocol, and state model for the system before integr ### Backlog Items - [x] define repository structure for `vela-ui` and `vela-gateway` -- define the WebSocket event contract used by the UI and gateway +- [x] define the WebSocket event contract used by the UI and gateway via shared package - define the session state machine and interrupt semantics - define provider adapter interfaces for STT, LLM, TTS, and tools - document error handling and cancellation behavior @@ -180,4 +180,5 @@ Polish the system after the core voice loop is reliable. - `apps/vela-ui` now boots as a minimal SvelteKit app with a starter page - `apps/vela-gateway` now boots as a minimal Fastify app with `/` and `/health` endpoints +- `apps/vela-protocol` now provides the shared WebSocket event contract for the UI and gateway - backend framework choice is now concrete: Fastify diff --git a/docs/protocol.md b/docs/protocol.md index 47ab4f4..d6fce51 100644 --- a/docs/protocol.md +++ b/docs/protocol.md @@ -2,31 +2,87 @@ ## Event Protocol +The shared code-level contract lives in the Yarn workspace package `@vela/protocol` so both the +gateway and UI import the same event names and envelope shape. + +## WebSocket Message Envelope + +Every WebSocket message uses one envelope format: + +```ts +type MessageEnvelope = { + type: TType; + payload: TPayload; +}; +``` + +This increment intentionally keeps the envelope minimal: + +- `type` identifies the event +- `payload` carries the event body +- no sequence numbers, timestamps, or protocol version fields yet +- future changes should be additive when possible + ### Client → Server ```ts type ClientEvent = - | { type: "start_listening" } - | { type: "stop_listening" } - | { type: "audio_chunk"; data: string } // PCM16 base64 - | { type: "interrupt" }; + | { type: "session.start"; payload: {} } + | { type: "input_audio.append"; payload: { chunk: string } } + | { type: "input_audio.commit"; payload: {} } + | { type: "response.cancel"; payload: {} }; ``` +#### Client event intent + +- `session.start` initializes a voice session without locking in transport or auth details yet +- `input_audio.append` carries a chunk of captured input audio as an encoded string +- `input_audio.commit` marks the current buffered user turn as ready for downstream processing +- `response.cancel` interrupts the active listen/think/speak flow + ### Server → Client ```ts type ServerEvent = - | { type: "state"; value: "idle" | "listening" | "thinking" | "speaking" } - | { type: "partial_transcript"; text: string } - | { type: "final_transcript"; text: string } - | { type: "assistant_text_delta"; text: string } - | { type: "tool_call_started"; tool: string } - | { type: "tool_call_finished"; tool: string; result: unknown } - | { type: "tts_audio_chunk"; data: string } - | { type: "assistant_done" } - | { type: "error"; message: string }; + | { type: "session.ready"; payload: { sessionId: string } } + | { + type: "session.state"; + payload: { value: "idle" | "listening" | "thinking" | "speaking" }; + } + | { type: "transcript.partial"; payload: { text: string } } + | { type: "transcript.final"; payload: { text: string } } + | { type: "response.text.delta"; payload: { text: string } } + | { type: "response.completed"; payload: {} } + | { + type: "error"; + payload: { code: string; message: string; retryable?: boolean }; + }; ``` +#### Server event intent + +- `session.ready` confirms that the gateway created a session identity +- `session.state` exposes the coarse session phase needed by the later UI shell +- `transcript.partial` and `transcript.final` support incremental and completed user text display +- `response.text.delta` supports streamed assistant text without committing to audio output details yet +- `response.completed` marks the current assistant turn as done +- `error` is the minimal recoverable failure shape for both UI and gateway work + +## Contract Scope for This Increment + +This contract is intentionally limited to the smallest event set needed to unblock: + +- the later gateway WebSocket session skeleton +- the later UI voice-session shell + +Explicitly deferred for later increments: + +- tool-calling events +- streamed TTS/output-audio events +- reconnect/resume semantics +- protocol version negotiation +- provider-specific metadata fields + ## State Machine ```text @@ -37,13 +93,13 @@ idle → idle ``` -Interrupt can occur at: +`response.cancel` can occur at: - listening → restart - thinking → cancel - speaking → stop immediately -## Interrupt Handling Requirements +## `response.cancel` Handling Requirements - immediate stop of TTS playback - immediate stop of LLM streaming @@ -51,12 +107,14 @@ Interrupt can occur at: ### Mechanism -The `interrupt` event cancels: +The `response.cancel` event cancels: - TTS process - current LLM request - tool execution when possible +This shared contract uses `response.cancel` consistently for that cancellation signal. + ## Protocol Notes for Implementation - keep the protocol backward compatible when possible