tetherto · lauripiisang · Jun 9, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 9, 2026
@@ -189,6 +189,11 @@ All endpoints follow the [OpenAI API](https://platform.openai.com/docs/api-refer
 | | `DELETE` | [`/v1/vector_stores/:id`](#delete-v1vector_storesid) |
 | | `POST` | [`/v1/vector_stores/:id/search`](#post-v1vector_storesidsearch) |
 | | `POST` | [`/v1/vector_stores/:id/files`](#post-v1vector_storesidfiles) |
+| [Videos](#videos) | `POST` | [`/v1/videos`](#post-v1videos) |
+| | `GET` | [`/v1/videos`](#get-v1videos) |
+| | `GET` | [`/v1/videos/:id`](#get-v1videosid) |
+| | `GET` | [`/v1/videos/:id/content`](#get-v1videosidcontent) |
+| | `DELETE` | [`/v1/videos/:id`](#delete-v1videosid) |
 
 <Callout type="info">
 All multipart endpoints (`/v1/audio/*`, `/v1/images/edits`, `/v1/files`) cap the request body at **100 MB**.
@@ -987,6 +992,50 @@ Once a vector store has been ingested with a particular embedding model, subsequ
 
 Search returns OpenAI-shaped `vector_store.search_results.page` objects. Each chunk's `attributes` include the originating `file_id` and `filename` when they were attached through the file flow.
 
+### Videos
+
+OpenAI-compatible **async** text-to-video, backed by the SDK's `video({ mode: "txt2vid" })`. Creating a job returns immediately; the generation runs in the background. Poll for status, then download the bytes.
+
+Requires an alias whose endpoint category is `video` (SDK addon `sdcpp-video`). Register it in `serve.models` and add a `serve.openai.videos.models` aliasing block so OpenAI SDK clients can use a hard-coded model name. **Text-to-video only** — `input_reference`, `/edits`, `/remix`, `/extensions`, and `/characters` are not implemented.
+
+#### `POST /v1/videos`
+
+Create a job. Returns the job resource with `status: "queued"`.
+
+#### `GET /v1/videos`
+
+List jobs (`limit` / `order` / `after`). **In-memory only** — a restart clears the list.
+
+#### `GET /v1/videos/:id`
+
+Poll job status (`queued` → `in_progress` → `completed` / `failed`).
+
+#### `GET /v1/videos/:id/content`
+
+Download the generated video. Defaults to `video/mp4` (lazy ffmpeg transcode + cache); `?format=avi` returns the native MJPG-AVI. A `?variant` other than `video` returns `501 unsupported_variant`.
+
+#### `DELETE /v1/videos/:id`
+
+Abort the running job and drop its assets.
+
+**End-to-end create + poll + download:**
+
+```bash
+curl http://localhost:11434/v1/videos \
+  -H "Content-Type: application/json" \
+  -d '{"model":"my-video","prompt":"a timelapse of clouds","size":"512x512"}'
+
+curl http://localhost:11434/v1/videos/video_abc123
+
+curl http://localhost:11434/v1/videos/video_abc123/content --output out.mp4
+```
+
+#### Deviations from the OpenAI spec
+
+- `input_reference` is rejected with `400 unsupported_param` (no img2vid in the SDK).
+- `size` accepts any `WxH` (multiples of 8) in addition to OpenAI's 4-value enum.
+- `Content-Type: video/mp4` is produced by a server-side ffmpeg transcode; `?format=avi` returns the native container.
+
 ### Authentication
 
 By default, the server accepts unauthenticated requests on `127.0.0.1`. To require a Bearer token, run the server with the `--api-key` flag:

@@ -29,6 +29,18 @@ This document describes the supported routes and how to configure `serve.models`
 | `GET` | `/v1/files` | List in-memory files |
 | `GET` | `/v1/files/{id}` | File metadata |
 | `GET` | `/v1/files/{id}/content` | Stream the bytes (used by image `response_format=url`) |
+| `GET` | `/v1/vector_stores` | List vector stores |
+| `POST` | `/v1/vector_stores` | Create a vector store |
+| `GET` | `/v1/vector_stores/{id}` | Retrieve a vector store |
+| `POST` | `/v1/vector_stores/{id}` | Update a vector store |
+| `DELETE` | `/v1/vector_stores/{id}` | Delete a vector store |
+| `POST` | `/v1/vector_stores/{id}/search` | Semantic search over a store (needs a loaded `embedding` model) |
+| `POST` | `/v1/vector_stores/{id}/files` | Attach + embed a previously-uploaded file |
+| `POST` | `/v1/videos` | Create a text-to-video job (async; backed by the SDK's `video({ mode: "txt2vid" })`) |
+| `GET` | `/v1/videos` | List video jobs (in-memory only) |
+| `GET` | `/v1/videos/{id}` | Poll job status |
+| `GET` | `/v1/videos/{id}/content` | Download bytes (`video/mp4` via ffmpeg transcode; `?format=avi` for native MJPG-AVI) |
+| `DELETE` | `/v1/videos/{id}` | Abort the job and drop its assets |
 
 Other OpenAI routes may be added over time; this file is updated when they ship.
 
@@ -565,3 +577,47 @@ Lists loaded (READY) text-to-speech models — the speech-capable subset of `/v1
   ]
 }
 ```
+
+## `POST /v1/videos` (and job lifecycle)
+
+OpenAI-compatible **async** video surface, backed by the SDK's
+`video({ mode: "txt2vid" })`. `POST` creates a job and returns immediately with
+`status: "queued"`; the generation runs in the background. Poll `GET
+/v1/videos/{id}` until `status` is `completed` (or `failed`), then fetch the
+bytes from `GET /v1/videos/{id}/content`.
+
+Requires an alias whose **endpoint category** is `video` (SDK addon
+`sdcpp-video`). Register it in `serve.models` and add a
+`serve.openai.videos.models` aliasing block so OpenAI SDK clients can use a
+hard-coded model name.
+
+**Scope: text-to-video only.** `input_reference` is rejected with `400
+unsupported_param` (no img2vid in the SDK); `/edits`, `/remix`, `/extensions`,
+and `/characters` are not implemented.
+
+### Endpoints
+
+| Method | Path | Notes |
+|--------|------|-------|
+| `POST` | `/v1/videos` | Create job → `{ status: "queued" }` |
+| `GET` | `/v1/videos/{id}` | Poll status |
+| `GET` | `/v1/videos/{id}/content` | Download; defaults to `video/mp4` (lazy ffmpeg transcode + cache). `?format=avi` returns the native MJPG-AVI. `?variant` other than `video` → `501 unsupported_variant` |
+| `GET` | `/v1/videos` | Paginated list (`limit` / `order` / `after`) |
+| `DELETE` | `/v1/videos/{id}` | Abort the running job and drop its assets |
+
+### Deviations from the OpenAI spec
+
+- `input_reference` → `400 unsupported_param`.
+- `size` accepts any `WxH` (multiples of 8) in addition to OpenAI's 4-value enum.
+- `Content-Type: video/mp4` is produced by a server-side ffmpeg transcode; `?format=avi` returns the native container.
+- The list endpoint is **in-memory only** — a restart clears it.
+
+### Errors
+
+| HTTP | `error.code` | When |
+|------|--------------|------|
+| 400 | `unsupported_param` | `input_reference` sent (no img2vid) |
+| 400 | `invalid_model_type` | Alias is not a `video` model |
+| 404 | `video_not_found` | Unknown job id |
+| 501 | `unsupported_variant` | `GET …/content?variant=` other than `video` |
+| 503 | `model_not_ready` | Model not loaded yet |
@@ -0,0 +1,76 @@
+import type { CompletionRun, CompletionStats, ToolCall } from '@qvac/sdk'
+import { HttpError } from '../../lib/http-error.js'
+
+export type OpenAiFinishReason = 'stop' | 'length' | 'tool_calls'
+
+export interface DrainedCompletion {
+  text: string
+  toolCalls: ToolCall[]
+  stats: CompletionStats | undefined
+  /**
+   * Terminal reason from the SDK `completionDone` event (`eos` / `length` /
+   * `stopSequence` / `cancelled`), or undefined if the stream ended without
+   * one. `error` and `cancelled` are never present here — `drainCompletion`
+   * throws on both (502 for error, `InferenceCancelledError` for cancelled).
+   */
+  stopReason: string | undefined
+  /** `stats.generatedTokens` when the SDK reports it, else a whitespace word count. */
+  completionTokens: number
+  /** OpenAI `finish_reason`: `tool_calls` wins, then `length` on truncation, else `stop`. */
+  finishReason: OpenAiFinishReason
+}
+
+/**
+ * Single-pass consumer of an SDK completion run, shared by every
+ * chat-category route (chat / completions / responses). Draining
+ * `result.events` once yields content text, tool calls, stats and the
+ * terminal `stopReason` together, so the OpenAI `finish_reason` and token
+ * accounting are derived in one place instead of drifting per route.
+ *
+ * Pass `onToken` to stream content deltas as they arrive (SSE paths); omit
+ * it for blocking responses.
+ */
+export async function drainCompletion (
+  result: CompletionRun,
+  onToken?: (token: string) => void
+): Promise<DrainedCompletion> {
+  let text = ''
+  const toolCalls: ToolCall[] = []
+  let stats: CompletionStats | undefined
+  let stopReason: string | undefined
+
+  for await (const event of result.events) {
+    if (event.type === 'contentDelta') {
+      text += event.text
+      onToken?.(event.text)
+    } else if (event.type === 'toolCall') {
+      toolCalls.push(event.call)
+    } else if (event.type === 'completionStats') {
+      stats = event.stats
+    } else if (event.type === 'completionDone') {
+      if (event.stopReason === 'error') {
+        throw new HttpError(502, 'inference_failed', 'Inference failed mid-stream.')
+      }
+      if (event.stopReason !== undefined) {
+        stopReason = event.stopReason
+      }
+    }
+  }
+
+  if (stopReason === 'cancelled') {
+    await result.final
+  }
+
+  const completionTokens = completionTokensFromStats(text, stats)
+  const finishReason: OpenAiFinishReason =
+    toolCalls.length > 0 ? 'tool_calls' : stopReason === 'length' ? 'length' : 'stop'
+
+  return { text, toolCalls, stats, stopReason, completionTokens, finishReason }
+}
+
+export function completionTokensFromStats (text: string, stats: CompletionStats | undefined): number {
+  if (typeof stats?.generatedTokens === 'number' && Number.isFinite(stats.generatedTokens)) {
+    return stats.generatedTokens
+  }
+  return text ? text.split(/\s+/).filter(Boolean).length : 0
+}
@@ -1,6 +1,7 @@
 import type { ServerResponse } from 'node:http'
 import type { CompletionRun, Tool } from '@qvac/sdk'
 import { sendSSE, endSSE } from '../../lib/sse.js'
+import { drainCompletion } from './completion-result.js'
 import { sdkToolCallsToOpenai } from './tool-calls.js'
 import type { GenerationParams, ResponseFormat } from '../../schemas/common.js'
 import { buildResponseObject, functionCallOutputItemId, messageId } from './responses-shape.js'
@@ -40,9 +41,7 @@ export async function writeBlockingResponse (
   p: ResponsesHandlerParams,
   result: CompletionRun
 ): Promise<Record<string, unknown>> {
-  const text = await result.text
-  const toolCalls = await result.toolCalls
-  const stats = await result.stats
+  const { text, toolCalls, stats, stopReason } = await drainCompletion(result)
 
   const responseObject = buildResponseObject({
     id: p.rid,
@@ -57,6 +56,7 @@ export async function writeBlockingResponse (
     parallelToolCalls: p.parallelToolCalls,
     previousResponseId: p.previousResponseId,
     store: p.storeEnabled,
+    ...(stopReason !== undefined ? { stopReason } : {}),
     ...(stats !== undefined ? { stats } : {})
   })
 
@@ -114,7 +114,7 @@ export async function writeStreamingResponse (
     response_id: p.rid
   })
 
-  for await (const token of result.tokenStream) {
+  const { toolCalls, stats, stopReason } = await drainCompletion(result, (token) => {
     fullText += token
     sendSSE(res, {
       type: 'response.output_text.delta',
@@ -124,9 +124,7 @@ export async function writeStreamingResponse (
       delta: token,
       response_id: p.rid
     })
-  }
-
-  const toolCalls = await result.toolCalls
+  })
   const hasToolCalls = toolCalls.length > 0
 
   sendSSE(res, {
@@ -196,8 +194,6 @@ export async function writeStreamingResponse (
     }
   }
 
-  const stats = await result.stats
-
   const responseObject = buildResponseObject({
     id: p.rid,
     modelAlias: p.modelAlias,
@@ -213,6 +209,7 @@ export async function writeStreamingResponse (
     store: p.storeEnabled,
     messageItemId: msgId,
     ...(hasToolCalls ? { functionCallItemIds: fcItemIds } : {}),
+    ...(stopReason !== undefined ? { stopReason } : {}),
     ...(stats !== undefined ? { stats } : {})
   })
 
@@ -228,7 +225,8 @@ export async function writeStreamingResponse (
     p.ctx.responsesStore.put(rec)
   }
 
-  sendSSE(res, { type: 'response.completed', response: responseObject })
+  const terminalType = responseObject['status'] === 'incomplete' ? 'response.incomplete' : 'response.completed'
+  sendSSE(res, { type: terminalType, response: responseObject })
   endSSE(res, { sentinel: false })
   p.ctx.logger.info(`  responses stream done id=${p.rid} stored=${p.storeEnabled}`)
   return responseObject

@@ -1,6 +1,7 @@
 import crypto from 'node:crypto'
 import type { ToolCall, CompletionStats } from '@qvac/sdk'
 import { sdkToolCallsToOpenai } from './tool-calls.js'
+import { completionTokensFromStats } from './completion-result.js'
 
 export function responseId (): string {
   return `resp_${randomId()}`
@@ -37,10 +38,13 @@ export interface BuildResponseObjectParams {
   functionCallItemIds?: string[]
   /** From SDK completion stats; `generatedTokens` maps to `usage.output_tokens`. */
   stats?: CompletionStats
-}
-
-function wordCountFallback (text: string): number {
-  return text ? text.split(/\s+/).filter(Boolean).length : 0
+  /**
+   * Terminal `stopReason` from the SDK. `length` maps to OpenAI's
+   * `status: 'incomplete'` + `incomplete_details.reason: 'max_output_tokens'`
+   * (the Responses-API analogue of chat's `finish_reason: 'length'`), unless
+   * tool calls take precedence with `requires_action`.
+   */
+  stopReason?: string
 }
 
 export function buildResponseObject (params: BuildResponseObjectParams): Record<string, unknown> {
@@ -74,10 +78,7 @@ export function buildResponseObject (params: BuildResponseObjectParams): Record<
     }
   }
 
-  const outputTokens =
-    typeof params.stats?.generatedTokens === 'number' && Number.isFinite(params.stats.generatedTokens)
-      ? params.stats.generatedTokens
-      : wordCountFallback(params.text || '')
+  const outputTokens = completionTokensFromStats(params.text || '', params.stats)
   // SDK does not expose prompt token count today; `cacheTokens` is KV-cache hit count, not full prompt size.
   const inputTokens = 0
   const usage = {
@@ -86,11 +87,14 @@ export function buildResponseObject (params: BuildResponseObjectParams): Record<
     total_tokens: inputTokens + outputTokens
   }
 
+  const truncated = !hasToolCalls && params.stopReason === 'length'
+  const status = hasToolCalls ? 'requires_action' : truncated ? 'incomplete' : 'completed'
+
   const base: Record<string, unknown> = {
     id: params.id,
     object: 'response',
     created_at: params.createdAtSec,
-    status: hasToolCalls ? 'requires_action' : 'completed',
+    status,
     model: params.modelAlias,
     output,
     output_text: params.text || '',
@@ -99,6 +103,10 @@ export function buildResponseObject (params: BuildResponseObjectParams): Record<
     store: params.store
   }
 
+  if (truncated) {
+    base['incomplete_details'] = { reason: 'max_output_tokens' }
+  }
+
   if (hasToolCalls) {
     base['required_action'] = {
       type: 'submit_tool_outputs',

@@ -22,7 +22,7 @@ export interface QvacContext {
   ffmpegAvailable: boolean
   transcribeOverride?: (opts: {
     modelId: string
-    audioChunk: Buffer
+    audioChunk: string | Buffer
     prompt?: string | undefined
   }) => Promise<string> & { requestId: string }
   /** Test seam — overrides `video()` from `@qvac/sdk` when set. */