diff --git a/.cursor/skills/proof/SKILL.md b/.cursor/skills/proof/SKILL.md index 36fb1962..c5dd9540 100644 --- a/.cursor/skills/proof/SKILL.md +++ b/.cursor/skills/proof/SKILL.md @@ -29,9 +29,15 @@ You (the parent agent) author the DAG inline using your understanding of the use { "title": "", "models": { - "HIGH": "gpt-5.3-codex", + "HIGH": { + "id": "gpt-5.4", + "params": [{ "id": "reasoning", "value": "high" }] + }, "MED": "composer-2", - "LOW": "auto-low" + "LOW": { + "id": "gpt-5.4-nano", + "params": [{ "id": "reasoning", "value": "low" }] + } }, "tasks": [ { @@ -49,7 +55,7 @@ Rules: - Every `depends_on` entry must reference another task's `id`. - No cycles. The runner rejects cyclic DAGs at parse time. - `complexity` controls the model the subagent uses (see table below). Pick `HIGH` for novel/complex reasoning, `MED` for typical implementation, `LOW` for mechanical/lookup tasks. -- Optional top-level `models` can override the default complexity → model map for this DAG. +- Optional top-level `models` can override the default complexity → model map for this DAG. Values can be plain SDK model id strings or model selection objects of the shape `{ "id": "...", "params": [{ "id": "...", "value": "..." }] }`, with `params` omitted when unused. - `subtask_prompt` should read like a standalone request — the runner automatically prepends a short summary of upstream task outputs, so you do not need to repeat them. - Do **not** put two tasks that write to the same file in the same rank (siblings within a rank run concurrently and would race). @@ -167,11 +173,24 @@ After the runner exits, briefly summarize what completed/failed and re-link the | MED | `composer-2` | | LOW | `gpt-5.4-nano` | -Override any subset inline with top-level DAG `models`, or pass a reusable profile with `--models-file `. Precedence is defaults < DAG `models` < `--models-file`. The Cursor model catalog can vary by account. +Override any subset inline with top-level DAG `models`, or pass a reusable profile with `--models-file `. Values can be plain SDK model id strings or SDK model selections with `params`. At run time, Proof calls `Cursor.models.list()`, validates ids and param values, and expands partial selections to the closest valid preset variant using the model's default variant for omitted params. Precedence is defaults < DAG `models` < `--models-file`. The Cursor model catalog can vary by account. + +To use a cheaper high-capability GPT model, use the base SDK id plus params, not a suffix-style id: + +```json +{ + "models": { + "HIGH": { + "id": "gpt-5.4", + "params": [{ "id": "reasoning", "value": "high" }] + } + } +} +``` ### Discovering valid model ids -Many Cursor CLI catalog models encode reasoning effort and Max Mode as **slug suffixes** (e.g. `claude-opus-4-7-thinking-max`, `gpt-5.5-extra-high`, `gpt-5.3-codex-xhigh`), but the Cursor SDK may accept only base slugs. Do not compose SDK model ids from CLI suffixes by hand. For SDK-bound code, prefer `Cursor.models.list()` or the SDK's `ConfigurationError` catalog over `cursor-agent --list-models`. +Many Cursor CLI catalog models encode reasoning effort and Max Mode as **slug suffixes** (e.g. `claude-opus-4-7-thinking-max`, `gpt-5.5-extra-high`, `gpt-5.3-codex-xhigh`), but the Cursor SDK may accept only base slugs plus `params`. Do not compose SDK model ids from CLI suffixes by hand: use `{ "id": "gpt-5.4", "params": [{ "id": "reasoning", "value": "high" }] }`, not `gpt-5.4-high`. For SDK-bound code, prefer `Cursor.models.list()` or the SDK's `ConfigurationError` catalog over `cursor-agent --list-models`. Ways to enumerate model ids: diff --git a/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json b/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json index 7e609e53..ec07e91c 100644 --- a/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json +++ b/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json @@ -2,9 +2,9 @@ "title": "Flatbread Flow PMF Audit (no sub-sub-agents)", "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", "models": { - "HIGH": "claude-opus-4-7", - "MED": "gpt-5.5", - "LOW": "gpt-5.4-mini" + "HIGH": { "id": "claude-opus-4-7" }, + "MED": { "id": "gpt-5.5" }, + "LOW": { "id": "gpt-5.4-mini" } }, "tasks": [ { diff --git a/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json b/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json index 5448195d..e34e5091 100644 --- a/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json +++ b/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json @@ -2,9 +2,9 @@ "title": "Flatbread codegen-only change (no sub-sub-agents)", "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", "models": { - "HIGH": "claude-opus-4-7", - "MED": "gpt-5.5", - "LOW": "gpt-5.4-mini" + "HIGH": { "id": "claude-opus-4-7" }, + "MED": { "id": "gpt-5.5" }, + "LOW": { "id": "gpt-5.4-mini" } }, "tasks": [ { diff --git a/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json b/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json index 0dc276c0..a564f641 100644 --- a/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json +++ b/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json @@ -2,9 +2,9 @@ "title": "Flatbread docs / README sync (no sub-sub-agents)", "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", "models": { - "HIGH": "claude-opus-4-7", - "MED": "gpt-5.5", - "LOW": "gpt-5.4-mini" + "HIGH": { "id": "claude-opus-4-7" }, + "MED": { "id": "gpt-5.5" }, + "LOW": { "id": "gpt-5.4-mini" } }, "tasks": [ { diff --git a/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json b/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json index 97a8e0e9..7f1e374c 100644 --- a/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json +++ b/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json @@ -2,9 +2,9 @@ "title": "Flatbread schema-breaking migration (no sub-sub-agents; pause at human checkpoint after contract-synth)", "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", "models": { - "HIGH": "claude-opus-4-7", - "MED": "gpt-5.5", - "LOW": "gpt-5.4-mini" + "HIGH": { "id": "claude-opus-4-7" }, + "MED": { "id": "gpt-5.5" }, + "LOW": { "id": "gpt-5.4-mini" } }, "tasks": [ { diff --git a/packages/proof/README.md b/packages/proof/README.md index ab670c3e..5778715a 100644 --- a/packages/proof/README.md +++ b/packages/proof/README.md @@ -68,6 +68,33 @@ Every DAG has a `title` and a `tasks` array. Each task needs: Proof computes ranks with Kahn topological sort and runs sibling tasks in the same rank concurrently. Avoid placing two sibling tasks in the same rank if they write the same files. +Optional top-level `models` can override the default complexity map with plain +SDK model id strings or SDK model selections: + +```json +{ + "models": { + "HIGH": { + "id": "gpt-5.4", + "params": [{ "id": "reasoning", "value": "high" }] + }, + "MED": "composer-2", + "LOW": { + "id": "gpt-5.4-nano", + "params": [{ "id": "reasoning", "value": "low" }] + } + } +} +``` + +Use the object shape when you need `params`; use a string when the model id is +enough. For example, use `{ "id": "gpt-5.4", "params": [{ "id": "reasoning", "value": "high" }] }`, not a suffix-style id like `gpt-5.4-high`. + +When a DAG runs, Proof calls `Cursor.models.list()`, validates model ids and +param values, and expands partial selections to the closest valid SDK preset +variant using that model's default variant for omitted params. `--init-only` +does not call the SDK, so it can still render a canvas without `CURSOR_API_KEY`. + Optional task kinds add control gates: - `kind: "oracle"` runs a shell command and records pass/fail evidence. @@ -118,8 +145,9 @@ Proof also exposes helpers for tooling: ```ts import { computeRanks, - createModelResolver, + createModelSelectionResolver, parseDAG, + resolveModelSelectionFromCatalog, runDryCheck, type DAG, type TaskState, diff --git a/packages/proof/src/canvas_writer.ts b/packages/proof/src/canvas_writer.ts index 76c152ba..1c3d6db6 100644 --- a/packages/proof/src/canvas_writer.ts +++ b/packages/proof/src/canvas_writer.ts @@ -10,7 +10,15 @@ import { writeFile, mkdir } from 'node:fs/promises'; import { dirname } from 'node:path'; -import type { Complexity, DAG, TaskKind } from './dag.js'; +import { + formatModelSelection, + normalizeModelSelection, + type Complexity, + type DAG, + type ModelSelection, + type ModelSpec, + type TaskKind, +} from './dag.js'; export type TaskStatus = | 'PENDING' @@ -27,6 +35,7 @@ export interface TaskState { subtask_prompt: string; status: TaskStatus; model: string; + modelSelection?: ModelSelection; /** `'task'` (default), `'pause'`, or `'oracle'`. Undefined is normalized to `'task'`. */ kind?: TaskKind; /** @@ -91,25 +100,34 @@ export interface RunState { export function initialRunState( dag: DAG, - modelFor: (c: Complexity) => string + modelFor: (c: Complexity) => ModelSpec ): RunState { return { title: dag.title, startedAt: Date.now(), - tasks: dag.tasks.map((t) => ({ - id: t.id, - depends_on: t.depends_on, - complexity: t.complexity, - subtask_prompt: t.subtask_prompt, - status: 'PENDING', - model: modelFor(t.complexity), - // Normalize undefined kind → 'task' so downstream consumers (canvas - // template, runner dispatcher) never have to ?? again. - kind: t.kind ?? 'task', - // Surface oracle-only fields so the canvas can render the gate's - // command / expectation without reading the streamed result body. - ...(t.kind === 'oracle' ? { command: t.command, expect: t.expect } : {}), - })), + tasks: dag.tasks.map((t) => { + const modelSelection = normalizeModelSelection( + modelFor(t.complexity), + `model for task ${t.id}` + ); + return { + id: t.id, + depends_on: t.depends_on, + complexity: t.complexity, + subtask_prompt: t.subtask_prompt, + status: 'PENDING', + model: formatModelSelection(modelSelection), + modelSelection, + // Normalize undefined kind → 'task' so downstream consumers (canvas + // template, runner dispatcher) never have to ?? again. + kind: t.kind ?? 'task', + // Surface oracle-only fields so the canvas can render the gate's + // command / expectation without reading the streamed result body. + ...(t.kind === 'oracle' + ? { command: t.command, expect: t.expect } + : {}), + }; + }), }; } @@ -215,6 +233,17 @@ type TaskStatus = type Complexity = 'HIGH' | 'MED' | 'LOW'; type TaskKind = 'task' | 'pause' | 'oracle'; +// Keep in sync with ModelParameterValue / ModelSelection in dag.ts. +interface ModelParameterValue { + id: string; + value: string; +} + +interface ModelSelection { + id: string; + params?: ModelParameterValue[]; +} + interface TaskState { id: string; depends_on: string[]; @@ -222,6 +251,7 @@ interface TaskState { subtask_prompt: string; status: TaskStatus; model: string; + modelSelection?: ModelSelection; kind?: TaskKind; command?: string; expect?: string; @@ -684,6 +714,12 @@ function TaskList({ : ''} {(t.iteration ?? 0) > 0 ? ' · iteration ' + t.iteration : ''} + {t.modelSelection?.params && t.modelSelection.params.length > 0 ? ( + + {'Params: ' + + t.modelSelection.params.map((p) => p.id + '=' + p.value).join(', ')} + + ) : null} {effectiveKind(t) === 'pause' && t.checkpointPath ? ( diff --git a/packages/proof/src/dag.test.ts b/packages/proof/src/dag.test.ts new file mode 100644 index 00000000..42d7e3dc --- /dev/null +++ b/packages/proof/src/dag.test.ts @@ -0,0 +1,474 @@ +import test from 'ava'; + +import { + createModelSelectionResolver, + normalizeModelSelection, + parseDAG, + resolveModelSelectionFromCatalog, + validateModelMap, + type ModelCatalogItem, + type ModelSpec, + type ModelSelection, +} from './dag.js'; + +function resolveSelection( + selection: ModelSelection, + variants: NonNullable +): ModelSelection { + const catalog: ModelCatalogItem[] = [ + { id: 'composer-2', displayName: 'Composer 2', variants }, + ]; + return resolveModelSelectionFromCatalog(selection, catalog, 'test model'); +} + +test('resolveModelSelectionFromCatalog prefers highest-scoring variant among matches', (t) => { + const resolved = resolveSelection( + { id: 'composer-2', params: [{ id: 'effort', value: 'max' }] }, + [ + { + displayName: 'Default medium concise', + isDefault: true, + params: [ + { id: 'effort', value: 'medium' }, + { id: 'style', value: 'concise' }, + ], + }, + { + displayName: 'Max concise', + params: [ + { id: 'effort', value: 'max' }, + { id: 'style', value: 'concise' }, + ], + }, + { + displayName: 'Max verbose', + params: [ + { id: 'effort', value: 'max' }, + { id: 'style', value: 'verbose' }, + ], + }, + ] + ); + + t.deepEqual(resolved, { + id: 'composer-2', + params: [ + { id: 'effort', value: 'max' }, + { id: 'style', value: 'concise' }, + ], + }); +}); + +test('resolveModelSelectionFromCatalog breaks equal-score ties to catalog default variant', (t) => { + const resolved = resolveSelection( + { id: 'composer-2', params: [{ id: 'effort', value: 'max' }] }, + [ + { + displayName: 'Max with style override', + params: [ + { id: 'effort', value: 'max' }, + { id: 'style', value: 'verbose' }, + ], + }, + { + displayName: 'Default max', + isDefault: true, + params: [{ id: 'effort', value: 'max' }], + }, + ] + ); + + t.deepEqual(resolved, { + id: 'composer-2', + params: [{ id: 'effort', value: 'max' }], + }); +}); + +test('resolveModelSelectionFromCatalog throws a descriptive error when no variant matches', (t) => { + const err = t.throws(() => + resolveSelection( + { id: 'composer-2', params: [{ id: 'effort', value: 'max' }] }, + [ + { + displayName: 'Default medium', + isDefault: true, + params: [{ id: 'effort', value: 'medium' }], + }, + ] + ) + ); + + if (!err) { + t.fail('Expected no-match variant selection to throw.'); + return; + } + t.regex( + err.message, + /does not match any Cursor SDK preset variant\. Valid variants:/ + ); +}); + +test('resolveModelSelectionFromCatalog returns default variant when no params requested', (t) => { + const resolved = resolveSelection({ id: 'composer-2' }, [ + { + displayName: 'Fast', + params: [{ id: 'effort', value: 'low' }], + }, + { + displayName: 'Default', + isDefault: true, + params: [{ id: 'effort', value: 'medium' }], + }, + ]); + + t.deepEqual(resolved, { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }); +}); + +test('resolveModelSelectionFromCatalog falls back to first variant when no default is flagged', (t) => { + const resolved = resolveSelection({ id: 'composer-2' }, [ + { + displayName: 'Fast', + params: [{ id: 'effort', value: 'low' }], + }, + { + displayName: 'Careful', + params: [{ id: 'effort', value: 'high' }], + }, + ]); + + t.deepEqual(resolved, { + id: 'composer-2', + params: [{ id: 'effort', value: 'low' }], + }); +}); + +test('resolveModelSelectionFromCatalog treats empty params as no params requested', (t) => { + const resolved = resolveSelection({ id: 'composer-2', params: [] }, [ + { + displayName: 'Fast', + params: [{ id: 'effort', value: 'low' }], + }, + { + displayName: 'Default', + isDefault: true, + params: [{ id: 'effort', value: 'medium' }], + }, + ]); + + t.deepEqual(resolved, { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }); +}); + +test('resolveModelSelectionFromCatalog throws when no variant fully matches requested params', (t) => { + const err = t.throws(() => + resolveSelection( + { + id: 'composer-2', + params: [ + { id: 'effort', value: 'max' }, + { id: 'style', value: 'verbose' }, + ], + }, + [ + { + displayName: 'Max concise', + params: [ + { id: 'effort', value: 'max' }, + { id: 'style', value: 'concise' }, + ], + }, + { + displayName: 'Medium verbose', + params: [ + { id: 'effort', value: 'medium' }, + { id: 'style', value: 'verbose' }, + ], + }, + ] + ) + ); + + if (!err) { + t.fail('Expected partial variant match to throw.'); + return; + } + t.regex(err.message, /does not match any Cursor SDK preset variant/); +}); + +test('resolveModelSelectionFromCatalog throws on unknown model id', (t) => { + const catalog: ModelCatalogItem[] = [ + { id: 'composer-2', displayName: 'Composer 2' }, + ]; + const err = t.throws(() => + resolveModelSelectionFromCatalog({ id: 'unknown-model' }, catalog, 'test') + ); + + if (!err) { + t.fail('Expected unknown model id to throw.'); + return; + } + t.regex(err.message, /uses unknown Cursor SDK model/); +}); + +test('resolveModelSelectionFromCatalog passes through selection when model has no variants', (t) => { + const catalog: ModelCatalogItem[] = [ + { id: 'composer-2', displayName: 'Composer 2' }, + ]; + const selection: ModelSelection = { + id: 'composer-2', + }; + const resolved = resolveModelSelectionFromCatalog(selection, catalog, 'test'); + + t.deepEqual(resolved, selection); + t.not(resolved, selection); +}); + +test('resolveModelSelectionFromCatalog accepts valid params declared by catalog parameters', (t) => { + const catalog: ModelCatalogItem[] = [ + { + id: 'composer-2', + displayName: 'Composer 2', + parameters: [ + { + id: 'effort', + values: [{ value: 'low' }, { value: 'medium' }, { value: 'high' }], + }, + ], + }, + ]; + const selection: ModelSelection = { + id: 'composer-2', + params: [{ id: 'effort', value: 'high' }], + }; + + const resolved = resolveModelSelectionFromCatalog(selection, catalog, 'test'); + + t.deepEqual(resolved, selection); + t.not(resolved, selection); +}); + +test('resolveModelSelectionFromCatalog throws when catalog parameters reject a param id', (t) => { + const catalog: ModelCatalogItem[] = [ + { + id: 'composer-2', + displayName: 'Composer 2', + parameters: [{ id: 'effort', values: [{ value: 'medium' }] }], + }, + ]; + const err = t.throws(() => + resolveModelSelectionFromCatalog( + { id: 'composer-2', params: [{ id: 'style', value: 'concise' }] }, + catalog, + 'test' + ) + ); + + if (!err) { + t.fail('Expected unknown parameter id to throw.'); + return; + } + t.regex(err.message, /does not support param "style"/); +}); + +test('resolveModelSelectionFromCatalog throws when catalog parameters reject a param value', (t) => { + const catalog: ModelCatalogItem[] = [ + { + id: 'composer-2', + displayName: 'Composer 2', + parameters: [{ id: 'effort', values: [{ value: 'medium' }] }], + }, + ]; + const err = t.throws(() => + resolveModelSelectionFromCatalog( + { id: 'composer-2', params: [{ id: 'effort', value: 'max' }] }, + catalog, + 'test' + ) + ); + + if (!err) { + t.fail('Expected unsupported parameter value to throw.'); + return; + } + t.regex(err.message, /param "effort" does not support value "max"/); +}); + +test('resolveModelSelectionFromCatalog throws when explicit params have no catalog declaration', (t) => { + const catalog: ModelCatalogItem[] = [ + { id: 'composer-2', displayName: 'Composer 2' }, + ]; + const err = t.throws(() => + resolveModelSelectionFromCatalog( + { id: 'composer-2', params: [{ id: 'effort', value: 'medium' }] }, + catalog, + 'test' + ) + ); + + if (!err) { + t.fail('Expected undeclared parameters to throw.'); + return; + } + t.regex(err.message, /does not declare parameters or preset variants/); +}); + +test('normalizeModelSelection trims string model ids', (t) => { + t.deepEqual(normalizeModelSelection(' composer-2 '), { id: 'composer-2' }); +}); + +test('normalizeModelSelection normalizes valid object model specs', (t) => { + const input: ModelSelection = { + id: ' composer-2 ', + params: [{ id: ' effort ', value: ' medium ' }], + }; + const result = normalizeModelSelection(input, 'test model'); + + t.deepEqual(result, { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }); + t.not(result, input); + t.not(result.params, input.params); +}); + +test('normalizeModelSelection throws label-prefixed errors for invalid model specs', (t) => { + for (const raw of ['', ' ', 42, null]) { + const err = t.throws(() => + normalizeModelSelection(raw as unknown as ModelSpec, 'test model') + ); + + if (!err) { + t.fail(`Expected invalid model spec ${String(raw)} to throw.`); + continue; + } + t.regex(err.message, /^test model must be /); + } +}); + +test('normalizeModelSelection throws label-prefixed errors for invalid param values', (t) => { + for (const value of ['', ' ', 42]) { + const err = t.throws(() => + normalizeModelSelection( + { + id: 'composer-2', + params: [{ id: 'effort', value }], + } as unknown as ModelSpec, + 'test model' + ) + ); + + if (!err) { + t.fail(`Expected invalid param value ${String(value)} to throw.`); + continue; + } + t.is(err.message, 'test model.params[0].value must be a non-empty string.'); + } +}); + +test('normalizeModelSelection throws on duplicate param ids', (t) => { + const err = t.throws(() => + normalizeModelSelection( + { + id: 'composer-2', + params: [ + { id: 'effort', value: 'low' }, + { id: 'effort', value: 'high' }, + ], + }, + 'test model' + ) + ); + + if (!err) { + t.fail('Expected duplicate param id to throw.'); + return; + } + t.regex(err.message, /duplicate id: effort/); +}); + +test('validateModelMap accepts plain string model ids', (t) => { + t.deepEqual( + validateModelMap( + { + HIGH: ' claude-opus-4-7 ', + LOW: 'gpt-5.4-nano', + }, + 'test models' + ), + { + HIGH: { id: 'claude-opus-4-7' }, + LOW: { id: 'gpt-5.4-nano' }, + } + ); +}); + +test('validateModelMap accepts model selection objects with params', (t) => { + t.deepEqual( + validateModelMap( + { + MED: { + id: 'composer-2', + params: [{ id: 'effort', value: 'max' }], + }, + }, + 'test models' + ), + { + MED: { + id: 'composer-2', + params: [{ id: 'effort', value: 'max' }], + }, + } + ); +}); + +test('createModelSelectionResolver normalizes mixed override shapes', (t) => { + const modelFor = createModelSelectionResolver({ + HIGH: 'claude-opus-4-7', + MED: { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }, + }); + + t.deepEqual(modelFor('HIGH'), { id: 'claude-opus-4-7' }); + t.deepEqual(modelFor('MED'), { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }); + t.deepEqual(modelFor('LOW'), { id: 'gpt-5.4-nano' }); +}); + +test('parseDAG normalizes mixed model override shapes', (t) => { + const dag = parseDAG({ + title: 'Mixed model overrides', + models: { + HIGH: 'claude-opus-4-7', + MED: { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }, + }, + tasks: [ + { + id: 'review', + depends_on: [], + complexity: 'HIGH', + subtask_prompt: 'Review the change.', + }, + ], + }); + + t.deepEqual(dag.models, { + HIGH: { id: 'claude-opus-4-7' }, + MED: { + id: 'composer-2', + params: [{ id: 'effort', value: 'medium' }], + }, + }); +}); diff --git a/packages/proof/src/dag.ts b/packages/proof/src/dag.ts index c3325aa2..77dc5472 100644 --- a/packages/proof/src/dag.ts +++ b/packages/proof/src/dag.ts @@ -5,8 +5,36 @@ */ export type Complexity = 'HIGH' | 'MED' | 'LOW'; -export type ModelMap = Record; -export type ModelMapOverride = Partial; +export interface ModelParameterValue { + id: string; + value: string; +} + +export interface ModelSelection { + id: string; + params?: ModelParameterValue[]; +} + +export type ModelSpec = string | ModelSelection; +export type ModelMap = Record; +export type ModelMapOverride = Partial>; +export type ResolvedModelMap = Record; + +export interface ModelCatalogItem { + id: string; + displayName: string; + parameters?: Array<{ + id: string; + displayName?: string; + values: Array<{ value: string; displayName?: string }>; + }>; + variants?: Array<{ + params: ModelParameterValue[]; + displayName: string; + description?: string; + isDefault?: boolean; + }>; +} /** * Discriminator separating LLM-backed work from non-LLM gate nodes. @@ -78,7 +106,11 @@ export interface DAGBudget { } const COMPLEXITY_VALUES = new Set(['HIGH', 'MED', 'LOW']); -const COMPLEXITY_KEYS: readonly Complexity[] = ['HIGH', 'MED', 'LOW'] as const; +export const COMPLEXITY_KEYS: readonly Complexity[] = [ + 'HIGH', + 'MED', + 'LOW', +] as const; const TASK_KIND_VALUES = new Set(['task', 'pause', 'oracle']); /** Synthetic placeholder so non-LLM tasks (pause, oracle) satisfy the existing structural type. The runner must branch on `kind` before consuming this. */ const NON_LLM_SYNTHETIC_COMPLEXITY: Complexity = 'LOW'; @@ -116,9 +148,9 @@ export function isOracleTask(task: RawTask): boolean { * read the SDK's error-message catalog; do NOT trust `cursor-agent --list-models`. */ export const DEFAULT_MODEL_MAP: ModelMap = { - HIGH: 'claude-opus-4-7', - MED: 'composer-2', - LOW: 'gpt-5.4-nano', + HIGH: { id: 'claude-opus-4-7' }, + MED: { id: 'composer-2' }, + LOW: { id: 'gpt-5.4-nano' }, }; export function parseDAG(raw: unknown): DAG { @@ -476,22 +508,329 @@ export function validateModelMap( if (!COMPLEXITY_VALUES.has(key as Complexity)) { throw new Error(`${label} contains unknown complexity key: ${key}`); } - if (typeof value !== 'string' || value.trim() === '') { - throw new Error(`${label}.${key} must be a non-empty string.`); - } - models[key as Complexity] = value.trim(); + models[key as Complexity] = normalizeModelSelection( + value as ModelSpec, + `${label}.${key}` + ); } return models; } -export function createModelResolver( +export function createModelSelectionResolver( overrides: ModelMapOverride = {} -): (c: Complexity) => string { - const models: ModelMap = { ...DEFAULT_MODEL_MAP, ...overrides }; - return (c: Complexity): string => { - if (!COMPLEXITY_KEYS.includes(c)) { - throw new Error(`Unknown complexity: ${c}`); +): (c: Complexity) => ModelSelection { + const models = resolveModelMap(overrides); + return (c: Complexity): ModelSelection => { + assertKnownComplexity(c); + return cloneModelSelection(models[c]); + }; +} + +export function createCatalogBackedModelResolver( + modelFor: (c: Complexity) => ModelSelection, + catalog: readonly ModelCatalogItem[] +): (c: Complexity) => ModelSelection { + const cache = new Map(); + return (c: Complexity): ModelSelection => { + const cached = cache.get(c); + if (cached) return cloneModelSelection(cached); + const resolved = resolveModelSelectionFromCatalog( + modelFor(c), + catalog, + `model for ${c}` + ); + cache.set(c, resolved); + return cloneModelSelection(resolved); + }; +} + +/** Validate a JSON model selection object. */ +export function validateModelSelection( + raw: unknown, + label = 'model' +): ModelSelection { + const obj = validateModelSelectionObject(raw, label); + const id = validateNonEmptyString(obj.id, `${label}.id`); + const params = validateModelParams(obj.params, label); + return createModelSelection(id, params); +} + +function validateModelSelectionObject( + raw: unknown, + label: string +): Record { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error(`${label} must be a model object.`); + } + return raw as Record; +} + +function validateNonEmptyString(raw: unknown, label: string): string { + if (typeof raw !== 'string' || raw.trim() === '') { + throw new Error(`${label} must be a non-empty string.`); + } + return raw.trim(); +} + +function validateModelParams( + raw: unknown, + label: string +): ModelParameterValue[] { + if (raw === undefined) return []; + if (!Array.isArray(raw)) { + throw new Error(`${label}.params must be an array when set.`); + } + + const params: ModelParameterValue[] = []; + const seen = new Set(); + for (let i = 0; i < raw.length; i++) { + const param = validateModelParam(raw[i], label, i); + const paramId = param.id; + if (seen.has(paramId)) { + throw new Error(`${label}.params contains duplicate id: ${paramId}`); } - return models[c]; + seen.add(paramId); + params.push(param); + } + return params; +} + +function validateModelParam( + raw: unknown, + label: string, + index: number +): ModelParameterValue { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error(`${label}.params[${index}] must be an object.`); + } + const param = raw as Record; + return { + id: validateNonEmptyString(param.id, `${label}.params[${index}].id`), + value: validateNonEmptyString( + param.value, + `${label}.params[${index}].value` + ), }; } + +export function normalizeModelSelection( + raw: ModelSpec, + label = 'model' +): ModelSelection { + if (typeof raw === 'string') { + return createModelSelection(validateNonEmptyString(raw, label)); + } + return validateModelSelection(raw, label); +} + +export function formatModelSelection(model: ModelSelection): string { + const params = model.params ?? []; + if (params.length === 0) return model.id; + return `${model.id} (${params.map((p) => `${p.id}=${p.value}`).join(', ')})`; +} + +export function resolveModelSelectionFromCatalog( + selection: ModelSelection, + catalog: readonly ModelCatalogItem[], + label = 'model' +): ModelSelection { + const catalogItem = catalog.find((model) => model.id === selection.id); + if (!catalogItem) { + const ids = catalog.map((model) => model.id).sort(); + throw new Error( + `${label} uses unknown Cursor SDK model "${ + selection.id + }". Known models:\n ${ids.join('\n ')}` + ); + } + + validateRequestedParams(selection, catalogItem, label); + + const variants = catalogItem.variants ?? []; + if (variants.length === 0) { + return cloneModelSelection(selection); + } + + const requestedParams = selection.params ?? []; + const chosenVariant = + requestedParams.length === 0 + ? defaultVariant(variants) + : chooseMatchingVariant(requestedParams, variants); + + if (!chosenVariant) { + throw new Error( + `${label} ${formatModelSelection( + selection + )} does not match any Cursor SDK preset variant. Valid variants:\n ${formatVariants( + variants + )}` + ); + } + + const params = chosenVariant.params.map((param) => ({ ...param })); + return params.length > 0 + ? { id: selection.id, params } + : { id: selection.id }; +} + +function validateRequestedParams( + selection: ModelSelection, + catalogItem: ModelCatalogItem, + label: string +): void { + const requestedParams = selection.params ?? []; + if (requestedParams.length === 0) return; + + const paramDefs = catalogItem.parameters ?? []; + if (paramDefs.length > 0) { + const definitions = new Map(paramDefs.map((param) => [param.id, param])); + for (const param of requestedParams) { + const definition = definitions.get(param.id); + if (!definition) { + const supported = [...definitions.keys()].sort(); + throw new Error( + `${label} ${selection.id} does not support param "${ + param.id + }". Supported params: ${ + supported.length > 0 ? supported.join(', ') : '(none)' + }` + ); + } + const allowed = new Set(definition.values.map((value) => value.value)); + if (!allowed.has(param.value)) { + throw new Error( + `${label} ${selection.id} param "${ + param.id + }" does not support value "${param.value}". Supported values: ${[ + ...allowed, + ].join(', ')}` + ); + } + } + return; + } + + const variants = catalogItem.variants ?? []; + if (variants.length > 0) { + const chosenVariant = chooseMatchingVariant(requestedParams, variants); + if (!chosenVariant) { + throw new Error( + `${label} ${formatModelSelection( + selection + )} does not match any Cursor SDK preset variant. Valid variants:\n ${formatVariants( + variants + )}` + ); + } + return; + } + + throw new Error( + `${label} ${selection.id} does not declare parameters or preset variants in the Cursor SDK catalog; remove explicit params from this model selection.` + ); +} + +type ModelCatalogVariant = NonNullable[number]; + +function defaultVariant( + variants: ReadonlyArray +): ModelCatalogVariant { + return variants.find((variant) => variant.isDefault) ?? variants[0]; +} + +function assertKnownComplexity(c: Complexity): void { + if (!COMPLEXITY_KEYS.includes(c)) { + throw new Error(`Unknown complexity: ${c}`); + } +} + +function resolveModelMap(overrides: ModelMapOverride = {}): ModelMap { + return { + HIGH: normalizeModelSelection(overrides.HIGH ?? DEFAULT_MODEL_MAP.HIGH), + MED: normalizeModelSelection(overrides.MED ?? DEFAULT_MODEL_MAP.MED), + LOW: normalizeModelSelection(overrides.LOW ?? DEFAULT_MODEL_MAP.LOW), + }; +} + +function chooseMatchingVariant( + requestedParams: readonly ModelParameterValue[], + variants: ReadonlyArray +): ModelCatalogVariant | undefined { + const matches = variants.filter((variant) => + paramsContainAll(variant.params, requestedParams) + ); + if (matches.length === 0) return undefined; + + const defaultVar = defaultVariant(variants); + const defaultParams = new Map( + defaultVar.params.map((param) => [param.id, param.value]) + ); + const requestedIds = new Set(requestedParams.map((param) => param.id)); + let best = matches[0]; + let bestScore = scoreVariant(best.params, defaultParams, requestedIds); + // Ties break to the catalog-declared default variant; otherwise first match wins. + for (const match of matches.slice(1)) { + const score = scoreVariant(match.params, defaultParams, requestedIds); + if (score > bestScore) { + best = match; + bestScore = score; + } else if ( + score === bestScore && + match === defaultVar && + best !== defaultVar + ) { + best = match; + } + } + return best; +} + +function paramsContainAll( + candidateParams: readonly ModelParameterValue[], + requestedParams: readonly ModelParameterValue[] +): boolean { + const candidate = new Map( + candidateParams.map((param) => [param.id, param.value]) + ); + return requestedParams.every( + (param) => candidate.get(param.id) === param.value + ); +} + +function scoreVariant( + params: readonly ModelParameterValue[], + defaultParams: ReadonlyMap, + requestedIds: ReadonlySet +): number { + let score = 0; + for (const param of params) { + if (requestedIds.has(param.id)) continue; + if (defaultParams.get(param.id) === param.value) score++; + } + return score; +} + +function formatVariants(variants: ReadonlyArray): string { + return variants + .map((variant) => { + const params = variant.params + .map((param) => `${param.id}=${param.value}`) + .join(', '); + const suffix = variant.isDefault ? ' [default]' : ''; + return `${variant.displayName}${suffix}: ${params || '(no params)'}`; + }) + .join('\n '); +} + +function createModelSelection( + id: string, + params: readonly ModelParameterValue[] = [] +): ModelSelection { + return params.length > 0 + ? { id, params: params.map((param) => ({ ...param })) } + : { id }; +} + +function cloneModelSelection(selection: ModelSelection): ModelSelection { + return createModelSelection(selection.id, selection.params ?? []); +} diff --git a/packages/proof/src/index.ts b/packages/proof/src/index.ts index 99485b65..3458d7f5 100644 --- a/packages/proof/src/index.ts +++ b/packages/proof/src/index.ts @@ -8,21 +8,32 @@ */ export { + COMPLEXITY_KEYS, DEFAULT_MODEL_MAP, computeRanks, - createModelResolver, + createCatalogBackedModelResolver, + createModelSelectionResolver, + formatModelSelection, isOracleTask, isPauseTask, + normalizeModelSelection, parseDAG, + resolveModelSelectionFromCatalog, + validateModelSelection, validateModelMap, } from './dag.js'; export type { Complexity, DAG, DAGBudget, + ModelCatalogItem, ModelMap, ModelMapOverride, + ModelParameterValue, + ModelSelection, + ModelSpec, RawTask, + ResolvedModelMap, TaskKind, } from './dag.js'; diff --git a/packages/proof/src/run_dag.ts b/packages/proof/src/run_dag.ts index b4c4c59b..4820276a 100644 --- a/packages/proof/src/run_dag.ts +++ b/packages/proof/src/run_dag.ts @@ -70,7 +70,7 @@ * newly edited source. */ -import { Agent } from '@cursor/sdk'; +import { Agent, Cursor } from '@cursor/sdk'; import { existsSync } from 'node:fs'; import { setMaxListeners } from 'node:events'; import { mkdir, readFile, writeFile } from 'node:fs/promises'; @@ -82,13 +82,19 @@ import { fileURLToPath } from 'node:url'; import { parseDAG, + COMPLEXITY_KEYS, computeRanks, - createModelResolver, + createCatalogBackedModelResolver, + createModelSelectionResolver, + formatModelSelection, + normalizeModelSelection, validateModelMap, } from './dag.js'; import type { DAG, DAGBudget, + ModelSelection, + ModelSpec, ModelMapOverride, RawTask, TaskKind, @@ -406,7 +412,7 @@ function defaultCanvasesDir(cwd: string): string { async function loadResumedRunState( statePath: string, dag: DAG, - modelForComplexity: (c: RawTask['complexity']) => string + modelForComplexity: (c: RawTask['complexity']) => ModelSpec ): Promise { const persisted = await readPersistedRunState(statePath); const state = persisted.state; @@ -430,7 +436,12 @@ async function loadResumedRunState( ts.depends_on = task.depends_on; ts.complexity = task.complexity; ts.subtask_prompt = task.subtask_prompt; - ts.model = modelForComplexity(task.complexity); + const modelSelection = normalizeModelSelection( + modelForComplexity(task.complexity), + `model for task ${task.id}` + ); + ts.model = formatModelSelection(modelSelection); + ts.modelSelection = modelSelection; ts.kind = task.kind ?? 'task'; ts.command = task.kind === 'oracle' ? task.command : undefined; ts.expect = task.kind === 'oracle' ? task.expect : undefined; @@ -461,6 +472,28 @@ function isResumeTerminalStatus(status: TaskState['status']): boolean { ); } +function taskModelSelection(ts: TaskState): ModelSelection { + return ( + ts.modelSelection ?? + // Fallback path is for legacy persisted run-state (before modelSelection). + // In that shape ts.model is always a plain model id (not formatted output). + normalizeModelSelection(ts.model, `task ${ts.id} model`) + ); +} + +async function fetchCursorModelCatalog(): Promise< + Awaited> +> { + try { + return await Cursor.models.list(); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + throw new Error( + `Could not fetch Cursor model catalog. Check CURSOR_API_KEY and network connectivity. Original error: ${message}` + ); + } +} + async function main(): Promise { const args = parseArgs(process.argv.slice(2)); @@ -504,9 +537,23 @@ async function main(): Promise { ), `--models-file ${args.modelsFile}` ); - const modelForComplexity = createModelResolver( + const unresolvedModelForComplexity = createModelSelectionResolver( mergeModelOverrides({ dagModels: dag.models, fileModels }) ); + const modelForComplexity = args.initOnly + ? unresolvedModelForComplexity + : createCatalogBackedModelResolver( + unresolvedModelForComplexity, + await fetchCursorModelCatalog() + ); + if (!args.initOnly) { + for (const complexity of COMPLEXITY_KEYS) { + modelForComplexity(complexity); + } + console.log( + '[proof] validated model selections against Cursor.models.list()' + ); + } const ranks = computeRanks(dag); if (args.convergeOn && !dag.tasks.some((t) => t.id === args.convergeOn)) { @@ -941,10 +988,11 @@ async function runTask( ? options.framing.trimEnd() + '\n\n' : ''; const stitched = framing + stitchedBody; + const modelSelection = taskModelSelection(ts); const agent = await Agent.create({ apiKey: process.env.CURSOR_API_KEY!, - model: { id: ts.model }, + model: modelSelection, local: { cwd }, });