diff --git a/.cursor/skills/dag-task-runner/scripts/canvas_writer.ts b/.cursor/skills/dag-task-runner/scripts/canvas_writer.ts index 0eda9656..76c152ba 100644 --- a/.cursor/skills/dag-task-runner/scripts/canvas_writer.ts +++ b/.cursor/skills/dag-task-runner/scripts/canvas_writer.ts @@ -188,17 +188,15 @@ function renderCanvasSource(state: RunState): string { return `${HEADER}\n\nconst STATE: RunState = ${stateLiteral};\n\n${BODY}\n`; } -const HEADER = `/* AUTO-GENERATED by sdk/dag-task-runner. Do not edit by hand — the runner overwrites this file. */ +const HEADER = `/* AUTO-GENERATED by @flatbread/proof. Do not edit by hand — the runner overwrites this file. */ import { Card, CardBody, CardHeader, Divider, - Grid, H1, H2, Pill, - Row, Stack, Stat, Text, @@ -252,12 +250,12 @@ interface RunState { tasks: TaskState[]; }`; -const BODY = String.raw`const NODE_W = 200; -const NODE_H = 64; -const SCROLL_STORAGE_KEY = 'dag-task-runner:scroll-y'; +const BODY = String.raw`const NODE_H = 64; +const SCROLL_STORAGE_KEY = '@flatbread/proof:scroll-y'; const COMPLETED_DOT_COLOR = '#22c55e'; const AWAITING_DOT_COLOR = '#f59e0b'; const BUDGET_DOT_COLOR = '#ef4444'; +const COMPACT_BREAKPOINT_PX = 720; function effectiveKind(t: TaskState): TaskKind { return t.kind ?? 'task'; @@ -320,6 +318,20 @@ function taskElementId(taskId: string): string { return 'task-card-' + taskId; } +function useViewportWidth(): number { + const [width, setWidth] = useState(1024); + + useEffect(() => { + if (typeof window === 'undefined') return; + const update = (): void => setWidth(window.innerWidth); + update(); + window.addEventListener('resize', update); + return () => window.removeEventListener('resize', update); + }, []); + + return width; +} + function getScrollY(): number { if (typeof window === 'undefined') return 0; return Math.max(window.scrollY ?? 0, 0); @@ -378,17 +390,24 @@ function DAGGraph({ onNodeClick?: (taskId: string) => void; }): JSX.Element { const theme = useHostTheme(); + const viewportWidth = useViewportWidth(); + const isCompact = viewportWidth < COMPACT_BREAKPOINT_PX; + const nodeWidth = isCompact ? 168 : 200; + const nodeGap = isCompact ? 24 : 40; + const rankGap = isCompact ? 60 : 72; + const layoutPadding = isCompact ? 12 : 24; + const titleLimit = Math.max(12, Math.floor((nodeWidth - 44) / 7)); const layout = computeDAGLayout({ nodes: state.tasks.map((t) => ({ id: t.id })), edges: state.tasks.flatMap((t) => t.depends_on.map((d) => ({ from: d, to: t.id })), ), direction: 'vertical', - nodeWidth: NODE_W, + nodeWidth, nodeHeight: NODE_H, - rankGap: 72, - nodeGap: 40, - padding: 24, + rankGap, + nodeGap, + padding: layoutPadding, }); const byId = new Map(state.tasks.map((t) => [t.id, t])); @@ -462,10 +481,24 @@ function DAGGraph({ } return ( +
- {t.id.length > 22 ? t.id.slice(0, 21) + '…' : t.id} + {t.id.length > titleLimit ? t.id.slice(0, titleLimit - 1) + '…' : t.id} +
+ ); +} + +function SummaryStats({ + counts, +}: { + counts: { + total: number; + pending: number; + running: number; + finished: number; + error: number; + awaiting: number; + }; +}): JSX.Element { + return ( +
+ + + 0 ? 'info' : undefined} /> + 0 ? 'warning' : undefined} /> + 0 ? 'success' : undefined} /> + 0 ? 'danger' : undefined} /> +
); } @@ -580,14 +645,21 @@ function TaskList({ {state.tasks.map((t) => { const trailing = ( - +
{t.complexity} {t.status} - +
); return (
@@ -761,7 +833,7 @@ export default function DagRun(): JSX.Element { break; case 'BUDGET-EXCEEDED': // Surfaced via the per-task pill / glyph; bucketed under errored - // here so the 6-column summary grid stays stable. + // here so the summary counts stay stable. acc.error += 1; break; } @@ -795,49 +867,54 @@ export default function DagRun(): JSX.Element { : 'info'; return ( - - -

{STATE.title}

- - - {statusLabel} - - - {counts.total} tasks · elapsed {formatDuration(elapsed(STATE))} - {tokens.input + tokens.output > 0 - ? ' · ' + tokens.input + ' in / ' + tokens.output + ' out tokens' - : ''} - - - {STATE.runMessage ? ( - - {STATE.runMessage} - - ) : null} -
+
+ + +
+

{STATE.title}

+
+
+ + {statusLabel} + + + {counts.total} tasks · elapsed {formatDuration(elapsed(STATE))} + {tokens.input + tokens.output > 0 + ? ' · ' + tokens.input + ' in / ' + tokens.output + ' out tokens' + : ''} + +
+ {STATE.runMessage ? ( + + {STATE.runMessage} + + ) : null} +
- - - - 0 ? 'info' : undefined} /> - 0 ? 'warning' : undefined} /> - 0 ? 'success' : undefined} /> - 0 ? 'danger' : undefined} /> - + - + - -

Graph

- -
+ +

Graph

+ +
- + - -

Tasks

- + +

Tasks

+ +
-
+
); }`; diff --git a/.cursor/skills/dag-task-runner/scripts/oracle_task.ts b/.cursor/skills/dag-task-runner/scripts/oracle_task.ts index e2672784..f53d9a63 100644 --- a/.cursor/skills/dag-task-runner/scripts/oracle_task.ts +++ b/.cursor/skills/dag-task-runner/scripts/oracle_task.ts @@ -98,7 +98,7 @@ export async function runOracleTask( deps.writer.schedule(deps.cloneState(deps.state)); console.log( - `[dag-runner] oracle ${task.id} → exec \`${command}\` (expect /${expectSrc}/)` + `[proof] oracle ${task.id} → exec \`${command}\` (expect /${expectSrc}/)` ); const outcome = await execShell(command, options); @@ -142,7 +142,7 @@ export async function runOracleTask( deps.writer.schedule(deps.cloneState(deps.state)); console.log( - `[dag-runner] oracle ${task.id} → ${pass ? 'PASS' : 'FAIL'} (exit ${ + `[proof] oracle ${task.id} → ${pass ? 'PASS' : 'FAIL'} (exit ${ outcome.exitCode ?? 'null' }, ${ts.durationMs}ms${outcome.timedOut ? ', TIMED OUT' : ''})` ); diff --git a/.cursor/skills/dag-task-runner/scripts/pause_task.ts b/.cursor/skills/dag-task-runner/scripts/pause_task.ts index eb213d2e..66bc53a7 100644 --- a/.cursor/skills/dag-task-runner/scripts/pause_task.ts +++ b/.cursor/skills/dag-task-runner/scripts/pause_task.ts @@ -60,7 +60,7 @@ export async function runPauseTask( deps.writer.schedule(deps.cloneState(deps.state)); console.log( - `[dag-runner] pause ${task.id} → AWAITING_APPROVAL; delete ${sentinelPath} to release the gate` + `[proof] pause ${task.id} → AWAITING_APPROVAL; delete ${sentinelPath} to release the gate` ); const deadline = Date.now() + options.taskTimeoutMs; @@ -92,7 +92,7 @@ export async function runPauseTask( ts.resultText = renderApprovedResultText(sentinelPath, ts.finishedAt); deps.writer.schedule(deps.cloneState(deps.state)); console.log( - `[dag-runner] pause ${task.id} → FINISHED (sentinel removed, ${ts.durationMs}ms gated)` + `[proof] pause ${task.id} → FINISHED (sentinel removed, ${ts.durationMs}ms gated)` ); return; } diff --git a/.cursor/skills/dag-task-runner/scripts/run_dag.ts b/.cursor/skills/dag-task-runner/scripts/run_dag.ts index 4b1c85d5..b4c4c59b 100644 --- a/.cursor/skills/dag-task-runner/scripts/run_dag.ts +++ b/.cursor/skills/dag-task-runner/scripts/run_dag.ts @@ -48,7 +48,7 @@ * `## Stderr (tail)` headings round-trip * through the same parser as regular tasks. * --checkpoint-dir Directory for `kind: 'pause'` sentinel files - * (default `.dag-runner/` under --cwd). + * (default `.proof/` under --cwd). * --converge-on After the main DAG run, parse the named task's * `resultText` for `## Blockers` / * `## High-severity findings`. If non-empty, @@ -59,7 +59,7 @@ * --max-iterations is reached. * --max-iterations Convergence iteration ceiling (default: 3). * --state-path Persist resumable runner state after each rank. - * Defaults to `.dag-runner/run-state.json` when + * Defaults to `.proof/run-state.json` when * --restart-on-runner-change is enabled. * --resume-state Resume from a previously persisted state file. * --restart-on-runner-change @@ -125,6 +125,17 @@ import { } from './self_hosting.js'; const SCRIPTS_DIR = dirname(fileURLToPath(import.meta.url)); +/** + * Source-of-truth directory for `--restart-on-runner-change` snapshotting. + * + * When the runner ships compiled (typical install-time use), `SCRIPTS_DIR` + * resolves to `/dist`. The TS source lives in `/src`, so the + * change detector points there. When running directly from source via + * `tsx src/run_dag.ts`, both directories coincide. + */ +const RUNNER_SOURCE_DIR = SCRIPTS_DIR.endsWith('/src') + ? SCRIPTS_DIR + : resolve(SCRIPTS_DIR, '..', 'src'); interface CliArgs { dag: string; @@ -148,7 +159,7 @@ interface CliArgs { streamIdleTimeoutMs: number; initOnly: boolean; dryCheckCmds: boolean; - /** Absolute dir for `kind: 'pause'` sentinel files. Defaults to `/.dag-runner`. */ + /** Absolute dir for `kind: 'pause'` sentinel files. Defaults to `/.proof`. */ checkpointDir: string; /** When set, the runner re-executes ancestors after the named task to converge on a clean review. */ convergeOn?: string; @@ -239,7 +250,7 @@ function parseArgs(argv: string[]): CliArgs { const checkpointRaw = args['checkpoint-dir']; const checkpointDir = isAbsolute(checkpointRaw ?? '') ? (checkpointRaw as string) - : resolve(cwd, checkpointRaw ?? '.dag-runner'); + : resolve(cwd, checkpointRaw ?? '.proof'); const convergeRaw = args['converge-on']; const convergeOn = convergeRaw !== undefined && convergeRaw !== '' && convergeRaw !== 'true' @@ -258,7 +269,7 @@ function parseArgs(argv: string[]): CliArgs { statePathRaw !== undefined && statePathRaw !== '' && statePathRaw !== 'true' ? statePathRaw : restartOnRunnerChange - ? resumeState ?? '.dag-runner/run-state.json' + ? resumeState ?? '.proof/run-state.json' : undefined; return { @@ -359,7 +370,7 @@ function ensureCursorRipgrepPathEnv(): void { const bundlePkg = cursorSdkRipgrepBundlePackage(); if (!bundlePkg) { console.warn( - '[dag-runner] No bundled ripgrep target for platform; set CURSOR_RIPGREP_PATH to an absolute `rg` path if local agents fail.' + '[proof] No bundled ripgrep target for platform; set CURSOR_RIPGREP_PATH to an absolute `rg` path if local agents fail.' ); return; } @@ -377,7 +388,7 @@ function ensureCursorRipgrepPathEnv(): void { // Optional dependency missing for this OS/arch — user can set CURSOR_RIPGREP_PATH. } console.warn( - `[dag-runner] Could not resolve bundled ripgrep from ${bundlePkg}. Install optional @cursor deps or export CURSOR_RIPGREP_PATH=/absolute/path/to/rg` + `[proof] Could not resolve bundled ripgrep from ${bundlePkg}. Install optional @cursor deps or export CURSOR_RIPGREP_PATH=/absolute/path/to/rg` ); } @@ -510,7 +521,7 @@ async function main(): Promise { : undefined; if (fullOutputAbsoluteDir && !args.initOnly) { await mkdir(fullOutputAbsoluteDir, { recursive: true }); - console.log(`[dag-runner] full-output-dir → ${fullOutputAbsoluteDir}`); + console.log(`[proof] full-output-dir → ${fullOutputAbsoluteDir}`); } const findingsAbsoluteDir = @@ -519,7 +530,7 @@ async function main(): Promise { : undefined; if (findingsAbsoluteDir && !args.initOnly) { await mkdir(findingsAbsoluteDir, { recursive: true }); - console.log(`[dag-runner] findings-dir → ${findingsAbsoluteDir}`); + console.log(`[proof] findings-dir → ${findingsAbsoluteDir}`); } const statePathAbsolute = @@ -538,7 +549,7 @@ async function main(): Promise { state.tasks.map((t) => [t.id, t]) ); const runnerSnapshot = args.restartOnRunnerChange - ? await snapshotRunnerRuntimeFiles(SCRIPTS_DIR) + ? await snapshotRunnerRuntimeFiles(RUNNER_SOURCE_DIR) : undefined; const writer = new CanvasWriter(args.canvasPath, args.debounceMs); @@ -546,14 +557,14 @@ async function main(): Promise { let interrupting = false; console.log( - `[dag-runner] DAG "${dag.title}" — ${dag.tasks.length} tasks across ${ranks.length} rank(s)` + `[proof] DAG "${dag.title}" — ${dag.tasks.length} tasks across ${ranks.length} rank(s)` ); - console.log(`[dag-runner] canvas → ${args.canvasPath}`); + console.log(`[proof] canvas → ${args.canvasPath}`); if (resumeStateAbsolute) { - console.log(`[dag-runner] resumed state ← ${resumeStateAbsolute}`); + console.log(`[proof] resumed state ← ${resumeStateAbsolute}`); } if (statePathAbsolute) { - console.log(`[dag-runner] state-path → ${statePathAbsolute}`); + console.log(`[proof] state-path → ${statePathAbsolute}`); } // Always write the initial all-PENDING canvas first. This is what the parent @@ -563,7 +574,7 @@ async function main(): Promise { await persistState('initial state'); if (args.initOnly) { - console.log('[dag-runner] --init-only: initial canvas written, exiting'); + console.log('[proof] --init-only: initial canvas written, exiting'); return; } @@ -574,18 +585,16 @@ async function main(): Promise { // ERROR, finalize the canvas, and exit cleanly. const onUnhandledRejection = (reason: unknown) => { const msg = reason instanceof Error ? reason.message : String(reason); - console.error(`[dag-runner] (suppressed unhandled SDK rejection) ${msg}`); + console.error(`[proof] (suppressed unhandled SDK rejection) ${msg}`); }; const onUncaughtException = (err: Error): void => { const msg = err?.stack ?? err?.message ?? String(err); - console.error(`[dag-runner] uncaught exception: ${msg}`); + console.error(`[proof] uncaught exception: ${msg}`); void failAndExit(1, 'FAILED', `Runner crashed: ${err.message}`); }; const onSignal = (signal: NodeJS.Signals): void => { const exitCode = signal === 'SIGINT' ? 130 : 143; - console.error( - `[dag-runner] received ${signal}; finalizing canvas before exit` - ); + console.error(`[proof] received ${signal}; finalizing canvas before exit`); void failAndExit( exitCode, 'INTERRUPTED', @@ -612,9 +621,9 @@ async function main(): Promise { await writer.flush(); await persistState(`runner source changed after ${boundary}`); console.log( - `[dag-runner] runner source changed after ${boundary}; persisted state and exiting ${EXIT_RUNNER_RESTART}` + `[proof] runner source changed after ${boundary}; persisted state and exiting ${EXIT_RUNNER_RESTART}` ); - console.log(`[dag-runner] changed runner files: ${changed.join(', ')}`); + console.log(`[proof] changed runner files: ${changed.join(', ')}`); process.exit(EXIT_RUNNER_RESTART); } @@ -633,7 +642,7 @@ async function main(): Promise { const flushMsg = flushErr instanceof Error ? flushErr.message : String(flushErr); console.error( - `[dag-runner] failed to flush canvas during shutdown: ${flushMsg}` + `[proof] failed to flush canvas during shutdown: ${flushMsg}` ); } finally { finalized = true; @@ -733,7 +742,7 @@ async function main(): Promise { } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error( - `[dag-runner] findings sidecar write failed for ${task.id}: ${msg}` + `[proof] findings sidecar write failed for ${task.id}: ${msg}` ); } } @@ -749,14 +758,14 @@ async function main(): Promise { }); if (runnableRank.length === 0) { console.log( - `[dag-runner] rank ${rankIdx + 1}/${ranks.length}: ${rank + `[proof] rank ${rankIdx + 1}/${ranks.length}: ${rank .map((t) => t.id) .join(', ')} (already complete; skipping)` ); continue; } console.log( - `[dag-runner] rank ${rankIdx + 1}/${ranks.length}: ${runnableRank + `[proof] rank ${rankIdx + 1}/${ranks.length}: ${runnableRank .map((t) => t.id) .join(', ')}` ); @@ -836,19 +845,17 @@ async function main(): Promise { const succeeded = state.tasks.length - errors.length - budgetHits.length; console.log( - `[dag-runner] done — ${succeeded}/${ + `[proof] done — ${succeeded}/${ state.tasks.length } succeeded in ${formatMs(state.finishedAt - state.startedAt)}` ); if (errors.length > 0) { - console.log(`[dag-runner] errors: ${errors.map((e) => e.id).join(', ')}`); + console.log(`[proof] errors: ${errors.map((e) => e.id).join(', ')}`); process.exitCode = 1; } if (budgetHits.length > 0) { console.log( - `[dag-runner] budget-exceeded: ${budgetHits - .map((b) => b.id) - .join(', ')}` + `[proof] budget-exceeded: ${budgetHits.map((b) => b.id).join(', ')}` ); // Distinct from the generic ERROR exit (1) so wrapper scripts can // branch on budget. We only upgrade `0`; a prior ERROR-driven `1` @@ -859,7 +866,7 @@ async function main(): Promise { } if (fullOutputAbsoluteDir) { console.log( - `[dag-runner] full transcripts + index (_index.md) → ${fullOutputAbsoluteDir}` + `[proof] full transcripts + index (_index.md) → ${fullOutputAbsoluteDir}` ); } } catch (err) { @@ -874,7 +881,7 @@ async function main(): Promise { writer.schedule(structuredCloneState(state)); await writer.flush(); finalized = true; - console.error(`[dag-runner] ${err.message}`); + console.error(`[proof] ${err.message}`); process.exit(EXIT_BUDGET_EXCEEDED); } const msg = err instanceof Error ? err.message : String(err); @@ -1243,9 +1250,7 @@ async function bestEffortCancel( } catch (cancelErr) { const msg = cancelErr instanceof Error ? cancelErr.message : String(cancelErr); - console.error( - `[dag-runner] failed to cancel timed-out task ${taskId}: ${msg}` - ); + console.error(`[proof] failed to cancel timed-out task ${taskId}: ${msg}`); } } @@ -1388,7 +1393,7 @@ async function runConvergenceLoop( if (!convergeTs) { // Defensive — main() already validates this, but the loop must not crash. console.error( - `[dag-runner] --converge-on "${convergeOn}" not found in state; skipping convergence loop` + `[proof] --converge-on "${convergeOn}" not found in state; skipping convergence loop` ); return; } @@ -1419,7 +1424,7 @@ async function runConvergenceLoop( ); if (!findings.hasIssues) { console.log( - `[dag-runner] converge-on ${convergeOn}: clean — no Blockers / High-severity findings after ${ + `[proof] converge-on ${convergeOn}: clean — no Blockers / High-severity findings after ${ iter - 1 } re-iteration(s)` ); @@ -1448,13 +1453,13 @@ async function runConvergenceLoop( convergeTs.errorMessage = `Convergence iteration ${iter} would exceed budget.maxIterations=${budget.maxIterations}`; writer.schedule(structuredCloneState(state)); console.log( - `[dag-runner] converge-on ${convergeOn}: BUDGET-EXCEEDED — iteration ${iter} would exceed budget.maxIterations=${budget.maxIterations}` + `[proof] converge-on ${convergeOn}: BUDGET-EXCEEDED — iteration ${iter} would exceed budget.maxIterations=${budget.maxIterations}` ); return; } console.log( - `[dag-runner] converge iteration ${iter}/${maxIterations}: ${findings.blockerLines.length} blocker(s), ${findings.highSeverityLines.length} high-severity finding(s) — re-running ${reExecIds.size} task(s)` + `[proof] converge iteration ${iter}/${maxIterations}: ${findings.blockerLines.length} blocker(s), ${findings.highSeverityLines.length} high-severity finding(s) — re-running ${reExecIds.size} task(s)` ); const convergenceContext = buildConvergenceContext( @@ -1528,16 +1533,16 @@ async function runConvergenceLoop( } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error( - `[dag-runner] findings sidecar re-write failed for ${convergeOn} after BUDGET-EXCEEDED: ${msg}` + `[proof] findings sidecar re-write failed for ${convergeOn} after BUDGET-EXCEEDED: ${msg}` ); } } console.log( - `[dag-runner] converge-on ${convergeOn}: BUDGET-EXCEEDED — exhausted --max-iterations=${maxIterations} with ${finalFindings.blockerLines.length} blocker(s), ${finalFindings.highSeverityLines.length} high-severity finding(s)` + `[proof] converge-on ${convergeOn}: BUDGET-EXCEEDED — exhausted --max-iterations=${maxIterations} with ${finalFindings.blockerLines.length} blocker(s), ${finalFindings.highSeverityLines.length} high-severity finding(s)` ); } else { console.log( - `[dag-runner] converge-on ${convergeOn}: clean after ${maxIterations} re-iteration(s)` + `[proof] converge-on ${convergeOn}: clean after ${maxIterations} re-iteration(s)` ); } } @@ -1557,9 +1562,7 @@ function skipTask( ts.durationMs = 0; ts.errorMessage = `Skipped: upstream task(s) ${failedDeps.join(', ')} failed`; console.log( - `[dag-runner] skipping ${task.id} — upstream ${failedDeps.join( - ', ' - )} failed` + `[proof] skipping ${task.id} — upstream ${failedDeps.join(', ')} failed` ); writer.schedule(structuredCloneState(state)); if (!fullOutputAbsoluteDir) return Promise.resolve(); @@ -1749,9 +1752,7 @@ function structuredCloneState(state: RunState): RunState { main().catch((err) => { console.error( - `[dag-runner] fatal: ${ - err instanceof Error ? err.stack ?? err.message : err - }` + `[proof] fatal: ${err instanceof Error ? err.stack ?? err.message : err}` ); process.exit(1); }); diff --git a/.cursor/skills/dag-task-runner/scripts/run_dag_supervisor.ts b/.cursor/skills/dag-task-runner/scripts/run_dag_supervisor.ts index 5313b72e..851523a3 100644 --- a/.cursor/skills/dag-task-runner/scripts/run_dag_supervisor.ts +++ b/.cursor/skills/dag-task-runner/scripts/run_dag_supervisor.ts @@ -6,9 +6,18 @@ * exits with EXIT_RUNNER_RESTART (75) after persisting state whenever runner * runtime files change. This supervisor relaunches the runner with * `--resume-state` so the next process executes the newly edited source. + * + * The supervisor automatically picks the right launcher based on its own + * file extension: + * + * - `.js` (compiled / packaged) — spawn `process.execPath` against the + * sibling `run_dag.js` so consumers do not need `tsx` on PATH. + * - `.ts` (dev / `tsx src/run_dag_supervisor.ts`) — spawn the locally + * installed `tsx` binary against the sibling `run_dag.ts`. */ import { spawn } from 'node:child_process'; +import { existsSync } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import { dirname, isAbsolute, join, resolve } from 'node:path'; import process from 'node:process'; @@ -16,10 +25,28 @@ import { fileURLToPath } from 'node:url'; import { EXIT_RUNNER_RESTART } from './self_hosting.js'; -const SCRIPTS_DIR = dirname(fileURLToPath(import.meta.url)); -const TSX_BIN = join(SCRIPTS_DIR, 'node_modules', '.bin', 'tsx'); -const RUNNER = join(SCRIPTS_DIR, 'run_dag.ts'); -const DEFAULT_STATE_PATH = '.dag-runner/run-state.json'; +const SUPERVISOR_PATH = fileURLToPath(import.meta.url); +const SCRIPTS_DIR = dirname(SUPERVISOR_PATH); +const IS_TS_SOURCE = SUPERVISOR_PATH.endsWith('.ts'); +const RUNNER = join(SCRIPTS_DIR, IS_TS_SOURCE ? 'run_dag.ts' : 'run_dag.js'); + +/** + * Resolve `tsx` lazily — only required when running the supervisor directly + * from `src/*.ts` (dev mode). The compiled `.js` build path uses + * `process.execPath` instead and does not need `tsx` on disk. + */ +function resolveTsxBin(): string { + const candidates = [ + join(SCRIPTS_DIR, '..', 'node_modules', '.bin', 'tsx'), + join(SCRIPTS_DIR, '..', '..', '..', 'node_modules', '.bin', 'tsx'), + ]; + for (const candidate of candidates) { + if (existsSync(candidate)) return candidate; + } + return 'tsx'; +} + +const DEFAULT_STATE_PATH = '.proof/run-state.json'; const DEFAULT_MAX_RESTARTS = 20; interface SupervisorArgs { @@ -97,7 +124,8 @@ function resolveAgainstCwd(path: string, cwd: string): string { async function runOnce(argv: readonly string[]): Promise { return new Promise((resolveCode) => { - const child = spawn(TSX_BIN, [RUNNER, ...argv], { + const command = IS_TS_SOURCE ? resolveTsxBin() : process.execPath; + const child = spawn(command, [RUNNER, ...argv], { cwd: process.cwd(), env: process.env, stdio: 'inherit', @@ -111,7 +139,7 @@ async function runOnce(argv: readonly string[]): Promise { }); child.on('error', (err) => { console.error( - `[dag-runner-supervisor] failed to launch runner: ${err.message}` + `[proof-supervisor] failed to launch runner: ${err.message}` ); resolveCode(1); }); @@ -131,7 +159,7 @@ async function main(): Promise { if (restart > 0) { argv = setFlag(argv, '--resume-state', absoluteStatePath); console.log( - `[dag-runner-supervisor] restart ${restart}/${parsed.maxRestarts} from ${absoluteStatePath}` + `[proof-supervisor] restart ${restart}/${parsed.maxRestarts} from ${absoluteStatePath}` ); } @@ -142,14 +170,14 @@ async function main(): Promise { } console.error( - `[dag-runner-supervisor] exceeded --max-runner-restarts=${parsed.maxRestarts}` + `[proof-supervisor] exceeded --max-runner-restarts=${parsed.maxRestarts}` ); process.exit(1); } main().catch((err) => { console.error( - `[dag-runner-supervisor] fatal: ${ + `[proof-supervisor] fatal: ${ err instanceof Error ? err.stack ?? err.message : err }` ); diff --git a/.cursor/skills/dag-task-runner/scripts/self_hosting.ts b/.cursor/skills/dag-task-runner/scripts/self_hosting.ts index d790863f..c5c5f0b2 100644 --- a/.cursor/skills/dag-task-runner/scripts/self_hosting.ts +++ b/.cursor/skills/dag-task-runner/scripts/self_hosting.ts @@ -70,8 +70,13 @@ export async function snapshotRunnerRuntimeFiles( const snapshot: RunnerFileSnapshot = new Map(); for (const rel of RUNNER_RUNTIME_FILES) { const path = join(scriptsDir, rel); - const s = await stat(path); - snapshot.set(path, { path, size: s.size, mtimeMs: s.mtimeMs }); + try { + const s = await stat(path); + snapshot.set(path, { path, size: s.size, mtimeMs: s.mtimeMs }); + } catch { + // Missing source file (e.g. installed package with `dist`-only layout). + // Skip silently — the change detector simply will not flag this file. + } } return snapshot; } diff --git a/.cursor/skills/proof/SKILL.md b/.cursor/skills/proof/SKILL.md new file mode 100644 index 00000000..36fb1962 --- /dev/null +++ b/.cursor/skills/proof/SKILL.md @@ -0,0 +1,238 @@ +--- +name: proof +description: Decompose a user's task into a DAG of subtasks and execute them with Cursor SDK local subagents in topological order, rendering live streaming status to a canvas. Each task has a complexity (HIGH/MED/LOW) that maps to a model. Use when the user asks to fan out work, decompose a task into a DAG, run subagents in parallel, or break a large task into a dependency graph. +--- + +# Proof + +Decomposes a user-described task into a JSON DAG, then runs each node as a Cursor SDK local subagent (with parents' outputs stitched into the child's prompt). Live DAG state — including each running subagent's streaming output — is rendered into a `.canvas.tsx` that the runner rewrites on every status transition; the IDE hot-recompiles so the user sees subagents move through `PENDING -> RUNNING -> FINISHED/ERROR` in real time. + +The runtime ships as the workspace package `@flatbread/proof` (`packages/proof`). It exposes two CLIs — `proof` (runner) and `proof-supervisor` (self-hosting wrapper) — plus a public library API for tooling that wants to author or inspect DAGs programmatically. + +## When to use + +Trigger when the user says any of: + +- "decompose this task", "break this into a DAG", "fan out subagents" +- "run this as a graph of subtasks" +- a multi-step request where some steps clearly depend on others and others can run in parallel + +Skip when the task is a single-shot edit, a quick question, or already linear enough that one agent turn would handle it. + +## Workflow + +### Step 1 — Generate a DAG JSON + +You (the parent agent) author the DAG inline using your understanding of the user's task. Schema: + +```json +{ + "title": "", + "models": { + "HIGH": "gpt-5.3-codex", + "MED": "composer-2", + "LOW": "auto-low" + }, + "tasks": [ + { + "id": "", + "depends_on": ["", "..."], + "complexity": "HIGH | MED | LOW", + "subtask_prompt": "" + } + ] +} +``` + +Rules: + +- Every `depends_on` entry must reference another task's `id`. +- No cycles. The runner rejects cyclic DAGs at parse time. +- `complexity` controls the model the subagent uses (see table below). Pick `HIGH` for novel/complex reasoning, `MED` for typical implementation, `LOW` for mechanical/lookup tasks. +- Optional top-level `models` can override the default complexity → model map for this DAG. +- `subtask_prompt` should read like a standalone request — the runner automatically prepends a short summary of upstream task outputs, so you do not need to repeat them. +- Do **not** put two tasks that write to the same file in the same rank (siblings within a rank run concurrently and would race). + +#### Maximize parallelism — this is the whole point of the runner + +The runner executes tasks within a rank **concurrently** via `Promise.all`. A linear `A → B → C → D` DAG wastes that capability. Before finalizing the DAG, actively decompose the problem to surface independent work: + +1. **Default to no dependencies.** Add a `depends_on` entry **only** when the child task literally cannot start without the parent's output. "Logically follows" is not a dependency. +2. **Split read-only research and discovery into a wide first rank.** Codebase grepping, doc reading, dependency scans, schema lookups, test inventory — these almost always share rank 1 with no edges between them. +3. **Fan out post-implementation work.** Tests, docs, changelog entries, type updates, lint fixes typically all depend on the same implementation task and on nothing else — put them in one rank, not a chain. +4. **Use diamonds, not lines.** If two tasks both feed into a third, model that explicitly: rank 1 has the two parents, rank 2 is the merge. +5. **Same-rank file-write safety.** The one hard constraint: don't put two tasks in the same rank if they would write the same file. Either serialize them with a `depends_on`, or merge them into one task. + +Quality bar: when you sketch the rank structure (rank 1 → rank 2 → …), at least one rank should contain more than one task in any non-trivial problem. If your DAG is a single chain of 1-task ranks, you almost certainly missed parallelism — go back and look again. + +The example shipped with the runner (`examples/example_dag.json`) demonstrates the pattern: rank 1 fans out to two read-only research tasks, rank 2 merges them into a design, rank 3 implements, and rank 4 fans out again to tests + docs. + +Write the JSON to a temp file **and immediately generate the initial canvas** so the user can open it while subagents spin up. Run all of the following in a single shell block: + +```bash +# 0. Pick a canvas path +CANVAS_PATH="$HOME/.cursor/projects//canvases/dag-.canvas.tsx" + +# 1. Write the DAG JSON +cat > /tmp/dag-.json <<'JSON' +{ "title": "...", "tasks": [ ... ] } +JSON + +# 2. Build the @flatbread/proof package once per workspace install +# (skipped if dist/ is already present; safe to re-run). +[ -f "$(git rev-parse --show-toplevel)/packages/proof/dist/run_dag.js" ] || \ + pnpm -F @flatbread/proof build + +# 3. Generate the initial all-PENDING canvas (no CURSOR_API_KEY needed) +pnpm exec proof \ + --init-only \ + --dag /tmp/dag-.json \ + --canvas-path "$CANVAS_PATH" + +# 4. Best-effort auto-open of the canvas file; ignore failure in headless/non-macOS environments +open "$CANVAS_PATH" >/dev/null 2>&1 || true +``` + +The canvas path is: + +``` +~/.cursor/projects//canvases/dag-.canvas.tsx +``` + +`` is derived from the cwd's absolute path with `/` and other special chars replaced by `-`. To compute it, take `pwd`, strip the leading `/`, and replace each remaining `/` with `-`. Example: cwd `/Users/me/Code/myapp` → slug `Users-me-Code-myapp`. Use the same `` you used for the DAG JSON filename so they're easy to correlate. + +### Step 2 — Surface the canvas link in chat + +Now that the file exists on disk, post a Markdown hyperlink with the exact text `Open Canvas` and a `file://` URL, plus the absolute path for fallback: + +> I created a live canvas: [Open Canvas](file:///Users//.cursor/projects//canvases/dag-.canvas.tsx) +> Fallback path: `/Users//.cursor/projects//canvases/dag-.canvas.tsx` + +Always use the link text `Open Canvas`. Use the absolute path in both the `file://` URL and fallback path, never `~/`. Do this **before** Step 3 so the user can open the canvas while subagents are still spinning up. The Step 1 shell block already attempts to auto-open the canvas with `open`; if that fails, continue and rely on the chat link. + +### Step 3 — Run the DAG + +Ensure `CURSOR_API_KEY` is set (the runner fails fast if missing), then launch: + +```bash +[ -n "$CURSOR_API_KEY" ] || { [ -f .env ] && set -a && source .env && set +a; } + +pnpm exec proof \ + --dag /tmp/dag-.json \ + --canvas-path "$CANVAS_PATH" +``` + +If the DAG is expected to edit the runner itself (`packages/proof/src/**`), launch through the supervisor instead so source edits take effect at a process boundary: + +```bash +pnpm exec proof-supervisor \ + --dag /tmp/dag-.json \ + --canvas-path "$CANVAS_PATH" \ + --state-path "$HOME/.cursor/projects//dag-state/.json" +``` + +The supervisor passes `--restart-on-runner-change` to the runner. When runner runtime files change after a rank or convergence iteration, the child runner persists state, marks the canvas `RESTARTING RUNNER`, exits `75`, and the supervisor relaunches with `--resume-state` so pending tasks continue under the new source. After editing `packages/proof/src/**`, run `pnpm -F @flatbread/proof build` so the relaunch picks up the new code. + +Same `--canvas-path` as Step 1. The runner: + +1. Validates the DAG and reuses the existing canvas file. +2. For each rank (Kahn topo-sort), launches ready tasks concurrently as local Cursor SDK agents and rewrites the canvas as each one transitions, streaming assistant text into each task card live. +3. Automatically skips tasks whose upstream dependencies failed (marks them `ERROR` with a "Skipped: upstream task(s) … failed" message). +4. Captures each subagent's final assistant text, status, token usage, and duration. +5. Writes a final canvas with summary stats. +6. On SIGINT/SIGTERM/SIGHUP, cancels all in-flight subagents before finalizing the canvas. + +#### CLI knobs + +| Flag | Default | Purpose | +| ------------------------------- | ------------------ | ------------------------------------------------------------------------- | +| `--models-file ` | — | JSON file containing a partial complexity → model override map. | +| `--state-path ` | — | Persist resumable state after rank boundaries. | +| `--resume-state ` | — | Resume from a persisted state file. | +| `--restart-on-runner-change` | `false` | Exit `75` after runner runtime files change so a supervisor can relaunch. | +| `--task-timeout-ms ` | `1200000` (20 min) | Marks a task `ERROR` if it runs too long. | +| `--stream-publish-ms ` | `500` | Throttles live canvas streaming writes. | +| `--stream-idle-timeout-ms ` | `300000` (5 min) | Marks a task `ERROR` if no stream events arrive. | +| `--debounce ` | `200` | Canvas write debounce interval. | + +### Step 4 — Summarize + +After the runner exits, briefly summarize what completed/failed and re-link the canvas with the exact text `[Open Canvas](file:///Users//.cursor/projects//canvases/dag-.canvas.tsx)` so the user can scroll back to it. Include the absolute fallback path only if useful. + +## Complexity → model + +| Complexity | Model | +| ---------- | ----------------- | +| HIGH | `claude-opus-4-7` | +| MED | `composer-2` | +| LOW | `gpt-5.4-nano` | + +Override any subset inline with top-level DAG `models`, or pass a reusable profile with `--models-file `. Precedence is defaults < DAG `models` < `--models-file`. The Cursor model catalog can vary by account. + +### Discovering valid model ids + +Many Cursor CLI catalog models encode reasoning effort and Max Mode as **slug suffixes** (e.g. `claude-opus-4-7-thinking-max`, `gpt-5.5-extra-high`, `gpt-5.3-codex-xhigh`), but the Cursor SDK may accept only base slugs. Do not compose SDK model ids from CLI suffixes by hand. For SDK-bound code, prefer `Cursor.models.list()` or the SDK's `ConfigurationError` catalog over `cursor-agent --list-models`. + +Ways to enumerate model ids: + +```bash +# CLI catalog — useful for CLI runs, not authoritative for @cursor/sdk +cursor-agent --list-models + +# SDK-flavored alternative — also prints any per-model `parameters` and preset `variants` +pnpm -F @flatbread/proof models:list # all ids +pnpm -F @flatbread/proof models:list # detail for one model +pnpm -F @flatbread/proof models:list --json +``` + +## Auth + +The runner reads `CURSOR_API_KEY` from the environment. Set it however you usually manage secrets: + +```bash +export CURSOR_API_KEY=crsr_... +``` + +If the current workspace has a `.env` containing it, source that first: + +```bash +set -a && source .env && set +a +``` + +## CLI options + +| Flag | Default | Notes | +| ---------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------- | +| `--dag` | required | Path to the DAG JSON file. | +| `--canvas-path` | composed from below | Full absolute path to the canvas file. Preferred — used by the parent-managed flow. | +| `--canvas` | — | Canvas filename stem (no `.canvas.tsx`). Used only if `--canvas-path` is omitted. | +| `--canvases-dir` | derived from cwd | Override the canvases output directory. Used only with `--canvas`. | +| `--cwd` | `process.cwd()` | Working dir each subagent operates in. | +| `--models-file` | — | JSON file containing a partial complexity → model override map. | +| `--debounce` | `200` (ms) | Canvas write debounce interval. | +| `--init-only` | `false` | Write the initial all-`PENDING` canvas and exit. No `CURSOR_API_KEY` required. | +| `--state-path` | — | Persist resumable runner state. Defaults to `.proof/run-state.json` when `--restart-on-runner-change` is set. | +| `--resume-state` | — | Load a persisted `RunState` and skip already terminal tasks. | +| `--restart-on-runner-change` | `false` | Detect runner runtime file changes after safe boundaries and exit `75` for supervisor restart. | +| `--max-runner-restarts` | `20` | Supervisor-only cap for relaunches from `proof-supervisor`. | +| `--task-timeout-ms` | `1200000` (20 min) | Marks a task `ERROR` if it exceeds this duration. | +| `--stream-publish-ms` | `500` (ms) | Throttles live canvas streaming writes to avoid excessive cloning. | +| `--stream-idle-timeout-ms` | `300000` (5 min) | Marks a task `ERROR` if no stream events arrive within this window. | + +## Caveats + +- Local runtime only — every subagent runs against `--cwd` (defaults to wherever you invoke the runner). +- Sibling tasks in the same rank run in parallel; do not let them write the same files. +- Inline MCP servers and sub-sub-agents are not configured by this runner. +- A failed task automatically skips all downstream dependents (they are marked `ERROR` with a "Skipped: upstream task(s) … failed" message). This prevents wasted API calls on tasks whose inputs are missing. +- Per-task streamed text is capped at `STREAM_CAP = 4000` chars to keep the canvas file modest. Upstream context passed to child tasks is capped at 2000 chars per parent. +- Timed-out tasks are marked `ERROR` instead of staying indefinitely in `RUNNING`. +- SIGINT/SIGTERM/SIGHUP gracefully cancel all in-flight subagents and finalize the canvas before exiting. +- Unexpected unhandled rejections from SDK internals are suppressed to prevent runner crashes; uncaught exceptions are logged and trigger a clean shutdown. + +## Reference + +- Package: `@flatbread/proof` at `packages/proof` +- DAG schema example: `examples/example_dag.json` +- Library exports: `import { parseDAG, computeRanks, ... } from '@flatbread/proof'` +- Cursor SDK docs: https://cursor.com/docs/api/sdk/typescript diff --git a/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json b/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json new file mode 100644 index 00000000..7e609e53 --- /dev/null +++ b/.cursor/skills/proof/examples/dag-flatbread-flow-pmf-audit.json @@ -0,0 +1,52 @@ +{ + "title": "Flatbread Flow PMF Audit (no sub-sub-agents)", + "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", + "models": { + "HIGH": "claude-opus-4-7", + "MED": "gpt-5.5", + "LOW": "gpt-5.4-mini" + }, + "tasks": [ + { + "id": "map-current-flow", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files (frontmatter `readonly: true` is advisory in DAG runs).\n\nMap the current end-to-end flow: how developers define content models, sources, transformers, generated APIs/types, querying, examples, and runtime usage. Distinguish where GraphQL is structurally required vs. one of several interfaces. Read repo docs and source as needed. For `## Current contract` capture: data source config shape, root query naming, ID/ref semantics, filter capabilities, generated TypeScript shape, CLI behavior, and obvious developer-path friction. Reference files as `path/to/file.ts:line`." + }, + { + "id": "relational-content-needs", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nIndependently audit what a developer who wants Git-native relational content would need from Flatbread. Stay in Flatbread's vocabulary (`Content`, `BaseContentNode`, `Source`, `Transformer`, `Override` per `packages/core/src/types.ts`) — do NOT import database vocabulary like tables/foreign keys/joins/constraints/indexes/import-export. Compare needs against what the repo provides today: content collections, refs between collections, query/filter ergonomics, type safety, validation, the local edit/query loop, codegen, and example integration. Map needs to the schema's headings: `## Current contract` is what exists today, `## Proposed contract` is what would close the highest-leverage gaps, `## Migration impact` is what users would have to change, `## Validation plan` is how to prove each gap closure works." + }, + { + "id": "docs-onboarding-audit", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nAudit the repository docs, examples, tests, package scripts, and README/onboarding path for a first-time developer. Focus on whether the relational content promise is obvious, whether the first success path is short, and where the developer is forced into GraphQL-specific concepts before they get value. `## Current contract` is the documented promise + steps; `## Proposed contract` is what the docs should promise instead; `## Migration impact` is the docs/example surface area to touch." + }, + { + "id": "market-positioning-audit", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nUsing the repo as the primary evidence plus general product reasoning, audit Flatbread's likely product-market fit for developers who want Git-native relational content for TypeScript apps. Consider adjacent alternatives: Contentlayer, Velite, Keystatic, MDX-based content layers, Sanity/Contentful-style headless CMSes, Astro Content Collections, and (only when honest) embedded databases like SQLite. `## Current contract` is the implicit positioning today; `## Proposed contract` is the sharpest defensible positioning; `## Migration impact` is what the README/landing copy would need to say." + }, + { + "id": "synthesize-pmf-gaps", + "depends_on": [ + "map-current-flow", + "relational-content-needs", + "docs-onboarding-audit", + "market-positioning-audit" + ], + "complexity": "HIGH", + "subtask_prompt": "You are acting as `flatbread-architecture-planner` operating as the rank-2 merge node. Follow its output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`.\n\nSynthesize upstream audit findings into a prioritized PMF gap report. Call out GraphQL coupling honestly only when upstream evidence supports it. For each gap include: severity (P0/P1/P2), evidence (file refs from upstream), product implication, and the contract change it implies.\n\nIMPORTANT for survival under the 2000-char downstream stitch: keep `## Current contract` to a single 1-2 line summary so the gap table at the top of `## Proposed contract` lands within the first 2000 chars for `recommend-roadmap`." + }, + { + "id": "recommend-roadmap", + "depends_on": ["synthesize-pmf-gaps"], + "complexity": "HIGH", + "subtask_prompt": "You are acting as `flatbread-architecture-planner` producing a roadmap recommendation. Follow its output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`.\n\nBased on the synthesized PMF gaps, recommend a concise product direction and roadmap. Output a sharper positioning statement, 3-5 product primitives to add or clarify (each with file/package anchor), near-term experiments, and what not to build yet. `## Migration impact` should map each recommendation to the affected packages so a follow-up `flatbread-major-migration` DAG (template at `.cursor/skills/proof/examples/flatbread/dag-schema-migration.json`) can be authored from this output without re-deriving scope." + } + ] +} diff --git a/.cursor/skills/proof/examples/example_dag.json b/.cursor/skills/proof/examples/example_dag.json new file mode 100644 index 00000000..6c0e43bd --- /dev/null +++ b/.cursor/skills/proof/examples/example_dag.json @@ -0,0 +1,41 @@ +{ + "title": "Build a tiny CLI todo app", + "tasks": [ + { + "id": "research-stack", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "Sketch the smallest reasonable design for a single-file Node.js CLI todo app that stores items in a local JSON file. List the supported commands (add, list, done, rm), the JSON schema, and the file layout. Output as markdown bullets only — do not write any code yet." + }, + { + "id": "research-cli-conventions", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "Summarize the conventions a small Node CLI should follow: shebang line, exit codes, stdout vs stderr usage, --help output shape, and how to parse argv without a dependency. Output as markdown bullets only — do not write code." + }, + { + "id": "design", + "depends_on": ["research-stack", "research-cli-conventions"], + "complexity": "MED", + "subtask_prompt": "Combine the upstream research into a one-page implementation plan for the todo CLI. Specify file paths, function signatures, error handling, and the JSON storage shape. Output a markdown design doc — still no code." + }, + { + "id": "implement", + "depends_on": ["design"], + "complexity": "MED", + "subtask_prompt": "Implement the design as `todo.mjs` in the current working directory. It must be a single file with no dependencies, support the four commands from the design, and persist to `./todos.json`. After writing the file, run it once with `node todo.mjs --help` and include the output in your reply." + }, + { + "id": "tests", + "depends_on": ["implement"], + "complexity": "LOW", + "subtask_prompt": "Add a `test_todo.mjs` script in the cwd that exercises add → list → done → rm against `todo.mjs` using a temp JSON file. Use only the Node `node:test` and `node:assert` modules. Run it with `node --test test_todo.mjs` and include the output in your reply." + }, + { + "id": "docs", + "depends_on": ["implement"], + "complexity": "LOW", + "subtask_prompt": "Write a short `README.md` in the cwd describing what `todo.mjs` does, the supported commands with examples, and where data is stored. Do not modify `todo.mjs`." + } + ] +} diff --git a/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json b/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json new file mode 100644 index 00000000..5448195d --- /dev/null +++ b/.cursor/skills/proof/examples/flatbread/dag-codegen-change.json @@ -0,0 +1,74 @@ +{ + "title": "Flatbread codegen-only change (no sub-sub-agents)", + "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", + "models": { + "HIGH": "claude-opus-4-7", + "MED": "gpt-5.5", + "LOW": "gpt-5.4-mini" + }, + "tasks": [ + { + "id": "diag-core-types", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files (frontmatter `readonly: true` is advisory in DAG runs).\n\nDiagnose every `@flatbread/core` type that `packages/codegen` consumes (e.g. `CodegenOptions`, `CodegenResult`, `CodegenStrategy`, `Content`, `BaseContentNode`, `Override`, `Source`, `Transformer`). Note which generated artifact in `examples/nextjs/generated/**` is downstream of each. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-codegen-input", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose how `packages/codegen/**` currently consumes inputs from `packages/core` for the proposed change: . Capture introspection vs. type sourcing, document discovery, generated artifact paths landed into `examples/nextjs/generated/**`. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-generated-output", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose the current generated output shape consumed by `examples/nextjs`. Capture every generated file path, every TS export name the example imports, and every GraphQL document the example references. Reference files as `path/to/file.ts:line`." + }, + { + "id": "contract-synth", + "depends_on": [ + "diag-core-types", + "diag-codegen-input", + "diag-generated-output" + ], + "complexity": "HIGH", + "subtask_prompt": "You are acting as `flatbread-architecture-planner` operating as the rank-2 merge node. Follow its output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`.\n\nProduce the codegen before/after contract for the proposed change: . `## Proposed contract` must lead with an executor-actionable diff: changed file paths grouped by directory, changed TS export names, changed GraphQL document shape. `## Human checkpoints` must call out DevEx Validation gate before release." + }, + { + "id": "wait-contract-approval", + "depends_on": ["contract-synth"], + "kind": "pause" + }, + { + "id": "impl-codegen", + "depends_on": ["wait-contract-approval"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`. Group multi-file references under brace expansion.\n\nImplement the contract from the upstream synthesis exactly. Do not expand scope. Touch only `packages/codegen/**` and the generation pipeline. Run `pnpm --filter @flatbread/codegen test` and lint edited files." + }, + { + "id": "impl-example-regen", + "depends_on": ["impl-codegen"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`.\n\nRegenerate `examples/nextjs` GraphQL artifacts via `pnpm --filter nextjs exec flatbread codegen` (the `--filter` is required because `flatbread.config.js` only exists at `examples/nextjs/flatbread.config.js`; `loadConfig` does not search up). Do NOT use `pnpm codegen`, which is `--watch` per `examples/nextjs/package.json:7` and would hang the DAG node. Do not hand-edit generated files. List every generated file that changed (group under brace expansion) plus any example source file that needed an import or query update." + }, + { + "id": "verify-codegen-tests", + "depends_on": ["impl-codegen"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nRun `pnpm --filter @flatbread/codegen test`. Report failures, snapshot diffs, and any contract drift between the synthesized contract and what landed. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches the parent can append directly." + }, + { + "id": "verify-example-build", + "depends_on": ["impl-example-regen"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nRun `pnpm --filter nextjs build` (binds port `5057` via `flatbread start -- next build` per `examples/nextjs/package.json:8`). Stop the build before exit. Report TS errors, missing imports, or query/document mismatches caused by the regenerated artifacts. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches." + }, + { + "id": "browser-verify", + "depends_on": ["verify-example-build"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-browser-verifier`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Commands run`, `## Routes checked`, `## Observed behavior`, `## Mismatches`, `## Screenshots`, `## Residual risk`.\n\nRun `pnpm browser:doctor` first to fail fast if the browser CLI is unavailable. Start the example dev server in the background: `pnpm --filter nextjs dev` (binds port `5057` HTTP, `5058` HTTPS per `packages/flatbread/src/cli/index.ts:128-135`); the upstream `verify-example-build` task already finished and freed the port. Wait for the server to come up before driving `pnpm exec agent-browser`. Verify documented queries and the rendered example pages still match the README. Tear the dev server down before completing the task. If the browser CLI is unavailable, your `## Residual risk` MUST lead with `BROWSER UNAVAILABLE`." + } + ] +} diff --git a/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json b/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json new file mode 100644 index 00000000..0dc276c0 --- /dev/null +++ b/.cursor/skills/proof/examples/flatbread/dag-docs-sync.json @@ -0,0 +1,51 @@ +{ + "title": "Flatbread docs / README sync (no sub-sub-agents)", + "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", + "models": { + "HIGH": "claude-opus-4-7", + "MED": "gpt-5.5", + "LOW": "gpt-5.4-mini" + }, + "tasks": [ + { + "id": "diag-readme-claims", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files (frontmatter `readonly: true` is advisory in DAG runs).\n\nList every claim, command, code snippet, and example query in the root `README.md` and each `packages/*/README.md`. For each, mark whether the current implementation still matches. Reference files as `path/to/file.md:line`." + }, + { + "id": "diag-example-paths", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nWalk `examples/nextjs` end-to-end and capture the actual first-success path a developer sees: setup commands, codegen invocation (note `package.json:7` is `--watch`; the docs should point users at the appropriate command), dev command, port (`5057` HTTP, `5058` HTTPS sibling), sample query, sample edit. Compare against what the docs claim." + }, + { + "id": "diag-positioning", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nAudit positioning language in `README.md`, package READMEs, and any landing copy. Flag any phrasing that overclaims database-replacement (tables, foreign keys, joins, constraints, indexes, import/export) or treats GraphQL as the entire product identity." + }, + { + "id": "docs-plan", + "depends_on": [ + "diag-readme-claims", + "diag-example-paths", + "diag-positioning" + ], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner` operating as the rank-2 merge node. Follow its output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`.\n\nProduce a concrete docs-edit plan. `## Proposed contract` must lead with a flat list of `path/to/file.md:line — change` (group adjacent edits) so the executor can apply edits without re-reading the diagnoses." + }, + { + "id": "impl-docs", + "depends_on": ["docs-plan"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`. Group multi-file references under brace expansion.\n\nApply the docs plan exactly. Touch only `*.md` files. Do not modify code. Run the project's lint/format on changed files." + }, + { + "id": "review-docs", + "depends_on": ["impl-docs"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nReview the docs diff for: stale commands, drifting code snippets, broken cross-links, and any positioning that overclaims database semantics. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches the parent can append directly." + } + ] +} diff --git a/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json b/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json new file mode 100644 index 00000000..97a8e0e9 --- /dev/null +++ b/.cursor/skills/proof/examples/flatbread/dag-schema-migration.json @@ -0,0 +1,153 @@ +{ + "title": "Flatbread schema-breaking migration (no sub-sub-agents; pause at human checkpoint after contract-synth)", + "framing": "Treat Flatbread as Git-native relational content for TypeScript apps, backed by flat files. GraphQL is one interface, not the whole product identity.", + "models": { + "HIGH": "claude-opus-4-7", + "MED": "gpt-5.5", + "LOW": "gpt-5.4-mini" + }, + "tasks": [ + { + "id": "diag-schema", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files (frontmatter `readonly: true` is advisory in DAG runs).\n\nDiagnose `packages/core/src/generators/schema.ts` for the proposed change: . For `## Current contract` capture root query naming, ID/ref semantics, filter capabilities, and any user-visible GraphQL surface this file generates. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-resolvers", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/core/src/resolvers/arguments.ts` for the proposed change: . For `## Current contract` capture filter shape, supported operators, and any internal contracts other resolvers depend on. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-types", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/core/src/types.ts` for the proposed change: . Capture which exported types cross package boundaries to `@flatbread/codegen`, the CLI, transformers, and examples. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-codegen", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/codegen/**` for the proposed change: . Capture how generated TypeScript and GraphQL documents are produced, which inputs from `@flatbread/core` they depend on, and how the generated artifacts land in `examples/nextjs`. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-cli", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/flatbread/src/cli/index.ts` and the GraphQL server wiring for the proposed change: . Capture the Flatbread start command behavior, codegen invocation, `/graphql` endpoint, port `5057` (HTTP) and `5058` (HTTPS sibling at `packages/flatbread/src/cli/index.ts:128-135`). Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-examples", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `examples/nextjs` for the proposed change: . Capture which generated GraphQL documents and types the example consumes, which queries would need to change, and which README/docs snippets would drift. Note: `examples/nextjs/package.json:7` defines `pnpm codegen` as `flatbread codegen --watch` (hangs in DAG runs); list non-`--watch` invocations instead." + }, + { + "id": "diag-docs", + "depends_on": [], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `README.md` and each `packages/*/README.md` for the proposed change: . List every snippet, command, or claim that would no longer be true. Reference files as `path/to/file.md:line`." + }, + { + "id": "diag-transformers", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/transformer-markdown/**` and `packages/transformer-yaml/**` for the proposed change: . Capture how each implements `Transformer` (interface at `packages/core/src/types.ts:73-82`), especially `preknownSchemaFragments` (`packages/core/src/types.ts:79`), and which extensions they own. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-source-plugins", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/source-filesystem/**` for the proposed change: . Capture how it implements `Source` (interface at `packages/core/src/types.ts:95-101`), especially `fetch` and `fetchByType`, and which file-discovery assumptions would shift. Reference files as `path/to/file.ts:line`." + }, + { + "id": "diag-config", + "depends_on": [], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-architecture-planner`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`. Do not edit files.\n\nDiagnose `packages/config/**` (especially `packages/config/src/validate.ts`) for the proposed change: . Capture the validated `FlatbreadConfig` shape, every required field, and how validation diagnostics surface to the CLI. Reference files as `path/to/file.ts:line`." + }, + { + "id": "contract-synth", + "depends_on": [ + "diag-schema", + "diag-resolvers", + "diag-types", + "diag-codegen", + "diag-cli", + "diag-examples", + "diag-docs", + "diag-transformers", + "diag-source-plugins", + "diag-config" + ], + "complexity": "HIGH", + "subtask_prompt": "You are acting as `flatbread-architecture-planner` operating as the rank-2 contract synthesis node. Follow its output schema. Output must lead with these `##` headings verbatim: `## Current contract`, `## Proposed contract`, `## Migration impact`, `## Validation plan`, `## Human checkpoints`.\n\nMerge upstream diagnoses into a single before/after contract for IDs, refs, filters, root query names, generated TypeScript, config shape, transformer/source contracts, and CLI behavior. `## Proposed contract` must lead with the literal markdown table `| field | before | after | breaking? | files_to_change |` as the first content under the heading, followed by executor-actionable details (changed file paths grouped by directory, changed export/type names, changed CLI flags) before any prose. `## Human checkpoints` must explicitly call out Schema Contract approval before any executor runs.\n\nIMPORTANT for survival under the 2000-char downstream stitch: keep `## Current contract` to a single 1-2 line summary so the executor-actionable diff at the top of `## Proposed contract` lands within the first 2000 chars for every downstream `impl-*` task." + }, + { + "id": "wait-contract-approval", + "depends_on": ["contract-synth"], + "kind": "pause" + }, + { + "id": "impl-core", + "depends_on": ["wait-contract-approval"], + "complexity": "HIGH", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`. Group multi-file references under brace expansion (e.g. `packages/core/src/{generators/schema.ts,resolvers/arguments.ts,types.ts}`).\n\nImplement the contract from `contract-synth` exactly. Do not expand scope. Touch only `packages/core/src/**`. Serialize edits in this order inside this single task: schema.ts → arguments.ts → types.ts. Run `pnpm --filter @flatbread/core build` to confirm the package compiles (note: `packages/core/package.json` has no `test` script as of this writing) and lint edited files. Record the build outcome under `## Checks run`." + }, + { + "id": "impl-docs", + "depends_on": ["wait-contract-approval"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`.\n\nUpdate `README.md`, every `packages/*/README.md`, and any migration notes the contract requires. Touch only `*.md` files — no code edits. Group changes under brace expansion in `## Files changed`." + }, + { + "id": "impl-codegen", + "depends_on": ["impl-core"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`.\n\nImplement the codegen-side of the contract. Touch only `packages/codegen/**`. Run `pnpm --filter @flatbread/codegen test` and lint edited files." + }, + { + "id": "impl-cli", + "depends_on": ["impl-core"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`.\n\nImplement the CLI-side of the contract. Touch only `packages/flatbread/src/**`. Do not run `flatbread start` (port 5057 is reserved for the rank-7 `verify-cli` task). Lint edited files." + }, + { + "id": "impl-examples", + "depends_on": ["impl-codegen"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-migration-executor`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Files changed`, `## Contract implemented`, `## Checks run`, `## Checks skipped`, `## Residual risk`, `## Release gate state`.\n\nRegenerate `examples/nextjs` GraphQL artifacts via `pnpm --filter nextjs exec flatbread codegen` (the `--filter` is required because `flatbread.config.js` only exists at `examples/nextjs/flatbread.config.js`; `loadConfig` does not search up). Do NOT use `pnpm codegen`, which is `--watch` per `examples/nextjs/package.json:7` and would hang the DAG node. Update any example source file whose imports or queries broke. Group generated paths under brace expansion in `## Files changed`." + }, + { + "id": "verify-schema-snap", + "depends_on": ["impl-core"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nDiff the generated GraphQL schema against the synthesized contract. Flag any drift. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches the parent can append directly." + }, + { + "id": "verify-codegen", + "depends_on": ["impl-codegen"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nRun `pnpm --filter @flatbread/codegen test` and report failures. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches." + }, + { + "id": "verify-readme", + "depends_on": ["impl-docs"], + "complexity": "LOW", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nDiff every README example and command against actual runtime behavior implied by the synthesized contract. Flag stale snippets, broken links, and any positioning drift. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches." + }, + { + "id": "verify-cli", + "depends_on": ["impl-cli", "verify-codegen"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-adversarial-reviewer`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Blockers`, `## High-severity findings`, `## Medium-severity findings`, `## Low-severity findings`, `## Residual risk`, `## Recommended next DAG tasks`.\n\nSmoke-test `pnpm --filter nextjs dev` and the `/graphql` endpoint on port `5057` (HTTP) and `5058` (HTTPS). This task is the sole port-5057 occupant of its rank — no other task may bind that port concurrently. Stop the server before exit so `browser-verify` can take the port. If anything fails, populate `## Recommended next DAG tasks` with `id` + one-line subtask_prompt sketches." + }, + { + "id": "browser-verify", + "depends_on": ["impl-examples", "verify-cli"], + "complexity": "MED", + "subtask_prompt": "You are acting as `flatbread-browser-verifier`. Follow its responsibilities and output schema. Output must lead with these `##` headings verbatim: `## Commands run`, `## Routes checked`, `## Observed behavior`, `## Mismatches`, `## Screenshots`, `## Residual risk`.\n\nRun `pnpm browser:doctor` first to fail fast if the browser CLI is unavailable. Start the example dev server in the background: `pnpm --filter nextjs dev` (binds port `5057` HTTP, `5058` HTTPS per `packages/flatbread/src/cli/index.ts:128-135`); the upstream `verify-cli` task already stopped its server before exit, so the port is free. Wait for the server to come up before driving `pnpm exec agent-browser`. Verify documented queries and rendered example pages still match READMEs. Tear the dev server down before completing the task. If the browser CLI is unavailable, your `## Residual risk` MUST lead with `BROWSER UNAVAILABLE` so the parent re-queues. This is the terminal release-gate node." + } + ] +} diff --git a/examples/nextjs/lib/graphql.ts b/examples/nextjs/lib/graphql.ts index b340d266..a33538ac 100644 --- a/examples/nextjs/lib/graphql.ts +++ b/examples/nextjs/lib/graphql.ts @@ -22,17 +22,21 @@ export async function graphqlFetch( variables?: Record, endpoint: string = 'http://localhost:5057/graphql' ): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 15000); + const response = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', Accept: 'application/json', }, + signal: controller.signal, body: JSON.stringify({ query, variables, }), - }); + }).finally(() => clearTimeout(timeout)); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); diff --git a/package.json b/package.json index 3ffb14c8..e8c869cb 100644 --- a/package.json +++ b/package.json @@ -42,6 +42,7 @@ "@flatbread/codegen": "workspace:*", "@flatbread/config": "workspace:*", "@flatbread/core": "workspace:*", + "@flatbread/proof": "workspace:*", "@flatbread/resolver-svimg": "workspace:*", "@flatbread/source-filesystem": "workspace:*", "@flatbread/transformer-markdown": "workspace:*", diff --git a/packages/proof/bin/proof-supervisor.js b/packages/proof/bin/proof-supervisor.js new file mode 100755 index 00000000..468147a8 --- /dev/null +++ b/packages/proof/bin/proof-supervisor.js @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import { resolve } from 'path'; +import { existsSync } from 'fs'; + +if (process.env.FLATBREAD_CI) { + const cliPath = resolve( + process.cwd(), + 'node_modules', + '@flatbread', + 'proof', + 'dist', + 'run_dag_supervisor.js' + ); + + if (existsSync(cliPath)) { + import('../dist/run_dag_supervisor.js'); + } else { + console.log('@flatbread/proof supervisor CLI is not available'); + } +} else { + import('../dist/run_dag_supervisor.js'); +} diff --git a/packages/proof/bin/proof.js b/packages/proof/bin/proof.js new file mode 100755 index 00000000..b350074a --- /dev/null +++ b/packages/proof/bin/proof.js @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import { resolve } from 'path'; +import { existsSync } from 'fs'; + +if (process.env.FLATBREAD_CI) { + const cliPath = resolve( + process.cwd(), + 'node_modules', + '@flatbread', + 'proof', + 'dist', + 'run_dag.js' + ); + + if (existsSync(cliPath)) { + import('../dist/run_dag.js'); + } else { + console.log('@flatbread/proof CLI is not available'); + } +} else { + import('../dist/run_dag.js'); +} diff --git a/packages/proof/package.json b/packages/proof/package.json new file mode 100644 index 00000000..2a8ce6d0 --- /dev/null +++ b/packages/proof/package.json @@ -0,0 +1,51 @@ +{ + "name": "@flatbread/proof", + "version": "0.1.0-alpha.0", + "description": "Decompose a task into a DAG of subagents and prove they did the work — live canvas, oracles, pause gates, and convergence loops.", + "type": "module", + "scripts": { + "build": "tsup", + "dev": "tsup --watch src", + "typecheck": "tsc -p tsconfig.json --noEmit", + "models:list": "tsx src/list_models.ts" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/FlatbreadLabs/flatbread.git", + "directory": "packages/proof" + }, + "homepage": "https://github.com/FlatbreadLabs/flatbread/tree/main/packages/proof#readme", + "author": "Tony Ketcham ", + "license": "MIT", + "bugs": { + "url": "https://github.com/FlatbreadLabs/flatbread/issues" + }, + "exports": { + ".": "./dist/index.js" + }, + "main": "dist/index.js", + "module": "dist/index.js", + "types": "dist/index.d.ts", + "bin": { + "proof": "bin/proof.js", + "proof-supervisor": "bin/proof-supervisor.js" + }, + "files": [ + "bin", + "dist", + "src", + "*.d.ts" + ], + "engines": { + "node": ">=18" + }, + "dependencies": { + "@cursor/sdk": "^1.0.9" + }, + "devDependencies": { + "@types/node": "^22.10.0", + "tsup": "^8.3.0", + "tsx": "^4.19.0", + "typescript": "^5.7.0" + } +} diff --git a/packages/proof/src/canvas_writer.ts b/packages/proof/src/canvas_writer.ts new file mode 100644 index 00000000..76c152ba --- /dev/null +++ b/packages/proof/src/canvas_writer.ts @@ -0,0 +1,920 @@ +/** + * Renders the runner's in-memory state into a self-contained `.canvas.tsx` + * file. The IDE hot-recompiles on file change, so calling write() repeatedly + * gives the user a live view of the DAG run. + * + * The canvas is fully static React + cursor/canvas — all state is inlined as + * a `const STATE = {...}` literal. Only that literal changes between writes; + * the rendered template is identical. + */ + +import { writeFile, mkdir } from 'node:fs/promises'; +import { dirname } from 'node:path'; +import type { Complexity, DAG, TaskKind } from './dag.js'; + +export type TaskStatus = + | 'PENDING' + | 'RUNNING' + | 'FINISHED' + | 'ERROR' + | 'AWAITING_APPROVAL' + | 'BUDGET-EXCEEDED'; + +export interface TaskState { + id: string; + depends_on: string[]; + complexity: Complexity; + subtask_prompt: string; + status: TaskStatus; + model: string; + /** `'task'` (default), `'pause'`, or `'oracle'`. Undefined is normalized to `'task'`. */ + kind?: TaskKind; + /** + * Shell command for `kind: 'oracle'` tasks. Surfaced in the canvas so the + * gate's pass/fail criterion is visible without reading the result body. + * Undefined for every other kind. + */ + command?: string; + /** Regex source the oracle's output is matched against (defaults to `.*`). */ + expect?: string; + startedAt?: number; + finishedAt?: number; + resultText?: string; + errorMessage?: string; + inputTokens?: number; + outputTokens?: number; + durationMs?: number; + /** + * Convergence-loop re-execution counter. 0/undefined = original run; bumped + * by 1 each time `--converge-on` re-runs this task to address upstream + * reviewer findings. + */ + iteration?: number; + /** + * Absolute path to the sentinel file the runner created for a `kind: 'pause'` + * task. Set when status === `AWAITING_APPROVAL`; persisted afterwards so the + * canvas can show "approved by removing ". + */ + checkpointPath?: string; +} + +export interface RunState { + title: string; + startedAt: number; + finishedAt?: number; + /** + * Aggregate outcome of the entire run. + * + * - `SUCCESS` — every task finished cleanly. + * - `FAILED` — at least one task ended in `ERROR`. + * - `INTERRUPTED` — the runner caught a fatal signal (SIGINT/SIGTERM/SIGHUP). + * - `BUDGET_EXCEEDED` — a budget ceiling was crossed: either the + * `--converge-on` loop exhausted `--max-iterations` with the convergence + * task still reporting blockers, OR `dag.budget.maxTokensTotal` was + * exceeded. Both paths exit with `EXIT_BUDGET_EXCEEDED` (4) so + * wrappers can branch on budget overflows without parsing logs. Hyphen + * form (`BUDGET-EXCEEDED`) is reserved for the per-task `TaskStatus`; + * the run-level field uses underscores to match the rest of this enum. + * - `RESTARTING_RUNNER` — runner runtime files changed mid-run; the + * supervisor should relaunch the runner from persisted state so the next + * process executes the newly edited source. + */ + runOutcome?: + | 'SUCCESS' + | 'FAILED' + | 'INTERRUPTED' + | 'BUDGET_EXCEEDED' + | 'RESTARTING_RUNNER'; + runMessage?: string; + tasks: TaskState[]; +} + +export function initialRunState( + dag: DAG, + modelFor: (c: Complexity) => string +): RunState { + return { + title: dag.title, + startedAt: Date.now(), + tasks: dag.tasks.map((t) => ({ + id: t.id, + depends_on: t.depends_on, + complexity: t.complexity, + subtask_prompt: t.subtask_prompt, + status: 'PENDING', + model: modelFor(t.complexity), + // Normalize undefined kind → 'task' so downstream consumers (canvas + // template, runner dispatcher) never have to ?? again. + kind: t.kind ?? 'task', + // Surface oracle-only fields so the canvas can render the gate's + // command / expectation without reading the streamed result body. + ...(t.kind === 'oracle' ? { command: t.command, expect: t.expect } : {}), + })), + }; +} + +/** + * Debounced writer. Multiple write() calls inside the debounce window collapse + * into one filesystem write — the latest state always wins. + */ +export class CanvasWriter { + private pending: RunState | null = null; + private timer: NodeJS.Timeout | null = null; + private inFlight: Promise = Promise.resolve(); + private writeSeq = 0; + private lastFailedWriteSeq = 0; + private lastWriteError: unknown = null; + + constructor( + private readonly canvasPath: string, + private readonly debounceMs: number = 200 + ) {} + + schedule(state: RunState): void { + this.pending = state; + if (this.timer) return; + this.timer = setTimeout(() => { + this.timer = null; + const snapshot = this.pending; + this.pending = null; + if (snapshot) { + this.enqueueWrite(snapshot); + } + }, this.debounceMs); + } + + /** Force-flush any pending write and await disk completion. */ + async flush(): Promise { + if (this.timer) { + clearTimeout(this.timer); + this.timer = null; + } + const snapshot = this.pending; + this.pending = null; + const targetWriteSeq = snapshot + ? this.enqueueWrite(snapshot) + : this.writeSeq; + await this.inFlight; + if (targetWriteSeq > 0 && this.lastFailedWriteSeq === targetWriteSeq) { + throw this.lastWriteError; + } + } + + private enqueueWrite(state: RunState): number { + const seq = ++this.writeSeq; + this.inFlight = this.inFlight.then(async () => { + try { + await this.writeNow(state); + if (this.lastFailedWriteSeq < seq) { + this.lastWriteError = null; + } + } catch (err) { + this.lastFailedWriteSeq = seq; + this.lastWriteError = err; + } + }); + return seq; + } + + private async writeNow(state: RunState): Promise { + const source = renderCanvasSource(state); + await mkdir(dirname(this.canvasPath), { recursive: true }); + await writeFile(this.canvasPath, source, 'utf8'); + } +} + +function renderCanvasSource(state: RunState): string { + const stateLiteral = JSON.stringify(state, null, 2); + return `${HEADER}\n\nconst STATE: RunState = ${stateLiteral};\n\n${BODY}\n`; +} + +const HEADER = `/* AUTO-GENERATED by @flatbread/proof. Do not edit by hand — the runner overwrites this file. */ +import { + Card, + CardBody, + CardHeader, + Divider, + H1, + H2, + Pill, + Stack, + Stat, + Text, + computeDAGLayout, + useHostTheme, +} from 'cursor/canvas'; +import { useEffect, useMemo, useState } from 'react'; + +type TaskStatus = + | 'PENDING' + | 'RUNNING' + | 'FINISHED' + | 'ERROR' + | 'AWAITING_APPROVAL' + | 'BUDGET-EXCEEDED'; +type Complexity = 'HIGH' | 'MED' | 'LOW'; +type TaskKind = 'task' | 'pause' | 'oracle'; + +interface TaskState { + id: string; + depends_on: string[]; + complexity: Complexity; + subtask_prompt: string; + status: TaskStatus; + model: string; + kind?: TaskKind; + command?: string; + expect?: string; + startedAt?: number; + finishedAt?: number; + resultText?: string; + errorMessage?: string; + inputTokens?: number; + outputTokens?: number; + durationMs?: number; + iteration?: number; + checkpointPath?: string; +} + +interface RunState { + title: string; + startedAt: number; + finishedAt?: number; + runOutcome?: + | 'SUCCESS' + | 'FAILED' + | 'INTERRUPTED' + | 'BUDGET_EXCEEDED' + | 'RESTARTING_RUNNER'; + runMessage?: string; + tasks: TaskState[]; +}`; + +const BODY = String.raw`const NODE_H = 64; +const SCROLL_STORAGE_KEY = '@flatbread/proof:scroll-y'; +const COMPLETED_DOT_COLOR = '#22c55e'; +const AWAITING_DOT_COLOR = '#f59e0b'; +const BUDGET_DOT_COLOR = '#ef4444'; +const COMPACT_BREAKPOINT_PX = 720; + +function effectiveKind(t: TaskState): TaskKind { + return t.kind ?? 'task'; +} + +function pillToneFor(status: TaskStatus): 'neutral' | 'info' | 'success' | 'warning' { + switch (status) { + case 'PENDING': + return 'neutral'; + case 'RUNNING': + return 'info'; + case 'FINISHED': + return 'success'; + case 'ERROR': + return 'warning'; + case 'AWAITING_APPROVAL': + return 'warning'; + case 'BUDGET-EXCEEDED': + return 'warning'; + } +} + +function complexityTone(c: Complexity): 'neutral' | 'info' | 'warning' { + switch (c) { + case 'HIGH': + return 'warning'; + case 'MED': + return 'info'; + case 'LOW': + return 'neutral'; + } +} + +function formatDuration(ms?: number): string { + if (ms === undefined) return '—'; + if (ms < 1000) return ms + 'ms'; + const s = ms / 1000; + if (s < 60) return s.toFixed(1) + 's'; + const m = Math.floor(s / 60); + const rem = Math.round(s - m * 60); + return m + 'm ' + rem + 's'; +} + +function elapsed(state: RunState): number { + const end = state.finishedAt ?? Date.now(); + return end - state.startedAt; +} + +function totalTokens(state: RunState): { input: number; output: number } { + let input = 0; + let output = 0; + for (const t of state.tasks) { + input += t.inputTokens ?? 0; + output += t.outputTokens ?? 0; + } + return { input, output }; +} + +function taskElementId(taskId: string): string { + return 'task-card-' + taskId; +} + +function useViewportWidth(): number { + const [width, setWidth] = useState(1024); + + useEffect(() => { + if (typeof window === 'undefined') return; + const update = (): void => setWidth(window.innerWidth); + update(); + window.addEventListener('resize', update); + return () => window.removeEventListener('resize', update); + }, []); + + return width; +} + +function getScrollY(): number { + if (typeof window === 'undefined') return 0; + return Math.max(window.scrollY ?? 0, 0); +} + +function saveScrollY(): void { + if (typeof window === 'undefined') return; + try { + window.sessionStorage.setItem(SCROLL_STORAGE_KEY, String(getScrollY())); + } catch { + // ignore storage failures + } +} + +function restoreScrollY(): void { + if (typeof window === 'undefined') return; + let target = 0; + try { + const raw = window.sessionStorage.getItem(SCROLL_STORAGE_KEY); + if (!raw) return; + const parsed = Number(raw); + if (!Number.isFinite(parsed) || parsed <= 0) return; + target = Math.floor(parsed); + } catch { + return; + } + + // Retry because hot-reload can run before content height has settled. + let attempts = 0; + const maxAttempts = 8; + const tick = (): void => { + attempts += 1; + const scrollHeight = Math.max( + document.documentElement?.scrollHeight ?? 0, + document.body?.scrollHeight ?? 0, + ); + const maxY = Math.max(scrollHeight - window.innerHeight, 0); + if (maxY <= 0) { + if (attempts < maxAttempts) window.requestAnimationFrame(tick); + return; + } + const desiredY = Math.min(target, maxY); + window.scrollTo({ top: desiredY, behavior: 'auto' }); + if (attempts < maxAttempts && Math.abs(getScrollY() - desiredY) > 2) { + window.requestAnimationFrame(tick); + } + }; + window.requestAnimationFrame(tick); +} + +function DAGGraph({ + state, + onNodeClick, +}: { + state: RunState; + onNodeClick?: (taskId: string) => void; +}): JSX.Element { + const theme = useHostTheme(); + const viewportWidth = useViewportWidth(); + const isCompact = viewportWidth < COMPACT_BREAKPOINT_PX; + const nodeWidth = isCompact ? 168 : 200; + const nodeGap = isCompact ? 24 : 40; + const rankGap = isCompact ? 60 : 72; + const layoutPadding = isCompact ? 12 : 24; + const titleLimit = Math.max(12, Math.floor((nodeWidth - 44) / 7)); + const layout = computeDAGLayout({ + nodes: state.tasks.map((t) => ({ id: t.id })), + edges: state.tasks.flatMap((t) => + t.depends_on.map((d) => ({ from: d, to: t.id })), + ), + direction: 'vertical', + nodeWidth, + nodeHeight: NODE_H, + rankGap, + nodeGap, + padding: layoutPadding, + }); + + const byId = new Map(state.tasks.map((t) => [t.id, t])); + + function nodeFill(status: TaskStatus): string { + switch (status) { + case 'PENDING': + return theme.fill.tertiary; + case 'RUNNING': + return theme.fill.secondary; + case 'FINISHED': + return theme.fill.secondary; + case 'ERROR': + return theme.fill.secondary; + case 'AWAITING_APPROVAL': + return theme.fill.secondary; + case 'BUDGET-EXCEEDED': + return theme.fill.secondary; + } + } + + function nodeStroke(status: TaskStatus): string { + switch (status) { + case 'PENDING': + return theme.stroke.tertiary; + case 'RUNNING': + return theme.accent.primary; + case 'FINISHED': + return COMPLETED_DOT_COLOR; + case 'ERROR': + return theme.stroke.primary; + case 'AWAITING_APPROVAL': + return AWAITING_DOT_COLOR; + case 'BUDGET-EXCEEDED': + return BUDGET_DOT_COLOR; + } + } + + function statusGlyph(status: TaskStatus): string { + switch (status) { + case 'PENDING': + return '○'; + case 'RUNNING': + return '◐'; + case 'FINISHED': + return '●'; + case 'ERROR': + return '×'; + case 'AWAITING_APPROVAL': + return '⏸'; + case 'BUDGET-EXCEEDED': + return '⊘'; + } + } + + function statusGlyphColor(status: TaskStatus): string { + switch (status) { + case 'PENDING': + return theme.text.tertiary; + case 'RUNNING': + return theme.accent.primary; + case 'FINISHED': + return COMPLETED_DOT_COLOR; + case 'ERROR': + return theme.text.primary; + case 'AWAITING_APPROVAL': + return AWAITING_DOT_COLOR; + case 'BUDGET-EXCEEDED': + return BUDGET_DOT_COLOR; + } + } + + return ( +
+ + + + + + + {layout.edges.map((e, i) => ( + + ))} + {layout.nodes.map((n) => { + const t = byId.get(n.id); + if (!t) return null; + return ( + onNodeClick?.(n.id)} + style={{ cursor: onNodeClick ? 'pointer' : 'default' }} + > + + + {statusGlyph(t.status)} + + + {t.id.length > titleLimit ? t.id.slice(0, titleLimit - 1) + '…' : t.id} + + + {effectiveKind(t) === 'pause' + ? 'human checkpoint' + : effectiveKind(t) === 'oracle' + ? 'oracle gate' + : t.complexity + ' · ' + t.model} + + + {t.status === 'FINISHED' || t.status === 'ERROR' + ? (effectiveKind(t) === 'oracle' + ? (t.status === 'FINISHED' ? 'pass · ' : 'fail · ') + formatDuration(t.durationMs) + : formatDuration(t.durationMs)) + + ((t.iteration ?? 0) > 0 ? ' · iter ' + t.iteration : '') + : t.status === 'RUNNING' + ? 'running…' + ((t.iteration ?? 0) > 0 ? ' · iter ' + t.iteration : '') + : t.status === 'AWAITING_APPROVAL' + ? 'awaiting approval' + : t.status === 'BUDGET-EXCEEDED' + ? 'budget exceeded' + ((t.iteration ?? 0) > 0 ? ' · iter ' + t.iteration : '') + : 'pending'} + + + ); + })} + +
+ ); +} + +function SummaryStats({ + counts, +}: { + counts: { + total: number; + pending: number; + running: number; + finished: number; + error: number; + awaiting: number; + }; +}): JSX.Element { + return ( +
+ + + 0 ? 'info' : undefined} /> + 0 ? 'warning' : undefined} /> + 0 ? 'success' : undefined} /> + 0 ? 'danger' : undefined} /> +
+ ); +} + +function TaskList({ + state, + forcedOpenVersionByTaskId, +}: { + state: RunState; + forcedOpenVersionByTaskId: Record; +}): JSX.Element { + const theme = useHostTheme(); + return ( + + {state.tasks.map((t) => { + const trailing = ( +
+ + {t.complexity} + + + {t.status} + +
+ ); + return ( +
+ 0} + > + {t.id} + + + + {effectiveKind(t) === 'pause' + ? 'Human checkpoint' + : effectiveKind(t) === 'oracle' + ? 'Oracle gate (deterministic — no model)' + : 'Model ' + t.model} + {t.depends_on.length > 0 ? ' · depends on ' + t.depends_on.join(', ') : ''} + {t.durationMs !== undefined ? ' · ' + formatDuration(t.durationMs) : ''} + {t.inputTokens !== undefined || t.outputTokens !== undefined + ? ' · ' + (t.inputTokens ?? 0) + ' in / ' + (t.outputTokens ?? 0) + ' out tokens' + : ''} + {(t.iteration ?? 0) > 0 ? ' · iteration ' + t.iteration : ''} + + {effectiveKind(t) === 'pause' && t.checkpointPath ? ( + + + {t.status === 'AWAITING_APPROVAL' ? 'Pending approval — delete this file to release the gate:' : 'Approved checkpoint:'} + +
+                      rm '{t.checkpointPath}'
+                    
+
+ ) : null} + {effectiveKind(t) === 'oracle' ? ( + + + Command: + {t.command ?? '(no command)'} + + + Expect: + /{t.expect ?? '.*'}/ + + + ) : ( + + {effectiveKind(t) === 'pause' ? 'Description: ' : 'Prompt: '} + {t.subtask_prompt || (effectiveKind(t) === 'pause' ? '(no description)' : '')} + + )} + {t.resultText ? ( + + + {t.status === 'RUNNING' + ? 'Streaming output' + : t.status === 'AWAITING_APPROVAL' + ? 'Pause status' + : effectiveKind(t) === 'oracle' + ? t.status === 'FINISHED' + ? 'Oracle pass' + : 'Oracle fail' + : 'Result'} + +
+                      {t.resultText}
+                      {t.status === 'RUNNING' ? '\u2588' : ''}
+                    
+
+ ) : t.status === 'RUNNING' ? ( + + Waiting for first token… + + ) : null} + {t.errorMessage ? ( + + Error + {t.errorMessage} + + ) : null} +
+
+
+
+ ); + })} +
+ ); +} + +export default function DagRun(): JSX.Element { + const [forcedOpenVersionByTaskId, setForcedOpenVersionByTaskId] = useState>({}); + const taskIds = useMemo(() => new Set(STATE.tasks.map((t) => t.id)), []); + + useEffect(() => { + restoreScrollY(); + const onScroll = (): void => saveScrollY(); + window.addEventListener('scroll', onScroll, { passive: true }); + return () => { + saveScrollY(); + window.removeEventListener('scroll', onScroll); + }; + }, []); + + const handleNodeClick = (taskId: string): void => { + if (!taskIds.has(taskId)) return; + + setForcedOpenVersionByTaskId((prev) => ({ + ...prev, + [taskId]: (prev[taskId] ?? 0) + 1, + })); + + const targetId = taskElementId(taskId); + const scrollToTask = (): void => { + const el = document.getElementById(targetId); + if (!el) return; + el.scrollIntoView({ behavior: 'smooth', block: 'start' }); + }; + + // Wait one frame so the forced-open remount lands before we scroll. + window.requestAnimationFrame(scrollToTask); + }; + + const counts = STATE.tasks.reduce( + (acc, t) => { + acc.total += 1; + switch (t.status) { + case 'PENDING': + acc.pending += 1; + break; + case 'RUNNING': + acc.running += 1; + break; + case 'FINISHED': + acc.finished += 1; + break; + case 'ERROR': + acc.error += 1; + break; + case 'AWAITING_APPROVAL': + acc.awaiting += 1; + break; + case 'BUDGET-EXCEEDED': + // Surfaced via the per-task pill / glyph; bucketed under errored + // here so the summary counts stay stable. + acc.error += 1; + break; + } + return acc; + }, + { total: 0, pending: 0, running: 0, finished: 0, error: 0, awaiting: 0 }, + ); + const tokens = totalTokens(STATE); + const isFinal = STATE.finishedAt !== undefined; + const statusLabel = + STATE.runOutcome === 'INTERRUPTED' + ? 'INTERRUPTED' + : STATE.runOutcome === 'FAILED' + ? 'FAILED' + : STATE.runOutcome === 'BUDGET_EXCEEDED' + ? 'BUDGET-EXCEEDED' + : STATE.runOutcome === 'RESTARTING_RUNNER' + ? 'RESTARTING RUNNER' + : isFinal + ? 'COMPLETE' + : 'RUNNING'; + const statusTone = + STATE.runOutcome === 'INTERRUPTED' || + STATE.runOutcome === 'FAILED' || + STATE.runOutcome === 'BUDGET_EXCEEDED' + ? 'danger' + : STATE.runOutcome === 'RESTARTING_RUNNER' + ? 'warning' + : isFinal + ? 'success' + : 'info'; + + return ( +
+ + +
+

{STATE.title}

+
+
+ + {statusLabel} + + + {counts.total} tasks · elapsed {formatDuration(elapsed(STATE))} + {tokens.input + tokens.output > 0 + ? ' · ' + tokens.input + ' in / ' + tokens.output + ' out tokens' + : ''} + +
+ {STATE.runMessage ? ( + + {STATE.runMessage} + + ) : null} +
+ + + + + + +

Graph

+ +
+ + + + +

Tasks

+ +
+
+
+ ); +}`; diff --git a/packages/proof/src/converge_loop.ts b/packages/proof/src/converge_loop.ts new file mode 100644 index 00000000..4fd75166 --- /dev/null +++ b/packages/proof/src/converge_loop.ts @@ -0,0 +1,156 @@ +/** + * --converge-on + --max-iterations loop helpers. + * + * The convergence task is expected to be a `flatbread-adversarial-reviewer` + * style node — its `resultText` follows the schema: + * + * ## Blockers + * … + * ## High-severity findings + * … + * ## Medium-severity findings + * … + * + * `extractConvergenceFindings` parses that result text. If `## Blockers` or + * `## High-severity findings` contain meaningful content (anything beyond + * `none`/`(none)`/`n/a` placeholders), we mark the run as having issues and + * the parent runner re-executes the ancestor subtree. + * + * `transitiveAncestors` returns the closed set of ancestor task ids in the + * DAG (the union of `depends_on` reached by repeated traversal). The runner + * filters its existing rank ordering to that set so re-execution preserves + * the same topological order as the original run. + */ + +import type { DAG } from './dag.js'; + +export interface ConvergenceFindings { + hasIssues: boolean; + blockerLines: string[]; + highSeverityLines: string[]; +} + +export function extractConvergenceFindings( + text: string | undefined +): ConvergenceFindings { + if (!text) { + return { hasIssues: false, blockerLines: [], highSeverityLines: [] }; + } + const sections = parseSections(text); + const blockerLines = filterMeaningful(sections.get('blockers') ?? []); + const highSeverityLines = filterMeaningful( + sections.get('high-severity findings') ?? [] + ); + return { + hasIssues: blockerLines.length > 0 || highSeverityLines.length > 0, + blockerLines, + highSeverityLines, + }; +} + +/** + * Splits text into sections keyed by `## ` heading text (lower-cased, + * trimmed). Sub-headings (`### …`) are kept inside their parent section. + */ +function parseSections(text: string): Map { + const out = new Map(); + // Anchor on lines that start with exactly two `#` (not three+) followed by + // a space — matches `## Blockers` but skips `### Sub-section`. + const HEADING_RE = /^##(?!#)\s*(.+?)\s*$/; + const lines = text.split(/\r?\n/); + let currentHeading: string | null = null; + let currentLines: string[] = []; + for (const line of lines) { + const m = HEADING_RE.exec(line); + if (m) { + if (currentHeading !== null) out.set(currentHeading, currentLines); + currentHeading = m[1].trim().toLowerCase(); + currentLines = []; + } else if (currentHeading !== null) { + currentLines.push(line); + } + } + if (currentHeading !== null) out.set(currentHeading, currentLines); + return out; +} + +/** + * Returns lines that look like real findings — drops blanks, plain + * placeholder text like `(none)` / `none.` / `n/a`, and decorative + * separators. + */ +function filterMeaningful(lines: string[]): string[] { + const out: string[] = []; + for (const raw of lines) { + const line = raw.trim(); + if (line === '') continue; + if (isPlaceholderLine(line)) continue; + if (/^[-*_]{3,}$/.test(line)) continue; // hr separators + out.push(line); + } + return out; +} + +function isPlaceholderLine(line: string): boolean { + // Strip leading bullets, formatting punctuation, and surrounding parens — + // anything that survives gets compared against a tiny placeholder vocabulary. + const stripped = line.replace(/[-_*()[\]\s.,;:!?'"`>]/g, '').toLowerCase(); + if (stripped === '') return true; + return PLACEHOLDER_WORDS.has(stripped); +} + +const PLACEHOLDER_WORDS = new Set([ + 'none', + 'na', + 'noneobserved', + 'nonefound', + 'nonenoted', + 'nothing', + 'nothingtoreport', + 'noissues', + 'noissuesfound', + 'noblockers', + 'noblockersfound', + 'nohighseverityfindings', + 'nohighseverityissues', +]); + +export function transitiveAncestors(taskId: string, dag: DAG): Set { + const byId = new Map(dag.tasks.map((t) => [t.id, t])); + const visited = new Set(); + const start = byId.get(taskId); + if (!start) return visited; + + const stack: string[] = [...start.depends_on]; + while (stack.length > 0) { + const id = stack.pop()!; + if (visited.has(id)) continue; + visited.add(id); + const t = byId.get(id); + if (!t) continue; + for (const dep of t.depends_on) stack.push(dep); + } + return visited; +} + +/** + * Renders the convergence task's `resultText` into the standard "extra + * upstream context" preamble we stitch into ancestor prompts on re-run. The + * iteration index lets re-runs distinguish their feedback from any future + * iterations. + */ +export function buildConvergenceContext( + convergeTaskId: string, + iteration: number, + resultText: string | undefined +): string { + const trimmed = (resultText ?? '').trim(); + const body = trimmed === '' ? '(empty result text)' : trimmed; + return [ + `Convergence feedback from "${convergeTaskId}" (iteration ${ + iteration - 1 + }):`, + '', + body, + ].join('\n'); +} diff --git a/packages/proof/src/dag.ts b/packages/proof/src/dag.ts new file mode 100644 index 00000000..c3325aa2 --- /dev/null +++ b/packages/proof/src/dag.ts @@ -0,0 +1,497 @@ +/** + * DAG schema parsing, validation, and topological ranking for the runner. + * + * The DAG file shape is intentionally tiny — see ../examples/example_dag.json. + */ + +export type Complexity = 'HIGH' | 'MED' | 'LOW'; +export type ModelMap = Record; +export type ModelMapOverride = Partial; + +/** + * Discriminator separating LLM-backed work from non-LLM gate nodes. + * + * - `task` (default) — a normal subagent invocation; uses `complexity` to + * select a model and treats `subtask_prompt` as the LLM prompt. + * - `pause` — a no-LLM rendezvous node. The runner blocks downstream tasks + * until an out-of-band signal (sentinel file removal, timeout, etc.) is + * observed. `complexity` is irrelevant and rejected at parse time; + * `subtask_prompt` is optional and surfaced as the canvas description. + * - `oracle` — a no-LLM deterministic gate. The runner executes `command` + * and pass/fails on whether stdout/stderr matches `expect` (regex, + * defaults to `'.*'`). `complexity`, `subtask_prompt`, and any explicit + * `model` field are rejected at parse time because no model is invoked. + */ +export type TaskKind = 'task' | 'pause' | 'oracle'; + +export interface RawTask { + id: string; + depends_on: string[]; + complexity: Complexity; + subtask_prompt: string; + /** + * Optional discriminator. Absent in legacy DAG JSON, in which case the + * parser treats the task as `'task'` so every existing template keeps + * parsing untouched. Non-LLM kinds (`'pause'`, `'oracle'`) get a synthetic + * `complexity` (`'LOW'`) attached so the structural type is satisfied — + * the runner must branch on `kind` before consuming `complexity` or + * `subtask_prompt`. + */ + kind?: TaskKind; + /** + * Required for `kind: 'oracle'`. Shell command the runner executes to + * decide pass/fail. Ignored on every other kind and rejected at parse + * time if set on a non-oracle task. + */ + command?: string; + /** + * Optional for `kind: 'oracle'`. Regex applied to the command's combined + * stdout/stderr; a match is required for pass. Defaults to `'.*'` (any + * output, even empty, matches). Rejected on every other kind. + * + * Note: by default the pass predicate ALSO requires `exit code === 0`. + * Set `allowNonZeroExit: true` to opt out of that requirement (only useful + * when asserting on the output of an intentionally failing command). + */ + expect?: string; + /** + * Optional for `kind: 'oracle'`. When `true`, an oracle passes on regex + * match alone, regardless of the command's exit code. Defaults to `false` + * — exit 0 is required by default because the historical regex-only + * contract silently passed `&&`-chained commands that exited non-zero. + * Rejected on every other kind. + */ + allowNonZeroExit?: boolean; +} + +export interface DAG { + title: string; + models?: ModelMapOverride; + framing?: string; + budget?: DAGBudget; + tasks: RawTask[]; +} + +export interface DAGBudget { + maxIterations?: number; + maxTokensTotal?: number; +} + +const COMPLEXITY_VALUES = new Set(['HIGH', 'MED', 'LOW']); +const COMPLEXITY_KEYS: readonly Complexity[] = ['HIGH', 'MED', 'LOW'] as const; +const TASK_KIND_VALUES = new Set(['task', 'pause', 'oracle']); +/** Synthetic placeholder so non-LLM tasks (pause, oracle) satisfy the existing structural type. The runner must branch on `kind` before consuming this. */ +const NON_LLM_SYNTHETIC_COMPLEXITY: Complexity = 'LOW'; +/** Default `expect` regex for `kind: 'oracle'` — any output (even empty) matches. */ +const DEFAULT_ORACLE_EXPECT = '.*'; + +/** Type guard — pause tasks must be detected by `kind` before any model-bound code path runs. */ +export function isPauseTask(task: RawTask): boolean { + return task.kind === 'pause'; +} + +/** Type guard — oracle tasks must be detected by `kind` before any model-bound code path runs. */ +export function isOracleTask(task: RawTask): boolean { + return task.kind === 'oracle'; +} + +/** + * Model IDs are validated at runtime by the Cursor SDK (NOT the `cursor-agent` + * CLI). The two catalogs differ: the CLI exposes reasoning-effort suffixes + * like `gpt-5.4-low` and `claude-opus-4-7-thinking-medium`; the SDK only + * accepts base slugs and rejects suffixed variants with + * `ConfigurationError: Cannot use this model`. + * + * The defaults below were cross-checked against the SDK's own error-message + * catalog (which `assertModelIdInList` enumerates verbatim) on 2026-05-07: + * + * default, composer-2, composer-1.5, gpt-5.3-codex, claude-sonnet-4-6, + * gpt-5.5, claude-opus-4-7, gpt-5.4, claude-opus-4-6, claude-opus-4-5, + * gpt-5.2, gemini-3.1-pro, gpt-5.4-mini, gpt-5.4-nano, claude-haiku-4-5, + * gpt-5.3-codex-spark, grok-4.3, claude-sonnet-4-5, gpt-5.2-codex, + * gpt-5.1-codex-max, gpt-5.1, gemini-3-flash, gpt-5.1-codex-mini, + * claude-sonnet-4, gpt-5-mini, gemini-2.5-flash, kimi-k2.5 + * + * To re-validate: trigger any LOW task with a deliberately-bad model id and + * read the SDK's error-message catalog; do NOT trust `cursor-agent --list-models`. + */ +export const DEFAULT_MODEL_MAP: ModelMap = { + HIGH: 'claude-opus-4-7', + MED: 'composer-2', + LOW: 'gpt-5.4-nano', +}; + +export function parseDAG(raw: unknown): DAG { + if (!raw || typeof raw !== 'object') { + throw new Error('DAG file must be a JSON object.'); + } + const obj = raw as Record; + if (typeof obj.title !== 'string' || obj.title.trim() === '') { + throw new Error('DAG.title must be a non-empty string.'); + } + if (!Array.isArray(obj.tasks) || obj.tasks.length === 0) { + throw new Error('DAG.tasks must be a non-empty array.'); + } + + const tasks: RawTask[] = obj.tasks.map((t, i) => validateTask(t, i)); + const ids = new Set(); + for (const t of tasks) { + if (ids.has(t.id)) { + throw new Error(`Duplicate task id: ${t.id}`); + } + ids.add(t.id); + } + for (const t of tasks) { + for (const dep of t.depends_on) { + if (!ids.has(dep)) { + throw new Error(`Task ${t.id} depends_on unknown id: ${dep}`); + } + if (dep === t.id) { + throw new Error(`Task ${t.id} depends on itself.`); + } + } + } + + detectCycle(tasks); + + const models = + obj.models === undefined + ? undefined + : validateModelMap(obj.models, 'DAG.models'); + const framing = + obj.framing === undefined ? undefined : validateFraming(obj.framing); + const budget = + obj.budget === undefined ? undefined : validateBudget(obj.budget); + + return { title: obj.title, models, framing, budget, tasks }; +} + +function validateFraming(raw: unknown): string { + if (typeof raw !== 'string') { + throw new Error('DAG.framing must be a string when set.'); + } + return raw; +} + +function validateBudget(raw: unknown): DAGBudget { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error('DAG.budget must be a JSON object when set.'); + } + const obj = raw as Record; + const budget: DAGBudget = {}; + if (obj.maxIterations !== undefined) { + validateBudgetNumber(obj.maxIterations, 'DAG.budget.maxIterations'); + budget.maxIterations = obj.maxIterations; + } + if (obj.maxTokensTotal !== undefined) { + validateBudgetNumber(obj.maxTokensTotal, 'DAG.budget.maxTokensTotal'); + budget.maxTokensTotal = obj.maxTokensTotal; + } + return budget; +} + +function validateBudgetNumber( + raw: unknown, + label: string +): asserts raw is number { + if (typeof raw !== 'number' || !Number.isSafeInteger(raw) || raw < 0) { + throw new Error(`${label} must be a non-negative integer when set.`); + } +} + +function validateTask(raw: unknown, index: number): RawTask { + if (!raw || typeof raw !== 'object') { + throw new Error(`tasks[${index}] must be an object.`); + } + const t = raw as Record; + + const id = t.id; + if (typeof id !== 'string' || id.trim() === '') { + throw new Error(`tasks[${index}].id must be a non-empty string.`); + } + + const kind = resolveTaskKind(t.kind, index); + + const depends_on = t.depends_on ?? []; + if ( + !Array.isArray(depends_on) || + depends_on.some((d) => typeof d !== 'string') + ) { + throw new Error(`tasks[${index}].depends_on must be an array of strings.`); + } + const dedupedDepends = [...new Set(depends_on as string[])]; + + if (kind === 'pause') { + if (t.complexity !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="pause" and must not set complexity (no LLM is invoked).` + ); + } + if (t.command !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="pause" and must not set command (only kind="oracle" runs a shell command).` + ); + } + if (t.expect !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="pause" and must not set expect (only kind="oracle" matches output).` + ); + } + if (t.allowNonZeroExit !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="pause" and must not set allowNonZeroExit (only kind="oracle" runs a command).` + ); + } + let subtask_prompt = ''; + if (t.subtask_prompt !== undefined) { + if (typeof t.subtask_prompt !== 'string') { + throw new Error( + `tasks[${index}].subtask_prompt must be a string when set on a pause task.` + ); + } + subtask_prompt = t.subtask_prompt; + } + return { + id, + depends_on: dedupedDepends, + complexity: NON_LLM_SYNTHETIC_COMPLEXITY, + subtask_prompt, + kind: 'pause', + }; + } + + if (kind === 'oracle') { + if (t.complexity !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="oracle" and must not set complexity (no LLM is invoked).` + ); + } + if (t.subtask_prompt !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="oracle" and must not set subtask_prompt (oracle tasks run a shell command, not an LLM prompt).` + ); + } + if (t.model !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="oracle" and must not set model (no model is invoked).` + ); + } + if (typeof t.command !== 'string' || t.command.trim() === '') { + throw new Error( + `tasks[${index}] (id="${id}") is kind="oracle" and requires a non-empty string command.` + ); + } + let expect: string = DEFAULT_ORACLE_EXPECT; + if (t.expect !== undefined) { + if (typeof t.expect !== 'string') { + throw new Error( + `tasks[${index}].expect must be a string when set on an oracle task.` + ); + } + try { + new RegExp(t.expect); + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + throw new Error( + `tasks[${index}].expect must be a valid regex (got ${JSON.stringify( + t.expect + )}: ${reason}).` + ); + } + expect = t.expect; + } + let allowNonZeroExit = false; + if (t.allowNonZeroExit !== undefined) { + if (typeof t.allowNonZeroExit !== 'boolean') { + throw new Error( + `tasks[${index}].allowNonZeroExit must be a boolean when set on an oracle task.` + ); + } + allowNonZeroExit = t.allowNonZeroExit; + } + return { + id, + depends_on: dedupedDepends, + complexity: NON_LLM_SYNTHETIC_COMPLEXITY, + subtask_prompt: '', + kind: 'oracle', + command: t.command, + expect, + allowNonZeroExit, + }; + } + + if (t.command !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="task" and must not set command (only kind="oracle" runs a shell command).` + ); + } + if (t.expect !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="task" and must not set expect (only kind="oracle" matches output).` + ); + } + if (t.allowNonZeroExit !== undefined) { + throw new Error( + `tasks[${index}] (id="${id}") is kind="task" and must not set allowNonZeroExit (only kind="oracle" runs a command).` + ); + } + const complexity = t.complexity; + if ( + typeof complexity !== 'string' || + !COMPLEXITY_VALUES.has(complexity as Complexity) + ) { + throw new Error( + `tasks[${index}].complexity must be one of HIGH | MED | LOW.` + ); + } + const subtask_prompt = t.subtask_prompt; + if (typeof subtask_prompt !== 'string' || subtask_prompt.trim() === '') { + throw new Error( + `tasks[${index}].subtask_prompt must be a non-empty string.` + ); + } + return { + id, + depends_on: dedupedDepends, + complexity: complexity as Complexity, + subtask_prompt, + kind: 'task', + }; +} + +function resolveTaskKind(raw: unknown, index: number): TaskKind { + if (raw === undefined) return 'task'; + if (typeof raw === 'string' && TASK_KIND_VALUES.has(raw as TaskKind)) { + return raw as TaskKind; + } + throw new Error( + `tasks[${index}].kind must be one of 'task' | 'pause' | 'oracle' when set (got ${JSON.stringify( + raw + )}).` + ); +} + +/** Throws on the first cycle found. Uses iterative DFS with a recursion stack. */ +function detectCycle(tasks: RawTask[]): void { + const adj = new Map(); + for (const t of tasks) adj.set(t.id, []); + for (const t of tasks) { + for (const dep of t.depends_on) { + adj.get(dep)!.push(t.id); + } + } + + const WHITE = 0; + const GRAY = 1; + const BLACK = 2; + const color = new Map(); + for (const t of tasks) color.set(t.id, WHITE); + + for (const start of tasks) { + if (color.get(start.id) !== WHITE) continue; + const stack: Array<{ id: string; childIdx: number; pathIdx: number }> = [ + { id: start.id, childIdx: 0, pathIdx: 0 }, + ]; + const path: string[] = []; + color.set(start.id, GRAY); + path.push(start.id); + + while (stack.length > 0) { + const top = stack[stack.length - 1]; + const children = adj.get(top.id)!; + if (top.childIdx >= children.length) { + color.set(top.id, BLACK); + path.pop(); + stack.pop(); + continue; + } + const child = children[top.childIdx++]; + const cColor = color.get(child) ?? WHITE; + if (cColor === GRAY) { + const cycleStart = path.indexOf(child); + const cycle = [...path.slice(cycleStart), child].join(' -> '); + throw new Error(`Cycle detected: ${cycle}`); + } + if (cColor === WHITE) { + color.set(child, GRAY); + path.push(child); + stack.push({ id: child, childIdx: 0, pathIdx: path.length - 1 }); + } + } + } +} + +/** + * Kahn's algorithm — return tasks grouped into ranks. Tasks within a rank + * have no inter-dependencies and can run in parallel. + */ +export function computeRanks(dag: DAG): RawTask[][] { + const remaining = new Map(); + const byId = new Map(); + for (const t of dag.tasks) { + remaining.set(t.id, t.depends_on.length); + byId.set(t.id, t); + } + const dependents = new Map(); + for (const t of dag.tasks) dependents.set(t.id, []); + for (const t of dag.tasks) { + for (const dep of t.depends_on) { + dependents.get(dep)!.push(t.id); + } + } + + const ranks: RawTask[][] = []; + let frontier = dag.tasks.filter((t) => remaining.get(t.id) === 0); + while (frontier.length > 0) { + ranks.push(frontier); + const next: RawTask[] = []; + for (const t of frontier) { + for (const child of dependents.get(t.id)!) { + const r = remaining.get(child)! - 1; + remaining.set(child, r); + if (r === 0) next.push(byId.get(child)!); + } + } + frontier = next; + } + + const placed = ranks.reduce((n, r) => n + r.length, 0); + if (placed !== dag.tasks.length) { + throw new Error('Topological sort failed — DAG contains a cycle.'); + } + return ranks; +} + +export function validateModelMap( + raw: unknown, + label = 'model map' +): ModelMapOverride { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error(`${label} must be a JSON object.`); + } + const obj = raw as Record; + const models: ModelMapOverride = {}; + for (const [key, value] of Object.entries(obj)) { + if (!COMPLEXITY_VALUES.has(key as Complexity)) { + throw new Error(`${label} contains unknown complexity key: ${key}`); + } + if (typeof value !== 'string' || value.trim() === '') { + throw new Error(`${label}.${key} must be a non-empty string.`); + } + models[key as Complexity] = value.trim(); + } + return models; +} + +export function createModelResolver( + overrides: ModelMapOverride = {} +): (c: Complexity) => string { + const models: ModelMap = { ...DEFAULT_MODEL_MAP, ...overrides }; + return (c: Complexity): string => { + if (!COMPLEXITY_KEYS.includes(c)) { + throw new Error(`Unknown complexity: ${c}`); + } + return models[c]; + }; +} diff --git a/packages/proof/src/dry_check_cmds.ts b/packages/proof/src/dry_check_cmds.ts new file mode 100644 index 00000000..49fdedbc --- /dev/null +++ b/packages/proof/src/dry_check_cmds.ts @@ -0,0 +1,512 @@ +/** + * --dry-check-cmds mode: walks every `subtask_prompt`, regex-extracts shell + * commands, validates them against the workspace, and prints a structured + * report. No `CURSOR_API_KEY` required. + * + * Validation focuses on patterns that have caused real DAG-runtime failures: + * + * - `pnpm --filter ...` where `` is not a known workspace + * package → DIRTY (the filter resolves to nothing and the command no-ops + * or errors). + * - `pnpm exec flatbread ` without `--filter ` → DIRTY + * (`loadConfig` does not search up; `flatbread.config.js` only exists in + * example dirs, so a top-level invocation never finds it). This is the + * historical regression the runner is asked to detect. + * - `pnpm codegen` (top-level) without an explicit `--filter` → DIRTY + * (`codegen` is a `--watch` script in `examples/nextjs/package.json`; + * it would hang the DAG node). + * - `pnpm --filter codegen` where `pkg` defines `codegen` as a + * `--watch` script → DIRTY (same hang risk). + * + * Backticked references that appear in a *negation context* ("Do NOT use", + * "instead of", "would hang", "avoid") are tagged `INFO` and excluded from + * the dirty count — they are documentation of anti-patterns the prompt + * already steers the agent away from. + */ + +import { readdir, readFile } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; +import { dirname, join, resolve } from 'node:path'; + +import type { DAG, RawTask } from './dag.js'; + +export type Verdict = 'OK' | 'DIRTY' | 'WARN' | 'INFO'; + +export interface CommandFinding { + taskId: string; + /** Raw command extracted from the prompt (backtick contents, trimmed). */ + command: string; + /** First non-flag token, e.g. `pnpm`, `flatbread`. */ + verb: string; + verdict: Verdict; + reason: string; + /** Was the command preceded by a "do NOT" / "instead of" cue in the prompt? */ + negated: boolean; +} + +export interface DryCheckReport { + title: string; + totalTasks: number; + totalCommands: number; + ok: number; + warn: number; + dirty: number; + info: number; + findings: CommandFinding[]; + /** True when at least one finding is `DIRTY`. Drives exit code. */ + isDirty: boolean; +} + +interface WorkspaceFacts { + /** All workspace package `name` fields (`@flatbread/core`, `nextjs`, …). */ + packageNames: Set; + /** Map of workspace package name → package.json `scripts` table. */ + scriptsByPackage: Map>; + /** Directory names under `packages/` and `examples/` (for `--filter` shorthand). */ + packageDirs: Set; + /** Map of dir basename → package name. Used to interpret `--filter `. */ + packageNameByDir: Map; + /** Absolute path to workspace root (the `--cwd` we resolve against). */ + cwd: string; +} + +/** Verbs we consider "shell commands" worth validating. */ +const SHELL_VERBS = new Set([ + 'pnpm', + 'npm', + 'yarn', + 'npx', + 'node', + 'tsx', + 'flatbread', + 'git', + 'cd', + 'mkdir', + 'mv', + 'cp', + 'rm', + 'cat', + 'echo', + 'bash', + 'sh', + 'cursor-agent', + 'agent-browser', + 'set', + 'source', + 'export', + 'kill', + 'open', + 'curl', + 'wget', + 'ls', +]); + +/** + * Cues in the preceding 80 chars that flip a finding from DIRTY/WARN to INFO. + * + * Three families: + * + * - Negation: prompt explicitly tells the agent NOT to run the command + * (`Do NOT use \`pnpm codegen\`…`). + * - Citation: command is quoted as a reference to existing config / docs + * rather than an instruction (`binds port 5057 via \`flatbread start …\` + * per \`examples/nextjs/package.json:8\``). + * - Backgrounding: command is explicitly intended to be spawned in the + * background and torn down later (`Start the example dev server in the + * background: \`pnpm --filter nextjs dev\``). + */ +const NEUTRALIZING_CUES = [ + // Negation + 'do not use', + "don't use", + 'do not run', + "don't run", + 'instead of', + 'would hang', + 'avoid', + 'never use', + "won't use", + 'rather than', + 'would block', + // Citation / documentation reference + ' via ', + ' per ', + ' from ', + ' see ', + 'defined as', + 'defined in', + 'package.json:', + 'binds port', + // Backgrounding (legitimate long-running spawn-then-teardown pattern) + 'in the background', + 'background:', + 'background.', + ' background ', + ' nohup ', +]; + +/** Single-backtick-delimited tokens. We deliberately ignore triple-backtick fences (none in current prompts) and HTML codeblocks. */ +const BACKTICK_RE = /`([^`\n]+)`/g; + +export async function loadWorkspaceFacts(cwd: string): Promise { + const workspaceRoot = resolveWorkspaceRoot(cwd); + const facts: WorkspaceFacts = { + packageNames: new Set(), + scriptsByPackage: new Map(), + packageDirs: new Set(), + packageNameByDir: new Map(), + cwd: workspaceRoot, + }; + + for (const parent of ['packages', 'examples']) { + const parentAbs = join(workspaceRoot, parent); + if (!existsSync(parentAbs)) continue; + const entries = await readdir(parentAbs, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const pkgJsonPath = join(parentAbs, entry.name, 'package.json'); + if (!existsSync(pkgJsonPath)) continue; + try { + const raw = JSON.parse(await readFile(pkgJsonPath, 'utf8')) as { + name?: unknown; + scripts?: unknown; + }; + if (typeof raw.name !== 'string' || raw.name.trim() === '') continue; + const name = raw.name.trim(); + facts.packageNames.add(name); + facts.packageDirs.add(entry.name); + facts.packageNameByDir.set(entry.name, name); + const scripts: Record = {}; + if (raw.scripts && typeof raw.scripts === 'object') { + for (const [k, v] of Object.entries( + raw.scripts as Record + )) { + if (typeof v === 'string') scripts[k] = v; + } + } + facts.scriptsByPackage.set(name, scripts); + } catch { + // ignore malformed package.json — not our job to lint here + } + } + } + + return facts; +} + +function resolveWorkspaceRoot(cwd: string): string { + let current = resolve(cwd); + while (true) { + if (existsSync(join(current, 'pnpm-workspace.yaml'))) return current; + const parent = dirname(current); + if (parent === current) return resolve(cwd); + current = parent; + } +} + +export function runDryCheck(dag: DAG, facts: WorkspaceFacts): DryCheckReport { + const findings: CommandFinding[] = []; + for (const task of dag.tasks) { + if (!task.subtask_prompt) continue; + for (const extracted of extractCommands(task.subtask_prompt)) { + findings.push(validateCommand(task, extracted, facts)); + } + } + + let ok = 0; + let warn = 0; + let dirty = 0; + let info = 0; + for (const f of findings) { + if (f.verdict === 'OK') ok++; + else if (f.verdict === 'WARN') warn++; + else if (f.verdict === 'DIRTY') dirty++; + else info++; + } + + return { + title: dag.title, + totalTasks: dag.tasks.length, + totalCommands: findings.length, + ok, + warn, + dirty, + info, + findings, + isDirty: dirty > 0, + }; +} + +interface ExtractedCommand { + command: string; + verb: string; + /** Up to 80 chars before the opening backtick, lowercased — used to detect negation. */ + precedingContext: string; +} + +function extractCommands(prompt: string): ExtractedCommand[] { + const out: ExtractedCommand[] = []; + BACKTICK_RE.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = BACKTICK_RE.exec(prompt))) { + const inner = m[1].trim(); + if (inner === '') continue; + const verb = inner.split(/\s+/, 1)[0]; + if (!SHELL_VERBS.has(verb)) continue; + const ctxStart = Math.max(0, m.index - 80); + const precedingContext = prompt.slice(ctxStart, m.index).toLowerCase(); + out.push({ command: inner, verb, precedingContext }); + } + return out; +} + +function isNeutralized(precedingContext: string): boolean { + return NEUTRALIZING_CUES.some((cue) => precedingContext.includes(cue)); +} + +function validateCommand( + task: RawTask, + extracted: ExtractedCommand, + facts: WorkspaceFacts +): CommandFinding { + const negated = isNeutralized(extracted.precedingContext); + const base: Omit = { + taskId: task.id, + command: extracted.command, + verb: extracted.verb, + negated, + }; + + let raw: CommandFinding; + if (extracted.verb === 'pnpm') { + raw = validatePnpmCommand(task, extracted.command, facts, base); + } else if (extracted.verb === 'flatbread') { + raw = { + ...base, + verdict: 'WARN', + reason: + 'Bare `flatbread …` invocation — unless run with `pnpm --filter exec` from a dir containing `flatbread.config.js`, `loadConfig` will not find a config.', + }; + } else { + raw = { + ...base, + verdict: 'OK', + reason: + 'No workspace-specific check; verb not in pnpm/flatbread risk family.', + }; + } + + // Downgrade DIRTY / WARN to INFO when surrounding prompt text already + // contains a negation / citation / backgrounding cue that handles the risk. + // Genuinely OK findings are passed through unchanged so they stay visible. + if (negated && (raw.verdict === 'DIRTY' || raw.verdict === 'WARN')) { + return { + ...raw, + verdict: 'INFO', + reason: `${raw.reason} — neutralized by surrounding prompt context (negation, citation, or background-spawn cue).`, + }; + } + return raw; +} + +function validatePnpmCommand( + task: RawTask, + command: string, + facts: WorkspaceFacts, + base: Omit +): CommandFinding { + const tokens = command.split(/\s+/); + // tokens[0] === 'pnpm' + let i = 1; + + // Short-circuit: `pnpm --silent`, `pnpm install …` etc. — strip leading flags + // before the first sub-command but preserve `--filter ` and `--dir `. + let filterPkg: string | null = null; + let filterDir: string | null = null; + let dirArg: string | null = null; + while (i < tokens.length && tokens[i].startsWith('--')) { + const flag = tokens[i]; + if (flag === '--filter' || flag === '-F') { + const arg = tokens[i + 1]; + if (arg) { + if (facts.packageDirs.has(arg)) { + filterDir = arg; + filterPkg = facts.packageNameByDir.get(arg) ?? null; + } else { + filterPkg = arg; + } + i += 2; + continue; + } + } + if (flag === '--dir' || flag === '-C') { + dirArg = tokens[i + 1] ?? null; + i += 2; + continue; + } + if (flag === '--silent' || flag === '--prefer-offline') { + i += 1; + continue; + } + // Unknown leading flag — treat the rest as opaque, still capture sub-cmd. + i += 1; + } + + const sub = tokens[i]; + const subArgs = tokens.slice(i + 1); + + // Validate filter target if provided. + if (filterPkg !== null) { + if ( + !facts.packageNames.has(filterPkg) && + !facts.packageDirs.has(filterDir ?? filterPkg) + ) { + return { + ...base, + verdict: 'DIRTY', + reason: `pnpm --filter target "${filterPkg}" is not a workspace package or dir under packages/ or examples/.`, + }; + } + } + + if (sub === 'exec' && subArgs[0] === 'flatbread') { + const flatbreadSub = subArgs[1] ?? ''; + if (filterPkg === null && dirArg === null) { + return { + ...base, + verdict: 'DIRTY', + reason: + `\`pnpm exec flatbread ${flatbreadSub}\` runs from the workspace root, where no flatbread.config.js exists; ` + + "flatbread's loadConfig does not search up. Use `pnpm --filter exec flatbread …` from an example dir instead.", + }; + } + return { + ...base, + verdict: 'OK', + reason: `pnpm --filter ${ + filterPkg ?? dirArg + } exec flatbread ${flatbreadSub}: filter targets a workspace package containing a flatbread.config.js.`, + }; + } + + // `pnpm codegen` / `pnpm dev` / `pnpm