cryptopoly · cryptopoly · Jun 17, 2026 · May 8, 2026 · May 8, 2026 · May 22, 2026
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
@@ -40,12 +40,12 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: Checkout main repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           persist-credentials: false
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
           cache: pip
@@ -59,7 +59,7 @@ jobs:
         run: mkdocs build --strict
 
       - name: Checkout marketing site repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           repository: cryptopoly/ChaosEngineAI-Site
           ssh-key: ${{ secrets.SITE_REPO_DEPLOY_KEY }}

diff --git a/.github/workflows/perf-gate.yml b/.github/workflows/perf-gate.yml
@@ -79,7 +79,7 @@ jobs:
 
       - name: Upload baseline JSON
         if: always()
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v7
         with:
           name: perf-baseline
           path: /tmp/perf-baseline.json

diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/backend_service/agent.py b/backend_service/agent.py
@@ -485,7 +485,11 @@ def run_agent_loop_streaming(
             # consumed so the assistant bubble doesn't show raw call
             # JSON next to the rendered ToolCallCard (FU-040).
             text = _strip_tool_call_xml(result.text)
-            chunk_size = 4
+            # The final answer is already fully computed (tool-calling turns
+            # are non-streaming), so the old 4-char dribble just added fake
+            # latency + yields. Emit in larger chunks; the SSE layer coalesces
+            # these further and the user sees the answer near-instantly.
+            chunk_size = 48
             for i in range(0, len(text), chunk_size):
                 yield {"token": text[i:i + chunk_size]}
 

diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py
diff --git a/backend_service/inference/_constants.py b/backend_service/inference/_constants.py
@@ -15,4 +15,15 @@
 # especially on a first-time pull from Hugging Face. Allow a generous ceiling.
 MLX_LOAD_TIMEOUT_SECONDS = 1800.0
 DEFAULT_LLAMA_TIMEOUT_SECONDS = 120.0
-CAPABILITY_CACHE_TTL_SECONDS = 10.0
+# Native-backend capabilities (mlx/llama-server/vLLM/accelerator presence)
+# only change when the user installs something — and every install path
+# (pip / system pkg / cuda-torch / convert / the /api/setup/refresh-
+# capabilities endpoint) calls refresh_capabilities(force=True), which
+# invalidates this cache immediately. So the TTL only governs ambient
+# staleness, not correctness. The old 10 s value was shorter than a single
+# model load+generate (40-70 s), so load_model's refresh_capabilities()
+# re-probed on *every* load — a blocking 17-31 s mlx_lm+mlx+mlx_vlm import
+# subprocess each time (the creep behind the FU-068 probe-timeout bumps).
+# 300 s comfortably spans back-to-back loads in a session while staying
+# fresh enough for the capability UI; installs force-refresh regardless.
+CAPABILITY_CACHE_TTL_SECONDS = 300.0
diff --git a/backend_service/inference/binaries.py b/backend_service/inference/binaries.py
@@ -33,6 +33,17 @@ def _json_subprocess(
             check=False,
             capture_output=True,
             timeout=timeout,
+            # Own session/process group: these short-lived JSON probes
+            # (mlx_worker probe, GGUF metadata read) must NOT be collateral
+            # of ``app._watch_parent_and_exit``'s killpg(SIGTERM) when the
+            # backend's parent dies. Without this, a non-Tauri launch (e.g.
+            # a bare ``python -m backend_service.app`` whose launch shell
+            # exits) reparents the app, the watchdog fires, and the probe —
+            # sharing the group — dies with "probe exited with code -15"
+            # mid-run. The probe is a few-second transient, so escaping the
+            # parent-death cleanup leaks nothing (the cleanup exists for the
+            # long-lived llama-server children, which are spawned elsewhere).
+            start_new_session=True,
         )
     except (OSError, subprocess.TimeoutExpired) as exc:
         return (-1, None, str(exc))

diff --git a/backend_service/inference/capabilities.py b/backend_service/inference/capabilities.py
@@ -126,12 +126,17 @@ def _probe_native_backends() -> BackendCapabilities:
 
     code, payload, message = _json_subprocess(
         [python_executable, "-m", "backend_service.mlx_worker", "probe"],
-        # FU-068: cold ``mlx_lm + mlx + mlx_vlm`` import has crept to
-        # ~12.4 s on M4 Max / Python 3.11 (measured 2026-05-25 v0.9.3),
-        # blowing the original 12.0 s ceiling and causing intermittent
-        # E2E Phase 1 fails on a freshly-booted backend. Bump to 20 s
-        # for ~60% headroom over today's cold-boot envelope.
-        timeout=20.0,
+        # FU-068: cold ``mlx_lm + mlx + mlx_vlm`` import keeps creeping —
+        # 12.0 s (orig) → 12.4 s (2026-05-25 v0.9.3, → 20 s) → 17.5 s solo
+        # on M4 Max / Python 3.11 (2026-06-02). Under a sustained E2E run
+        # (whole suite ~3x slower from concurrent model loads + thermal
+        # throttle) the probe is re-issued per MLX cell and measured
+        # ~31 s, blowing both the 20 s and 30 s ceilings (different cell
+        # each time). 45 s clears the ~31 s loaded peak with headroom and
+        # is still bounded enough to surface a genuinely wedged worker.
+        # Follow-up: cache the capability probe so it isn't re-run per
+        # load under load (the real inefficiency behind the creep).
+        timeout=45.0,
     )
 
     if payload is None:

diff --git a/backend_service/inference/llama_cpp_engine.py b/backend_service/inference/llama_cpp_engine.py
@@ -92,6 +92,17 @@
     "frequency_penalty",
     "presence_penalty",
     "stop",
+    # Modern anti-repetition / quality samplers llama-server supports
+    # natively. Forward-only: builds that don't recognise them ignore the
+    # field, so old binaries are unaffected. DRY beats plain repeat_penalty
+    # at killing verbatim loops; XTC adds creative variety; top-n-sigma is
+    # a temperature-stable truncator.
+    "dry_multiplier",
+    "dry_base",
+    "dry_allowed_length",
+    "xtc_probability",
+    "xtc_threshold",
+    "top_n_sigma",
     # Phase 3.3: per-token confidence info. llama-server returns
     # top-k alternatives with their logprobs in each delta when
     # `logprobs: true` + `top_logprobs: N` are set.
@@ -421,6 +432,7 @@ def _build_command(
         fit_enabled: bool,
         is_fallback: bool,
         speculative_decoding: bool = False,
+        fused_attention: bool = False,
         canonical_repo: str | None = None,
         model_ref: str = "",
     ) -> tuple[list[str], str | None, bool, str | None]:
@@ -449,6 +461,19 @@ def _build_command(
             str(max(256, context_tokens)),
             "--jinja",
         ]
+        # Reuse the single slot's KV cache across chat turns: a growing
+        # conversation re-prefills only the new suffix instead of the whole
+        # history (turn-2+ TTFT drops sharply on long chats). Forward-gated
+        # on binary support so older llama-server builds are unaffected.
+        if _llama_server_supports(binary, "--cache-reuse"):
+            command.extend(["--cache-reuse", "256"])
+        # Honour the user's fused-attention toggle. It was plumbed into
+        # load_model + stored on LoadedModelInfo but never emitted as a
+        # flag. Flash attention is a large decode + KV-memory win on Metal
+        # and is required by the quantized KV cache types. Opt-in via the
+        # existing flag so a model/quant combo that dislikes it can disable.
+        if fused_attention and _llama_server_supports(binary, "--flash-attn"):
+            command.extend(["--flash-attn", "on"])
         if _llama_server_supports(binary, "--reasoning-format"):
             command.extend(["--reasoning-format", "deepseek"])
         if _llama_server_supports(binary, "--reasoning"):
@@ -660,6 +685,7 @@ def load_model(
                 fit_enabled=fit_enabled,
                 is_fallback=is_fallback,
                 speculative_decoding=speculative_decoding,
+                fused_attention=fused_attention,
                 canonical_repo=canonical_repo,
                 model_ref=model_ref,
             )
@@ -791,6 +817,9 @@ def generate(
             "temperature": temperature,
             "max_tokens": max_tokens,
             "stream": False,
+            # Reuse the slot's cached prompt prefix across turns (pairs with
+            # the server's --cache-reuse) so unchanged history isn't reprocessed.
+            "cache_prompt": True,
         }
         if tools:
             payload["tools"] = tools
@@ -884,6 +913,9 @@ def stream_generate(
             "temperature": temperature,
             "max_tokens": max_tokens,
             "stream": True,
+            # Reuse the slot's cached prompt prefix across turns (pairs with
+            # the server's --cache-reuse) so unchanged history isn't reprocessed.
+            "cache_prompt": True,
         }
         if tools:
             payload["tools"] = tools

diff --git a/backend_service/mlx_worker.py b/backend_service/mlx_worker.py
@@ -59,6 +59,7 @@
 from backend_service import mlx_worker_lifecycle as _lifecycle
 from backend_service import mlx_worker_speculative as _speculative
 from backend_service import mlx_worker_generate as _generate
+from backend_service import mlx_worker_prompt_cache as _prompt_cache
 
 # Phase 1f-4: model + runtime introspection helpers now live in
 # ``backend_service.mlx_worker_diagnostics``. Re-export so existing imports
@@ -127,6 +128,13 @@ def __init__(self) -> None:
         # delimiters via ``reasoning_delimiters_for``. Default
         # (``<think>...</think>``) still applies when ``None``.
         self._loaded_model_ref: str | None = None
+        # Tier 4: persistent single-slot prompt cache for native-strategy chat
+        # so follow-up turns prefill only the new suffix. Managed by
+        # backend_service.mlx_worker_prompt_cache; invalidated on any model
+        # load / unload / profile change.
+        self._persist_cache: Any | None = None
+        self._persist_tokens: list[int] = []
+        self._persist_cache_model_ref: str | None = None
 
     def handle(self, request: dict[str, Any]) -> dict[str, Any] | None:
         op = request.get("op")
@@ -148,12 +156,15 @@ def handle(self, request: dict[str, Any]) -> dict[str, Any] | None:
         raise ValueError(f"Unsupported worker operation: {op}")
 
     def load_model(self, request: dict[str, Any]) -> dict[str, Any]:
+        _prompt_cache.invalidate(self)
         return _lifecycle.load_model(self, request)
 
     def unload_model(self) -> dict[str, Any]:
+        _prompt_cache.invalidate(self)
         return _lifecycle.unload_model(self)
 
     def update_profile(self, request: dict[str, Any]) -> dict[str, Any]:
+        _prompt_cache.invalidate(self)
         return _lifecycle.update_profile(self, request)
 
     def _apply_cache_profile(

diff --git a/backend_service/mlx_worker_generate.py b/backend_service/mlx_worker_generate.py
@@ -34,6 +34,7 @@
 )
 from backend_service.mlx_worker_request import (
     _apply_mlx_seed,
+    _build_mlx_logits_processors,
     _build_mlx_sampler,
     _extract_top_logprobs,
     _format_tools_for_prompt,
@@ -46,6 +47,7 @@
     strip_harmony_boilerplate,
 )
 from backend_service.runaway_guard import RunawayGuard
+from backend_service import mlx_worker_prompt_cache as _prompt_cache
 
 
 if TYPE_CHECKING:
@@ -109,24 +111,32 @@ def generate_standard(state: WorkerState, request: dict[str, Any]) -> dict[str,
         system_prompt=system_prompt,
     )
     sampler = _build_mlx_sampler(request)
-    prompt_cache, runtime_note = state._make_cache()
-    runtime_note = _merge_runtime_notes(runtime_note, prompt_note)
-    runtime_fields = state._runtime_fields(prompt_cache=prompt_cache)
+    acq = _prompt_cache.acquire(state, prompt_text)
+    prompt_cache = acq.cache
+    prompt_feed = acq.prompt_feed
+    managed = acq.managed
+    runtime_note = _merge_runtime_notes(acq.note, prompt_note)
+    runtime_fields = state._runtime_fields(prompt_cache=acq.fields_cache)
     transcript_fallback = _plain_chat_fallback_active(prompt_note)
 
     runaway_guard = RunawayGuard()
     runaway_stopped = False
+    generated_ids: list[int] = []
     try:
         text_parts: list[str] = []
         last_response = None
         for response in stream_generate(
             state.model,
             state.tokenizer,
-            prompt_text,
+            prompt_feed,
                 max_tokens=int(request.get("maxTokens") or 256),
                 sampler=sampler,
+                logits_processors=_build_mlx_logits_processors(request),
                 prompt_cache=prompt_cache,
         ):
+            _tok = getattr(response, "token", None)
+            if isinstance(_tok, int):
+                generated_ids.append(_tok)
             if response.text:
                 text_parts.append(response.text)
                 try:
@@ -135,8 +145,20 @@ def generate_standard(state: WorkerState, request: dict[str, Any]) -> dict[str,
                     runaway_stopped = True
                     break
             last_response = response
+        if managed:
+            _prompt_cache.commit(
+                state,
+                cache=prompt_cache,
+                commit_tokens=acq.commit_tokens,
+                generated_ids=generated_ids,
+                model_ref=state._loaded_model_ref,
+            )
     except (ValueError, RuntimeError, TypeError, AttributeError) as exc:
-        _should_retry = (
+        was_managed = managed
+        if managed:
+            _prompt_cache.invalidate(state)
+            managed = False
+        _should_retry = was_managed or (
             prompt_cache is not None
             and _should_retry_cache_failure(exc)
         )
@@ -319,10 +341,13 @@ def stream_generate(state: WorkerState, request: dict[str, Any]) -> None:
         system_prompt=system_prompt,
     )
     sampler = _build_mlx_sampler(request)
-    prompt_cache, runtime_note = state._make_cache()
-    runtime_note = _merge_runtime_notes(runtime_note, prompt_note)
+    acq = _prompt_cache.acquire(state, prompt_text)
+    prompt_cache = acq.cache
+    prompt_feed = acq.prompt_feed
+    managed = acq.managed
+    runtime_note = _merge_runtime_notes(acq.note, prompt_note)
     runtime_note = _merge_runtime_notes(runtime_note, speculative_stream_fallback_note)
-    runtime_fields = state._runtime_fields(prompt_cache=prompt_cache)
+    runtime_fields = state._runtime_fields(prompt_cache=acq.fields_cache)
     transcript_fallback = _plain_chat_fallback_active(prompt_note)
 
     thinking_mode = request.get("thinkingMode") or "off"
@@ -336,6 +361,7 @@ def stream_generate(state: WorkerState, request: dict[str, Any]) -> None:
     transcript_trimmed = False
     runaway_guard = RunawayGuard()
     runaway_stopped = False
+    generated_ids: list[int] = []
     # Phase 3.3 follow-up: when the request opted into logprobs,
     # extract top-k per token via the helper and forward inline
     # with each text chunk.
@@ -346,11 +372,15 @@ def stream_generate(state: WorkerState, request: dict[str, Any]) -> None:
         for response in mlx_stream_generate(
             state.model,
             state.tokenizer,
-            prompt_text,
+            prompt_feed,
             max_tokens=int(request.get("maxTokens") or 256),
             sampler=sampler,
+            logits_processors=_build_mlx_logits_processors(request),
             prompt_cache=prompt_cache,
         ):
+            _tok = getattr(response, "token", None)
+            if isinstance(_tok, int):
+                generated_ids.append(_tok)
             if response.text:
                 # Check for runaway loops before emitting
                 try:
@@ -392,8 +422,20 @@ def stream_generate(state: WorkerState, request: dict[str, Any]) -> None:
             transcript_trimmed = transcript_trimmed or transcript_filter.stopped
         if visible_text:
             _emit({"ok": True, "chunk": {"text": visible_text}})
+        if managed:
+            _prompt_cache.commit(
+                state,
+                cache=prompt_cache,
+                commit_tokens=acq.commit_tokens,
+                generated_ids=generated_ids,
+                model_ref=state._loaded_model_ref,
+            )
     except (ValueError, RuntimeError, TypeError, AttributeError) as exc:
-        _should_retry = (
+        was_managed = managed
+        if managed:
+            _prompt_cache.invalidate(state)
+            managed = False
+        _should_retry = was_managed or (
             prompt_cache is not None
             and _should_retry_cache_failure(exc)
         )