GetStream · aliev · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -37,6 +37,7 @@ uv run mypy
 
 ## Python rules
 
+- Never use `from __future__ import annotations`.
 - Never write `except Exception as e`. Catch specific exceptions.
 - Avoid `getattr`, `hasattr`, `delattr`, `setattr`; prefer normal attribute access.
 - Docstrings: Google style, keep them short.

diff --git a/agents-core/vision_agents/testing/__init__.py b/agents-core/vision_agents/testing/__init__.py
@@ -0,0 +1,50 @@
+"""Testing framework for Vision-Agents.
+
+Provides text-only testing of agents without requiring audio/video
+infrastructure or edge connections.
+
+Usage:
+
+Verify a greeting::
+
+    async def test_greeting():
+        async with TestSession(llm=llm, judge=judge_llm, instructions="Be friendly") as session:
+            response = await session.simple_response("Hello")
+            await response.judge(intent="Friendly greeting")
+            response.no_more_events()
+
+Verify tool calls::
+
+    async def test_weather():
+        async with TestSession(llm=llm, judge=judge_llm, instructions="...") as session:
+            response = await session.simple_response("Weather in Tokyo?")
+            response.function_called("get_weather", arguments={"location": "Tokyo"})
+            await response.judge(intent="Reports weather for Tokyo")
+            response.no_more_events()
+
+Key exports:
+    TestSession: async context manager that wraps an LLM for testing.
+    TestResponse: returned by ``simple_response()`` — carries events and assertions.
+    mock_tools: context manager to temporarily replace tool implementations.
+    RunEvent: union of ChatMessageEvent, FunctionCallEvent, FunctionCallOutputEvent.
+"""
+
+from vision_agents.testing._events import (
+    ChatMessageEvent,
+    FunctionCallEvent,
+    FunctionCallOutputEvent,
+    RunEvent,
+)
+from vision_agents.testing._mock_tools import mock_tools
+from vision_agents.testing._run_result import TestResponse
+from vision_agents.testing._session import TestSession
+
+__all__ = [
+    "TestSession",
+    "TestResponse",
+    "ChatMessageEvent",
+    "FunctionCallEvent",
+    "FunctionCallOutputEvent",
+    "RunEvent",
+    "mock_tools",
+]
diff --git a/agents-core/vision_agents/testing/_events.py b/agents-core/vision_agents/testing/_events.py
@@ -0,0 +1,44 @@
+"""Event types captured during a test run.
+
+These normalized events represent what happened during a single
+conversation turn: messages, function calls, and their outputs.
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Literal
+
+
+@dataclass
+class ChatMessageEvent:
+    """A chat message produced during a turn."""
+
+    role: str
+    content: str
+    type: Literal["message"] = field(default="message", init=False)
+
+
+@dataclass
+class FunctionCallEvent:
+    """The agent requested a tool/function call."""
+
+    name: str
+    arguments: dict[str, Any]
+    tool_call_id: str | None = None
+    type: Literal["function_call"] = field(default="function_call", init=False)
+
+
+@dataclass
+class FunctionCallOutputEvent:
+    """The result of a tool/function call."""
+
+    name: str
+    output: Any
+    is_error: bool = False
+    tool_call_id: str | None = None
+    execution_time_ms: float | None = None
+    type: Literal["function_call_output"] = field(
+        default="function_call_output", init=False
+    )
+
+
+RunEvent = ChatMessageEvent | FunctionCallEvent | FunctionCallOutputEvent
diff --git a/agents-core/vision_agents/testing/_judge.py b/agents-core/vision_agents/testing/_judge.py
@@ -0,0 +1,96 @@
+"""LLM-based intent evaluation for agent message testing.
+
+Uses a separate LLM instance with a structured PASS/FAIL prompt to
+evaluate whether an agent's message fulfils a given intent.
+"""
+
+import logging
+
+from vision_agents.core.llm.llm import LLM
+
+logger = logging.getLogger(__name__)
+
+# Verdict keywords mapped to success/failure for _parse_verdict.
+_VERDICTS = {"PASS": True, "FAIL": False}
+_ERROR_PREVIEW_MAX_LEN = 200  # max chars of LLM response shown in parse errors
+
+_JUDGE_SYSTEM_PROMPT = (
+    "You are a strict test evaluator for conversational AI agents.\n"
+    "You will be shown a message produced by an agent and a target intent.\n"
+    "Determine whether the message accomplishes the intent.\n\n"
+    "Rules:\n"
+    "- Be strict: if the message does not clearly fulfil the intent, it fails.\n"
+    "- Respond with EXACTLY one line in one of these formats:\n"
+    "  PASS: <brief reason>\n"
+    "  FAIL: <brief reason>\n"
+    "- Do NOT include any other text before or after the verdict line."
+)
+
+
+async def evaluate_intent(
+    llm: LLM,
+    message_content: str,
+    intent: str,
+) -> tuple[bool, str]:
+    """Evaluate whether *message_content* fulfils the given *intent*.
+
+    Creates a one-shot LLM call with a judge system prompt.  The LLM
+    is expected to respond with a ``PASS: …`` or ``FAIL: …`` line.
+
+    Important:
+        The *llm* parameter should be a **separate** instance from the
+        agent's LLM so that the judge call does not pollute the agent's
+        conversation history.
+
+    Args:
+        llm: LLM instance to use for evaluation.
+        message_content: The agent's message to evaluate.
+        intent: Description of what the message should accomplish.
+
+    Returns:
+        ``(success, reason)`` tuple.
+    """
+    if not message_content:
+        return False, "The message is empty."
+
+    if not intent:
+        return False, "Intent is required for evaluation."
+
+    original_instructions = llm._instructions
+    llm.set_instructions(_JUDGE_SYSTEM_PROMPT)
+
+    try:
+        prompt = (
+            f"Check if the following message fulfils the given intent.\n\n"
+            f"Intent:\n{intent}\n\n"
+            f"Message:\n{message_content}\n\n"
+            f"Respond with EXACTLY one line: PASS: <reason> or FAIL: <reason>"
+        )
+
+        response = await llm.simple_response(text=prompt)
+
+        if not response or not response.text:
+            return False, "LLM returned an empty response."
+
+        return _parse_verdict(response.text)
+
+    except (OSError, ValueError, RuntimeError) as exc:
+        logger.exception("Judge evaluation failed")
+        return False, f"Judge evaluation error: {exc}"
+
+    finally:
+        llm.set_instructions(original_instructions)
+
+
+def _parse_verdict(text: str) -> tuple[bool, str]:
+    """Parse a PASS/FAIL verdict from the LLM response text."""
+    for line in text.strip().splitlines():
+        word = line.strip().split(":")[0].strip().upper()
+        if word in _VERDICTS:
+            reason = line.strip()[len(word) :].lstrip(":").strip()
+            return _VERDICTS[word], reason or f"{word.title()}ed."
+
+    return (
+        False,
+        f"Could not parse verdict from LLM response: {text[:_ERROR_PREVIEW_MAX_LEN]}",
+    )
diff --git a/agents-core/vision_agents/testing/_mock_tools.py b/agents-core/vision_agents/testing/_mock_tools.py
@@ -0,0 +1,58 @@
+"""Tool mocking for testing.
+
+Temporarily replace tool implementations in a ``FunctionRegistry``
+without changing the tool schema visible to the LLM.
+
+Example::
+
+    with mock_tools(llm, {"get_weather": lambda location: "sunny, 70F"}):
+        result = await session.run("What's the weather?")
+"""
+
+from collections.abc import Callable, Generator
+from contextlib import contextmanager
+from typing import Any
+
+from vision_agents.core.llm.llm import LLM
+
+
+@contextmanager
+def mock_tools(
+    llm: LLM,
+    mocks: dict[str, Callable[..., Any]],
+) -> Generator[None, None, None]:
+    """Temporarily replace tool implementations for testing.
+
+    The tool schemas (name, description, parameters) remain unchanged —
+    only the underlying callable is swapped.  This lets you control return
+    values or simulate errors while the LLM still "sees" the original tool
+    definition.
+
+    Args:
+        llm: The LLM instance whose tools to mock.
+        mocks: Mapping of tool name → mock callable.  The mock receives
+            the same arguments as the original tool.  If the mock
+            *returns* an ``Exception`` instance the framework will raise
+            it during tool execution.
+
+    Raises:
+        KeyError: If a tool name is not registered on the LLM.
+    """
+    registry = llm.function_registry
+
+    for tool_name in mocks:
+        if registry._functions.get(tool_name) is None:
+            raise KeyError(f"Tool '{tool_name}' is not registered on this LLM")
+
+    originals: dict[str, Callable[..., Any]] = {}
+
+    for tool_name, mock_fn in mocks.items():
+        func_def = registry._functions[tool_name]
+        originals[tool_name] = func_def.function
+        func_def.function = mock_fn
+
+    try:
+        yield
+    finally:
+        for tool_name, original_fn in originals.items():
+            registry._functions[tool_name].function = original_fn