Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3d3eae3
WIP: add vision_agents.testing framework
aliev Feb 19, 2026
01d7a99
docs: add testing framework API documentation
aliev Feb 19, 2026
c143b89
refactor: TestSession → TestEval with simplified assertion API
aliev Feb 19, 2026
5551514
rename user_says → simple_response, agent_responds → judge
aliev Feb 19, 2026
f4736c1
move assertions from TestEval into TestResponse
aliev Feb 19, 2026
d51cfc2
rename agent_calls → function_called, agent_calls_output → function_o…
aliev Feb 19, 2026
52b658d
update API.md for v2 refactoring
aliev Feb 19, 2026
b7651dc
polish API.md formatting
aliev Feb 19, 2026
949039c
replace API.md with user-friendly README.md
aliev Feb 19, 2026
1ac1ba8
apply ruff formatting fixes
aliev Feb 20, 2026
69a9b0b
clean up testing framework: remove dead code, dedup, fix style
aliev Feb 20, 2026
e8e1dcd
remove verbose mechanism and remaining future annotations
aliev Feb 20, 2026
ecec3e3
deduplicate PASS/FAIL parsing in _parse_verdict
aliev Feb 20, 2026
d64962b
refactor _format_events into TestResponse static methods
aliev Feb 20, 2026
7da1605
move _VERDICTS to top of _judge.py, extract magic number into constant
aliev Feb 20, 2026
bbe2872
replace README.md with module docstring in __init__.py
aliev Feb 20, 2026
4d345e5
use public API imports in tests, remove private method call
aliev Feb 20, 2026
c36b71f
apply ruff formatting to _judge.py
aliev Feb 20, 2026
499f902
remove examples/00_example test scaffold
aliev Feb 20, 2026
05c5eb7
use @pytest.mark.integration decorator instead of pytestmark
aliev Feb 20, 2026
d206bf8
add type annotations to _FakeLLM.simple_response
aliev Feb 20, 2026
2803074
fix mock_tools leaving tools partially swapped on KeyError
aliev Feb 20, 2026
90d043a
apply ruff formatting to test_mock_tools.py
aliev Feb 20, 2026
30427c1
fix mypy error in mock_tools restore loop
aliev Feb 20, 2026
02909ca
rename TestEval to TestSession and fix _FakeLLM signature
aliev Feb 22, 2026
2dafe39
apply ruff formatting to test_simple_agent.py
aliev Feb 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ uv run mypy

## Python rules

- Never use `from __future__ import annotations`.
- Never write `except Exception as e`. Catch specific exceptions.
- Avoid `getattr`, `hasattr`, `delattr`, `setattr`; prefer normal attribute access.
- Docstrings: Google style, keep them short.
Expand Down
50 changes: 50 additions & 0 deletions agents-core/vision_agents/testing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Testing framework for Vision-Agents.

Provides text-only testing of agents without requiring audio/video
infrastructure or edge connections.

Usage:

Verify a greeting::

async def test_greeting():
async with TestSession(llm=llm, judge=judge_llm, instructions="Be friendly") as session:
response = await session.simple_response("Hello")
await response.judge(intent="Friendly greeting")
response.no_more_events()

Verify tool calls::

async def test_weather():
async with TestSession(llm=llm, judge=judge_llm, instructions="...") as session:
response = await session.simple_response("Weather in Tokyo?")
response.function_called("get_weather", arguments={"location": "Tokyo"})
await response.judge(intent="Reports weather for Tokyo")
response.no_more_events()

Key exports:
TestSession: async context manager that wraps an LLM for testing.
TestResponse: returned by ``simple_response()`` — carries events and assertions.
mock_tools: context manager to temporarily replace tool implementations.
RunEvent: union of ChatMessageEvent, FunctionCallEvent, FunctionCallOutputEvent.
"""

from vision_agents.testing._events import (
ChatMessageEvent,
FunctionCallEvent,
FunctionCallOutputEvent,
RunEvent,
)
from vision_agents.testing._mock_tools import mock_tools
from vision_agents.testing._run_result import TestResponse
from vision_agents.testing._session import TestSession

__all__ = [
"TestSession",
"TestResponse",
"ChatMessageEvent",
"FunctionCallEvent",
"FunctionCallOutputEvent",
"RunEvent",
"mock_tools",
]
44 changes: 44 additions & 0 deletions agents-core/vision_agents/testing/_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Event types captured during a test run.

These normalized events represent what happened during a single
conversation turn: messages, function calls, and their outputs.
"""

from dataclasses import dataclass, field
from typing import Any, Literal


@dataclass
class ChatMessageEvent:
"""A chat message produced during a turn."""

role: str
content: str
type: Literal["message"] = field(default="message", init=False)


@dataclass
class FunctionCallEvent:
"""The agent requested a tool/function call."""

name: str
arguments: dict[str, Any]
tool_call_id: str | None = None
type: Literal["function_call"] = field(default="function_call", init=False)


@dataclass
class FunctionCallOutputEvent:
"""The result of a tool/function call."""

name: str
output: Any
is_error: bool = False
tool_call_id: str | None = None
execution_time_ms: float | None = None
type: Literal["function_call_output"] = field(
default="function_call_output", init=False
)


RunEvent = ChatMessageEvent | FunctionCallEvent | FunctionCallOutputEvent
96 changes: 96 additions & 0 deletions agents-core/vision_agents/testing/_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""LLM-based intent evaluation for agent message testing.

Uses a separate LLM instance with a structured PASS/FAIL prompt to
evaluate whether an agent's message fulfils a given intent.
"""

import logging

from vision_agents.core.llm.llm import LLM

logger = logging.getLogger(__name__)

# Verdict keywords mapped to success/failure for _parse_verdict.
_VERDICTS = {"PASS": True, "FAIL": False}
_ERROR_PREVIEW_MAX_LEN = 200 # max chars of LLM response shown in parse errors

_JUDGE_SYSTEM_PROMPT = (
"You are a strict test evaluator for conversational AI agents.\n"
"You will be shown a message produced by an agent and a target intent.\n"
"Determine whether the message accomplishes the intent.\n\n"
"Rules:\n"
"- Be strict: if the message does not clearly fulfil the intent, it fails.\n"
"- Respond with EXACTLY one line in one of these formats:\n"
" PASS: <brief reason>\n"
" FAIL: <brief reason>\n"
"- Do NOT include any other text before or after the verdict line."
)


async def evaluate_intent(
llm: LLM,
message_content: str,
intent: str,
) -> tuple[bool, str]:
"""Evaluate whether *message_content* fulfils the given *intent*.

Creates a one-shot LLM call with a judge system prompt. The LLM
is expected to respond with a ``PASS: …`` or ``FAIL: …`` line.

Important:
The *llm* parameter should be a **separate** instance from the
agent's LLM so that the judge call does not pollute the agent's
conversation history.

Args:
llm: LLM instance to use for evaluation.
message_content: The agent's message to evaluate.
intent: Description of what the message should accomplish.

Returns:
``(success, reason)`` tuple.
"""
if not message_content:
return False, "The message is empty."

if not intent:
return False, "Intent is required for evaluation."

original_instructions = llm._instructions
llm.set_instructions(_JUDGE_SYSTEM_PROMPT)

try:
prompt = (
f"Check if the following message fulfils the given intent.\n\n"
f"Intent:\n{intent}\n\n"
f"Message:\n{message_content}\n\n"
f"Respond with EXACTLY one line: PASS: <reason> or FAIL: <reason>"
)

response = await llm.simple_response(text=prompt)

if not response or not response.text:
return False, "LLM returned an empty response."

return _parse_verdict(response.text)

except (OSError, ValueError, RuntimeError) as exc:
logger.exception("Judge evaluation failed")
return False, f"Judge evaluation error: {exc}"

finally:
llm.set_instructions(original_instructions)


def _parse_verdict(text: str) -> tuple[bool, str]:
"""Parse a PASS/FAIL verdict from the LLM response text."""
for line in text.strip().splitlines():
word = line.strip().split(":")[0].strip().upper()
if word in _VERDICTS:
reason = line.strip()[len(word) :].lstrip(":").strip()
return _VERDICTS[word], reason or f"{word.title()}ed."

return (
False,
f"Could not parse verdict from LLM response: {text[:_ERROR_PREVIEW_MAX_LEN]}",
)
58 changes: 58 additions & 0 deletions agents-core/vision_agents/testing/_mock_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Tool mocking for testing.

Temporarily replace tool implementations in a ``FunctionRegistry``
without changing the tool schema visible to the LLM.

Example::

with mock_tools(llm, {"get_weather": lambda location: "sunny, 70F"}):
result = await session.run("What's the weather?")
"""

from collections.abc import Callable, Generator
from contextlib import contextmanager
from typing import Any

from vision_agents.core.llm.llm import LLM


@contextmanager
def mock_tools(
llm: LLM,
mocks: dict[str, Callable[..., Any]],
) -> Generator[None, None, None]:
"""Temporarily replace tool implementations for testing.

The tool schemas (name, description, parameters) remain unchanged —
only the underlying callable is swapped. This lets you control return
values or simulate errors while the LLM still "sees" the original tool
definition.

Args:
llm: The LLM instance whose tools to mock.
mocks: Mapping of tool name → mock callable. The mock receives
the same arguments as the original tool. If the mock
*returns* an ``Exception`` instance the framework will raise
it during tool execution.

Raises:
KeyError: If a tool name is not registered on the LLM.
"""
registry = llm.function_registry

for tool_name in mocks:
if registry._functions.get(tool_name) is None:
raise KeyError(f"Tool '{tool_name}' is not registered on this LLM")

originals: dict[str, Callable[..., Any]] = {}

for tool_name, mock_fn in mocks.items():
func_def = registry._functions[tool_name]
originals[tool_name] = func_def.function
func_def.function = mock_fn

try:
yield
finally:
for tool_name, original_fn in originals.items():
registry._functions[tool_name].function = original_fn
Loading
Loading