scenario/python/examples/test_false_assumptions.py at main · langwatch/scenario · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from typing import cast
import litellm
import pytest

from openai.types.chat import ChatCompletionMessageParam
import scenario

scenario.configure(default_model="openai/gpt-4.1-nano")


class Agent(scenario.AgentAdapter):
    async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
        response = litellm.completion(
            model="openai/gpt-4.1-nano",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant",
                },
                *input.messages,
            ],
        )
        message = response.choices[0].message  # type: ignore

        return [cast(ChatCompletionMessageParam, message)]


@pytest.mark.agent_test
@pytest.mark.flaky(reruns=2)
@pytest.mark.asyncio
async def test_early_assumption_bias():
    result = await scenario.run(
        name="early assumption bias",
        description="""
            The agent makes false assumption that the user is talking about an ATM bank, and user corrects it that they actually mean river banks
        """,
        agents=[
            Agent(),
            scenario.UserSimulatorAgent(),
            scenario.JudgeAgent(
                criteria=[
                    "user should get good recommendations on river crossing",
                    "agent should NOT keep following up about ATM recommendation after user has corrected them that they are actually just hiking",
                ],
            ),
        ],
        max_turns=10,
        script=[
            # Define hardcoded messages
            scenario.agent("Hello, how can I help you today?"),
            scenario.user("how do I safely approach a bank?"),
            # Or let it be generated automatically
            scenario.agent(),
            # Generate a user follow-up message
            scenario.user(),
            # Let the simulation proceed for 2 more turns, print at every turn
            scenario.proceed(
                turns=2,
                on_turn=lambda state: print(
                    f"Turn {state.current_turn}: {state.messages}"
                ),
            ),
            # Time to make a judgment call
            scenario.judge(),
        ],
        set_id="python-examples",
    )

    assert result.success