Select agent index(es) (comma-separated), or -1 to evaluate all: Traceback (most recent call last):
File "C:\Users\Jaime\Code\github\railtracks-visualizer\eval.py", line 29, in <module>
results = evals.evaluate(
data=agent_data,
...<3 lines>...
payload_callback=upload_agent_evaluation,
)
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\railtracks\evaluations\runners\_evaluate.py", line 157, in evaluate
data_dict, agents = _setup_agent_data(data, agent_selection, agents)
~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\railtracks\evaluations\runners\_evaluate.py", line 125, in _setup_agent_data
agents = _select_agent(
{agent_name: len(data_dict[agent_name]) for agent_name in data_dict.keys()}
)
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\railtracks\evaluations\runners\_evaluate.py", line 37, in _select_agent
user_input = Prompt.ask(
f"\n[{COLORS['prompt']}]Select agent index(es)[/{COLORS['prompt']}] (comma-separated), or [{COLORS['highlight']}]-1[/{COLORS['highlight']}] to evaluate all"
)
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\rich\prompt.py", line 141, in ask
return _prompt(default=default, stream=stream)
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\rich\prompt.py", line 274, in __call__
value = self.get_input(self.console, prompt, self.password, stream=stream)
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\rich\prompt.py", line 203, in get_input
return console.input(prompt, password=password, stream=stream)
~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jaime\Code\github\railtracks-visualizer\.venv\Lib\site-packages\rich\console.py", line 2123, in input
result = input()
KeyboardInterrupt
import railtracks as rt
from railtracks import evaluations as evals
from railtownai import upload_agent_evaluation
# Set EVALUATIONS_API_TOKEN in .env (from Conductr Agent Evaluations onboarding)
agent_data = evals.extract_agent_data_points(".railtracks/data/sessions/")
judge = evals.JudgeEvaluator(
llm=rt.llm.OpenAILLM(model_name="gpt-5.2"),
metrics=[
evals.metrics.Categorical(
name="Helpfullness",
categories=["Not Helpful", "Somewhat Helpful", "Very Helpful"],
),
evals.metrics.Categorical(
name="Efficiency",
description=(
"How efficiently did the agent complete the task? "
"Consider factors such as speed, resource usage, and overall effectiveness."
),
categories=["Not Efficient", "Somewhat Efficient", "Very Efficient"],
),
],
reasoning=True,
timeout=1000,
)
results = evals.evaluate(
data=agent_data,
# agents=["Job Header Extraction Agent"],
evaluators=[evals.LLMInferenceEvaluator(), evals.ToolUseEvaluator(), judge],
name="Small Dataset Evaluation",
payload_callback=upload_agent_evaluation,
)
Description
uv run eval.pytotal_costReproducing Steps (include code examples if applicable)
use the following eval.py
Additional Context
No response
Operating System
No response
railtracks Version
No response
Python Version
No response