Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
EXPERIMENT_NAME_KEY = "_ml_obs.experiment_name"

# experiment context keys
EXPERIMENT_CONFIG = "_ml_obs.config"
EXPERIMENT_RECORD_METADATA = "_ml_obs.meta.metadata"
EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
EXPERIMENTS_INPUT = "_ml_obs.meta.input"
Expand Down
9 changes: 9 additions & 0 deletions ddtrace/llmobs/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ddtrace.constants import ERROR_TYPE
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import DD_SITES_NEEDING_APP_SUBDOMAIN
from ddtrace.llmobs._constants import EXPERIMENT_CONFIG
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
from ddtrace.llmobs._constants import EXPERIMENT_RECORD_METADATA
from ddtrace.llmobs._utils import convert_tags_dict_to_list
Expand Down Expand Up @@ -1097,6 +1098,12 @@ async def _process_record(
"dataset_record_id": str(record_id),
"experiment_id": str(self._id),
}
# Propagate dataset record tags to the experiment span
record_tags = record.get("tags", [])
for tag in record_tags:
if ":" in tag:
key, value = tag.split(":", 1)
tags[key] = value
if canonical_id:
tags["dataset_record_canonical_id"] = canonical_id
output_data = None
Expand All @@ -1112,6 +1119,8 @@ async def _process_record(
span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"])
if "metadata" in record:
span._set_ctx_item(EXPERIMENT_RECORD_METADATA, record["metadata"])
if self._config:
span._set_ctx_item(EXPERIMENT_CONFIG, self._config)

return {
"idx": idx,
Expand Down
5 changes: 5 additions & 0 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from ddtrace.llmobs._constants import DISPATCH_ON_OPENAI_AGENT_SPAN_FINISH
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
from ddtrace.llmobs._constants import EXPERIMENT_CONFIG
from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE
from ddtrace.llmobs._constants import EXPERIMENT_DATASET_NAME_KEY
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
Expand Down Expand Up @@ -566,6 +567,10 @@ def _llmobs_span_event(self, span: Span) -> Optional[LLMObsSpanEvent]:
if isinstance(span_links, list) and span_links:
llmobs_span_event["span_links"] = span_links

experiment_config = span._get_ctx_item(EXPERIMENT_CONFIG)
if experiment_config:
llmobs_span_event["config"] = experiment_config

return llmobs_span_event

@staticmethod
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from ddtrace.llmobs._constants import EXP_SUBDOMAIN_NAME
from ddtrace.llmobs._constants import SPAN_ENDPOINT
from ddtrace.llmobs._constants import SPAN_SUBDOMAIN_NAME
from ddtrace.llmobs._experiment import ConfigType
from ddtrace.llmobs._experiment import Dataset
from ddtrace.llmobs._experiment import DatasetRecord
from ddtrace.llmobs._experiment import DatasetRecordRaw
Expand All @@ -58,6 +59,7 @@ class _LLMObsSpanEventOptional(TypedDict, total=False):
status_message: str
collection_errors: list[str]
span_links: list[_SpanLink]
config: ConfigType


class LLMObsSpanEvent(_LLMObsSpanEventOptional):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
upgrade:
- |
LLM Observability: Experiments spans now contain config from the experiment initialization, allowing for searching
of relevant spans using the experiment config.
- |
LLM Observability: Experiments spans now contain the tags from the dataset records, allowing for searching of
relevant spans using the dataset record tags.
19 changes: 19 additions & 0 deletions tests/llmobs/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2189,6 +2189,7 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test
dummy_task,
test_dataset_one_record_w_metadata,
[dummy_evaluator],
config={"temperature": 0.7},
)
exp._experiment._id = "1234567890"
asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False))
Expand Down Expand Up @@ -2216,6 +2217,7 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test
assert "run_iteration:1" in event["tags"]
assert f"ddtrace.version:{ddtrace.__version__}" in event["tags"]
assert event["_dd"]["scope"] == "experiments"
assert event["config"] == {"temperature": 0.7}


def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_record_w_metadata):
Expand All @@ -2224,6 +2226,7 @@ def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_
dummy_task,
test_dataset_one_record_w_metadata,
[dummy_evaluator],
config={"temperature": 0.7},
)
exp._experiment._id = "1234567890"
for i in range(2):
Expand Down Expand Up @@ -2257,6 +2260,22 @@ def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_
assert f"run_iteration:{i + 1}" in event["tags"]
assert f"ddtrace.version:{ddtrace.__version__}" in event["tags"]
assert event["_dd"]["scope"] == "experiments"
assert event["config"] == {"temperature": 0.7}


def test_experiment_span_no_config_omits_field(llmobs, llmobs_events, test_dataset_one_record_w_metadata):
"""Assert that the config field is omitted from the span event when no config is provided."""
exp = llmobs.experiment(
"test_experiment",
dummy_task,
test_dataset_one_record_w_metadata,
[dummy_evaluator],
)
exp._experiment._id = "1234567890"
asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False))
assert len(llmobs_events) == 1
event = llmobs_events[0]
assert "config" not in event


def test_evaluators_run_with_jobs_parameter(llmobs, test_dataset_one_record):
Expand Down