diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index ba3cc7a3994..9839d155062 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -132,6 +132,7 @@ EXPERIMENT_NAME_KEY = "_ml_obs.experiment_name" # experiment context keys +EXPERIMENT_CONFIG = "_ml_obs.config" EXPERIMENT_RECORD_METADATA = "_ml_obs.meta.metadata" EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output" EXPERIMENTS_INPUT = "_ml_obs.meta.input" diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 1c519d794eb..e31c76124c5 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -27,6 +27,7 @@ from ddtrace.constants import ERROR_TYPE from ddtrace.internal.logger import get_logger from ddtrace.llmobs._constants import DD_SITES_NEEDING_APP_SUBDOMAIN +from ddtrace.llmobs._constants import EXPERIMENT_CONFIG from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT from ddtrace.llmobs._constants import EXPERIMENT_RECORD_METADATA from ddtrace.llmobs._utils import convert_tags_dict_to_list @@ -1097,6 +1098,12 @@ async def _process_record( "dataset_record_id": str(record_id), "experiment_id": str(self._id), } + # Propagate dataset record tags to the experiment span + record_tags = record.get("tags", []) + for tag in record_tags: + if ":" in tag: + key, value = tag.split(":", 1) + tags[key] = value if canonical_id: tags["dataset_record_canonical_id"] = canonical_id output_data = None @@ -1112,6 +1119,8 @@ async def _process_record( span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"]) if "metadata" in record: span._set_ctx_item(EXPERIMENT_RECORD_METADATA, record["metadata"]) + if self._config: + span._set_ctx_item(EXPERIMENT_CONFIG, self._config) return { "idx": idx, diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 2ec321bb9f1..ce98ce32b22 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -55,6 +55,7 @@ from ddtrace.llmobs._constants import DISPATCH_ON_OPENAI_AGENT_SPAN_FINISH from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED +from ddtrace.llmobs._constants import EXPERIMENT_CONFIG from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE from ddtrace.llmobs._constants import EXPERIMENT_DATASET_NAME_KEY from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT @@ -566,6 +567,10 @@ def _llmobs_span_event(self, span: Span) -> Optional[LLMObsSpanEvent]: if isinstance(span_links, list) and span_links: llmobs_span_event["span_links"] = span_links + experiment_config = span._get_ctx_item(EXPERIMENT_CONFIG) + if experiment_config: + llmobs_span_event["config"] = experiment_config + return llmobs_span_event @staticmethod diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 712cc0ae4bf..2223bcf3eda 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -35,6 +35,7 @@ from ddtrace.llmobs._constants import EXP_SUBDOMAIN_NAME from ddtrace.llmobs._constants import SPAN_ENDPOINT from ddtrace.llmobs._constants import SPAN_SUBDOMAIN_NAME +from ddtrace.llmobs._experiment import ConfigType from ddtrace.llmobs._experiment import Dataset from ddtrace.llmobs._experiment import DatasetRecord from ddtrace.llmobs._experiment import DatasetRecordRaw @@ -58,6 +59,7 @@ class _LLMObsSpanEventOptional(TypedDict, total=False): status_message: str collection_errors: list[str] span_links: list[_SpanLink] + config: ConfigType class LLMObsSpanEvent(_LLMObsSpanEventOptional): diff --git a/releasenotes/notes/llmobs-dne-experiments-dataset-record-tags-config-propagation-59397bb58866b19b.yaml b/releasenotes/notes/llmobs-dne-experiments-dataset-record-tags-config-propagation-59397bb58866b19b.yaml new file mode 100644 index 00000000000..c5d92f922c9 --- /dev/null +++ b/releasenotes/notes/llmobs-dne-experiments-dataset-record-tags-config-propagation-59397bb58866b19b.yaml @@ -0,0 +1,8 @@ +--- +upgrade: + - | + LLM Observability: Experiments spans now contain config from the experiment initialization, allowing for searching + of relevant spans using the experiment config. + - | + LLM Observability: Experiments spans now contain the tags from the dataset records, allowing for searching of + relevant spans using the dataset record tags. diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 1b563d89353..1ced4fe8de1 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -2189,6 +2189,7 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test dummy_task, test_dataset_one_record_w_metadata, [dummy_evaluator], + config={"temperature": 0.7}, ) exp._experiment._id = "1234567890" asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False)) @@ -2216,6 +2217,7 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test assert "run_iteration:1" in event["tags"] assert f"ddtrace.version:{ddtrace.__version__}" in event["tags"] assert event["_dd"]["scope"] == "experiments" + assert event["config"] == {"temperature": 0.7} def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_record_w_metadata): @@ -2224,6 +2226,7 @@ def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_ dummy_task, test_dataset_one_record_w_metadata, [dummy_evaluator], + config={"temperature": 0.7}, ) exp._experiment._id = "1234567890" for i in range(2): @@ -2257,6 +2260,22 @@ def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_ assert f"run_iteration:{i + 1}" in event["tags"] assert f"ddtrace.version:{ddtrace.__version__}" in event["tags"] assert event["_dd"]["scope"] == "experiments" + assert event["config"] == {"temperature": 0.7} + + +def test_experiment_span_no_config_omits_field(llmobs, llmobs_events, test_dataset_one_record_w_metadata): + """Assert that the config field is omitted from the span event when no config is provided.""" + exp = llmobs.experiment( + "test_experiment", + dummy_task, + test_dataset_one_record_w_metadata, + [dummy_evaluator], + ) + exp._experiment._id = "1234567890" + asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False)) + assert len(llmobs_events) == 1 + event = llmobs_events[0] + assert "config" not in event def test_evaluators_run_with_jobs_parameter(llmobs, test_dataset_one_record):