From fb433f6e383a20d153325f30ed92adb977683214 Mon Sep 17 00:00:00 2001 From: Lotte Verheyden <48100308+Lotte-Verheyden@users.noreply.github.com> Date: Wed, 1 Apr 2026 20:11:57 -0700 Subject: [PATCH 1/5] feat: add Langfuse Academy section --- app/academy/[[...slug]]/not-found.tsx | 15 + app/academy/[[...slug]]/page.tsx | 71 ++++ app/academy/layout.tsx | 10 + components/NavLinks.tsx | 1 + content/academy/building-datasets.mdx | 310 ++++++++++++++++++ content/academy/closing-the-loop.mdx | 8 + content/academy/cost-tracking.mdx | 4 + content/academy/designing-evals.mdx | 110 +++++++ content/academy/evaluation-example-setups.mdx | 12 + content/academy/evaluation-loop.mdx | 132 ++++++++ content/academy/evaluation-overview.mdx | 16 + content/academy/good-traces.mdx | 143 ++++++++ content/academy/index.mdx | 39 +++ .../llm-observability-vs-traditional.mdx | 8 + content/academy/meta.json | 32 ++ content/academy/monitoring-overview.mdx | 14 + .../academy/observability-core-concepts.mdx | 151 +++++++++ content/academy/observability-overview.mdx | 15 + .../observability-use-case-examples.mdx | 12 + content/academy/prompt-composability.mdx | 67 ++++ .../academy/prompt-deployment-strategies.mdx | 8 + .../academy/prompt-management-overview.mdx | 13 + .../prompt-management-when-it-makes-sense.mdx | 8 + content/academy/prompt-version-control.mdx | 187 +++++++++++ content/academy/security-and-guardrails.mdx | 219 +++++++++++++ content/academy/user-feedback.mdx | 135 ++++++++ content/academy/what-to-evaluate.mdx | 171 ++++++++++ content/academy/what-to-monitor.mdx | 4 + .../all/what-does-a-good-trace-look-like.mdx | 2 + lib/source.ts | 13 + .../images/docs/academy/evaluation-loop.png | Bin 0 -> 312728 bytes source.config.ts | 5 + 32 files changed, 1935 insertions(+) create mode 100644 app/academy/[[...slug]]/not-found.tsx create mode 100644 app/academy/[[...slug]]/page.tsx create mode 100644 app/academy/layout.tsx create mode 100644 content/academy/building-datasets.mdx create mode 100644 content/academy/closing-the-loop.mdx create mode 100644 content/academy/cost-tracking.mdx create mode 100644 content/academy/designing-evals.mdx create mode 100644 content/academy/evaluation-example-setups.mdx create mode 100644 content/academy/evaluation-loop.mdx create mode 100644 content/academy/evaluation-overview.mdx create mode 100644 content/academy/good-traces.mdx create mode 100644 content/academy/index.mdx create mode 100644 content/academy/llm-observability-vs-traditional.mdx create mode 100644 content/academy/meta.json create mode 100644 content/academy/monitoring-overview.mdx create mode 100644 content/academy/observability-core-concepts.mdx create mode 100644 content/academy/observability-overview.mdx create mode 100644 content/academy/observability-use-case-examples.mdx create mode 100644 content/academy/prompt-composability.mdx create mode 100644 content/academy/prompt-deployment-strategies.mdx create mode 100644 content/academy/prompt-management-overview.mdx create mode 100644 content/academy/prompt-management-when-it-makes-sense.mdx create mode 100644 content/academy/prompt-version-control.mdx create mode 100644 content/academy/security-and-guardrails.mdx create mode 100644 content/academy/user-feedback.mdx create mode 100644 content/academy/what-to-evaluate.mdx create mode 100644 content/academy/what-to-monitor.mdx create mode 100644 public/images/docs/academy/evaluation-loop.png diff --git a/app/academy/[[...slug]]/not-found.tsx b/app/academy/[[...slug]]/not-found.tsx new file mode 100644 index 000000000..5cd2ecec1 --- /dev/null +++ b/app/academy/[[...slug]]/not-found.tsx @@ -0,0 +1,15 @@ +import Link from "next/link"; + +export default function AcademyNotFound() { + return ( +
+

Page not found

+

+ The academy page you're looking for doesn't exist or has moved. +

+ + Back to Academy + +
+ ); +} diff --git a/app/academy/[[...slug]]/page.tsx b/app/academy/[[...slug]]/page.tsx new file mode 100644 index 000000000..0d6cb1bf8 --- /dev/null +++ b/app/academy/[[...slug]]/page.tsx @@ -0,0 +1,71 @@ +import type { Metadata } from "next"; +import { academySource } from "@/lib/source"; +import { buildOgImageUrl, buildPageUrl } from "@/lib/og-url"; +import { DocsPage } from "fumadocs-ui/page"; +import { notFound } from "next/navigation"; +import { DocsContributors } from "@/components/DocsContributors"; +import { DocBodyChrome } from "@/components/DocBodyChrome"; +import { getMDXComponents } from "@/mdx-components"; +import type { ComponentType } from "react"; + +type PageProps = { + params: Promise<{ slug?: string[] }>; +}; + +export default async function AcademyPage(props: PageProps) { + const params = await props.params; + const slug = params.slug ?? []; + const page = academySource.getPage(slug); + + if (!page) notFound(); + + const { toc } = page.data; + const MDX = page.data.body as ComponentType<{ components?: Record }>; + + return ( + }} + > + + + + + ); +} + +export async function generateMetadata(props: PageProps): Promise { + const params = await props.params; + const slug = params.slug ?? []; + const page = academySource.getPage(slug); + if (!page) + return { + title: "Not Found", + }; + const pageData = page.data as typeof page.data & { + canonical?: string | null; + seoTitle?: string | null; + ogImage?: string | null; + }; + const pagePath = `/academy${slug.length > 0 ? `/${slug.join("/")}` : ""}`; + const canonicalUrl = pageData.canonical ?? buildPageUrl(pagePath); + const seoTitle = pageData.seoTitle || page.data.title; + const ogImage = buildOgImageUrl({ + title: seoTitle, + description: page.data.description, + section: "Academy", + staticOgImage: pageData.ogImage, + }); + return { + title: seoTitle, + description: page.data.description ?? undefined, + alternates: { canonical: canonicalUrl }, + openGraph: { images: [{ url: ogImage }], url: canonicalUrl }, + twitter: { images: [{ url: ogImage }] }, + }; +} + +export function generateStaticParams() { + return academySource.generateParams(); +} diff --git a/app/academy/layout.tsx b/app/academy/layout.tsx new file mode 100644 index 000000000..f24fdaceb --- /dev/null +++ b/app/academy/layout.tsx @@ -0,0 +1,10 @@ +import { academySource, getPageTreeWithShortTitles } from "@/lib/source"; +import { SharedDocsLayout } from "@/app/docs/SharedDocsLayout"; + +export default function AcademyLayout({ + children, +}: { + children: React.ReactNode; +}) { + return {children}; +} diff --git a/components/NavLinks.tsx b/components/NavLinks.tsx index 5624a7eb7..fceaa0117 100644 --- a/components/NavLinks.tsx +++ b/components/NavLinks.tsx @@ -22,6 +22,7 @@ const productLinks = [ ]; const resourcesLinks = [ + { name: "Academy", href: "/academy" }, { name: "Blog", href: "/blog" }, { name: "Changelog", href: "/changelog" }, { name: "Roadmap", href: "/docs/roadmap" }, diff --git a/content/academy/building-datasets.mdx b/content/academy/building-datasets.mdx new file mode 100644 index 000000000..aca1bd940 --- /dev/null +++ b/content/academy/building-datasets.mdx @@ -0,0 +1,310 @@ +--- +title: Building Datasets +description: "Learn how to test LLM applications with automated evaluation, datasets, and experiment runners." +--- + +# Building a Good Dataset + +esting LLM applications presents unique challenges. Unlike traditional software where outputs are deterministic, LLMs produce varied responses that can't be verified with simple equality checks. Yet the need for systematic testing remains critical—how do you ensure your AI application works reliably across deployments? + +This guide shows you how to implement automated tests for LLM applications using datasets and experiment runners, inspired by [Hamel Husain's testing framework](https://hamel.dev/blog/posts/evals/#level-1-unit-tests). + +## Testing vs. Evaluation in LLM Applications + +Before diving in, let's clarify some terminology that often causes confusion: + +**Testing** typically means running automated checks that produce pass/fail results. You write assertions like `assert result == expected` and your test suite tells you if something broke. + +**Evaluation** is about measuring quality. How accurate is your model? How helpful are the responses? These are scored on continuous scales rather than binary pass/fail. + +Traditional testing relies on predictable outputs. You assert that `add(2, 3)` returns `5`. But when you ask an LLM "What is the capital of France?", you might get "Paris", "The capital is Paris", "Paris, France", or a longer explanation. All are correct, but none match exactly. + +This variability doesn't mean we can't test LLM applications—it means we need different testing strategies. In LLM applications, these concepts blend together. You "test" your application by "evaluating" its outputs with scoring functions. A test passes if the evaluation score meets your threshold. + +### Unit Tests vs What We're Building + +Hamel Husain calls this approach "Level 1: Unit Tests" in his framework, and we'll use similar terminology. However, it's worth noting that these aren't traditional unit tests: + +**Traditional unit tests**: +- Test isolated code units +- Deterministic (same input = same output) +- Fast, no external dependencies + +**LLM application tests**: +- Test application behavior +- Non-deterministic (same input can produce different outputs) +- Slower execution + +Think of these as **automated regression tests** that verify your LLM application maintains acceptable quality as you make changes. The "unit" being tested is your application's behavior on specific inputs. + +## The Solution: Datasets + Experiment Runners + Evaluators + +The approach combines three components: + +1. **Datasets**: Collections of input/output pairs that represent your test cases +2. **Experiment Runners**: Execute your LLM application against the dataset +3. **Evaluators**: Score the outputs programmatically instead of checking exact matches + +Let's see how this works in practice. + +## Example: Testing a Geography Question Answering System + +Here's a complete example testing an LLM application that answers geography questions, using Langfuse's Experiment Runner SDK and local test data. + +First, setup the environment variables: + +```sh +# .env file +OPENAI_API_KEY=your_openai_api_key +LANGFUSE_PUBLIC_KEY=your_langfuse_public_key +LANGFUSE_SECRET_KEY=your_langfuse_secret_key +LANGFUSE_BASE_URL=https://cloud.langfuse.com +``` + +You can export the environment variables to your shell: + +``` +export $(grep -v '^#' .env) +``` + +Now create the test file: + +```python +# test_geography_experiment.py +import pytest +from langfuse import get_client, Evaluation, Langfuse +from langfuse.openai import OpenAI + +# Each test case includes both the input and expected output. The expected output +# serves as a reference for evaluation, not as an exact string match. +test_data = [ + {"input": "What is the capital of France?", "expected_output": "Paris"}, + {"input": "What is the capital of Germany?", "expected_output": "Berlin"}, + {"input": "What is the capital of Spain?", "expected_output": "Madrid"}, +] + +# The task function wraps your LLM application logic. It receives each test item and +# returns the LLM's response. This should be your full LLM application logic. +def geography_task(*, item, **kwargs): + """Task function that answers geography questions""" + question = item["input"] + response = OpenAI().chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": question}] + ) + return response.choices[0].message.content + +# This evaluator checks if the expected answer appears anywhere in the output, accounting +# for LLM verbosity. You could also use more sophisticated evaluators, including LLM-as-a-judge +# for semantic similarity. +def accuracy_evaluator(*, input, output, expected_output, **kwargs): + """Evaluator that checks if the expected answer is in the output""" + if expected_output and expected_output.lower() in output.lower(): + return Evaluation(name="accuracy", value=1.0) + + return Evaluation(name="accuracy", value=0.0) + +# Run-level evaluators aggregate scores across all test items, giving you a single +# metric to assert against. +def average_accuracy_evaluator(*, item_results, **kwargs): + """Run evaluator that calculates average accuracy across all items""" + accuracies = [ + eval.value for result in item_results + for eval in result.evaluations if eval.name == "accuracy" + ] + + if not accuracies: + return Evaluation(name="avg_accuracy", value=None) + + avg = sum(accuracies) / len(accuracies) + + return Evaluation(name="avg_accuracy", value=avg, comment=f"Average accuracy: {avg:.2%}") + +@pytest.fixture +def langfuse_client() -> Langfuse: + """Initialize Langfuse client for testing""" + return get_client() + +def test_geography_accuracy_passes(langfuse_client: Langfuse): + """Test that passes when accuracy is above threshold""" + result = langfuse_client.run_experiment( + name="Geography Test - Should Pass", + data=test_data, + task=geography_task, + evaluators=[accuracy_evaluator], + run_evaluators=[average_accuracy_evaluator] + ) + + # Access the run evaluator result directly + avg_accuracy = next( + eval.value for eval in result.run_evaluations + if eval.name == "avg_accuracy" + ) + + # Assert minimum accuracy threshold + assert avg_accuracy >= 0.8, f"Average accuracy {avg_accuracy:.2f} below threshold 0.8" +``` + + + **Set Appropriate Thresholds**: Not all tests need 100% accuracy. Set realistic thresholds based on your application's requirements: + + ```python + # Strict threshold for critical functionality + assert avg_accuracy >= 0.95, "Critical tests must have 95%+ accuracy" + + # Relaxed threshold for experimental features + assert avg_accuracy >= 0.70, "Experimental feature needs improvement" + ``` + + +You can run the test with: + +```sh +pip install pytest langfuse openai +pytest test_geography_experiment.py -v +``` + +## Running Tests in CI/CD + +You can integrate these tests into your continuous integration pipeline. + +### GitHub Actions Example + +First, setup the environment variables in your Github Actions secrets: + +![Github Actions secrets](/images/blog/2025-10-21-testing-llm-applications/github-actions-secrets.png) + + +```yaml +name: LLM Application Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + pip install pytest langfuse openai + + - name: Run LLM unit tests + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} + LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} + LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }} + run: | + pytest test_geography_experiment.py -v +``` + +## Using Remote Datasets with LLM-as-a-Judge + +For more advanced testing, you can use [remote datasets](https://langfuse.com/docs/evaluation/experiments/datasets) stored in Langfuse: + +```python {11,12} +import pytest +from langfuse import get_client, Langfuse + +@pytest.fixture +def langfuse_client() -> Langfuse: + return get_client() + +def test_with_remote_dataset(langfuse_client: Langfuse): + """Test using a dataset stored in Langfuse with LLM-as-a-judge evaluation""" + + # Fetch dataset from Langfuse + dataset = langfuse_client.get_dataset("geography-questions") + + def task(*, item, **kwargs): + question = item.input + response = OpenAI().chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": question}] + ) + return response.choices[0].message.content + + # Run experiment - Langfuse automatically applies configured evaluators + result = dataset.run_experiment( + name="Geography Test with Remote Dataset", + description="Testing geography QA with LLM-as-a-judge", + task=task + ) + + # The LLM-as-a-judge evaluator runs automatically in Langfuse + # Results are visible in the Langfuse UI + langfuse_client.flush() +``` + +You can then see the result of LLM-as-a-judge evaluation and the aggregated score in the Langfuse UI: + +![LLM-as-a-judge evaluation and aggregated score](/images/blog/2025-10-21-testing-llm-applications/llm-as-a-judge-results.png) + +### Benefits of Remote Datasets + +1. **Centralized test management**: Update test cases without code changes +2. **LLM-as-a-judge evaluators**: Configure semantic evaluation in the Langfuse UI (see [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge)) +3. **Historical tracking**: Compare results across runs to detect regressions +4. **Team collaboration**: Share datasets across team members + +When using remote datasets, all experiment results are tracked in Langfuse: + +- View individual test case results with scores and reasoning +- Compare experiment runs over time +- Track which code changes improved or degraded performance +- Share results with team members through the Langfuse dashboard + +We published two guides covering how to test chatbots and conversational applications with remote datasets and LLM-as-a-judge evaluators: + +- **[Evaluating Multi-Turn Conversations](https://langfuse.com/guides/cookbook/example_evaluating_multi_turn_conversations)**: Test specific points in ongoing conversations (N+1 evaluations), useful for debugging issues where context is lost across turns +- **[Simulating Multi-Turn Conversations](https://langfuse.com/guides/cookbook/example_simulated_multi_turn_conversations)**: Generate synthetic conversations with AI agents to test various scenarios and personas + +## Next Steps + +Ready to implement automated testing for your LLM application? Start by [identifying 3-5 critical functionalities](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications), create a small dataset with expected behaviors, and run your first experiment. We'd love to hear about your testing approach—join the conversation in our [GitHub Discussion](https://github.com/langfuse/langfuse/discussions), and explore our learning resources below. + +## Learn More + +import { FileCode, BookOpen, Video, Users, Joystick } from "lucide-react"; + + + } arrow /> + } arrow /> + } arrow /> + } arrow /> + + +## FAQ + +
+How do you test LLM applications? + +Testing LLM applications requires a different approach than traditional software testing. Instead of checking exact output matches, you use **evaluation functions** that score outputs on continuous scales. The typical approach combines three components: (1) **Datasets** — collections of input/output pairs representing test cases, (2) **Experiment runners** — tools that execute your LLM application against the dataset, and (3) **Evaluators** — scoring functions that assess output quality using LLM-as-a-Judge, semantic similarity, or custom logic. A test passes when evaluation scores meet your defined thresholds. + +
+ +
+What is the difference between LLM testing and LLM evaluation? + +**Testing** produces binary pass/fail results — your test suite tells you whether something broke. **Evaluation** measures quality on continuous scales — how accurate, helpful, or relevant are the responses. In practice, LLM testing and evaluation blend together: you "test" your application by "evaluating" its outputs with scoring functions, and a test passes if the evaluation score meets your threshold. Both are essential for building reliable LLM applications. + +
+ +
+How do I automate LLM testing in CI/CD? + +You can integrate LLM testing into your CI/CD pipeline using Langfuse's experiment runner SDK. Create a test script that: (1) loads your dataset (locally or from Langfuse), (2) runs your application against each test case, (3) scores the outputs with evaluator functions, and (4) asserts that scores meet minimum thresholds. Run this script as part of your CI pipeline using `pytest` or your preferred test runner. See the [experiments via SDK guide](/docs/evaluation/experiments/experiments-via-sdk) for implementation details. + +
diff --git a/content/academy/closing-the-loop.mdx b/content/academy/closing-the-loop.mdx new file mode 100644 index 000000000..b81067142 --- /dev/null +++ b/content/academy/closing-the-loop.mdx @@ -0,0 +1,8 @@ +--- +title: Closing the Loop +description: TODO +--- + +# Closing the Loop + +TODO: Closing the loop by looking at traces, selecting some for datasets, running evals on it and improving with prompt iteration diff --git a/content/academy/cost-tracking.mdx b/content/academy/cost-tracking.mdx new file mode 100644 index 000000000..6add75919 --- /dev/null +++ b/content/academy/cost-tracking.mdx @@ -0,0 +1,4 @@ +--- +title: Cost Tracking +description: Langfuse tracks usage and cost of LLM generations for various models. Learn how to ingest, infer, and manage cost data. +--- diff --git a/content/academy/designing-evals.mdx b/content/academy/designing-evals.mdx new file mode 100644 index 000000000..cb8ab57d4 --- /dev/null +++ b/content/academy/designing-evals.mdx @@ -0,0 +1,110 @@ +--- +title: Designing Evals +description: A practical guide to setting up automated evaluations for LLM applications and AI agents. +--- + +# Designing Good Evals + +In AI development, iterating quickly is important. Whether you're refining a prompt, swapping a model, or changing your application logic, you need to understand the impact of each change on performance. Manually annotating outputs after every modification is slow and expensive, especially when you want to integrate evaluations into a CI/CD pipeline. + +**Automated evaluators** solve this problem by providing a scalable way to measure and monitor your application's failure modes, enabling a fast and effective development loop. + +_The framework in this guide is adapted from Hamel Husain's [Eval FAQ](https://hamel.dev/blog/posts/evals-faq/)._ + +--- + +This guide describes a process to **build automated evaluators** for your application. This is a robust evaluator that you can scale for different tests and evolutions of your application: + +1. [What to Measure](#what-to-measure) +2. [How to Measure](#how-to-measure) +3. [Draft your LLM-as-a-Judge prompt](#draft-prompt) +4. [Validate your evaluator](#validate-evaluator) + +--- + +I'll demonstrate this process using an [example chatbot in the Langfuse documentation](/docs/demo) that uses the Vercel AI SDK and has access to a RAG tool to retrieve documents from the Langfuse documentation. The example chat app logs traces into the Langfuse example project and has already answered 19k user queries in the past year. + +Here's the chat interface (you can find the example chat app [here](/docs/demo)): + + + ![Chat + Interface](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/demo-chat.png) + + +## What to Measure [#what-to-measure] + +In the [previous blog post](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications), we showed how to perform **error analysis** and identify failure modes in your application. Now we will focus on building automated evaluators to measure these failure modes. + +Before building an evaluator, it's important to differentiate between two types of failures to prioritize your efforts: + +**Missing Instructions:** The first type are errors caused by vague or incomplete instructions in your prompt. For instance if your agent uses too many bullet points or doesn't ask follow-up questions, and you never instructed it to do so, the first step is to fix the prompt. Creating an evaluator for a failure that a simple prompt tweak can solve is often unnecessary effort. + +**Model Limitations:** The second type occur when the LLM fails to perform correctly despite receiving clear and precise instructions. These are the ideal candidates for automated evaluation because they represent the model's inherent limitations, not a misunderstanding of your intent. + +Let's apply this to the [Langfuse Example App](/docs/demo). First, we fix some obvious issues by clarifying the prompt: "use a maximum of three bullet points" and "ask for more context when the user's query is ambiguous." With those fixed, we can focus on measuring the more complex model limitation failures we identified in the [previous blog post](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications): + +* **Out of Scope**: The agent answers a question not related to Langfuse or the LLM/AI space. +* **Generic Responses**: The answer is technically correct but doesn't resolve the user's issue. The metric is to assess if the agent's final answer is helpful and directly addresses the user's question. +* **Context Retrieval / RAG Issues**: The agent uses the wrong retrieval tool. The metric needs to judge if the correct tool was chosen based on the user's query. + +For this guide, we will set up an evaluator for the "Out of Scope" failure mode. + +## How to Measure [#how-to-measure] + +In Langfuse, all evaluations are tracked as **Scores**, which [can be attached to traces, observations, sessions or dataset runs](/docs/evaluation/experiments/data-model#scores). Evaluations in Langfuse can be set up in two main ways: + +**In the Langfuse UI:** In Langfuse, you can set up **LLM-as-a-Judge Evaluators** that use another LLM to evaluate your application's output on subjective and nuanced criteria. These are easily configured directly in Langfuse. For a guide on setting them up in the UI, check the documentation on **[LLM-as-a-Judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge)**. + +**External Evaluators:** In your code, you can set up **Custom Evaluators** and use the Langfuse SDKs to send the scores back to the evaluated traces. This allows you to set up code-based evaluators or any other custom evaluation logic. For an example of a custom pipeline, see the guide on **[setting up an external evaluation pipelines](/guides/cookbook/example_external_evaluation_pipelines)**. + +In this guide, we will set up an LLM-as-a-Judge evaluator in the Langfuse UI. + +## Drafting your LLM-as-a-Judge Prompt [#draft-prompt] + +A good LLM-as-a-Judge prompt is narrowly focused and well-structured: + +1. **Pick one Failure Mode**: Focus on one specific failure mode. Do not try to cover multiple failure modes at once. +2. **Pass/Fail Definitions**: Clearly define what constitutes a "Pass" (failure is absent) and a "Fail" (failure is present). +3. **Examples**: Include clear examples of both "Pass" and "Fail" cases. + +Here is an example prompt I use in our Example App to check if the agent's answer is within its scope: + + + ![LLM-as-a-Judge Prompt](/images/blog/2025-09-05-automated-evaluations/prompt.png) + + +Once you drafted your prompt, you can set up the LLM-as-a-Judge evaluator in the Langfuse UI. You can find a guide on how to set them up in the UI [here](/docs/evaluation/evaluation-methods/llm-as-a-judge). + + + ![Evaluator Setup](/images/blog/2025-09-05-automated-evaluations/setup-evaluator.png) + + +## Validating Your Evaluator [#validate-evaluator] + +To build an evaluator you can trust, you must validate its performance against human judgment, a process similar to testing a machine learning classifier. + +First, split a set of human-labeled traces into a **development set** and a **test set**. In Langfuse, you can use tags to manage these sets. The **development set** is used to iteratively refine your judge's prompt. Run the evaluator on this set, compare its scores to the human labels, analyze the disagreements, and adjust the prompt's definitions or examples until its judgments closely align with yours: + + + ![Evaluator Tuning](/images/blog/2025-09-05-automated-evaluations/evaluator-tuning.png) + + +Additionally, you can measure the judge's alignment with human labels. The best metrics for this are **True Positive Rate (TPR)** and **True Negative Rate (TNR)**. TPR measures what fraction of actual "Passes" your judge correctly identifies, while TNR measures what fraction of actual "Fails" it correctly identifies. + +You can calculate these metrics by querying the data from the trace table (via [UI export](/docs/api-and-data-platform/features/export-from-ui) or [SDKs](/docs/api-and-data-platform/features/query-via-sdk)) and calculating the metrics. + +Once your judge performs well on the dev set (e.g., TPR and TNR \>90%), run it a final time on the held-out **test set** to get an unbiased measure of its real-world performance. A validated evaluator with high TPR and TNR gives you confidence that your automated metrics are meaningful. + +You can now repeat this process for both your Evaluators in the Langfuse UI and your custom evaluators as part of an [external evaluation pipeline](/guides/cookbook/example_external_evaluation_pipelines). + +## Next Steps [#operationalize-evaluator] + +With good automated evaluators in place, the next step is to operationalize your workflow. The goal is to create a CI/CD pipeline where every code change (to a prompt, model, or tool) automatically triggers an evaluation run on a **golden [Langfuse Dataset](/docs/datasets)**. + +Your automated evaluators will score these runs, providing immediate feedback on how your changes impacted key failure modes. This continuous monitoring loop helps you develop faster while maintaining a high quality bar for your application. + +import { FileCode } from "lucide-react"; + + + } arrow /> + diff --git a/content/academy/evaluation-example-setups.mdx b/content/academy/evaluation-example-setups.mdx new file mode 100644 index 000000000..286ca76c6 --- /dev/null +++ b/content/academy/evaluation-example-setups.mdx @@ -0,0 +1,12 @@ +--- +title: Example Setups +description: Evaluation approaches for different types of LLM applications. +--- + +# Example Setups + +TODO + +Add example evaluator setups for +- customer support chatbot (and link to demo project) +- ... diff --git a/content/academy/evaluation-loop.mdx b/content/academy/evaluation-loop.mdx new file mode 100644 index 000000000..2ff1ec2df --- /dev/null +++ b/content/academy/evaluation-loop.mdx @@ -0,0 +1,132 @@ +--- +title: The Evaluation Loop +description: Learn the fundamental concepts behind LLM evaluation — the evaluation loop, evaluation methods, experiments, and online evaluation. +--- + +# The Evaluation Loop + +LLM applications often have a constant loop of testing and monitoring. + +**Offline evaluation** lets you test your application against a fixed dataset before you deploy. You run your new prompt or model against test cases, review the scores, iterate until the results look good, then deploy your changes. In Langfuse, you can do that by running [Experiments](/docs/evaluation/core-concepts#experiments). + +**Online evaluation** scores live traces to catch issues in real traffic. When you find edge cases your dataset didn't cover, you add them back to your dataset so future experiments will catch them. + +![The Continuous Evaluation/Iteration Loop](/images/docs/academy/evaluation-loop.png) + +> **Here's an example workflow** for building a customer support chatbot +> 1. You update your prompt to make responses less formal. +> 2. Before deploying, you run an **experiment**: test the new prompt against your dataset of customer questions **(offline evaluation)**. +> 3. You review the scores and outputs. The tone improved, but responses are longer and some miss important links. +> 4. You refine the prompt and run the experiment again. +> 5. The results look good now. You deploy the new prompt to production. +> 6. You monitor with **online evaluation** to catch any new edge cases. +> 7. You notice that a customer asked a question in French, but the bot responded in English. +> 8. You add this French query to your dataset so future experiments will catch this issue. +> 9. You update your prompt to support French responses and run another experiment. +> +> Over time, your dataset grows from a couple of examples to a diverse, representative set of real-world test cases. + +## Evaluation Methods [#evaluation-methods] + +Evaluation methods are the functions that score traces, observations, sessions, or dataset runs. You can use a variety of evaluation methods to add [scores](/docs/evaluation/experiments/data-model#scores). + + +| Method | What | Use when | +| --- | --- | --- | +| [LLM-as-a-Judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) | Use an LLM to evaluate outputs based on custom criteria | Subjective assessments at scale (tone, accuracy, helpfulness) | +| [Scores via UI](/docs/evaluation/evaluation-methods/scores-via-ui) | Manually add scores to traces directly in the Langfuse UI | Quick quality spot checks, reviewing individual traces | +| [Annotation Queues](/docs/evaluation/evaluation-methods/annotation-queues) | Structured human review workflows with customizable queues | Building ground truth, systematic labeling, team collaboration | +| [Scores via API/SDK](/docs/evaluation/evaluation-methods/scores-via-sdk) | Programmatically add scores using the Langfuse API or SDK | Custom evaluation pipelines, deterministic checks, automated workflows | + +When setting up new evaluation methods, you can use [Score Analytics](/docs/evaluation/evaluation-methods/score-analytics) to analyze or sense-check the scores you produce. +## Experiments [#experiments] + +An experiment runs your application against a dataset and evaluates the outputs. This is how you test changes before deploying to production. + +### Definitions + +Before diving into experiments, it's helpful to understand the building blocks in Langfuse: datasets, dataset items, tasks, scores, and experiments. + +| Object | Definition | +| --- | --- | +| **Dataset** | A collection of test cases (dataset items). You can run experiments on a dataset. | +| **Dataset item** | One item in a dataset. Each dataset item contains an input (the scenario to test) and optionally an expected output. | +| **Task** | The application code that you want to test in an experiment. This will be performed on each dataset item, and you will score the output. +| **Evaluation Method** | A function that scores experiment results. In the context of a Langfuse experiment, this can be a [deterministic check](/docs/evaluation/evaluation-methods/custom-scores), or [LLM-as-a-Judge](/docs/evaluation/evaluation-methods/llm-as-a-judge). | +| **Score** | The output of an evaluation. This can be numeric, categorical, or boolean. See [Scores](/docs/evaluation/experiments/data-model#scores) for more details.| +| **Experiment Run** | A single execution of your task against all items in a dataset, producing outputs (and scores). | + +You can find the data model for these objects [here](/docs/evaluation/experiments/data-model). + + +### How these work together + +This is what happens conceptually: + +When you run an experiment on a given **dataset**, each of the **dataset items** will be passed to the **task function** you defined. The task function is generally an LLM call that happens in your application, that you want to test. The task function produces an output for each dataset item. This process is called an **experiment run**. The resulting collection of outputs linked to the dataset items are the **experiment results**. + +Often, you want to score these experiment results. You can use various [evaluation methods](#evaluation-methods) that take in the dataset item and the output produced by the task function, and produce a score based on criteria you define. Based on these scores, you can then get a complete picture of how your application performs across all test cases. + + + ![Experiments flow](/images/docs/evaluation/experiments-flow.jpg) + + +You can compare experiment runs to see if a new prompt version improves scores, or identify specific inputs where your application struggles. Based on these experiment results, you can decide whether the change is ready to be deployed to production. + +You can find more details on how these objects link together under the hood on the [data model page](/docs/evaluation/experiments/data-model). + + +### Two ways to run experiments + +You can **run experiments programmatically using the Langfuse SDK**. This gives you full control over the task, evaluation logic, and more. [Learn more about running experiments via SDK](/docs/evaluation/experiments/experiments-via-sdk). + +Another way is to **run experiments directly from the Langfuse interface** by selecting a dataset and prompt version. This is useful for quick iterations on prompts without writing code. [Learn more about running experiments via UI](/docs/evaluation/experiments/experiments-via-ui). + +
+
+ {/* Header row */} +
+
+ **Langfuse Execution** +
+
+ **Local/CI Execution** +
+ + {/* Langfuse Data row */} +
+ **Langfuse Dataset** +
+
+ [Experiments via UI](/docs/evaluation/experiments/experiments-via-ui) +
+
+ [Experiments via SDK](/docs/evaluation/experiments/experiments-via-sdk) +
+ + {/* Local Data row */} +
+ **Local Dataset** +
+
+ Not supported +
+
+ [Experiments via SDK](/docs/evaluation/experiments/experiments-via-sdk) +
+ +
+
+ +*While it's optional, we recommend managing the underlying [Datasets](/docs/evaluation/experiments/datasets) in Langfuse as it allows for [1] In-UI comparison tables of different experiments on the same data and [2] Iteratively improve dataset based on production/staging traces.* + +## Online Evaluation [#online-evaluation] + +For online evaluation, you can configure evaluation methods to automatically score production traces. This helps you catch issues immediately. + +Langfuse currently supports LLM-as-a-Judge and human annotation checks for online evaluation. [Deterministic checks are on the roadmap](https://github.com/orgs/langfuse/discussions/6087). + + +### Monitoring with dashboards + +Langfuse offers dashboards to monitor your application performance in real-time. You can also monitor scores in dashboards. You can find more details on how to use dashboards [here](/docs/metrics/features/custom-dashboards). diff --git a/content/academy/evaluation-overview.mdx b/content/academy/evaluation-overview.mdx new file mode 100644 index 000000000..3b63a878c --- /dev/null +++ b/content/academy/evaluation-overview.mdx @@ -0,0 +1,16 @@ +--- +title: Evaluating LLM Applications +description: Learn how to systematically evaluate LLM applications — from understanding what to measure, to designing evals, to building the datasets that make it all work. +--- + +# Evaluating LLM Applications + +Once your traces are set up, and you decided what the behavior of your agent should be, you can start setting up evaluations, or short "Evals". There are many different forms of evaluations, each having their own pro's and cons. Most setups will benefit from a combination of different evaluation forms. + +This section will walk you through that: + +- [The evaluation loop](/academy/evaluation-loop) +- [What should you evaluate?](/academy/what-to-evaluate) +- [How to design an eval](/academy/designing-evals) +- [Building good datasets](/academy/building-datasets) +- [Example setups](/academy/evaluation-example-setups) diff --git a/content/academy/good-traces.mdx b/content/academy/good-traces.mdx new file mode 100644 index 000000000..bba8df8f8 --- /dev/null +++ b/content/academy/good-traces.mdx @@ -0,0 +1,143 @@ +--- +title: What does a good trace look like? +description: A guide to structuring your Langfuse traces for effective debugging, evaluation, and cost tracking. +--- + +import { Fan, Wrench } from "lucide-react"; + +# What does a good trace look like? + +You see traces appearing in Langfuse, but how do you know if you've done it well? Here are a couple of things you can look at and optimize. + +## What's the scope of one trace? + +Langfuse's [data model](/docs/observability/data-model) has three levels of grouping: observations (individual steps) are grouped into traces via a `trace_id`, and traces can be grouped into sessions via a `session_id`. + +A trace represents one self-contained unit of work in your application. Good examples of a typical trace: +- One chatbot turn (user sends a message, your app retrieves context, calls the LLM, returns a response) +- One agent run (the agent receives a task, reasons, calls tools, and produces a result) +- One pipeline execution (a document comes in, gets chunked, embedded, and stored) + +If multiple of these happen in sequence, e.g. a multi-turn conversation, or several agent runs that feed into a final report, that's where [sessions](/docs/observability/features/sessions) come in. Each step is its own trace, and the session ties them together. + +A trace shows up in the Langfuse UI as a trace tree and an [agent graph](/docs/observability/features/agent-graphs): + +
+ +![Trace tree](/images/docs/faq/good-trace-tree.png) + +![Agent graph](/images/docs/faq/good-trace-agent-graph.png) + +
+ +## Look at the trace tree + +When you click on a trace, you see the trace tree. There are two things you can check: + +### Are the right steps showing up? + +You should see your LLM calls, tool calls, and other important steps represented in the tree. They should have the correct [observation type](/docs/observability/features/observation-types). + +For example +- an LLM call should show up as a `generation`. This is important because a `generation` can carry [cost, token usage, and model information](#track-model-tokens-and-cost-on-generations). +- a tool call should show up as a `tool`. You can then filter on tool call observations when you create [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) evaluators. + +Framework integrations typically set these types automatically. If you're instrumenting manually, you can set them via the `as_type` parameter (Python) or `asType` (JS/TS). See the [observation types docs](/docs/observability/features/observation-types) for a full list. + +### Is there noise you don't need? + +Not every observation in the tree is useful for understanding what your application did. HTTP spans, database queries, and framework internals often add clutter without giving you meaningful insight. If you see observations like these polluting your trace tree, you can [filter them out](/faq/all/unwanted-http-database-spans). + +
+![Noisy spans in a trace tree](/images/docs/faq/good-trace-noisy-spans.png) +
+ +## Choose good names + +Observation- and trace names are used in many places: + +- When setting up [LLM-as-a-judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge), you target specific observations by name. +- In [dashboards](/docs/metrics/features/custom-dashboards), you can filter and aggregate metrics by observation name. +- In the tracing table, names help you quickly identify what each step does. + +Try to name the observations after what they do: `classify-intent`, `generate-response`, `summarize-results`. This makes it easier to understand what each step does when you're looking at the trace tree, and makes filtering on a specific step easier. + +> Try not to name observations after the AI model used (`gpt-4o`, `claude-sonnet`). It breaks as soon as you swap models. The model is already a separate attribute on `generation` observations, use that instead. + +## Choose meaningful input and output + +In general, it's recommended that operations have an input and/or output. If an observation has neither, ask yourself if an observation is actually useful or if you can drop it. + +For your **most viewed observations**, take some extra care to set them up. You will likely create pre-filtered views on your tracing and session screens. The observations you filter for here are the ones that will get looked at the most. For these, ask yourself: **what do I need to see to quickly evaluate a trace/session at a glance?** + +![Tracing table with input and output](/images/docs/faq/good-trace-tracing-table-io.png) + +Typical input/output for `GENERATION` observations: +- For a chatbot: the user message (input) and the assistant response (output). +- For a RAG pipeline: the user query and the generated answer. +- For a classification task: the text being classified and the predicted label. + +If your input and output fields are showing up empty unintentionally, see [why are the input and output of my trace empty?](/faq/all/empty-trace-input-and-output) + +## Useful attributes + +Observations have a number of attributes that can be useful for your use cases. These will allow you to go even further with filtering, scoring, and making dashboards. + +### Add metadata for context + +[Metadata](/docs/observability/features/metadata) is a flexible key-value store on each observation. Some data that might be useful to save under metadata: + +- **Evaluation context**: When configuring [LLM-as-a-judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge), you can reference metadata fields. This is useful for passing ground truth, expected behavior, or other context that the evaluator needs but that isn't part of the actual input/output. +- **Filtering**: You can filter by metadata keys in the Langfuse UI, which is helpful when you need to find traces with specific characteristics. +- **Annotation context**: When doing manual review, metadata gives annotators extra information to make better judgments. + +### Track model, tokens, and cost on generations [#track-model-tokens-and-cost-on-generations] + +If you want to understand what your LLM usage costs, broken down by model, by user, by feature, you need three things on your `generation` observations: + +- **Model name**: Langfuse uses this to look up pricing in the [model pricing table](/docs/model-usage-and-cost). If the model name doesn't match, Langfuse can't calculate cost automatically. +- **Usage details**: Input tokens, output tokens, and optionally cached tokens. This is what powers the token usage views in dashboards. +- **Cost details** (optional): If you want to override Langfuse's automatic pricing — for example, if you have a custom pricing agreement — you can pass cost explicitly. + +Most [integrations](/integrations) capture all of this automatically. If you're instrumenting manually, see the [token and cost tracking docs](/docs/observability/features/token-and-cost-tracking). + +You can see these attributes on the `GENERATION` observation in the Langfuse UI. + +![Generation attributes in Langfuse](/images/docs/faq/good-trace-generation-attributes.png) + +### Use tags for business-level dimensions + +[Tags](/docs/observability/features/tags) enable filtering and metric breakdowns across dimensions that matter to your business. Good tags answer questions like "how does latency differ between our `web` and `api` users?". + +One property of tags is that they are **immutable and must be set at observation creation time**. This makes them great for things you know upfront (where the request came from, which feature it's part of), but not for things you learn later. + +If you need to label traces based on something determined after the fact, like an [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) evaluation result, use [scores](/docs/evaluation/overview) instead. + +### Link prompts to traces + +If you [manage your prompts in Langfuse](/docs/prompt-management), you can [link them to your generations](/docs/prompt-management/features/link-to-traces). This lets you see which prompt version was used for a given trace, and track how metrics change across prompt versions. Useful when you're iterating on prompts and want to compare performance. + +### Set the environment + +Set the [environment](/docs/observability/features/environments) attribute (`production`, `staging`, `development`) so that your test traces don't pollute production dashboards and evaluations. + +### Track users with user IDs + +Setting the [user ID](/docs/observability/features/users) connects traces to specific users, which unlocks per-user views in Langfuse. Useful if you want to answer questions like: + +- Which users are costing us the most? +- How does output quality vary across users? +- What does a specific user's usage pattern look like? + +### Group related traces with session IDs + +If your application involves multiple traces that logically belong together, group them into a [session](/docs/observability/features/sessions). This gives you a session replay view where you can see the full interaction in sequence. + +This makes sense when: +- You're building a chatbot (each user message creates a new trace, but the whole conversation is one session) +- You have multiple agents that each contribute to a final output (e.g., five agents that collaborate to produce a report) +- Your workflow spans multiple requests with human-in-the-loop steps in between + +If your application is single-request/single-response with no continuity between calls, you probably don't need sessions. + +![Sessions view in Langfuse](/images/docs/faq/good-trace-sessions-view.png) diff --git a/content/academy/index.mdx b/content/academy/index.mdx new file mode 100644 index 000000000..2bad58e2f --- /dev/null +++ b/content/academy/index.mdx @@ -0,0 +1,39 @@ +--- +title: Langfuse Academy +description: Learn how to build, evaluate, and improve LLM applications. Conceptual guides on observability, evaluation, prompt management, and production monitoring. +--- + +# Langfuse Academy + +The Academy is a collection of conceptual guides that help you understand how to build a good setup for your LLM application. It's not about how to integrate a specific SDK — that's what the [docs](/docs) are for. Instead, it covers the thinking behind the decisions: what does good look like, what processes should you follow, and what should you keep in mind? + +## Sections + +import { Eye, FlaskConical, MessageSquareText, Activity } from "lucide-react"; + + + } + title="LLM Observability" + description="How LLM observability differs from traditional observability, core concepts, and what good traces look like." + href="/academy/observability-overview" + /> + } + title="Evaluating LLM Applications" + description="The evaluation loop, choosing what to evaluate, designing evals, building datasets, and example setups." + href="/academy/evaluation-overview" + /> + } + title="Prompt Management" + description="When prompt management makes sense, version control, deployment strategies, and composability." + href="/academy/prompt-management-overview" + /> + } + title="Monitoring & Improving in Production" + description="What to monitor, cost tracking, user feedback, closing the loop, and security guardrails." + href="/academy/monitoring-overview" + /> + diff --git a/content/academy/llm-observability-vs-traditional.mdx b/content/academy/llm-observability-vs-traditional.mdx new file mode 100644 index 000000000..e76e2d8db --- /dev/null +++ b/content/academy/llm-observability-vs-traditional.mdx @@ -0,0 +1,8 @@ +--- +title: How Is LLM Observability Different? +description: TODO +--- + +# How Is LLM Observability Different? + +TODO diff --git a/content/academy/meta.json b/content/academy/meta.json new file mode 100644 index 000000000..8d22e9d41 --- /dev/null +++ b/content/academy/meta.json @@ -0,0 +1,32 @@ +{ + "title": "Academy", + "pages": [ + "index", + "---LLM Observability---", + "observability-overview", + "llm-observability-vs-traditional", + "observability-core-concepts", + "good-traces", + "observability-use-case-examples", + "---Evaluating LLM Applications---", + "evaluation-overview", + "evaluation-loop", + "what-to-evaluate", + "designing-evals", + "building-datasets", + "evaluation-example-setups", + "---Prompt Management---", + "prompt-management-overview", + "prompt-management-when-it-makes-sense", + "prompt-version-control", + "prompt-deployment-strategies", + "prompt-composability", + "---Monitoring & Improving in Production---", + "monitoring-overview", + "what-to-monitor", + "cost-tracking", + "user-feedback", + "closing-the-loop", + "security-and-guardrails" + ] +} diff --git a/content/academy/monitoring-overview.mdx b/content/academy/monitoring-overview.mdx new file mode 100644 index 000000000..bc94067ec --- /dev/null +++ b/content/academy/monitoring-overview.mdx @@ -0,0 +1,14 @@ +--- +title: Monitoring & Improving in Production +description: You shipped your LLM application — now what? Learn what to monitor, how to track costs, collect feedback, and continuously improve. +--- + +# Monitoring & Improving in Production + +You shipped your LLM application — now learn what to monitor, how to track costs, collect feedback, and continuously improve. + +- [What to monitor](/academy/what-to-monitor) +- [Cost tracking and optimization](/academy/cost-tracking) +- [Collecting and using user feedback](/academy/user-feedback) +- [Closing the loop](/academy/closing-the-loop) +- [Security and guardrails](/academy/security-and-guardrails) diff --git a/content/academy/observability-core-concepts.mdx b/content/academy/observability-core-concepts.mdx new file mode 100644 index 000000000..024a1588d --- /dev/null +++ b/content/academy/observability-core-concepts.mdx @@ -0,0 +1,151 @@ +--- +title: Observability Core Concepts +description: Understand how Langfuse structures traces, observations, and sessions — the building blocks of LLM observability. +--- + +# Core Concepts + +This page digs into the underlying concepts of how Langfuse structures and captures your data. Understanding these will make setting up and working with your traces easier. + +## Observations, Traces, and Sessions + +Langfuse organizes an application's data into three core concepts: observations, traces, and sessions. + +```mermaid +flowchart LR + + subgraph SESSION [session] + direction TB + + subgraph TRACE1 [trace] + direction LR + O1[observation] + O2[observation] + O3[observation] + end + + subgraph TRACE2 [trace] + direction LR + O4[observation] + O5[observation] + O6[observation] + end + + subgraph TRACE3 [trace] + direction LR + O7[observation] + O8[observation] + O9[observation] + end + + end + classDef sessionBox fill:none,stroke:#888,stroke-width:2px; + classDef traceBox fill:none,stroke:#aaa,stroke-width:1.5px; + + %% Assign classes + class SESSION sessionBox; + class TRACE1,TRACE2,TRACE3 traceBox; +``` + +import ObservationTypesList from "@/components-mdx/observation-types-list.mdx"; + +### Observations + +`Observations` are the individual steps within a trace. Langfuse supports a number of LLM application specific [observation types](/docs/observability/features/observation-types), for example _generations_, _toolcalls_, _RAG retrieval steps_, etc. + +Observations can be nested. The example below shows a trace with a nested observation. + +
+ +
+ +Hierarchical structure of observations in Langfuse + +
+ +```mermaid +classDiagram + Trace "1" *-- "n" Observation + Observation o-- Observation: Nesting +``` + +
+ +
+ +
+ +Example trace in Langfuse UI + +![Trace in Langfuse UI](/images/docs/tracing-observation-tree-light.png) + +
+ +
+ +Example trace in Langfuse UI + +![Trace in Langfuse UI](/images/docs/tracing-observation-tree-dark.png) + +
+ +
+ +### Traces + +A `trace` typically represents a single request or operation. +For example, when a user asks a question to a chatbot, that interaction, from the user's question to the bot's response, is captured as one trace. + +It serves as container of observations. Trace attributes such as `user_id`, `session_id`, `tags`, `metadata`, etc. are propagated to all observations within the trace. + +### Sessions + +Optionally, traces can be grouped into [sessions](/docs/observability/features/sessions). +Sessions are used to group traces that are part of the same user interaction. +A common example is a thread in a chat interface. + +
+ +
+ +Optionally, sessions aggregate traces + +
+ +```mermaid +classDiagram + Session "1" o-- "n" Trace +``` + +
+ +
+ +
+ +Example session in Langfuse UI + +![Session view](/images/docs/session.png) + +
+ +
+ +Using sessions is recommended for applications with multi-turn conversations or workflows. Please refer to the [Sessions](/docs/observability/features/sessions) documentation to add sessions to your traces. + +## Adding Attributes + +Once you've structured your data into traces and observations, you can enrich them with additional attributes. These attributes act as labels that help you filter, segment, and analyze your traces for specific use cases. + +There are different types of attributes you can add: + +| Attribute | Description | +|-----------|-------------| +| [Environments](/docs/observability/features/environments) | Separate data from different deployment contexts like `production`, `staging`, or `development` | +| [Tags](/docs/observability/features/tags) | Flexible labels to categorize traces by feature, API endpoint, or workflow | +| [User](/docs/observability/features/users) | Track which end-user triggered each trace | +| [Metadata](/docs/observability/features/metadata) | Flexible key-value store for custom information | +| [Releases & Versions](/docs/observability/features/releases-and-versioning) | Track application versions and component changes | + +The next page goes deeper into when it makes sense to add certain attributes. + diff --git a/content/academy/observability-overview.mdx b/content/academy/observability-overview.mdx new file mode 100644 index 000000000..a7f9860fb --- /dev/null +++ b/content/academy/observability-overview.mdx @@ -0,0 +1,15 @@ +--- +title: LLM Observability +description: Understand what LLM observability is, how it differs from traditional observability, and why it matters for building reliable AI applications. +--- + +# LLM Observability + +This section dives into LLM observability. It's the first step towards a solid AI setup and one you shouldn't wait on. Even if you don't feel ready for other aspects of LLM engineering, having traces of your agents is always a good idea. You can already study their behavior, make small fixes based on what you see, and form a more grounded opinion on what good looks like for you agent in specific situations. + +This section covers the following topics: + +- [How is LLM observability different from traditional logging?](/academy/llm-observability-vs-traditional) +- [Core observability concepts in Langfuse](/academy/observability-core-concepts) +- [What does a good trace look like?](/academy/good-traces) +- [Use-case specific example setups](/academy/observability-use-case-examples) diff --git a/content/academy/observability-use-case-examples.mdx b/content/academy/observability-use-case-examples.mdx new file mode 100644 index 000000000..cbf5d2ca6 --- /dev/null +++ b/content/academy/observability-use-case-examples.mdx @@ -0,0 +1,12 @@ +--- +title: Use Case Examples +description: TODO +--- + +# Use Case Specific Examples + +TODO: +- refer to demo project +- discuss + - a complicated customer support chatbot trace + - ... diff --git a/content/academy/prompt-composability.mdx b/content/academy/prompt-composability.mdx new file mode 100644 index 000000000..0b4ca1600 --- /dev/null +++ b/content/academy/prompt-composability.mdx @@ -0,0 +1,67 @@ +--- +title: Prompt Composability +description: Reference other prompts in your prompts using a simple tag format to create modular, reusable prompt components. +--- + +import { FaqPreview } from "@/components/faq/FaqPreview"; + +# Prompt Composability + +As you create more prompts, you will often find yourself using the same snippets of text or instructions in multiple prompts. To avoid duplication, you can compose prompts by referencing other prompts. + +