diff --git a/app/academy/[[...slug]]/not-found.tsx b/app/academy/[[...slug]]/not-found.tsx new file mode 100644 index 0000000000..5cd2ecec16 --- /dev/null +++ b/app/academy/[[...slug]]/not-found.tsx @@ -0,0 +1,15 @@ +import Link from "next/link"; + +export default function AcademyNotFound() { + return ( +
+

Page not found

+

+ The academy page you're looking for doesn't exist or has moved. +

+ + Back to Academy + +
+ ); +} diff --git a/app/academy/[[...slug]]/page.tsx b/app/academy/[[...slug]]/page.tsx new file mode 100644 index 0000000000..0d6cb1bf80 --- /dev/null +++ b/app/academy/[[...slug]]/page.tsx @@ -0,0 +1,71 @@ +import type { Metadata } from "next"; +import { academySource } from "@/lib/source"; +import { buildOgImageUrl, buildPageUrl } from "@/lib/og-url"; +import { DocsPage } from "fumadocs-ui/page"; +import { notFound } from "next/navigation"; +import { DocsContributors } from "@/components/DocsContributors"; +import { DocBodyChrome } from "@/components/DocBodyChrome"; +import { getMDXComponents } from "@/mdx-components"; +import type { ComponentType } from "react"; + +type PageProps = { + params: Promise<{ slug?: string[] }>; +}; + +export default async function AcademyPage(props: PageProps) { + const params = await props.params; + const slug = params.slug ?? []; + const page = academySource.getPage(slug); + + if (!page) notFound(); + + const { toc } = page.data; + const MDX = page.data.body as ComponentType<{ components?: Record }>; + + return ( + }} + > + + + + + ); +} + +export async function generateMetadata(props: PageProps): Promise { + const params = await props.params; + const slug = params.slug ?? []; + const page = academySource.getPage(slug); + if (!page) + return { + title: "Not Found", + }; + const pageData = page.data as typeof page.data & { + canonical?: string | null; + seoTitle?: string | null; + ogImage?: string | null; + }; + const pagePath = `/academy${slug.length > 0 ? `/${slug.join("/")}` : ""}`; + const canonicalUrl = pageData.canonical ?? buildPageUrl(pagePath); + const seoTitle = pageData.seoTitle || page.data.title; + const ogImage = buildOgImageUrl({ + title: seoTitle, + description: page.data.description, + section: "Academy", + staticOgImage: pageData.ogImage, + }); + return { + title: seoTitle, + description: page.data.description ?? undefined, + alternates: { canonical: canonicalUrl }, + openGraph: { images: [{ url: ogImage }], url: canonicalUrl }, + twitter: { images: [{ url: ogImage }] }, + }; +} + +export function generateStaticParams() { + return academySource.generateParams(); +} diff --git a/app/academy/layout.tsx b/app/academy/layout.tsx new file mode 100644 index 0000000000..f24fdaceb2 --- /dev/null +++ b/app/academy/layout.tsx @@ -0,0 +1,10 @@ +import { academySource, getPageTreeWithShortTitles } from "@/lib/source"; +import { SharedDocsLayout } from "@/app/docs/SharedDocsLayout"; + +export default function AcademyLayout({ + children, +}: { + children: React.ReactNode; +}) { + return {children}; +} diff --git a/components/NavLinks.tsx b/components/NavLinks.tsx index 5624a7eb70..fceaa01176 100644 --- a/components/NavLinks.tsx +++ b/components/NavLinks.tsx @@ -22,6 +22,7 @@ const productLinks = [ ]; const resourcesLinks = [ + { name: "Academy", href: "/academy" }, { name: "Blog", href: "/blog" }, { name: "Changelog", href: "/changelog" }, { name: "Roadmap", href: "/docs/roadmap" }, diff --git a/components/academy/ImprovementLoop.tsx b/components/academy/ImprovementLoop.tsx new file mode 100644 index 0000000000..f29652a419 --- /dev/null +++ b/components/academy/ImprovementLoop.tsx @@ -0,0 +1,491 @@ +"use client"; + +import React, { useState } from "react"; +import { motion, AnimatePresence } from "framer-motion"; + +type Phase = "all" | "manual" | "automated"; + +const BOX_W = 180; +const BOX_H = 80; + +// The "observe" box changes label based on phase +const OBSERVE_LABELS: Record = { + all: { label: "Monitor & Review", sublabel: "Online evals + manual review" }, + manual: { label: "Review & Annotate", sublabel: "Debug, spot issues, label data" }, + automated: { label: "Monitor", sublabel: "Online evals, automated scoring" }, +}; + +const BOXES = [ + { + id: "tracing", + label: "Tracing", + sublabel: "Capture real user data", + x: 560, + y: 220, + manual: true, + automated: true, + }, + { + id: "observe", + label: "", // dynamic + sublabel: "", // dynamic + x: 300, + y: 220, + manual: true, + automated: true, + }, + { + id: "datasets", + label: "Build Datasets", + sublabel: "Happy path, edge cases, adversarial inputs", + x: 20, + y: 220, + manual: true, + automated: true, + }, + { + id: "changes", + label: "Make Changes", + sublabel: "Prompt management, versioning", + x: 20, + y: 20, + manual: true, + automated: true, + }, + { + id: "experiments", + label: "Run Experiments", + sublabel: "Validate changes against datasets", + x: 300, + y: 20, + manual: true, + automated: true, + }, + { + id: "evaluate", + label: "Evaluate", + sublabel: "Automated quality scoring at scale", + x: 560, + y: 20, + manual: false, + automated: true, + }, +] as const; + +// Connections: [fromId, toId, manual, automated] +const CONNECTIONS: [string, string, boolean, boolean][] = [ + ["tracing", "observe", true, true], + ["observe", "datasets", true, true], + ["datasets", "changes", true, true], + ["changes", "experiments", true, true], + ["experiments", "tracing", true, false], // manual: experiments -> deploy -> tracing + ["experiments", "evaluate", false, true], + ["evaluate", "tracing", false, true], // automated: evaluate -> deploy -> tracing +]; + +function getConnectionPoints( + fromId: string, + toId: string +): { x1: number; y1: number; x2: number; y2: number } { + const from = BOXES.find((b) => b.id === fromId)!; + const to = BOXES.find((b) => b.id === toId)!; + + const fromCenter = { x: from.x + BOX_W / 2, y: from.y + BOX_H / 2 }; + const toCenter = { x: to.x + BOX_W / 2, y: to.y + BOX_H / 2 }; + + const dx = toCenter.x - fromCenter.x; + const dy = toCenter.y - fromCenter.y; + + let x1: number, y1: number, x2: number, y2: number; + + if (Math.abs(dx) > Math.abs(dy)) { + if (dx > 0) { + x1 = from.x + BOX_W; + y1 = fromCenter.y; + x2 = to.x; + y2 = toCenter.y; + } else { + x1 = from.x; + y1 = fromCenter.y; + x2 = to.x + BOX_W; + y2 = toCenter.y; + } + } else { + if (dy > 0) { + x1 = fromCenter.x; + y1 = from.y + BOX_H; + x2 = toCenter.x; + y2 = to.y; + } else { + x1 = fromCenter.x; + y1 = from.y; + x2 = toCenter.x; + y2 = to.y + BOX_H; + } + } + + return { x1, y1, x2, y2 }; +} + +function Arrow({ + fromId, + toId, + active, + dimmed, +}: { + fromId: string; + toId: string; + active: boolean; + dimmed: boolean; +}) { + const { x1, y1, x2, y2 } = getConnectionPoints(fromId, toId); + + const midX = (x1 + x2) / 2; + const midY = (y1 + y2) / 2; + + const dx = x2 - x1; + const dy = y2 - y1; + const dist = Math.sqrt(dx * dx + dy * dy); + const curvature = dist * 0.15; + + const nx = -dy / dist; + const ny = dx / dist; + const cx = midX + nx * curvature; + const cy = midY + ny * curvature; + + const markerId = `arrow-${fromId}-${toId}`; + + const arrowLen = 8; + const endDx = x2 - cx; + const endDy = y2 - cy; + const endDist = Math.sqrt(endDx * endDx + endDy * endDy); + const shortenedX2 = x2 - (endDx / endDist) * arrowLen; + const shortenedY2 = y2 - (endDy / endDist) * arrowLen; + + return ( + + + + + + + + + ); +} + +function DynamicBox({ + x, + y, + phase, + active, + href, +}: { + x: number; + y: number; + phase: Phase; + active: boolean; + href: string; +}) { + const { label, sublabel } = OBSERVE_LABELS[phase]; + + return ( + + + + + + {label} + + + {sublabel} + + + + + ); +} + +function Box({ + label, + sublabel, + x, + y, + active, + dimmed, + href, +}: { + label: string; + sublabel: string; + x: number; + y: number; + active: boolean; + dimmed: boolean; + href: string; +}) { + return ( + + + + + {label} + + + {sublabel} + + + + ); +} + +function DeployLabel({ phase }: { phase: Phase }) { + const fromId = phase === "manual" ? "experiments" : "evaluate"; + const { x1, y1, x2, y2 } = getConnectionPoints(fromId, "tracing"); + const mx = (x1 + x2) / 2 + 15; + const my = (y1 + y2) / 2; + + return ( + + Deploy + + ); +} + +export function ImprovementLoop() { + const [phase, setPhase] = useState("all"); + + const isBoxActive = (box: (typeof BOXES)[number]) => { + if (phase === "all") return false; + if (phase === "manual") return box.manual; + if (phase === "automated") return box.automated; + return false; + }; + + const isBoxDimmed = (box: (typeof BOXES)[number]) => { + if (phase === "all") return false; + if (phase === "manual") return !box.manual; + if (phase === "automated") return !box.automated; + return false; + }; + + const isConnectionActive = (conn: (typeof CONNECTIONS)[number]) => { + if (phase === "all") return false; + if (phase === "manual") return conn[2]; + if (phase === "automated") return conn[3]; + return false; + }; + + const isConnectionDimmed = (conn: (typeof CONNECTIONS)[number]) => { + if (phase === "all") return false; + if (phase === "manual") return !conn[2]; + if (phase === "automated") return !conn[3]; + return false; + }; + + const sectionHrefs: Record = { + tracing: "/academy/tracing", + observe: "/academy/review-and-annotate", + datasets: "/academy/datasets", + changes: "/academy/make-changes", + experiments: "/academy/experiments", + evaluate: "/academy/evaluate", + }; + + return ( +
+ {/* SVG Diagram */} +
+ + {/* Arrows */} + {CONNECTIONS.map(([from, to, p1, p2]) => ( + + ))} + + {/* Deploy labels */} + {(phase === "all" || phase === "manual") && ( + + )} + {phase === "automated" && } + + {/* Static boxes */} + {BOXES.filter((b) => b.id !== "observe").map((box) => ( + + ))} + + {/* Dynamic observe box */} + + +
+ + {/* Phase toggle */} +
+ {( + [ + ["all", "Full Loop"], + ["manual", "Manual"], + ["automated", "Automated"], + ] as const + ).map(([key, label]) => ( + + ))} +
+ + {/* Phase description */} + + + {phase === "all" && ( +

+ The full continuous improvement loop. Start with the manual path + to build your foundation, then add automation as your product matures. +

+ )} + {phase === "manual" && ( +

+ Start here. Manually review your traces, annotate what you see, build + datasets from real production data, and run experiments to validate + changes before deploying. +

+ )} + {phase === "automated" && ( +

+ Once you know what to measure, add monitoring with online evaluations + and automated scoring to your loop — letting you iterate faster at scale. +

+ )} +
+
+
+ ); +} diff --git a/content/academy/case-studies/overview.mdx b/content/academy/case-studies/overview.mdx new file mode 100644 index 0000000000..8a8ee62360 --- /dev/null +++ b/content/academy/case-studies/overview.mdx @@ -0,0 +1,61 @@ +--- +title: Case Studies +description: Real application examples with detailed Langfuse setups — traces, evaluators, datasets, annotation queues, and more. +--- + +# Case Studies + +Let's dissect a couple of examples. + +## Customer support chatbot + +For this one we'll refer to the [Langfuse demo project](/docs/demo). + +things to highlight: +- identify main observation (handle-chatbot-message) + clear input/output + clear naming + session ID, user ID +- hierarchy makes sense, observation types +- token usage, latency and cost are present on relevant observations +- on the actual generation call, the model, prompt are present +- toolcalls: + input and output are there. readability on this is a bit less important to us, we dont really look at it in a table to get an overview. And we can still use jsonpath easily for mappings where we need it (evaluators, putting in dataset, ...) + +## RAG pipeline + +Context: an internal knowledge base assistant. Users ask questions, the app retrieves relevant docs from a vector store, then generates an answer grounded in those docs. + +Trace structure: +- trace: `answer-question` + - span: `retrieve-documents` — input: user query, output: list of retrieved chunks + - generation: `generate-answer` — input: user query + retrieved context, output: final answer + +Things to highlight: +- metadata on `retrieve-documents`: number of chunks retrieved, similarity scores, collection name. Useful for filtering (e.g. find all traces where retrieval returned fewer than 3 chunks) +- prompt management: the system prompt that instructs the model how to use retrieved context is managed in Langfuse and linked to the `generate-answer` generation. Can track how answer quality changes across prompt versions +- clear input/output on the trace level: user question as input, final answer as output. Makes the tracing table scannable +- environment attribute set to `development` / `production` so test queries don't pollute prod dashboards + +## Coding agent + +Context: a coding agent that takes a task description, plans an approach, then executes steps (reading files, writing code, running tests) in a loop until done. + +Trace structure: +- trace: `execute-task` + - generation: `plan` — input: task description, output: step-by-step plan + - span: `execution-loop` (may repeat multiple times) + - tool: `read-file` — input: file path, output: file contents + - generation: `decide-next-step` — input: current state, output: next action + - tool: `write-file` — input: file path + content + - tool: `run-tests` — input: test command, output: test results + - generation: `summarize` — input: execution history, output: summary for user + +Things to highlight: +- agent graph: the trace tree is deep and branching, the agent graph view makes it much easier to understand the flow +- tags: set at trace creation to capture business-level dimensions. E.g. `source:slack`, `task-type:refactor`, `repo:frontend`. Enables filtering and dashboard breakdowns ("how does cost differ between `refactor` and `bugfix` tasks?") +- cost tracking: many LLM calls in one trace, cost rolls up to the trace level. Important to monitor since agent runs can get expensive +- deep hierarchy: the execution loop span groups related observations together, making it easy to see what happened in each iteration even when the trace has 20+ observations + +## What clicksights is doing? +See circleback call diff --git a/content/academy/datasets/meta.json b/content/academy/datasets/meta.json new file mode 100644 index 0000000000..dc308f6c65 --- /dev/null +++ b/content/academy/datasets/meta.json @@ -0,0 +1,6 @@ +{ + "title": "Build Datasets", + "pages": [ + "overview" + ] +} diff --git a/content/academy/datasets/overview.mdx b/content/academy/datasets/overview.mdx new file mode 100644 index 0000000000..ecf4ca3ade --- /dev/null +++ b/content/academy/datasets/overview.mdx @@ -0,0 +1,8 @@ +--- +title: Build Datasets +description: Curate production traces into test datasets for systematic offline evaluation. +--- + +# Build Datasets + +_Content coming soon._ diff --git a/content/academy/evaluate/designing-evals.mdx b/content/academy/evaluate/designing-evals.mdx new file mode 100644 index 0000000000..cb8ab57d45 --- /dev/null +++ b/content/academy/evaluate/designing-evals.mdx @@ -0,0 +1,110 @@ +--- +title: Designing Evals +description: A practical guide to setting up automated evaluations for LLM applications and AI agents. +--- + +# Designing Good Evals + +In AI development, iterating quickly is important. Whether you're refining a prompt, swapping a model, or changing your application logic, you need to understand the impact of each change on performance. Manually annotating outputs after every modification is slow and expensive, especially when you want to integrate evaluations into a CI/CD pipeline. + +**Automated evaluators** solve this problem by providing a scalable way to measure and monitor your application's failure modes, enabling a fast and effective development loop. + +_The framework in this guide is adapted from Hamel Husain's [Eval FAQ](https://hamel.dev/blog/posts/evals-faq/)._ + +--- + +This guide describes a process to **build automated evaluators** for your application. This is a robust evaluator that you can scale for different tests and evolutions of your application: + +1. [What to Measure](#what-to-measure) +2. [How to Measure](#how-to-measure) +3. [Draft your LLM-as-a-Judge prompt](#draft-prompt) +4. [Validate your evaluator](#validate-evaluator) + +--- + +I'll demonstrate this process using an [example chatbot in the Langfuse documentation](/docs/demo) that uses the Vercel AI SDK and has access to a RAG tool to retrieve documents from the Langfuse documentation. The example chat app logs traces into the Langfuse example project and has already answered 19k user queries in the past year. + +Here's the chat interface (you can find the example chat app [here](/docs/demo)): + + + ![Chat + Interface](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/demo-chat.png) + + +## What to Measure [#what-to-measure] + +In the [previous blog post](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications), we showed how to perform **error analysis** and identify failure modes in your application. Now we will focus on building automated evaluators to measure these failure modes. + +Before building an evaluator, it's important to differentiate between two types of failures to prioritize your efforts: + +**Missing Instructions:** The first type are errors caused by vague or incomplete instructions in your prompt. For instance if your agent uses too many bullet points or doesn't ask follow-up questions, and you never instructed it to do so, the first step is to fix the prompt. Creating an evaluator for a failure that a simple prompt tweak can solve is often unnecessary effort. + +**Model Limitations:** The second type occur when the LLM fails to perform correctly despite receiving clear and precise instructions. These are the ideal candidates for automated evaluation because they represent the model's inherent limitations, not a misunderstanding of your intent. + +Let's apply this to the [Langfuse Example App](/docs/demo). First, we fix some obvious issues by clarifying the prompt: "use a maximum of three bullet points" and "ask for more context when the user's query is ambiguous." With those fixed, we can focus on measuring the more complex model limitation failures we identified in the [previous blog post](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications): + +* **Out of Scope**: The agent answers a question not related to Langfuse or the LLM/AI space. +* **Generic Responses**: The answer is technically correct but doesn't resolve the user's issue. The metric is to assess if the agent's final answer is helpful and directly addresses the user's question. +* **Context Retrieval / RAG Issues**: The agent uses the wrong retrieval tool. The metric needs to judge if the correct tool was chosen based on the user's query. + +For this guide, we will set up an evaluator for the "Out of Scope" failure mode. + +## How to Measure [#how-to-measure] + +In Langfuse, all evaluations are tracked as **Scores**, which [can be attached to traces, observations, sessions or dataset runs](/docs/evaluation/experiments/data-model#scores). Evaluations in Langfuse can be set up in two main ways: + +**In the Langfuse UI:** In Langfuse, you can set up **LLM-as-a-Judge Evaluators** that use another LLM to evaluate your application's output on subjective and nuanced criteria. These are easily configured directly in Langfuse. For a guide on setting them up in the UI, check the documentation on **[LLM-as-a-Judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge)**. + +**External Evaluators:** In your code, you can set up **Custom Evaluators** and use the Langfuse SDKs to send the scores back to the evaluated traces. This allows you to set up code-based evaluators or any other custom evaluation logic. For an example of a custom pipeline, see the guide on **[setting up an external evaluation pipelines](/guides/cookbook/example_external_evaluation_pipelines)**. + +In this guide, we will set up an LLM-as-a-Judge evaluator in the Langfuse UI. + +## Drafting your LLM-as-a-Judge Prompt [#draft-prompt] + +A good LLM-as-a-Judge prompt is narrowly focused and well-structured: + +1. **Pick one Failure Mode**: Focus on one specific failure mode. Do not try to cover multiple failure modes at once. +2. **Pass/Fail Definitions**: Clearly define what constitutes a "Pass" (failure is absent) and a "Fail" (failure is present). +3. **Examples**: Include clear examples of both "Pass" and "Fail" cases. + +Here is an example prompt I use in our Example App to check if the agent's answer is within its scope: + + + ![LLM-as-a-Judge Prompt](/images/blog/2025-09-05-automated-evaluations/prompt.png) + + +Once you drafted your prompt, you can set up the LLM-as-a-Judge evaluator in the Langfuse UI. You can find a guide on how to set them up in the UI [here](/docs/evaluation/evaluation-methods/llm-as-a-judge). + + + ![Evaluator Setup](/images/blog/2025-09-05-automated-evaluations/setup-evaluator.png) + + +## Validating Your Evaluator [#validate-evaluator] + +To build an evaluator you can trust, you must validate its performance against human judgment, a process similar to testing a machine learning classifier. + +First, split a set of human-labeled traces into a **development set** and a **test set**. In Langfuse, you can use tags to manage these sets. The **development set** is used to iteratively refine your judge's prompt. Run the evaluator on this set, compare its scores to the human labels, analyze the disagreements, and adjust the prompt's definitions or examples until its judgments closely align with yours: + + + ![Evaluator Tuning](/images/blog/2025-09-05-automated-evaluations/evaluator-tuning.png) + + +Additionally, you can measure the judge's alignment with human labels. The best metrics for this are **True Positive Rate (TPR)** and **True Negative Rate (TNR)**. TPR measures what fraction of actual "Passes" your judge correctly identifies, while TNR measures what fraction of actual "Fails" it correctly identifies. + +You can calculate these metrics by querying the data from the trace table (via [UI export](/docs/api-and-data-platform/features/export-from-ui) or [SDKs](/docs/api-and-data-platform/features/query-via-sdk)) and calculating the metrics. + +Once your judge performs well on the dev set (e.g., TPR and TNR \>90%), run it a final time on the held-out **test set** to get an unbiased measure of its real-world performance. A validated evaluator with high TPR and TNR gives you confidence that your automated metrics are meaningful. + +You can now repeat this process for both your Evaluators in the Langfuse UI and your custom evaluators as part of an [external evaluation pipeline](/guides/cookbook/example_external_evaluation_pipelines). + +## Next Steps [#operationalize-evaluator] + +With good automated evaluators in place, the next step is to operationalize your workflow. The goal is to create a CI/CD pipeline where every code change (to a prompt, model, or tool) automatically triggers an evaluation run on a **golden [Langfuse Dataset](/docs/datasets)**. + +Your automated evaluators will score these runs, providing immediate feedback on how your changes impacted key failure modes. This continuous monitoring loop helps you develop faster while maintaining a high quality bar for your application. + +import { FileCode } from "lucide-react"; + + + } arrow /> + diff --git a/content/academy/evaluate/meta.json b/content/academy/evaluate/meta.json new file mode 100644 index 0000000000..55314d79db --- /dev/null +++ b/content/academy/evaluate/meta.json @@ -0,0 +1,8 @@ +{ + "title": "Evaluate", + "pages": [ + "overview", + "what-to-evaluate", + "designing-evals" + ] +} diff --git a/content/academy/evaluate/overview.mdx b/content/academy/evaluate/overview.mdx new file mode 100644 index 0000000000..0f259592f3 --- /dev/null +++ b/content/academy/evaluate/overview.mdx @@ -0,0 +1,8 @@ +--- +title: Evaluate +description: Build automated evaluators that score traces and experiment results at scale. +--- + +# Evaluate + +_Content coming soon._ diff --git a/content/academy/evaluate/what-to-evaluate.mdx b/content/academy/evaluate/what-to-evaluate.mdx new file mode 100644 index 0000000000..7b012443d3 --- /dev/null +++ b/content/academy/evaluate/what-to-evaluate.mdx @@ -0,0 +1,173 @@ +--- +title: What to Evaluate +description: A practical guide to identifying, categorizing, and analyzing failure modes in LLM applications using error analysis. +--- + +# What to evaluate + + + +To improve your LLM app, you must understand **how it fails**. Aggregate metrics won't tell you if your system retrieves the wrong documents or if the model's tone alienates users. **Error analysis** provides this crucial context. + +_The framework in this guide is adapted from Hamel Husain's [Eval FAQ](https://hamel.dev/blog/posts/evals-faq/why-is-error-analysis-so-important-in-llm-evals-and-how-is-it-performed.html)._ + +--- + +This guide describes a four-step process to **identify, categorize, and quantify your application's unique failure modes**. The result is a specific evaluation framework that is far more useful than generic metrics: + +1. [Gather a diverse dataset of traces](#gather-dataset) +2. [Open code to surface failure patterns](#open-coding) +3. [Structure failure modes](#structure-failure-modes) +4. [Label and quantify](#label-and-quantify) + +--- + +I'll demonstrate this process using an [example chatbot in the Langfuse documentation](/docs/demo) that uses the Vercel AI SDK and has access to a RAG tool to retrieve documents from the Langfuse documentation. The example chat app logs traces into the Langfuse example project and has already answered 19k user queries in the past year. + +Here's the chat interface (you can find the example chat app [here](/docs/demo)): + + + ![Chat + Interface](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/demo-chat.png) + + +## 1. Gather a Diverse Dataset [#gather-dataset] + +To start our error analysis, we assemble a representative dataset of 50-100 traces produced by the example chat app. The quality of your analysis depends on the diversity of this initial data. + +**Existing Production Traces:** If you already have real user traces, as in our example, create your dataset based on them. I recommend first manually clicking through your traces, focusing only on the user input, and adding a diverse set of traces to an annotation queue. + +You can also query for traces with negative user feedback, long conversations, high latency, or specific user metadata. The goal is not a random sample, but a set that covers a wide range of user intents and potential edge cases. + +In Langfuse, you can bulk add traces to an annotation queue [or a dataset](/changelog/2025-12-11-batch-add-observations-to-dataset) by clicking the "Actions" button: + + + ![Add to Annotation + Queue](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/add-to-annotation-queue.png) + + +**Synthetic Dataset:** If you lack production data, generate a synthetic dataset covering anticipated user behaviors and potential failure points. We have a Python cookbook that shows how to do this [here](/guides/cookbook/example_synthetic_datasets). Once created, add these traces to a Langfuse Annotation Queue. Note that the quality of your dataset matters a lot for the success of your error analysis; it needs to be diverse and representative of the real world. + +The [Annotation Queue](/docs/evaluation/evaluation-methods/annotation#annotation-queues) we created will serve as your workspace for the analysis. For our example chatbot, we selected 40 traces reflecting different user questions, from simple definitions to complex comparisons: + + + ![Annotation + Queue](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/annotation-queue.png) + + +## 2. Open Coding: Surface Failure Patterns [#open-coding] + +In the next step, we open our Annotation Queue and carefully review every trace and its associated tool use. The objective is to apply raw, descriptive labels without forcing them into predefined categories. + +For each trace, assign two annotations: + +- A binary score: Pass or Fail. This forces a clear judgment call. + +- A free-text comment: Describe the first point of failure you observe. This process is called [open coding](https://hamel.dev/blog/posts/evals-faq/#open-coding), as we are not forcing any categories on the data. + +If you have traces with multiple errors, focusing on the first failure is efficient. A single upstream error, like incorrect document retrieval, often causes multiple downstream issues. Fixing the root cause resolves them all. Your comment should be a raw observation, not a premature diagnosis. + +Here are some examples from our example chat app: + +import { + Carousel, + CarouselContent, + CarouselItem, + CarouselPrevious, + CarouselNext, +} from "@/components/ui/carousel"; + + + + + + ![Open Coding + 1](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/open-coding-1.png) + + + + + ![Open Coding + 2](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/open-coding-2.png) + + + + + ![Open Coding + 3](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/open-coding-3.png) + + + + + + + +## 3. Structure Failure Modes [#structure-failure-modes] + +After annotating all traces, the next step is to structure your free-text comments into a coherent taxonomy. + +Export your comments from the Langfuse annotation job (you can [query your comments via the Langfuse API](https://colab.research.google.com/drive/1ErETZNWHyOjkG262bZHh-j3Vo9z83qUj)). You can use an LLM to perform an initial clustering of these notes into related themes. Review and manually refine the LLM's output to ensure the categories are distinct, comprehensive, and accurately reflect your application's specific issues. + +For our docs chatbot, we used the following prompt on our exported annotations: + +> You are given a list of open-ended annotations describing failures of an LLM-powered assistant that answers questions about Langfuse. Organize these into a small set of coherent failure categories, grouping similar mistakes together. For each category, provide a concise descriptive title and a one-line definition. Only cluster based on the issues in the annotations—do not invent new failure types. + +This produced a clear taxonomy: + +| Failure Mode | Definition | +| -------------------------------------- | ----------------------------------------------------------------------------------------------- | +| Hallucinations / Incorrect Information | The assistant gives factually wrong answers or shows lack of knowledge about the domain. | +| Context Retrieval / RAG Issues | Failures related to retrieving or using the right documents. | +| Irrelevant or Off-Topic Responses | The assistant produces content unrelated to the user's question. | +| Generic or Unhelpful Responses | Answers are too broad, vague, or do not directly address the user's question. | +| Formatting / Presentation Issues | Problems with response delivery, such as missing code blocks or links. | +| Interaction Style / Missing Follow-ups | The assistant fails to ask clarifying questions or misses opportunities for guided interaction. | + +## 4. Label and Quantify [#label-and-quantify] + +With our error labels in place, we can now annotate our dataset with these failure modes. + +First, create a new Score configuration in Langfuse containing each failure mode as a boolean or categorical option. Then, re-annotate your dataset using this new, structured schema. + +This labeled dataset allows you to use Langfuse analytics to pivot and aggregate the data. You can now answer critical questions like, "What is our most frequent failure mode?" For our example chatbot, the analysis revealed that Context Retrieval Issues were the most common problem. + +Here are the results after labeling our dataset: + + + ![Failure + Modes](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/failure-modes.png) + + +## Common Pitfalls + +- **Generic Metrics:** Avoid starting with off-the-shelf metrics like "conciseness" or "hallucinations." Let your application's actual failures define your evaluation criteria. + +- **One-and-Done Analysis:** Error analysis is not a static task. As your application and user behavior evolve, so will its failure modes. Make this process a recurring part of your development cycle. + +## Next Steps + +This error analysis produces a quantified, application-specific understanding of your primary issues. These insights provide a clear roadmap for targeted improvements, whether in your prompts, RAG pipeline, or model selection. + +The structured failure modes you defined serve as the foundation for building automated evaluators, which can scale this analysis across your application. However, before setting up automated evaluators, ensure you first address the obvious issues encountered during the error analysis. You can typically go through multiple rounds of this process before reaching a plateau. + +In the next blog post, we will set up automated evaluators and use them to continuously improve our example chatbot: + +import { FileCode } from "lucide-react"; + + + } arrow /> + \ No newline at end of file diff --git a/content/academy/evaluation/building-datasets.mdx b/content/academy/evaluation/building-datasets.mdx new file mode 100644 index 0000000000..e2c27b491d --- /dev/null +++ b/content/academy/evaluation/building-datasets.mdx @@ -0,0 +1,6 @@ +--- +title: Building Datasets +description: How to build and grow evaluation datasets for LLM applications — from your first pre-production set to a comprehensive collection of real-world test cases. +--- + +TODO diff --git a/content/academy/evaluation/designing-evals.mdx b/content/academy/evaluation/designing-evals.mdx new file mode 100644 index 0000000000..cb8ab57d45 --- /dev/null +++ b/content/academy/evaluation/designing-evals.mdx @@ -0,0 +1,110 @@ +--- +title: Designing Evals +description: A practical guide to setting up automated evaluations for LLM applications and AI agents. +--- + +# Designing Good Evals + +In AI development, iterating quickly is important. Whether you're refining a prompt, swapping a model, or changing your application logic, you need to understand the impact of each change on performance. Manually annotating outputs after every modification is slow and expensive, especially when you want to integrate evaluations into a CI/CD pipeline. + +**Automated evaluators** solve this problem by providing a scalable way to measure and monitor your application's failure modes, enabling a fast and effective development loop. + +_The framework in this guide is adapted from Hamel Husain's [Eval FAQ](https://hamel.dev/blog/posts/evals-faq/)._ + +--- + +This guide describes a process to **build automated evaluators** for your application. This is a robust evaluator that you can scale for different tests and evolutions of your application: + +1. [What to Measure](#what-to-measure) +2. [How to Measure](#how-to-measure) +3. [Draft your LLM-as-a-Judge prompt](#draft-prompt) +4. [Validate your evaluator](#validate-evaluator) + +--- + +I'll demonstrate this process using an [example chatbot in the Langfuse documentation](/docs/demo) that uses the Vercel AI SDK and has access to a RAG tool to retrieve documents from the Langfuse documentation. The example chat app logs traces into the Langfuse example project and has already answered 19k user queries in the past year. + +Here's the chat interface (you can find the example chat app [here](/docs/demo)): + + + ![Chat + Interface](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/demo-chat.png) + + +## What to Measure [#what-to-measure] + +In the [previous blog post](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications), we showed how to perform **error analysis** and identify failure modes in your application. Now we will focus on building automated evaluators to measure these failure modes. + +Before building an evaluator, it's important to differentiate between two types of failures to prioritize your efforts: + +**Missing Instructions:** The first type are errors caused by vague or incomplete instructions in your prompt. For instance if your agent uses too many bullet points or doesn't ask follow-up questions, and you never instructed it to do so, the first step is to fix the prompt. Creating an evaluator for a failure that a simple prompt tweak can solve is often unnecessary effort. + +**Model Limitations:** The second type occur when the LLM fails to perform correctly despite receiving clear and precise instructions. These are the ideal candidates for automated evaluation because they represent the model's inherent limitations, not a misunderstanding of your intent. + +Let's apply this to the [Langfuse Example App](/docs/demo). First, we fix some obvious issues by clarifying the prompt: "use a maximum of three bullet points" and "ask for more context when the user's query is ambiguous." With those fixed, we can focus on measuring the more complex model limitation failures we identified in the [previous blog post](/blog/2025-08-29-error-analysis-to-evaluate-llm-applications): + +* **Out of Scope**: The agent answers a question not related to Langfuse or the LLM/AI space. +* **Generic Responses**: The answer is technically correct but doesn't resolve the user's issue. The metric is to assess if the agent's final answer is helpful and directly addresses the user's question. +* **Context Retrieval / RAG Issues**: The agent uses the wrong retrieval tool. The metric needs to judge if the correct tool was chosen based on the user's query. + +For this guide, we will set up an evaluator for the "Out of Scope" failure mode. + +## How to Measure [#how-to-measure] + +In Langfuse, all evaluations are tracked as **Scores**, which [can be attached to traces, observations, sessions or dataset runs](/docs/evaluation/experiments/data-model#scores). Evaluations in Langfuse can be set up in two main ways: + +**In the Langfuse UI:** In Langfuse, you can set up **LLM-as-a-Judge Evaluators** that use another LLM to evaluate your application's output on subjective and nuanced criteria. These are easily configured directly in Langfuse. For a guide on setting them up in the UI, check the documentation on **[LLM-as-a-Judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge)**. + +**External Evaluators:** In your code, you can set up **Custom Evaluators** and use the Langfuse SDKs to send the scores back to the evaluated traces. This allows you to set up code-based evaluators or any other custom evaluation logic. For an example of a custom pipeline, see the guide on **[setting up an external evaluation pipelines](/guides/cookbook/example_external_evaluation_pipelines)**. + +In this guide, we will set up an LLM-as-a-Judge evaluator in the Langfuse UI. + +## Drafting your LLM-as-a-Judge Prompt [#draft-prompt] + +A good LLM-as-a-Judge prompt is narrowly focused and well-structured: + +1. **Pick one Failure Mode**: Focus on one specific failure mode. Do not try to cover multiple failure modes at once. +2. **Pass/Fail Definitions**: Clearly define what constitutes a "Pass" (failure is absent) and a "Fail" (failure is present). +3. **Examples**: Include clear examples of both "Pass" and "Fail" cases. + +Here is an example prompt I use in our Example App to check if the agent's answer is within its scope: + + + ![LLM-as-a-Judge Prompt](/images/blog/2025-09-05-automated-evaluations/prompt.png) + + +Once you drafted your prompt, you can set up the LLM-as-a-Judge evaluator in the Langfuse UI. You can find a guide on how to set them up in the UI [here](/docs/evaluation/evaluation-methods/llm-as-a-judge). + + + ![Evaluator Setup](/images/blog/2025-09-05-automated-evaluations/setup-evaluator.png) + + +## Validating Your Evaluator [#validate-evaluator] + +To build an evaluator you can trust, you must validate its performance against human judgment, a process similar to testing a machine learning classifier. + +First, split a set of human-labeled traces into a **development set** and a **test set**. In Langfuse, you can use tags to manage these sets. The **development set** is used to iteratively refine your judge's prompt. Run the evaluator on this set, compare its scores to the human labels, analyze the disagreements, and adjust the prompt's definitions or examples until its judgments closely align with yours: + + + ![Evaluator Tuning](/images/blog/2025-09-05-automated-evaluations/evaluator-tuning.png) + + +Additionally, you can measure the judge's alignment with human labels. The best metrics for this are **True Positive Rate (TPR)** and **True Negative Rate (TNR)**. TPR measures what fraction of actual "Passes" your judge correctly identifies, while TNR measures what fraction of actual "Fails" it correctly identifies. + +You can calculate these metrics by querying the data from the trace table (via [UI export](/docs/api-and-data-platform/features/export-from-ui) or [SDKs](/docs/api-and-data-platform/features/query-via-sdk)) and calculating the metrics. + +Once your judge performs well on the dev set (e.g., TPR and TNR \>90%), run it a final time on the held-out **test set** to get an unbiased measure of its real-world performance. A validated evaluator with high TPR and TNR gives you confidence that your automated metrics are meaningful. + +You can now repeat this process for both your Evaluators in the Langfuse UI and your custom evaluators as part of an [external evaluation pipeline](/guides/cookbook/example_external_evaluation_pipelines). + +## Next Steps [#operationalize-evaluator] + +With good automated evaluators in place, the next step is to operationalize your workflow. The goal is to create a CI/CD pipeline where every code change (to a prompt, model, or tool) automatically triggers an evaluation run on a **golden [Langfuse Dataset](/docs/datasets)**. + +Your automated evaluators will score these runs, providing immediate feedback on how your changes impacted key failure modes. This continuous monitoring loop helps you develop faster while maintaining a high quality bar for your application. + +import { FileCode } from "lucide-react"; + + + } arrow /> + diff --git a/content/academy/evaluation/evaluation-example-setups.mdx b/content/academy/evaluation/evaluation-example-setups.mdx new file mode 100644 index 0000000000..286ca76c67 --- /dev/null +++ b/content/academy/evaluation/evaluation-example-setups.mdx @@ -0,0 +1,12 @@ +--- +title: Example Setups +description: Evaluation approaches for different types of LLM applications. +--- + +# Example Setups + +TODO + +Add example evaluator setups for +- customer support chatbot (and link to demo project) +- ... diff --git a/content/academy/evaluation/evaluation-loop.mdx b/content/academy/evaluation/evaluation-loop.mdx new file mode 100644 index 0000000000..4e15ed96c5 --- /dev/null +++ b/content/academy/evaluation/evaluation-loop.mdx @@ -0,0 +1,56 @@ +--- +title: The Evaluation Loop +description: Learn the fundamental concepts behind LLM evaluation — the evaluation loop, online vs offline evaluation, and why spending time in your traces matters. +--- + +# The Evaluation Loop + +LLM applications benefit from a constant loop of testing and monitoring: test changes against a fixed dataset before deploying, then monitor live traffic to catch issues your dataset didn't cover. When you find new edge cases, feed them back into your dataset so future tests will catch them. + +![The Continuous Evaluation/Iteration Loop](/images/docs/academy/evaluation-loop.png) + + +## Offline evaluation + +Offline evaluation lets you test your application against a fixed dataset before you deploy. You run your new prompt or model against test cases, score the outputs, and iterate until the results look good. In Langfuse, you do this by running [Experiments](/docs/evaluation/core-concepts#experiments). + +This is your main tool for iterating quickly — change something, run the experiment, see if the scores improved. The quality of your offline evaluation depends on having [good datasets](/academy/building-datasets) and [good evaluators](/academy/designing-evals). + +## Online evaluation + +Online evaluation is when quality gets evaluated on live production traces. This can take different forms: + +- **Automatic evaluators** like [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) that score every trace (or a sample) on specific quality dimensions +- **Manual annotation** where you or your team go through traces and assign scores on different quality metrics via [Annotation Queues](/docs/evaluation/evaluation-methods/annotation) +- **User feedback** collected directly from end users + +Online evaluation serves two purposes: + +1. **Monitoring quality live.** Aggregate evaluation scores into [dashboards](/docs/evaluation/core-concepts#dashboards) and set up alerts so you know when quality degrades. This gives you confidence that your agent is performing well in production. + +2. **Surfacing traces worth investigating.** Poor-performing traces — the ones that score low on your evaluators — are the ones you want to look at more closely. These are also your best candidates for adding to datasets, so future experiments can catch similar issues. Think of it like software engineering: when you find a new bug, you add a regression test. Same here — when you spot a bad trace, it becomes a dataset item. + +## An example + +**Here's an example workflow** for building a customer support chatbot +> 1. You update your prompt to make responses less formal. +> 2. Before deploying, you run an **experiment**: test the new prompt against your dataset of customer questions **(offline evaluation)**. +> 3. You review the scores and outputs. The tone improved, but responses are longer and some miss important links. +> 4. You refine the prompt and run the experiment again. +> 5. The results look good now. You deploy the new prompt to production. +> 6. You monitor with **online evaluation** to catch any new edge cases. +> 7. You notice that a customer asked a question in French, but the bot responded in English. +> 8. You add this French query to your dataset so future experiments will catch this issue. +> 9. You update your prompt to support French responses and run another experiment. +> +> Over time, your dataset grows from a couple of examples to a diverse, representative set of real-world test cases. + +## Spend time in your traces + +It's always a good idea to keep going through traces manually, even when your dashboards look green. + +- **Quality is subjective.** Agent performance isn't a clear-cut good/bad. What looks fine in aggregate might still feel off when you read the actual conversation. +- **You can't anticipate everything.** It's impossible to know every dimension you need to measure beforehand. New failure modes will show up that your evaluators don't cover yet. +- **Taste comes from exposure.** Your sense of what good agent behavior looks like develops from spending time with your agent's actual interactions. You'll get new ideas for how the agent should handle specific situations slightly differently, what kind of context users provide, what unexpected questions come in. + +The traces that matter most aren't always the ones that fail your evaluators. Sometimes there are ones that technically pass but leave you thinking "this could be better." That intuition is what drives the next iteration of your prompt, your evaluators, and your datasets. \ No newline at end of file diff --git a/content/academy/evaluation/evaluation-overview.mdx b/content/academy/evaluation/evaluation-overview.mdx new file mode 100644 index 0000000000..78dae18ca2 --- /dev/null +++ b/content/academy/evaluation/evaluation-overview.mdx @@ -0,0 +1,27 @@ +--- +title: Evaluating LLM Applications +description: Learn how to systematically evaluate LLM applications — from understanding what to measure, to designing evals, to building the datasets that make it all work. +--- + +# Structured Evaluation + +Once your traces are set up, and you decided what the behavior of your agent should be, you can start setting up evaluations, or short "Evals". There are many different forms of evaluations, each having their own pro's and cons. Most setups will benefit from a combination of different evaluation forms. + +This section will walk you through that: + +- [The evaluation loop](/academy/evaluation-loop) +- [What should you evaluate?](/academy/what-to-evaluate) +- [How to design an eval](/academy/designing-evals) +- [Building good datasets](/academy/building-datasets) +- [Example setups](/academy/evaluation-example-setups) + + diff --git a/content/academy/evaluation/what-to-evaluate.mdx b/content/academy/evaluation/what-to-evaluate.mdx new file mode 100644 index 0000000000..7b012443d3 --- /dev/null +++ b/content/academy/evaluation/what-to-evaluate.mdx @@ -0,0 +1,173 @@ +--- +title: What to Evaluate +description: A practical guide to identifying, categorizing, and analyzing failure modes in LLM applications using error analysis. +--- + +# What to evaluate + + + +To improve your LLM app, you must understand **how it fails**. Aggregate metrics won't tell you if your system retrieves the wrong documents or if the model's tone alienates users. **Error analysis** provides this crucial context. + +_The framework in this guide is adapted from Hamel Husain's [Eval FAQ](https://hamel.dev/blog/posts/evals-faq/why-is-error-analysis-so-important-in-llm-evals-and-how-is-it-performed.html)._ + +--- + +This guide describes a four-step process to **identify, categorize, and quantify your application's unique failure modes**. The result is a specific evaluation framework that is far more useful than generic metrics: + +1. [Gather a diverse dataset of traces](#gather-dataset) +2. [Open code to surface failure patterns](#open-coding) +3. [Structure failure modes](#structure-failure-modes) +4. [Label and quantify](#label-and-quantify) + +--- + +I'll demonstrate this process using an [example chatbot in the Langfuse documentation](/docs/demo) that uses the Vercel AI SDK and has access to a RAG tool to retrieve documents from the Langfuse documentation. The example chat app logs traces into the Langfuse example project and has already answered 19k user queries in the past year. + +Here's the chat interface (you can find the example chat app [here](/docs/demo)): + + + ![Chat + Interface](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/demo-chat.png) + + +## 1. Gather a Diverse Dataset [#gather-dataset] + +To start our error analysis, we assemble a representative dataset of 50-100 traces produced by the example chat app. The quality of your analysis depends on the diversity of this initial data. + +**Existing Production Traces:** If you already have real user traces, as in our example, create your dataset based on them. I recommend first manually clicking through your traces, focusing only on the user input, and adding a diverse set of traces to an annotation queue. + +You can also query for traces with negative user feedback, long conversations, high latency, or specific user metadata. The goal is not a random sample, but a set that covers a wide range of user intents and potential edge cases. + +In Langfuse, you can bulk add traces to an annotation queue [or a dataset](/changelog/2025-12-11-batch-add-observations-to-dataset) by clicking the "Actions" button: + + + ![Add to Annotation + Queue](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/add-to-annotation-queue.png) + + +**Synthetic Dataset:** If you lack production data, generate a synthetic dataset covering anticipated user behaviors and potential failure points. We have a Python cookbook that shows how to do this [here](/guides/cookbook/example_synthetic_datasets). Once created, add these traces to a Langfuse Annotation Queue. Note that the quality of your dataset matters a lot for the success of your error analysis; it needs to be diverse and representative of the real world. + +The [Annotation Queue](/docs/evaluation/evaluation-methods/annotation#annotation-queues) we created will serve as your workspace for the analysis. For our example chatbot, we selected 40 traces reflecting different user questions, from simple definitions to complex comparisons: + + + ![Annotation + Queue](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/annotation-queue.png) + + +## 2. Open Coding: Surface Failure Patterns [#open-coding] + +In the next step, we open our Annotation Queue and carefully review every trace and its associated tool use. The objective is to apply raw, descriptive labels without forcing them into predefined categories. + +For each trace, assign two annotations: + +- A binary score: Pass or Fail. This forces a clear judgment call. + +- A free-text comment: Describe the first point of failure you observe. This process is called [open coding](https://hamel.dev/blog/posts/evals-faq/#open-coding), as we are not forcing any categories on the data. + +If you have traces with multiple errors, focusing on the first failure is efficient. A single upstream error, like incorrect document retrieval, often causes multiple downstream issues. Fixing the root cause resolves them all. Your comment should be a raw observation, not a premature diagnosis. + +Here are some examples from our example chat app: + +import { + Carousel, + CarouselContent, + CarouselItem, + CarouselPrevious, + CarouselNext, +} from "@/components/ui/carousel"; + + + + + + ![Open Coding + 1](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/open-coding-1.png) + + + + + ![Open Coding + 2](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/open-coding-2.png) + + + + + ![Open Coding + 3](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/open-coding-3.png) + + + + + + + +## 3. Structure Failure Modes [#structure-failure-modes] + +After annotating all traces, the next step is to structure your free-text comments into a coherent taxonomy. + +Export your comments from the Langfuse annotation job (you can [query your comments via the Langfuse API](https://colab.research.google.com/drive/1ErETZNWHyOjkG262bZHh-j3Vo9z83qUj)). You can use an LLM to perform an initial clustering of these notes into related themes. Review and manually refine the LLM's output to ensure the categories are distinct, comprehensive, and accurately reflect your application's specific issues. + +For our docs chatbot, we used the following prompt on our exported annotations: + +> You are given a list of open-ended annotations describing failures of an LLM-powered assistant that answers questions about Langfuse. Organize these into a small set of coherent failure categories, grouping similar mistakes together. For each category, provide a concise descriptive title and a one-line definition. Only cluster based on the issues in the annotations—do not invent new failure types. + +This produced a clear taxonomy: + +| Failure Mode | Definition | +| -------------------------------------- | ----------------------------------------------------------------------------------------------- | +| Hallucinations / Incorrect Information | The assistant gives factually wrong answers or shows lack of knowledge about the domain. | +| Context Retrieval / RAG Issues | Failures related to retrieving or using the right documents. | +| Irrelevant or Off-Topic Responses | The assistant produces content unrelated to the user's question. | +| Generic or Unhelpful Responses | Answers are too broad, vague, or do not directly address the user's question. | +| Formatting / Presentation Issues | Problems with response delivery, such as missing code blocks or links. | +| Interaction Style / Missing Follow-ups | The assistant fails to ask clarifying questions or misses opportunities for guided interaction. | + +## 4. Label and Quantify [#label-and-quantify] + +With our error labels in place, we can now annotate our dataset with these failure modes. + +First, create a new Score configuration in Langfuse containing each failure mode as a boolean or categorical option. Then, re-annotate your dataset using this new, structured schema. + +This labeled dataset allows you to use Langfuse analytics to pivot and aggregate the data. You can now answer critical questions like, "What is our most frequent failure mode?" For our example chatbot, the analysis revealed that Context Retrieval Issues were the most common problem. + +Here are the results after labeling our dataset: + + + ![Failure + Modes](/images/blog/2025-08-29-error-analysis-to-evaluate-llm-applications/failure-modes.png) + + +## Common Pitfalls + +- **Generic Metrics:** Avoid starting with off-the-shelf metrics like "conciseness" or "hallucinations." Let your application's actual failures define your evaluation criteria. + +- **One-and-Done Analysis:** Error analysis is not a static task. As your application and user behavior evolve, so will its failure modes. Make this process a recurring part of your development cycle. + +## Next Steps + +This error analysis produces a quantified, application-specific understanding of your primary issues. These insights provide a clear roadmap for targeted improvements, whether in your prompts, RAG pipeline, or model selection. + +The structured failure modes you defined serve as the foundation for building automated evaluators, which can scale this analysis across your application. However, before setting up automated evaluators, ensure you first address the obvious issues encountered during the error analysis. You can typically go through multiple rounds of this process before reaching a plateau. + +In the next blog post, we will set up automated evaluators and use them to continuously improve our example chatbot: + +import { FileCode } from "lucide-react"; + + + } arrow /> + \ No newline at end of file diff --git a/content/academy/experiments/meta.json b/content/academy/experiments/meta.json new file mode 100644 index 0000000000..447f915f76 --- /dev/null +++ b/content/academy/experiments/meta.json @@ -0,0 +1,6 @@ +{ + "title": "Run Experiments", + "pages": [ + "overview" + ] +} diff --git a/content/academy/experiments/overview.mdx b/content/academy/experiments/overview.mdx new file mode 100644 index 0000000000..e4216a397f --- /dev/null +++ b/content/academy/experiments/overview.mdx @@ -0,0 +1,8 @@ +--- +title: Run Experiments +description: Iterate on models, prompts, tools, and logic — then validate changes against your datasets. +--- + +# Run Experiments + +_Content coming soon._ diff --git a/content/academy/index.mdx b/content/academy/index.mdx new file mode 100644 index 0000000000..06f83204d1 --- /dev/null +++ b/content/academy/index.mdx @@ -0,0 +1,18 @@ +--- +title: Langfuse Academy +description: Learn the continuous evaluation and iteration loop for LLM applications. Hands-on processes to systematically improve your AI products. +--- + +# Welcome to the Langfuse Academy + +Our [docs](/docs) cover how to implement specific features in Langfuse — but for that, you already need to know what you want to build. The Academy is here to help you get there. It teaches you the best practices and processes to set up a flywheel of continuous improvement and iteration for your LLM application. + +## The Continuous Loop + +Building great LLM applications requires a good process for continuous iteration. Unlike deterministic software, it is impossible to anticipate everything your AI agent will do, and behavior might change over time. There are best practices on how to handle this, and they touch on many aspects of LLM engineering. We can put them into a continuous improvement loop. You don't need to do everything all at once — start with a few steps and add others as your product matures. + + + +This academy will walk you through the different steps you can take to build such a continuous improvement process. After each section, you will have learned one more sub-process that brings you closer to the full loop. Up to you if you do everything in here, or treat some things as inspiration. + +Next to the loop, we also have a [Case Studies](/academy/case-studies/overview) section with real application examples and their full Langfuse setups. diff --git a/content/academy/make-changes/meta.json b/content/academy/make-changes/meta.json new file mode 100644 index 0000000000..e17fe37609 --- /dev/null +++ b/content/academy/make-changes/meta.json @@ -0,0 +1,6 @@ +{ + "title": "Make Changes", + "pages": [ + "overview" + ] +} diff --git a/content/academy/make-changes/overview.mdx b/content/academy/make-changes/overview.mdx new file mode 100644 index 0000000000..2524c19459 --- /dev/null +++ b/content/academy/make-changes/overview.mdx @@ -0,0 +1,8 @@ +--- +title: Prompt Management +description: Manage prompts, versions, and environments to make confident changes to your LLM application. +--- + +# Prompt Management + +_Content coming soon._ diff --git a/content/academy/meta.json b/content/academy/meta.json new file mode 100644 index 0000000000..c4e2dc9aaa --- /dev/null +++ b/content/academy/meta.json @@ -0,0 +1,15 @@ +{ + "title": "Academy", + "pages": [ + "index", + "---The Loop---", + "tracing", + "review-and-annotate", + "datasets", + "make-changes", + "experiments", + "evaluate", + "---Case Studies---", + "case-studies/overview" + ] +} diff --git a/content/academy/monitoring/closing-the-loop.mdx b/content/academy/monitoring/closing-the-loop.mdx new file mode 100644 index 0000000000..b810671426 --- /dev/null +++ b/content/academy/monitoring/closing-the-loop.mdx @@ -0,0 +1,8 @@ +--- +title: Closing the Loop +description: TODO +--- + +# Closing the Loop + +TODO: Closing the loop by looking at traces, selecting some for datasets, running evals on it and improving with prompt iteration diff --git a/content/academy/monitoring/cost-tracking.mdx b/content/academy/monitoring/cost-tracking.mdx new file mode 100644 index 0000000000..6add75919e --- /dev/null +++ b/content/academy/monitoring/cost-tracking.mdx @@ -0,0 +1,4 @@ +--- +title: Cost Tracking +description: Langfuse tracks usage and cost of LLM generations for various models. Learn how to ingest, infer, and manage cost data. +--- diff --git a/content/academy/monitoring/monitoring-overview.mdx b/content/academy/monitoring/monitoring-overview.mdx new file mode 100644 index 0000000000..6d47e01191 --- /dev/null +++ b/content/academy/monitoring/monitoring-overview.mdx @@ -0,0 +1,12 @@ +--- +title: Monitoring & Improving in Production +description: You shipped your LLM application — now what? Learn what to monitor, how to track costs, collect feedback, and continuously improve. +--- + +# Monitoring & Improving in Production + +You shipped your LLM application — now learn what to monitor, how to track costs, collect feedback, and continuously improve. + +- [What to monitor](/academy/what-to-monitor) +- [Cost tracking and optimization](/academy/cost-tracking) +- [Closing the loop](/academy/closing-the-loop) diff --git a/content/academy/monitoring/what-to-monitor.mdx b/content/academy/monitoring/what-to-monitor.mdx new file mode 100644 index 0000000000..7b469f029a --- /dev/null +++ b/content/academy/monitoring/what-to-monitor.mdx @@ -0,0 +1,4 @@ +--- +title: What to Monitor +description: "Learn how to use chatbot analytics tools to monitor performance, track user interactions, and improve your AI chatbot. Covers key metrics, evaluation strategies, and analytics platforms." +--- diff --git a/content/academy/observability/good-traces.mdx b/content/academy/observability/good-traces.mdx new file mode 100644 index 0000000000..c2becd593b --- /dev/null +++ b/content/academy/observability/good-traces.mdx @@ -0,0 +1,179 @@ +--- +title: What does a good trace look like? +description: A guide to structuring your Langfuse traces for effective debugging, evaluation, and cost tracking. +--- + +import { Fan, Wrench } from "lucide-react"; + +# What does a good trace look like? + +You see traces appearing in Langfuse, but how do you know if you've done it well? Here are a couple of things you can look at and optimize. + +## What's the scope of one trace? + +Langfuse's [data model](/docs/observability/data-model) has three levels of grouping: observations (individual steps) are grouped into traces via a `trace_id`, and traces can be grouped into sessions via a `session_id`. + +```mermaid +flowchart LR + + subgraph SESSION [session] + direction TB + + subgraph TRACE1 [trace] + direction LR + O1[observation] + O2[observation] + O3[observation] + end + + subgraph TRACE2 [trace] + direction LR + O4[observation] + O5[observation] + O6[observation] + end + + subgraph TRACE3 [trace] + direction LR + O7[observation] + O8[observation] + O9[observation] + end + + end + classDef sessionBox fill:none,stroke:#888,stroke-width:2px; + classDef traceBox fill:none,stroke:#aaa,stroke-width:1.5px; + + %% Assign classes + class SESSION sessionBox; + class TRACE1,TRACE2,TRACE3 traceBox; +``` + +A trace represents one self-contained unit of work in your application. Good examples of a typical trace: +- One chatbot turn (user sends a message, your app retrieves context, calls the LLM, returns a response) +- One agent run (the agent receives a task, reasons, calls tools, and produces a result) +- One pipeline execution (a document comes in, gets chunked, embedded, and stored) + +If multiple of these happen in sequence, e.g. a multi-turn conversation, or several agent runs that feed into a final report, that's where [sessions](/docs/observability/features/sessions) come in. Each step is its own trace, and the session ties them together. + +A trace shows up in the Langfuse UI as a trace tree and an [agent graph](/docs/observability/features/agent-graphs): + +
+ +![Trace tree](/images/docs/faq/good-trace-tree.png) + +![Agent graph](/images/docs/faq/good-trace-agent-graph.png) + +
+ +## Look at the trace tree + +When you click on a trace, you see the trace tree. There are two things you can check: + +### Are the right steps showing up? + +You should see your LLM calls, tool calls, and other important steps represented in the tree. They should have the correct [observation type](/docs/observability/features/observation-types). + +For example +- an LLM call should show up as a `generation`. This is important because a `generation` can carry [cost, token usage, and model information](#track-model-tokens-and-cost-on-generations). +- a tool call should show up as a `tool`. You can then filter on tool call observations when you create [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) evaluators. + +Framework integrations typically set these types automatically. If you're instrumenting manually, you can set them via the `as_type` parameter (Python) or `asType` (JS/TS). See the [observation types docs](/docs/observability/features/observation-types) for a full list. + +### Is there noise you don't need? + +Not every observation in the tree is useful for understanding what your application did. HTTP spans, database queries, and framework internals often add clutter without giving you meaningful insight. If you see observations like these polluting your trace tree, you can [filter them out](/faq/all/unwanted-http-database-spans). + +
+![Noisy spans in a trace tree](/images/docs/faq/good-trace-noisy-spans.png) +
+ +## Choose good names + +Observation- and trace names are used in many places: + +- When setting up [LLM-as-a-judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge), you target specific observations by name. +- In [dashboards](/docs/metrics/features/custom-dashboards), you can filter and aggregate metrics by observation name. +- In the tracing table, names help you quickly identify what each step does. + +Try to name the observations after what they do: `classify-intent`, `generate-response`, `summarize-results`. This makes it easier to understand what each step does when you're looking at the trace tree, and makes filtering on a specific step easier. + +> Try not to name observations after the AI model used (`gpt-4o`, `claude-sonnet`). It breaks as soon as you swap models. The model is already a separate attribute on `generation` observations, use that instead. + +## Choose meaningful input and output + +In general, it's recommended that operations have an input and/or output. If an observation has neither, ask yourself if an observation is actually useful or if you can drop it. + +For your **most viewed observations**, take some extra care to set them up. You will likely create pre-filtered views on your tracing and session screens. The observations you filter for here are the ones that will get looked at the most. For these, ask yourself: **what do I need to see to quickly evaluate a trace/session at a glance?** + +![Tracing table with input and output](/images/docs/faq/good-trace-tracing-table-io.png) + +Typical input/output for `GENERATION` observations: +- For a chatbot: the user message (input) and the assistant response (output). +- For a RAG pipeline: the user query and the generated answer. +- For a classification task: the text being classified and the predicted label. + +If your input and output fields are showing up empty unintentionally, see [why are the input and output of my trace empty?](/faq/all/empty-trace-input-and-output) + +## Useful attributes + +Observations have a number of attributes that can be useful for your use cases. These will allow you to go even further with filtering, scoring, and making dashboards. + +### Add metadata for context + +[Metadata](/docs/observability/features/metadata) is a flexible key-value store on each observation. Some data that might be useful to save under metadata: + +- **Evaluation context**: When configuring [LLM-as-a-judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge), you can reference metadata fields. This is useful for passing ground truth, expected behavior, or other context that the evaluator needs but that isn't part of the actual input/output. +- **Filtering**: You can filter by metadata keys in the Langfuse UI, which is helpful when you need to find traces with specific characteristics. +- **Annotation context**: When doing manual review, metadata gives annotators extra information to make better judgments. + +### Track model, tokens, and cost on generations [#track-model-tokens-and-cost-on-generations] + +If you want to understand what your LLM usage costs, broken down by model, by user, by feature, you need three things on your `generation` observations: + +- **Model name**: Langfuse uses this to look up pricing in the [model pricing table](/docs/model-usage-and-cost). If the model name doesn't match, Langfuse can't calculate cost automatically. +- **Usage details**: Input tokens, output tokens, and optionally cached tokens. This is what powers the token usage views in dashboards. +- **Cost details** (optional): If you want to override Langfuse's automatic pricing — for example, if you have a custom pricing agreement — you can pass cost explicitly. + +Most [integrations](/integrations) capture all of this automatically. If you're instrumenting manually, see the [token and cost tracking docs](/docs/observability/features/token-and-cost-tracking). + +You can see these attributes on the `GENERATION` observation in the Langfuse UI. + +![Generation attributes in Langfuse](/images/docs/faq/good-trace-generation-attributes.png) + +### Use tags for business-level dimensions + +[Tags](/docs/observability/features/tags) enable filtering and metric breakdowns across dimensions that matter to your business. Good tags answer questions like "how does latency differ between our `web` and `api` users?". + +One property of tags is that they are **immutable and must be set at observation creation time**. This makes them great for things you know upfront (where the request came from, which feature it's part of), but not for things you learn later. + +If you need to label traces based on something determined after the fact, like an [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) evaluation result, use [scores](/docs/evaluation/overview) instead. + +### Link prompts to traces + +If you [manage your prompts in Langfuse](/docs/prompt-management), you can [link them to your generations](/docs/prompt-management/features/link-to-traces). This lets you see which prompt version was used for a given trace, and track how metrics change across prompt versions. Useful when you're iterating on prompts and want to compare performance. + +### Set the environment + +Set the [environment](/docs/observability/features/environments) attribute (`production`, `staging`, `development`) so that your test traces don't pollute production dashboards and evaluations. + +### Track users with user IDs + +Setting the [user ID](/docs/observability/features/users) connects traces to specific users, which unlocks per-user views in Langfuse. Useful if you want to answer questions like: + +- Which users are costing us the most? +- How does output quality vary across users? +- What does a specific user's usage pattern look like? + +### Group related traces with session IDs + +If your application involves multiple traces that logically belong together, group them into a [session](/docs/observability/features/sessions). This gives you a session replay view where you can see the full interaction in sequence. + +This makes sense when: +- You're building a chatbot (each user message creates a new trace, but the whole conversation is one session) +- You have multiple agents that each contribute to a final output (e.g., five agents that collaborate to produce a report) +- Your workflow spans multiple requests with human-in-the-loop steps in between + +If your application is single-request/single-response with no continuity between calls, you probably don't need sessions. + +![Sessions view in Langfuse](/images/docs/faq/good-trace-sessions-view.png) diff --git a/content/academy/observability/llm-observability-vs-traditional.mdx b/content/academy/observability/llm-observability-vs-traditional.mdx new file mode 100644 index 0000000000..e76e2d8db8 --- /dev/null +++ b/content/academy/observability/llm-observability-vs-traditional.mdx @@ -0,0 +1,8 @@ +--- +title: How Is LLM Observability Different? +description: TODO +--- + +# How Is LLM Observability Different? + +TODO diff --git a/content/academy/observability/observability-overview.mdx b/content/academy/observability/observability-overview.mdx new file mode 100644 index 0000000000..196b21e616 --- /dev/null +++ b/content/academy/observability/observability-overview.mdx @@ -0,0 +1,27 @@ +--- +title: LLM Observability +description: Understand what LLM observability is, how it differs from traditional observability, and why it matters for building reliable AI applications. +--- + +# LLM Observability + +> I'd like to see what my agent is doing + +This section dives into LLM observability. It's the first step towards a solid AI setup. Even if you don't feel ready for other aspects of LLM engineering, having traces of your agents is always a good idea. You can study their behavior, make small fixes based on what you see, and form a more grounded opinion on what good looks like for your agent in specific situations. + +This section covers the following topics: + +- [How is LLM observability different from traditional logging?](/academy/llm-observability-vs-traditional) +- [What does a good trace look like?](/academy/good-traces) +- [Use-case specific example setups](/academy/observability-use-case-examples) + + \ No newline at end of file diff --git a/content/academy/observability/observability-use-case-examples.mdx b/content/academy/observability/observability-use-case-examples.mdx new file mode 100644 index 0000000000..cbf5d2ca6f --- /dev/null +++ b/content/academy/observability/observability-use-case-examples.mdx @@ -0,0 +1,12 @@ +--- +title: Use Case Examples +description: TODO +--- + +# Use Case Specific Examples + +TODO: +- refer to demo project +- discuss + - a complicated customer support chatbot trace + - ... diff --git a/content/academy/prompt-management/prompt-composability.mdx b/content/academy/prompt-management/prompt-composability.mdx new file mode 100644 index 0000000000..856147997f --- /dev/null +++ b/content/academy/prompt-management/prompt-composability.mdx @@ -0,0 +1,6 @@ +--- +title: Prompt Composability +description: Reference other prompts in your prompts using a simple tag format to create modular, reusable prompt components. +--- + +TODO \ No newline at end of file diff --git a/content/academy/prompt-management/prompt-management-as-a-practice.mdx b/content/academy/prompt-management/prompt-management-as-a-practice.mdx new file mode 100644 index 0000000000..6cac13c7d8 --- /dev/null +++ b/content/academy/prompt-management/prompt-management-as-a-practice.mdx @@ -0,0 +1,19 @@ +--- +title: Prompt Management as a Practice +description: When prompt management makes sense, how version control and deployment strategies for prompts work, and why treating prompts as a managed artifact matters. +--- + +# Prompt Management as a Practice + +## When does prompt management make sense? + +TODO +- multiple people on team +- .. + +compare to git + +## Versioning & deployment strategies + +todo: +- explain versioning, how to deploy to prod, how to manage prompts in team and multiple envs, ... diff --git a/content/academy/prompt-management/prompt-management-overview.mdx b/content/academy/prompt-management/prompt-management-overview.mdx new file mode 100644 index 0000000000..2585ab9a2f --- /dev/null +++ b/content/academy/prompt-management/prompt-management-overview.mdx @@ -0,0 +1,22 @@ +--- +title: Prompt Management +description: Learn when and how to manage prompts as a practice — version control, deployment strategies, and composability. +--- + +# Prompt Management as a Practice + +Prompt management becomes important once you start to invest heavily in iterating on your prompts, especially when you're a larger team. This section covers a couple of aspects: + +- [Prompt management as a practice](/academy/prompt-management-as-a-practice) +- [Composability](/academy/prompt-composability) + + diff --git a/content/academy/review-and-annotate/meta.json b/content/academy/review-and-annotate/meta.json new file mode 100644 index 0000000000..009ff78cfb --- /dev/null +++ b/content/academy/review-and-annotate/meta.json @@ -0,0 +1,6 @@ +{ + "title": "Review & Annotate", + "pages": [ + "overview" + ] +} diff --git a/content/academy/review-and-annotate/overview.mdx b/content/academy/review-and-annotate/overview.mdx new file mode 100644 index 0000000000..baa3d4272b --- /dev/null +++ b/content/academy/review-and-annotate/overview.mdx @@ -0,0 +1,10 @@ +--- +title: Review & Annotate +description: Manually inspect traces, debug issues, and annotate what you see to build understanding of your LLM application's behavior. +--- + +# Review & Annotate + +_Content coming soon._ + +See also: [Using Agent Skills to Automatically Improve your Prompts](/blog/2026-02-16-prompt-improvement-claude-skills) — a walkthrough of how annotating traces can quickly lead to concrete prompt improvements. diff --git a/content/academy/tracing/good-traces.mdx b/content/academy/tracing/good-traces.mdx new file mode 100644 index 0000000000..c2becd593b --- /dev/null +++ b/content/academy/tracing/good-traces.mdx @@ -0,0 +1,179 @@ +--- +title: What does a good trace look like? +description: A guide to structuring your Langfuse traces for effective debugging, evaluation, and cost tracking. +--- + +import { Fan, Wrench } from "lucide-react"; + +# What does a good trace look like? + +You see traces appearing in Langfuse, but how do you know if you've done it well? Here are a couple of things you can look at and optimize. + +## What's the scope of one trace? + +Langfuse's [data model](/docs/observability/data-model) has three levels of grouping: observations (individual steps) are grouped into traces via a `trace_id`, and traces can be grouped into sessions via a `session_id`. + +```mermaid +flowchart LR + + subgraph SESSION [session] + direction TB + + subgraph TRACE1 [trace] + direction LR + O1[observation] + O2[observation] + O3[observation] + end + + subgraph TRACE2 [trace] + direction LR + O4[observation] + O5[observation] + O6[observation] + end + + subgraph TRACE3 [trace] + direction LR + O7[observation] + O8[observation] + O9[observation] + end + + end + classDef sessionBox fill:none,stroke:#888,stroke-width:2px; + classDef traceBox fill:none,stroke:#aaa,stroke-width:1.5px; + + %% Assign classes + class SESSION sessionBox; + class TRACE1,TRACE2,TRACE3 traceBox; +``` + +A trace represents one self-contained unit of work in your application. Good examples of a typical trace: +- One chatbot turn (user sends a message, your app retrieves context, calls the LLM, returns a response) +- One agent run (the agent receives a task, reasons, calls tools, and produces a result) +- One pipeline execution (a document comes in, gets chunked, embedded, and stored) + +If multiple of these happen in sequence, e.g. a multi-turn conversation, or several agent runs that feed into a final report, that's where [sessions](/docs/observability/features/sessions) come in. Each step is its own trace, and the session ties them together. + +A trace shows up in the Langfuse UI as a trace tree and an [agent graph](/docs/observability/features/agent-graphs): + +
+ +![Trace tree](/images/docs/faq/good-trace-tree.png) + +![Agent graph](/images/docs/faq/good-trace-agent-graph.png) + +
+ +## Look at the trace tree + +When you click on a trace, you see the trace tree. There are two things you can check: + +### Are the right steps showing up? + +You should see your LLM calls, tool calls, and other important steps represented in the tree. They should have the correct [observation type](/docs/observability/features/observation-types). + +For example +- an LLM call should show up as a `generation`. This is important because a `generation` can carry [cost, token usage, and model information](#track-model-tokens-and-cost-on-generations). +- a tool call should show up as a `tool`. You can then filter on tool call observations when you create [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) evaluators. + +Framework integrations typically set these types automatically. If you're instrumenting manually, you can set them via the `as_type` parameter (Python) or `asType` (JS/TS). See the [observation types docs](/docs/observability/features/observation-types) for a full list. + +### Is there noise you don't need? + +Not every observation in the tree is useful for understanding what your application did. HTTP spans, database queries, and framework internals often add clutter without giving you meaningful insight. If you see observations like these polluting your trace tree, you can [filter them out](/faq/all/unwanted-http-database-spans). + +
+![Noisy spans in a trace tree](/images/docs/faq/good-trace-noisy-spans.png) +
+ +## Choose good names + +Observation- and trace names are used in many places: + +- When setting up [LLM-as-a-judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge), you target specific observations by name. +- In [dashboards](/docs/metrics/features/custom-dashboards), you can filter and aggregate metrics by observation name. +- In the tracing table, names help you quickly identify what each step does. + +Try to name the observations after what they do: `classify-intent`, `generate-response`, `summarize-results`. This makes it easier to understand what each step does when you're looking at the trace tree, and makes filtering on a specific step easier. + +> Try not to name observations after the AI model used (`gpt-4o`, `claude-sonnet`). It breaks as soon as you swap models. The model is already a separate attribute on `generation` observations, use that instead. + +## Choose meaningful input and output + +In general, it's recommended that operations have an input and/or output. If an observation has neither, ask yourself if an observation is actually useful or if you can drop it. + +For your **most viewed observations**, take some extra care to set them up. You will likely create pre-filtered views on your tracing and session screens. The observations you filter for here are the ones that will get looked at the most. For these, ask yourself: **what do I need to see to quickly evaluate a trace/session at a glance?** + +![Tracing table with input and output](/images/docs/faq/good-trace-tracing-table-io.png) + +Typical input/output for `GENERATION` observations: +- For a chatbot: the user message (input) and the assistant response (output). +- For a RAG pipeline: the user query and the generated answer. +- For a classification task: the text being classified and the predicted label. + +If your input and output fields are showing up empty unintentionally, see [why are the input and output of my trace empty?](/faq/all/empty-trace-input-and-output) + +## Useful attributes + +Observations have a number of attributes that can be useful for your use cases. These will allow you to go even further with filtering, scoring, and making dashboards. + +### Add metadata for context + +[Metadata](/docs/observability/features/metadata) is a flexible key-value store on each observation. Some data that might be useful to save under metadata: + +- **Evaluation context**: When configuring [LLM-as-a-judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge), you can reference metadata fields. This is useful for passing ground truth, expected behavior, or other context that the evaluator needs but that isn't part of the actual input/output. +- **Filtering**: You can filter by metadata keys in the Langfuse UI, which is helpful when you need to find traces with specific characteristics. +- **Annotation context**: When doing manual review, metadata gives annotators extra information to make better judgments. + +### Track model, tokens, and cost on generations [#track-model-tokens-and-cost-on-generations] + +If you want to understand what your LLM usage costs, broken down by model, by user, by feature, you need three things on your `generation` observations: + +- **Model name**: Langfuse uses this to look up pricing in the [model pricing table](/docs/model-usage-and-cost). If the model name doesn't match, Langfuse can't calculate cost automatically. +- **Usage details**: Input tokens, output tokens, and optionally cached tokens. This is what powers the token usage views in dashboards. +- **Cost details** (optional): If you want to override Langfuse's automatic pricing — for example, if you have a custom pricing agreement — you can pass cost explicitly. + +Most [integrations](/integrations) capture all of this automatically. If you're instrumenting manually, see the [token and cost tracking docs](/docs/observability/features/token-and-cost-tracking). + +You can see these attributes on the `GENERATION` observation in the Langfuse UI. + +![Generation attributes in Langfuse](/images/docs/faq/good-trace-generation-attributes.png) + +### Use tags for business-level dimensions + +[Tags](/docs/observability/features/tags) enable filtering and metric breakdowns across dimensions that matter to your business. Good tags answer questions like "how does latency differ between our `web` and `api` users?". + +One property of tags is that they are **immutable and must be set at observation creation time**. This makes them great for things you know upfront (where the request came from, which feature it's part of), but not for things you learn later. + +If you need to label traces based on something determined after the fact, like an [LLM-as-a-judge](/docs/evaluation/evaluation-methods/llm-as-a-judge) evaluation result, use [scores](/docs/evaluation/overview) instead. + +### Link prompts to traces + +If you [manage your prompts in Langfuse](/docs/prompt-management), you can [link them to your generations](/docs/prompt-management/features/link-to-traces). This lets you see which prompt version was used for a given trace, and track how metrics change across prompt versions. Useful when you're iterating on prompts and want to compare performance. + +### Set the environment + +Set the [environment](/docs/observability/features/environments) attribute (`production`, `staging`, `development`) so that your test traces don't pollute production dashboards and evaluations. + +### Track users with user IDs + +Setting the [user ID](/docs/observability/features/users) connects traces to specific users, which unlocks per-user views in Langfuse. Useful if you want to answer questions like: + +- Which users are costing us the most? +- How does output quality vary across users? +- What does a specific user's usage pattern look like? + +### Group related traces with session IDs + +If your application involves multiple traces that logically belong together, group them into a [session](/docs/observability/features/sessions). This gives you a session replay view where you can see the full interaction in sequence. + +This makes sense when: +- You're building a chatbot (each user message creates a new trace, but the whole conversation is one session) +- You have multiple agents that each contribute to a final output (e.g., five agents that collaborate to produce a report) +- Your workflow spans multiple requests with human-in-the-loop steps in between + +If your application is single-request/single-response with no continuity between calls, you probably don't need sessions. + +![Sessions view in Langfuse](/images/docs/faq/good-trace-sessions-view.png) diff --git a/content/academy/tracing/meta.json b/content/academy/tracing/meta.json new file mode 100644 index 0000000000..b64073486e --- /dev/null +++ b/content/academy/tracing/meta.json @@ -0,0 +1,7 @@ +{ + "title": "Tracing", + "pages": [ + "overview", + "good-traces" + ] +} diff --git a/content/academy/tracing/overview.mdx b/content/academy/tracing/overview.mdx new file mode 100644 index 0000000000..8527582d38 --- /dev/null +++ b/content/academy/tracing/overview.mdx @@ -0,0 +1,30 @@ +--- +title: Tracing +description: Instrument your LLM application to capture real user data with Langfuse tracing. +--- + +# Tracing + +_Content coming soon._ + +## How Is LLM Observability Different? + +TODO: +- Talk about traditional logging: more for finding errors +- AI behavior is inherently non-deterministic: Not always clear failure cases +- you'll need to spend more time manually in your agent's traces than in a logging solution (it's not because the response looks correct, that it is correct) + + +also useful to mention here: +- In a lot of cases, you can also learn something from your customers (what are they asking, ...) --> because AI is usually put at places with open ended use cases/problems + +## Ending of page + +TODO: +- If you don't have tracing set up yet, install the Langfuse AI agent skill and ask it to add tracing to any project you have — so you can follow along with the rest of the academy. + + +TODO: callout with quick setup instructions via the AI agent skill + + +Once you have traces coming in, make sure to read [What Does a Good Trace Look Like?](/academy/tracing/good-traces) — it's important to get your trace structure right before moving on. Once you feel confident your traces are well-structured, head to the [Review & Annotate](/academy/review-and-annotate/overview) section. diff --git a/content/faq/all/what-does-a-good-trace-look-like.mdx b/content/faq/all/what-does-a-good-trace-look-like.mdx index ef0c306232..98297da0b0 100644 --- a/content/faq/all/what-does-a-good-trace-look-like.mdx +++ b/content/faq/all/what-does-a-good-trace-look-like.mdx @@ -49,7 +49,9 @@ Framework integrations typically set these types automatically. If you're instru Not every observation in the tree is useful for understanding what your application did. HTTP spans, database queries, and framework internals often add clutter without giving you meaningful insight. If you see observations like these polluting your trace tree, you can [filter them out](/faq/all/unwanted-http-database-spans). +
![Noisy spans in a trace tree](/images/docs/faq/good-trace-noisy-spans.png) +
## Choose good names diff --git a/lib/source.ts b/lib/source.ts index c3f89fa1f2..36a4b55be4 100644 --- a/lib/source.ts +++ b/lib/source.ts @@ -13,6 +13,7 @@ import { customers, handbook, marketing, + academy, } from "../.source/server"; export const source = loader({ @@ -206,6 +207,12 @@ export const handbookSource = loader({ source: handbook.toFumadocsSource(), }); +export const academySource = loader({ + baseUrl: "/academy", + source: academy.toFumadocsSource(), + pageTree: { idPrefix: "academy" }, +}); + export const marketingSource = loader({ baseUrl: "", source: marketing.toFumadocsSource(), @@ -295,6 +302,11 @@ const DOC_SECTIONS = { collection: "handbook", title: "Handbook", }, + academy: { + source: academySource, + collection: "academy", + title: "Academy", + }, } as const; const marketingEntries = Object.fromEntries( @@ -315,6 +327,7 @@ export const DOCS_STYLE_APP_SECTIONS = new Set([ "self-hosting", "guides", "library", + "academy", ]); /** Sections that are blog/changelog posts — no left sidebar */ diff --git a/mdx-components.tsx b/mdx-components.tsx index f338600d65..7ed387a9b7 100644 --- a/mdx-components.tsx +++ b/mdx-components.tsx @@ -11,6 +11,7 @@ import { Callout, Tabs, Tab, Cards, Card, Steps, FileTree, FileTreeFile, FileTre import { MdxDetails, MdxSummary } from "@/components/MdxDetails"; import { AvailabilityBanner } from "@/components/availability"; import { Link as MdxLink, type LinkProps } from "@/components/ui/link"; +import { ImprovementLoop } from "@/components/academy/ImprovementLoop"; // Lazy-load Video so @vidstack/react (~800 KB) is NOT bundled on every MDX page. // It only downloads on pages that actually render a