-
Notifications
You must be signed in to change notification settings - Fork 6
evals #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
evals #23
Changes from all commits
bcf7619
5c19c2b
7da72d9
f2ec17a
8ed450a
868a8a7
8aefd9e
3e4220a
ab116ab
8b4338b
dfe19f2
4cb8ee9
0dfe37b
39f69ee
8c93d25
8f62abd
2e974ad
ab9edac
dae05fc
1e51834
981e6e9
7911985
4bf03db
e20f2ea
4e73b2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| name: MCP Server Evaluations | ||
| permissions: | ||
| contents: read | ||
| checks: write | ||
| pull-requests: write | ||
|
|
||
| on: | ||
| push: | ||
| branches: [main, dev/**] | ||
| pull_request: | ||
| branches: [main] | ||
| workflow_dispatch: | ||
|
|
||
| env: | ||
| DOTNET_VERSION: "9.0.x" | ||
|
|
||
| jobs: | ||
| evaluate: | ||
| runs-on: ubuntu-latest | ||
| timeout-minutes: 30 | ||
|
|
||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Setup .NET | ||
| uses: actions/setup-dotnet@v4 | ||
| with: | ||
| dotnet-version: ${{ env.DOTNET_VERSION }} | ||
|
|
||
| - name: Restore dependencies | ||
| run: dotnet restore DataFactory.MCP.EvaluationTests/DataFactory.MCP.EvaluationTests.csproj --configfile nuget.public.config | ||
|
|
||
| - name: Build project | ||
| run: dotnet build DataFactory.MCP.EvaluationTests/DataFactory.MCP.EvaluationTests.csproj --no-restore --configuration Release | ||
|
|
||
| - name: Install code coverage tools | ||
| run: dotnet tool install -g dotnet-coverage --configfile nuget.public.config | ||
|
|
||
| - name: Run evaluation tests | ||
| env: | ||
| EVAL_AZURE_OPENAI_ENDPOINT: ${{ secrets.EVAL_AZURE_OPENAI_ENDPOINT }} | ||
| EVAL_AZURE_OPENAI_API_KEY: ${{ secrets.EVAL_AZURE_OPENAI_API_KEY }} | ||
| EVAL_AZURE_OPENAI_MODEL: ${{ secrets.EVAL_AZURE_OPENAI_MODEL }} | ||
| run: | | ||
| dotnet test DataFactory.MCP.EvaluationTests/DataFactory.MCP.EvaluationTests.csproj \ | ||
| --no-build \ | ||
| --configuration Release \ | ||
| --logger "trx;LogFileName=evaluation-results.trx" \ | ||
| --logger "console;verbosity=detailed" \ | ||
| --collect:"Code Coverage" \ | ||
| --results-directory ./TestResults | ||
|
|
||
| - name: Publish test results | ||
| uses: dorny/test-reporter@v1 | ||
| if: always() | ||
| with: | ||
| name: Evaluation Test Results | ||
| path: "./TestResults/*.trx" | ||
| reporter: dotnet-trx | ||
| fail-on-error: true | ||
|
|
||
| - name: Upload test results | ||
| uses: actions/upload-artifact@v4 | ||
| if: always() | ||
| with: | ||
| name: evaluation-test-results | ||
| path: ./TestResults/ | ||
| retention-days: 30 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,3 +2,6 @@ obj/ | |
| bin/ | ||
| *.sln | ||
| DataFactory.MCP.Tests/Infrastructure/.env | ||
| *node_modules/ | ||
| *package-lock.json | ||
| *.vs | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| <Project Sdk="Microsoft.NET.Sdk"> | ||
|
|
||
| <PropertyGroup> | ||
| <TargetFramework>net9.0</TargetFramework> | ||
| <LangVersion>latest</LangVersion> | ||
| <ImplicitUsings>enable</ImplicitUsings> | ||
| <Nullable>enable</Nullable> | ||
| </PropertyGroup> | ||
|
|
||
| <ItemGroup> | ||
| <PackageReference Include="Azure.AI.OpenAI" Version="2.1.0" /> | ||
| <PackageReference Include="Azure.Identity" Version="1.17.0" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI" Version="9.10.0" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.10.0" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.AzureAIInference" Version="9.10.0-preview.1.25513.3" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.Evaluation" Version="9.10.0" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.Evaluation.NLP" Version="9.10.0-preview.1.25513.3" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="9.10.0" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.Evaluation.Reporting" Version="9.10.0" /> | ||
| <PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.10.0-preview.1.25513.3" /> | ||
| <PackageReference Include="ModelContextProtocol" Version="0.4.0-preview.2" /> | ||
| <PackageReference Include="MSTest" Version="4.0.0-preview.25465.3" /> | ||
| </ItemGroup> | ||
|
|
||
| <ItemGroup> | ||
| <Using Include="Microsoft.VisualStudio.TestTools.UnitTesting" /> | ||
| </ItemGroup> | ||
|
|
||
| <ItemGroup> | ||
| <ProjectReference Include="..\DataFactory.MCP\DataFactory.MCP.csproj" /> | ||
| </ItemGroup> | ||
| </Project> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
| // See the LICENSE file in the project root for more information. | ||
|
|
||
| using System.Diagnostics.CodeAnalysis; | ||
|
|
||
| namespace DataFactory.MCP.EvaluationTests; | ||
|
|
||
| public class EnvironmentVariables | ||
| { | ||
| private static readonly IDictionary<string, string> s_environmentVariableCache = new Dictionary<string, string>(); | ||
|
|
||
| private static string GetEnvironmentVariable(string variableName) | ||
| { | ||
| if (!s_environmentVariableCache.TryGetValue(variableName, out string? value)) | ||
| { | ||
| value = | ||
| Environment.GetEnvironmentVariable(variableName) ?? | ||
| throw new Exception($"Environment variable {variableName} not set."); | ||
|
|
||
| s_environmentVariableCache[variableName] = value; | ||
| } | ||
|
|
||
| return value; | ||
| } | ||
|
|
||
| #region Azure OpenAI | ||
| public static string AzureOpenAIEndpoint | ||
| => GetEnvironmentVariable("EVAL_AZURE_OPENAI_ENDPOINT"); | ||
|
|
||
| public static string AzureOpenAIAPIKey | ||
| => GetEnvironmentVariable("EVAL_AZURE_OPENAI_API_KEY"); | ||
|
|
||
| public static string AzureOpenAIModel | ||
| => GetEnvironmentVariable("EVAL_AZURE_OPENAI_MODEL"); | ||
| #endregion | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,181 @@ | ||
| using Microsoft.Extensions.AI; | ||
| using Microsoft.Extensions.AI.Evaluation; | ||
| using ModelContextProtocol.Client; | ||
|
|
||
| namespace DataFactory.MCP.EvaluationTests | ||
| { | ||
| public abstract class EvalTestBase | ||
| { | ||
| /// The below <see cref="ChatConfiguration"/> identifies the LLM endpoint that should be used for all evaluations | ||
| /// performed in the current sample project. <see cref="s_chatConfiguration"/> is initialized with the value | ||
| /// returned from <see cref="TestSetup.GetChatConfiguration"/> inside <see cref="InitializeAsync(TestContext)"/> | ||
| /// below. | ||
| protected static ChatConfiguration? s_chatConfiguration; | ||
|
|
||
| /// The MCP client used to connect to the Data Factory MCP server | ||
| protected static McpClient? s_mcpClient; | ||
|
|
||
| /// The chat options containing the tools and settings for the chat client | ||
| protected static ChatOptions? s_chatOptions; | ||
|
|
||
| /// The tools available from the MCP server | ||
| protected static IList<McpClientTool>? s_tools; | ||
|
|
||
| /// All unit tests in the current sample project evaluate the LLM's response to Data Factory management queries. | ||
| /// | ||
| /// We invoke the LLM once inside <see cref="InitializeAsync(TestContext)"/> below to get a response to this | ||
| /// question and store this response in a static variable <see cref="s_response"/>. Each unit test in the current | ||
| /// project then performs a different evaluation on the same stored response. | ||
|
|
||
| protected static readonly IList<ChatMessage> s_messages = [ | ||
| new ChatMessage( | ||
| ChatRole.System, | ||
| "You are a helpful Microsoft Data Factory assistant. Use the available tools to help users manage their Data Factory resources including gateways, connections, workspaces, and dataflows."), | ||
| new ChatMessage( | ||
| ChatRole.User, | ||
| "Can you help me understand what Data Factory resources are available in my environment? I'd like to see an overview of my gateways and connections.")]; | ||
|
|
||
| protected static ChatResponse s_response = new(); | ||
|
|
||
| protected static async Task InitializeTestAsync() | ||
| { | ||
| /// Set up the <see cref="ChatConfiguration"/> which includes the <see cref="IChatClient"/> that all the | ||
| /// evaluators used in the current sample project will use to communicate with the LLM. | ||
| s_chatConfiguration = TestSetup.GetChatConfiguration(); | ||
|
|
||
| StdioClientTransport mcpClientTransport = new StdioClientTransport(new StdioClientTransportOptions | ||
| { | ||
| Name = "DataFactory.MCP", | ||
| Command = "dotnet", | ||
| Arguments = ["run", "--project", "..\\..\\..\\..\\DataFactory.MCP\\DataFactory.MCP.csproj"], | ||
| }); | ||
|
|
||
| s_mcpClient = await McpClient.CreateAsync(mcpClientTransport); | ||
| s_tools = await s_mcpClient.ListToolsAsync(); | ||
| s_chatOptions = new ChatOptions | ||
| { | ||
| Tools = [.. s_tools], | ||
| Temperature = 0.0f, | ||
| ResponseFormat = ChatResponseFormat.Text | ||
| }; | ||
|
|
||
| // Get the initial response using the shared messages | ||
| s_response = await s_chatConfiguration.ChatClient.GetResponseAsync(s_messages, s_chatOptions); | ||
| } | ||
|
|
||
| protected static async Task CleanupTestAsync() | ||
| { | ||
| if (s_mcpClient != null) | ||
| { | ||
| await s_mcpClient.DisposeAsync(); | ||
| s_mcpClient = null; | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Generic method to evaluate how well an actual response matches an expected response pattern using LLM-based evaluation. | ||
| /// Now internally uses the ResponseMatchEvaluator (IEvaluator) for consistency while maintaining backward compatibility. | ||
| /// </summary> | ||
| /// <param name="originalMessages">The original conversation messages</param> | ||
| /// <param name="actualResponse">The actual AI response to evaluate</param> | ||
| /// <param name="expectedResponsePattern">Description of the expected response pattern</param> | ||
| /// <param name="evaluationCriteria">Specific criteria for evaluation (optional, will use default if null)</param> | ||
| /// <param name="scenarioName">Name of the scenario being evaluated (for logging purposes)</param> | ||
| /// <param name="minimumAcceptableScore">Minimum score (1-5) to pass the evaluation (default: 3)</param> | ||
| /// <returns>The evaluation score (1-5) or null if evaluation failed</returns> | ||
| protected static async Task<int?> EvaluateResponseMatchAsync( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better to implement this as an
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would recommend reading through and running the samples under https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/README.md. These samples are structured as a series of unit tests where each test builds upon concepts introduced in previous tests. I would recommend going through the READMEs, instructions and code for these sample tests one test at a time to understand how the various APIs, concepts and functionality in the Microsoft.Extensions.AI.Evaluation libraries work, and how this functionality can be used within your tests to set up your own offline eval pipelines and reporting. For example,
|
||
| List<ChatMessage> originalMessages, | ||
| ChatResponse actualResponse, | ||
| string expectedResponsePattern, | ||
| string? evaluationCriteria = null, | ||
| string scenarioName = "Response", | ||
| int minimumAcceptableScore = 3) | ||
| { | ||
| // Use the IEvaluator internally for consistency | ||
| var evaluator = new ResponseMatchEvaluator(); | ||
| var context = new ResponseMatchEvaluatorContext( | ||
| expectedResponsePattern, | ||
| evaluationCriteria, | ||
| scenarioName); | ||
|
|
||
| var result = await evaluator.EvaluateAsync( | ||
| originalMessages, | ||
| actualResponse, | ||
| s_chatConfiguration, | ||
| [context]); | ||
|
|
||
| // Extract the score and validate it meets minimum expectations | ||
| var matchScoreMetric = result.Metrics.OfType<NumericMetric>() | ||
|
Check warning on line 108 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs
|
||
| .FirstOrDefault(m => m.Name == ResponseMatchEvaluator.MatchScoreMetricName); | ||
|
|
||
| if (matchScoreMetric?.Value is double score) | ||
| { | ||
| var intScore = (int)score; | ||
| Assert.IsGreaterThanOrEqualTo(minimumAcceptableScore, intScore, | ||
| $"{scenarioName} should meet basic expectations. Got score: {intScore}. Explanation: {matchScoreMetric.Reason}"); | ||
|
|
||
| return intScore; | ||
| } | ||
|
|
||
| return null; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Enhanced method that uses the IEvaluator pattern with ResponseMatchEvaluator. | ||
| /// Returns a proper EvaluationResult with structured metrics and diagnostics. | ||
| /// This is the recommended method for new code that needs access to detailed evaluation results. | ||
| /// </summary> | ||
| /// <param name="originalMessages">The original conversation messages</param> | ||
| /// <param name="actualResponse">The actual AI response to evaluate</param> | ||
| /// <param name="expectedResponsePattern">Description of the expected response pattern</param> | ||
| /// <param name="evaluationCriteria">Specific criteria for evaluation (optional, will use default if null)</param> | ||
| /// <param name="scenarioName">Name of the scenario being evaluated (for logging purposes)</param> | ||
| /// <param name="minimumAcceptableScore">Minimum score (1-5) to pass the evaluation (default: 3)</param> | ||
| /// <returns>The evaluation result containing the match score and metrics</returns> | ||
| protected static async Task<EvaluationResult> EvaluateWithIEvaluatorAsync( | ||
| List<ChatMessage> originalMessages, | ||
| ChatResponse actualResponse, | ||
| string expectedResponsePattern, | ||
| string? evaluationCriteria = null, | ||
| string scenarioName = "Response", | ||
| int minimumAcceptableScore = 3) | ||
| { | ||
| var evaluator = new ResponseMatchEvaluator(); | ||
| var context = new ResponseMatchEvaluatorContext( | ||
| expectedResponsePattern, | ||
| evaluationCriteria, | ||
| scenarioName); | ||
|
|
||
| var result = await evaluator.EvaluateAsync( | ||
| originalMessages, | ||
| actualResponse, | ||
| s_chatConfiguration, | ||
| [context]); | ||
|
|
||
| // Validate the result meets minimum expectations | ||
| var matchScoreMetric = result.Metrics.OfType<NumericMetric>() | ||
|
Check warning on line 156 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs
|
||
| .FirstOrDefault(m => m.Name == ResponseMatchEvaluator.MatchScoreMetricName); | ||
|
|
||
| if (matchScoreMetric?.Value is double score) | ||
| { | ||
| Assert.IsGreaterThanOrEqualTo(minimumAcceptableScore, score, | ||
| $"{scenarioName} should meet basic expectations. Got score: {score}. Explanation: {matchScoreMetric.Reason}"); | ||
| } | ||
|
|
||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Helper method to extract the numeric score from a ResponseMatchEvaluator result. | ||
| /// </summary> | ||
| /// <param name="evaluationResult">The evaluation result from ResponseMatchEvaluator</param> | ||
| /// <returns>The numeric score (1-5) or null if not found</returns> | ||
| protected static int? GetMatchScore(EvaluationResult evaluationResult) | ||
| { | ||
| var matchScoreMetric = evaluationResult.Metrics.OfType<NumericMetric>() | ||
|
Check warning on line 175 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs
|
||
| .FirstOrDefault(m => m.Name == ResponseMatchEvaluator.MatchScoreMetricName); | ||
|
|
||
| return matchScoreMetric?.Value is double score ? (int)score : null; | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.