Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions .github/workflows/mcp-evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: MCP Server Evaluations
permissions:
contents: read
checks: write
pull-requests: write

on:
push:
branches: [main, dev/**]
pull_request:
branches: [main]
workflow_dispatch:

env:
DOTNET_VERSION: "9.0.x"

jobs:
evaluate:
runs-on: ubuntu-latest
timeout-minutes: 30

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup .NET
uses: actions/setup-dotnet@v4
with:
dotnet-version: ${{ env.DOTNET_VERSION }}

- name: Restore dependencies
run: dotnet restore DataFactory.MCP.EvaluationTests/DataFactory.MCP.EvaluationTests.csproj --configfile nuget.public.config

- name: Build project
run: dotnet build DataFactory.MCP.EvaluationTests/DataFactory.MCP.EvaluationTests.csproj --no-restore --configuration Release

- name: Install code coverage tools
run: dotnet tool install -g dotnet-coverage --configfile nuget.public.config

- name: Run evaluation tests
env:
EVAL_AZURE_OPENAI_ENDPOINT: ${{ secrets.EVAL_AZURE_OPENAI_ENDPOINT }}
EVAL_AZURE_OPENAI_API_KEY: ${{ secrets.EVAL_AZURE_OPENAI_API_KEY }}
EVAL_AZURE_OPENAI_MODEL: ${{ secrets.EVAL_AZURE_OPENAI_MODEL }}
run: |
dotnet test DataFactory.MCP.EvaluationTests/DataFactory.MCP.EvaluationTests.csproj \
--no-build \
--configuration Release \
--logger "trx;LogFileName=evaluation-results.trx" \
--logger "console;verbosity=detailed" \
--collect:"Code Coverage" \
--results-directory ./TestResults

- name: Publish test results
uses: dorny/test-reporter@v1
if: always()
with:
name: Evaluation Test Results
path: "./TestResults/*.trx"
reporter: dotnet-trx
fail-on-error: true

- name: Upload test results
uses: actions/upload-artifact@v4
if: always()
with:
name: evaluation-test-results
path: ./TestResults/
retention-days: 30
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ obj/
bin/
*.sln
DataFactory.MCP.Tests/Infrastructure/.env
*node_modules/
*package-lock.json
*.vs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<LangVersion>latest</LangVersion>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.AI.OpenAI" Version="2.1.0" />
<PackageReference Include="Azure.Identity" Version="1.17.0" />
<PackageReference Include="Microsoft.Extensions.AI" Version="9.10.0" />
<PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.10.0" />
<PackageReference Include="Microsoft.Extensions.AI.AzureAIInference" Version="9.10.0-preview.1.25513.3" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation" Version="9.10.0" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.NLP" Version="9.10.0-preview.1.25513.3" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="9.10.0" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Reporting" Version="9.10.0" />
<PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.10.0-preview.1.25513.3" />
<PackageReference Include="ModelContextProtocol" Version="0.4.0-preview.2" />
<PackageReference Include="MSTest" Version="4.0.0-preview.25465.3" />
</ItemGroup>

<ItemGroup>
<Using Include="Microsoft.VisualStudio.TestTools.UnitTesting" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\DataFactory.MCP\DataFactory.MCP.csproj" />
</ItemGroup>
</Project>
37 changes: 37 additions & 0 deletions DataFactory.MCP.EvaluationTests/EnvironmentVariables.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Diagnostics.CodeAnalysis;

namespace DataFactory.MCP.EvaluationTests;

public class EnvironmentVariables
{
private static readonly IDictionary<string, string> s_environmentVariableCache = new Dictionary<string, string>();

private static string GetEnvironmentVariable(string variableName)
{
if (!s_environmentVariableCache.TryGetValue(variableName, out string? value))
{
value =
Environment.GetEnvironmentVariable(variableName) ??
throw new Exception($"Environment variable {variableName} not set.");

s_environmentVariableCache[variableName] = value;
}

return value;
}

#region Azure OpenAI
public static string AzureOpenAIEndpoint
=> GetEnvironmentVariable("EVAL_AZURE_OPENAI_ENDPOINT");

public static string AzureOpenAIAPIKey
=> GetEnvironmentVariable("EVAL_AZURE_OPENAI_API_KEY");

public static string AzureOpenAIModel
=> GetEnvironmentVariable("EVAL_AZURE_OPENAI_MODEL");
#endregion
}
181 changes: 181 additions & 0 deletions DataFactory.MCP.EvaluationTests/EvalTestBase.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
using Microsoft.Extensions.AI;
using Microsoft.Extensions.AI.Evaluation;
using ModelContextProtocol.Client;

namespace DataFactory.MCP.EvaluationTests
{
public abstract class EvalTestBase
{
/// The below <see cref="ChatConfiguration"/> identifies the LLM endpoint that should be used for all evaluations
/// performed in the current sample project. <see cref="s_chatConfiguration"/> is initialized with the value
/// returned from <see cref="TestSetup.GetChatConfiguration"/> inside <see cref="InitializeAsync(TestContext)"/>
/// below.
protected static ChatConfiguration? s_chatConfiguration;

/// The MCP client used to connect to the Data Factory MCP server
protected static McpClient? s_mcpClient;

/// The chat options containing the tools and settings for the chat client
protected static ChatOptions? s_chatOptions;

/// The tools available from the MCP server
protected static IList<McpClientTool>? s_tools;

/// All unit tests in the current sample project evaluate the LLM's response to Data Factory management queries.
///
/// We invoke the LLM once inside <see cref="InitializeAsync(TestContext)"/> below to get a response to this
/// question and store this response in a static variable <see cref="s_response"/>. Each unit test in the current
/// project then performs a different evaluation on the same stored response.

protected static readonly IList<ChatMessage> s_messages = [
new ChatMessage(
ChatRole.System,
"You are a helpful Microsoft Data Factory assistant. Use the available tools to help users manage their Data Factory resources including gateways, connections, workspaces, and dataflows."),
new ChatMessage(
ChatRole.User,
"Can you help me understand what Data Factory resources are available in my environment? I'd like to see an overview of my gateways and connections.")];

protected static ChatResponse s_response = new();

protected static async Task InitializeTestAsync()
{
/// Set up the <see cref="ChatConfiguration"/> which includes the <see cref="IChatClient"/> that all the
/// evaluators used in the current sample project will use to communicate with the LLM.
s_chatConfiguration = TestSetup.GetChatConfiguration();

StdioClientTransport mcpClientTransport = new StdioClientTransport(new StdioClientTransportOptions
{
Name = "DataFactory.MCP",
Command = "dotnet",
Arguments = ["run", "--project", "..\\..\\..\\..\\DataFactory.MCP\\DataFactory.MCP.csproj"],
});

s_mcpClient = await McpClient.CreateAsync(mcpClientTransport);
s_tools = await s_mcpClient.ListToolsAsync();
s_chatOptions = new ChatOptions
{
Tools = [.. s_tools],
Temperature = 0.0f,
ResponseFormat = ChatResponseFormat.Text
};

// Get the initial response using the shared messages
s_response = await s_chatConfiguration.ChatClient.GetResponseAsync(s_messages, s_chatOptions);
}

protected static async Task CleanupTestAsync()
{
if (s_mcpClient != null)
{
await s_mcpClient.DisposeAsync();
s_mcpClient = null;
}
}

/// <summary>
/// Generic method to evaluate how well an actual response matches an expected response pattern using LLM-based evaluation.
/// Now internally uses the ResponseMatchEvaluator (IEvaluator) for consistency while maintaining backward compatibility.
/// </summary>
/// <param name="originalMessages">The original conversation messages</param>
/// <param name="actualResponse">The actual AI response to evaluate</param>
/// <param name="expectedResponsePattern">Description of the expected response pattern</param>
/// <param name="evaluationCriteria">Specific criteria for evaluation (optional, will use default if null)</param>
/// <param name="scenarioName">Name of the scenario being evaluated (for logging purposes)</param>
/// <param name="minimumAcceptableScore">Minimum score (1-5) to pass the evaluation (default: 3)</param>
/// <returns>The evaluation score (1-5) or null if evaluation failed</returns>
protected static async Task<int?> EvaluateResponseMatchAsync(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better to implement this as an IEvaluator. This function would essentially become IEvaluator.EvaluateAsync() for your evaluator. This function could then return an EvaluationResult that includes one or more metrics (in your case it could be a single NumericMetric with Name like "Match Score").

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would recommend reading through and running the samples under https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/README.md. These samples are structured as a series of unit tests where each test builds upon concepts introduced in previous tests.

I would recommend going through the READMEs, instructions and code for these sample tests one test at a time to understand how the various APIs, concepts and functionality in the Microsoft.Extensions.AI.Evaluation libraries work, and how this functionality can be used within your tests to set up your own offline eval pipelines and reporting.

For example,

List<ChatMessage> originalMessages,
ChatResponse actualResponse,
string expectedResponsePattern,
string? evaluationCriteria = null,
string scenarioName = "Response",
int minimumAcceptableScore = 3)
{
// Use the IEvaluator internally for consistency
var evaluator = new ResponseMatchEvaluator();
var context = new ResponseMatchEvaluatorContext(
expectedResponsePattern,
evaluationCriteria,
scenarioName);

var result = await evaluator.EvaluateAsync(
originalMessages,
actualResponse,
s_chatConfiguration,
[context]);

// Extract the score and validate it meets minimum expectations
var matchScoreMetric = result.Metrics.OfType<NumericMetric>()

Check warning on line 108 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 108 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 108 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 108 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)
.FirstOrDefault(m => m.Name == ResponseMatchEvaluator.MatchScoreMetricName);

if (matchScoreMetric?.Value is double score)
{
var intScore = (int)score;
Assert.IsGreaterThanOrEqualTo(minimumAcceptableScore, intScore,
$"{scenarioName} should meet basic expectations. Got score: {intScore}. Explanation: {matchScoreMetric.Reason}");

return intScore;
}

return null;
}

/// <summary>
/// Enhanced method that uses the IEvaluator pattern with ResponseMatchEvaluator.
/// Returns a proper EvaluationResult with structured metrics and diagnostics.
/// This is the recommended method for new code that needs access to detailed evaluation results.
/// </summary>
/// <param name="originalMessages">The original conversation messages</param>
/// <param name="actualResponse">The actual AI response to evaluate</param>
/// <param name="expectedResponsePattern">Description of the expected response pattern</param>
/// <param name="evaluationCriteria">Specific criteria for evaluation (optional, will use default if null)</param>
/// <param name="scenarioName">Name of the scenario being evaluated (for logging purposes)</param>
/// <param name="minimumAcceptableScore">Minimum score (1-5) to pass the evaluation (default: 3)</param>
/// <returns>The evaluation result containing the match score and metrics</returns>
protected static async Task<EvaluationResult> EvaluateWithIEvaluatorAsync(
List<ChatMessage> originalMessages,
ChatResponse actualResponse,
string expectedResponsePattern,
string? evaluationCriteria = null,
string scenarioName = "Response",
int minimumAcceptableScore = 3)
{
var evaluator = new ResponseMatchEvaluator();
var context = new ResponseMatchEvaluatorContext(
expectedResponsePattern,
evaluationCriteria,
scenarioName);

var result = await evaluator.EvaluateAsync(
originalMessages,
actualResponse,
s_chatConfiguration,
[context]);

// Validate the result meets minimum expectations
var matchScoreMetric = result.Metrics.OfType<NumericMetric>()

Check warning on line 156 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 156 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 156 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 156 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)
.FirstOrDefault(m => m.Name == ResponseMatchEvaluator.MatchScoreMetricName);

if (matchScoreMetric?.Value is double score)
{
Assert.IsGreaterThanOrEqualTo(minimumAcceptableScore, score,
$"{scenarioName} should meet basic expectations. Got score: {score}. Explanation: {matchScoreMetric.Reason}");
}

return result;
}

/// <summary>
/// Helper method to extract the numeric score from a ResponseMatchEvaluator result.
/// </summary>
/// <param name="evaluationResult">The evaluation result from ResponseMatchEvaluator</param>
/// <returns>The numeric score (1-5) or null if not found</returns>
protected static int? GetMatchScore(EvaluationResult evaluationResult)
{
var matchScoreMetric = evaluationResult.Metrics.OfType<NumericMetric>()

Check warning on line 175 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 175 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 175 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)

Check warning on line 175 in DataFactory.MCP.EvaluationTests/EvalTestBase.cs

View workflow job for this annotation

GitHub Actions / evaluate

This call will always result in an empty sequence because type 'System.Collections.Generic.KeyValuePair<string, Microsoft.Extensions.AI.Evaluation.EvaluationMetric>' is incompatible with type 'Microsoft.Extensions.AI.Evaluation.NumericMetric' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca2021)
.FirstOrDefault(m => m.Name == ResponseMatchEvaluator.MatchScoreMetricName);

return matchScoreMetric?.Value is double score ? (int)score : null;
}
}
}
Loading
Loading