microsoft · saibulusu · May 15, 2026 · Mar 3, 2026 · Mar 20, 2026 · May 7, 2026
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.2.4
+3.2.5
diff --git a/...rtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs b/...rtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs
@@ -149,7 +149,7 @@ public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecution()
         public async Task SuperBenchmarkExecutorDeploySuperBenchContainer()
         {
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
-            string expectedCommand = $"sb deploy --host-list localhost -i testContainer";
+            string expectedCommand = $"bash -c \"source ./venv/bin/activate && sb deploy --host-list localhost -i testContainer\"";
 
             bool commandExecuted = false;
             this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
@@ -184,7 +184,7 @@ public async Task SuperBenchmarkExecutorDeploySuperBenchContainer()
         public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommand()
         {
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
-            string expectedCommand = $"sb run --host-list localhost -c Test.yaml";
+            string expectedCommand = $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\"";
 
             bool commandExecuted = false;
             this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
@@ -224,8 +224,8 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati
                 $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"",
                 $"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark",
                 $"sudo bash initialize.sh testuser",
-                $"sb deploy --host-list localhost -i testContainer",
-                $"sb run --host-list localhost -c Test.yaml"
+                $"bash -c \"source ./venv/bin/activate && sb deploy --host-list localhost -i testContainer\"",
+                $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\""
             };
 
             int processCount = 0;
@@ -278,8 +278,8 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati
                 $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"",
                 $"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark",
                 $"sudo bash initialize.sh testuser /docker/path",
-                $"sb deploy --host-list localhost -i testContainer",
-                $"sb run --host-list localhost -c Test.yaml"
+                $"bash -c \"source ./venv/bin/activate && sb deploy --host-list localhost -i testContainer\"",
+                $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\""
             };
 
             int processCount = 0;
@@ -320,7 +320,7 @@ public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecu
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             List<string> expectedCommands = new List<string>
             {
-                $"sb run --host-list localhost -c Test.yaml"
+                $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\""
             };
 
             int processCount = 0;

diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs
@@ -20,7 +20,7 @@ namespace VirtualClient.Actions
     /// <summary>
     /// The SuperBenchmark workload executor.
     /// </summary>
-    [SupportedPlatforms("linux-x64", true)]
+    [SupportedPlatforms("linux-x64,linux-arm64", true)]
     public class SuperBenchmarkExecutor : VirtualClientComponent
     {
         private const string SuperBenchmarkRunShell = "RunSuperBenchmark.sh";
@@ -139,8 +139,9 @@ protected override async Task ExecuteAsync(EventContext telemetryContext, Cancel
             using (BackgroundOperations profiling = BackgroundOperations.BeginProfiling(this, cancellationToken))
             {
                 string commandArguments = this.GetCommandLineArguments();
+                string commandWithVenv = $"-c \"source ./venv/bin/activate && sb {commandArguments}\"";
 
-                using (IProcessProxy process = await this.ExecuteCommandAsync("sb", commandArguments, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false))
+                using (IProcessProxy process = await this.ExecuteCommandAsync("bash", commandWithVenv, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false))
                 {
                     if (!cancellationToken.IsCancellationRequested)
                     {
@@ -166,7 +167,6 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
 
             if (!state.SuperBenchmarkInitialized)
             {
-                // This is to grant directory folders for 
                 await this.systemManager.MakeFilesExecutableAsync(this.PlatformSpecifics.CurrentDirectory, this.Platform, cancellationToken);
 
                 string cloneDir = this.PlatformSpecifics.Combine(this.PlatformSpecifics.PackagesDirectory, "superbenchmark");
@@ -191,7 +191,8 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
                 }
 
                 await this.ExecuteSbCommandAsync("bash", initializeArgs, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
-                await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false);
+                string commandWithVenv = $"-c \"source ./venv/bin/activate && sb deploy --host-list localhost -i {this.ContainerVersion}\"";
+                await this.ExecuteSbCommandAsync("bash", commandWithVenv, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false);
 
                 state.SuperBenchmarkInitialized = true;
             }

diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/default.yaml b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/default.yaml
@@ -1,54 +1,265 @@
 # SuperBench Config
-version: v0.8
+version: v0.12
 superbench:
   enable: null
+  monitor:
+    enable: true
+    sample_duration: 1
+    sample_interval: 10
   var:
     default_local_mode: &default_local_mode
       enable: true
       modes:
         - name: local
-          proc_num: 1
+          proc_num: 8
           prefix: CUDA_VISIBLE_DEVICES={proc_rank}
           parallel: yes
     default_pytorch_mode: &default_pytorch_mode
       enable: true
       modes:
         - name: torch.distributed
-          proc_num: 1
+          proc_num: 8
           node_num: 1
       frameworks:
         - pytorch
     common_model_config: &common_model_config
       duration: 0
       num_warmup: 16
       num_steps: 128
+      batch_size: 1
       precision:
         - float32
         - float16
       model_action:
         - train
   benchmarks:
+    gpu-burn:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        time: 300
+        doubles: true
+        tensor_core: true
+    nccl-bw:default:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        ngpus: 8
+    nccl-bw:gdr-only:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+          env:
+            NCCL_IB_PCI_RELAXED_ORDERING: '1'
+            NCCL_NET_GDR_LEVEL: '5'
+            NCCL_P2P_DISABLE: '1'
+            NCCL_SHM_DISABLE: '1'
+            NCCL_MIN_NCHANNELS: '16'
+            NCCL_IB_DISABLE: '0'
+      parameters:
+        ngpus: 8
+    ib-loopback:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2
+          parallel: yes
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
+          parallel: yes
+    disk-benchmark:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        block_devices:
+          - /dev/nvme0n1
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
+    mem-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+          parallel: no
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+    nvbandwidth:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        buffer_size: 128
+        test_cases:
+          - host_to_device_memcpy_ce
+          - device_to_host_memcpy_ce
+          - host_to_device_memcpy_sm
+          - device_to_host_memcpy_sm
+        num_loops: 18
+        skip_verification: false
+        disable_affinity: false
+        use_mean: false
+    kernel-launch:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+    cudnn-function:
+      <<: *default_local_mode
+    cublas-function:
+      <<: *default_local_mode
+    matmul:
+      <<: *default_local_mode
+      frameworks:
+        - pytorch
+    sharding-matmul:
+      <<: *default_pytorch_mode
+    computation-communication-overlap:
+      <<: *default_pytorch_mode
+    ib-traffic:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+      parameters:
+        msg_size: 8388608
+        ib_dev: mlx5_$LOCAL_RANK
+        gpu_dev: $LOCAL_RANK
+        numa_dev: $((LOCAL_RANK/2))
+    gpcnet-network-test:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 1
+          mca:
+            pml: ucx
+            btl: ^uct
+            btl_tcp_if_include: eth0
+          env:
+            UCX_NET_DEVICES: mlx5_0:1
+    gpcnet-network-load-test:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 1
+          mca:
+            pml: ucx
+            btl: ^uct
+            btl_tcp_if_include: eth0
+          env:
+            UCX_NET_DEVICES: mlx5_0:1
+    tcp-connectivity:
+      enable: false
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        port: 22
+    ort-inference:
+      <<: *default_local_mode
+      parameters:
+        batch_size: 1
+    tensorrt-inference:
+      <<: *default_local_mode
+      parameters:
+        pytorch_models:
+          - resnet50
+          - resnet101
+          - resnet152
+          - densenet169
+          - densenet201
+          - bert-base
+          - bert-large
+        seq_length: 224
+        batch_size: 1
+        precision: int8
+    megatron-gpt:
+      modes:
+      - name: mpi
+        proc_num: 1
+        node_num: all
+      parameters:
+        code_base: /opt/superbench/third_party/Megatron/Megatron-DeepSpeed/
+        dataset_url: https://huggingface.co/datasets/suolyer/pile_bookcorpus2/raw/main/test.json
+        batch_size: 2048
+        num_warmup: 0
+        num_steps: 10
+        precision:
+          - float16
+          - bfloat16
+        deepspeed: yes
+        sequence_parallel: yes
+        use_rotary_position_embeddings: yes
     gpt_models:
       <<: *default_pytorch_mode
       models:
         - gpt2-small
+        - gpt2-large
       parameters:
         <<: *common_model_config
-        batch_size: 1
     bert_models:
       <<: *default_pytorch_mode
       models:
         - bert-base
+        - bert-large
       parameters:
         <<: *common_model_config
-        batch_size: 2
     lstm_models:
       <<: *default_pytorch_mode
       models:
         - lstm
       parameters:
         <<: *common_model_config
-        batch_size: 32
     resnet_models:
       <<: *default_pytorch_mode
       models:
@@ -57,15 +268,13 @@ superbench:
         - resnet152
       parameters:
         <<: *common_model_config
-        batch_size: 32
     densenet_models:
       <<: *default_pytorch_mode
       models:
         - densenet169
         - densenet201
       parameters:
         <<: *common_model_config
-        batch_size: 32
     vgg_models:
       <<: *default_pytorch_mode
       models:
@@ -74,5 +283,4 @@ superbench:
         - vgg16
         - vgg19
       parameters:
-        <<: *common_model_config
-        batch_size: 32
+        <<: *common_model_config