diff --git a/VERSION b/VERSION index 9b7a431d9f..448ada3bda 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.2.4 \ No newline at end of file +3.2.5 \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs index 92523d692c..265505b5d6 100644 --- a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs +++ b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs @@ -149,7 +149,7 @@ public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecution() public async Task SuperBenchmarkExecutorDeploySuperBenchContainer() { ProcessStartInfo expectedInfo = new ProcessStartInfo(); - string expectedCommand = $"sb deploy --host-list localhost -i testContainer"; + string expectedCommand = $"bash -c \"source ./venv/bin/activate && sb deploy --host-list localhost -i testContainer\""; bool commandExecuted = false; this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => @@ -184,7 +184,7 @@ public async Task SuperBenchmarkExecutorDeploySuperBenchContainer() public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommand() { ProcessStartInfo expectedInfo = new ProcessStartInfo(); - string expectedCommand = $"sb run --host-list localhost -c Test.yaml"; + string expectedCommand = $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\""; bool commandExecuted = false; this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => @@ -224,8 +224,8 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"", $"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark", $"sudo bash initialize.sh testuser", - $"sb deploy --host-list localhost -i testContainer", - $"sb run --host-list localhost -c Test.yaml" + $"bash -c \"source ./venv/bin/activate && sb deploy --host-list localhost -i testContainer\"", + $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\"" }; int processCount = 0; @@ -278,8 +278,8 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"", $"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark", $"sudo bash initialize.sh testuser /docker/path", - $"sb deploy --host-list localhost -i testContainer", - $"sb run --host-list localhost -c Test.yaml" + $"bash -c \"source ./venv/bin/activate && sb deploy --host-list localhost -i testContainer\"", + $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\"" }; int processCount = 0; @@ -320,7 +320,7 @@ public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecu ProcessStartInfo expectedInfo = new ProcessStartInfo(); List expectedCommands = new List { - $"sb run --host-list localhost -c Test.yaml" + $"bash -c \"source ./venv/bin/activate && sb run --host-list localhost -c Test.yaml\"" }; int processCount = 0; diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs index 1010966216..ca827c261c 100644 --- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs @@ -20,7 +20,7 @@ namespace VirtualClient.Actions /// /// The SuperBenchmark workload executor. /// - [SupportedPlatforms("linux-x64", true)] + [SupportedPlatforms("linux-x64,linux-arm64", true)] public class SuperBenchmarkExecutor : VirtualClientComponent { private const string SuperBenchmarkRunShell = "RunSuperBenchmark.sh"; @@ -139,8 +139,9 @@ protected override async Task ExecuteAsync(EventContext telemetryContext, Cancel using (BackgroundOperations profiling = BackgroundOperations.BeginProfiling(this, cancellationToken)) { string commandArguments = this.GetCommandLineArguments(); + string commandWithVenv = $"-c \"source ./venv/bin/activate && sb {commandArguments}\""; - using (IProcessProxy process = await this.ExecuteCommandAsync("sb", commandArguments, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false)) + using (IProcessProxy process = await this.ExecuteCommandAsync("bash", commandWithVenv, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false)) { if (!cancellationToken.IsCancellationRequested) { @@ -166,7 +167,6 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can if (!state.SuperBenchmarkInitialized) { - // This is to grant directory folders for await this.systemManager.MakeFilesExecutableAsync(this.PlatformSpecifics.CurrentDirectory, this.Platform, cancellationToken); string cloneDir = this.PlatformSpecifics.Combine(this.PlatformSpecifics.PackagesDirectory, "superbenchmark"); @@ -191,7 +191,8 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can } await this.ExecuteSbCommandAsync("bash", initializeArgs, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true); - await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false); + string commandWithVenv = $"-c \"source ./venv/bin/activate && sb deploy --host-list localhost -i {this.ContainerVersion}\""; + await this.ExecuteSbCommandAsync("bash", commandWithVenv, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false); state.SuperBenchmarkInitialized = true; } diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/default.yaml b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/default.yaml index b2f75f655c..b36cbb5b97 100644 --- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/default.yaml +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/default.yaml @@ -1,20 +1,24 @@ # SuperBench Config -version: v0.8 +version: v0.12 superbench: enable: null + monitor: + enable: true + sample_duration: 1 + sample_interval: 10 var: default_local_mode: &default_local_mode enable: true modes: - name: local - proc_num: 1 + proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} parallel: yes default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed - proc_num: 1 + proc_num: 8 node_num: 1 frameworks: - pytorch @@ -22,33 +26,240 @@ superbench: duration: 0 num_warmup: 16 num_steps: 128 + batch_size: 1 precision: - float32 - float16 model_action: - train benchmarks: + gpu-burn: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + time: 300 + doubles: true + tensor_core: true + nccl-bw:default: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + ngpus: 8 + nccl-bw:gdr-only: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + env: + NCCL_IB_PCI_RELAXED_ORDERING: '1' + NCCL_NET_GDR_LEVEL: '5' + NCCL_P2P_DISABLE: '1' + NCCL_SHM_DISABLE: '1' + NCCL_MIN_NCHANNELS: '16' + NCCL_IB_DISABLE: '0' + parameters: + ngpus: 8 + ib-loopback: + enable: true + modes: + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2 + parallel: yes + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2 + parallel: yes + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: + - /dev/nvme0n1 + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: + - htod + - dtoh + - dtod + copy_type: + - sm + - dma + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: + - htod + - dtoh + - dtod + copy_type: + - sm + - dma + nvbandwidth: + enable: true + modes: + - name: local + parallel: no + parameters: + buffer_size: 128 + test_cases: + - host_to_device_memcpy_ce + - device_to_host_memcpy_ce + - host_to_device_memcpy_sm + - device_to_host_memcpy_sm + num_loops: 18 + skip_verification: false + disable_affinity: false + use_mean: false + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + sharding-matmul: + <<: *default_pytorch_mode + computation-communication-overlap: + <<: *default_pytorch_mode + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 8 + parameters: + msg_size: 8388608 + ib_dev: mlx5_$LOCAL_RANK + gpu_dev: $LOCAL_RANK + numa_dev: $((LOCAL_RANK/2)) + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + env: + UCX_NET_DEVICES: mlx5_0:1 + gpcnet-network-load-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + env: + UCX_NET_DEVICES: mlx5_0:1 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + ort-inference: + <<: *default_local_mode + parameters: + batch_size: 1 + tensorrt-inference: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 1 + precision: int8 + megatron-gpt: + modes: + - name: mpi + proc_num: 1 + node_num: all + parameters: + code_base: /opt/superbench/third_party/Megatron/Megatron-DeepSpeed/ + dataset_url: https://huggingface.co/datasets/suolyer/pile_bookcorpus2/raw/main/test.json + batch_size: 2048 + num_warmup: 0 + num_steps: 10 + precision: + - float16 + - bfloat16 + deepspeed: yes + sequence_parallel: yes + use_rotary_position_embeddings: yes gpt_models: <<: *default_pytorch_mode models: - gpt2-small + - gpt2-large parameters: <<: *common_model_config - batch_size: 1 bert_models: <<: *default_pytorch_mode models: - bert-base + - bert-large parameters: <<: *common_model_config - batch_size: 2 lstm_models: <<: *default_pytorch_mode models: - lstm parameters: <<: *common_model_config - batch_size: 32 resnet_models: <<: *default_pytorch_mode models: @@ -57,7 +268,6 @@ superbench: - resnet152 parameters: <<: *common_model_config - batch_size: 32 densenet_models: <<: *default_pytorch_mode models: @@ -65,7 +275,6 @@ superbench: - densenet201 parameters: <<: *common_model_config - batch_size: 32 vgg_models: <<: *default_pytorch_mode models: @@ -74,5 +283,4 @@ superbench: - vgg16 - vgg19 parameters: - <<: *common_model_config - batch_size: 32 + <<: *common_model_config \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh index 4e2d9262b1..18a7c4454c 100644 --- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh @@ -1,13 +1,13 @@ +sudo apt update +sudo apt install -y python3-venv python3-full + # Ansible will use sudo which needs explicit password input. This command removes that step. -echo '$1 ALL=(ALL) NOPASSWD:ALL' | (sudo EDITOR='tee -a' visudo) +echo '$1 ALL=(ALL) NOPASSWD:ALL' | (sudo EDITOR='tee -a' visudo) # Remove any existing system-installed Ansible to avoid version conflicts sudo apt remove -y ansible || true sudo pip3 uninstall -y ansible ansible-base ansible-core || true -# Install ansible-core compatible with Python 3.8 (Ubuntu 20.04) -python3 -m pip install --user "ansible-core>=2.12,<2.14" - # Ensure the pip user-installed ansible is in PATH and takes precedence export PATH=/home/$1/.local/bin:$PATH @@ -33,17 +33,17 @@ if [[ -n "${2:-}" ]]; then # Start Docker back up sudo systemctl start docker - # (Optional) Warm-up/check NVIDIA devices as you had in the commented section - # sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi else echo "No second argument provided; skipping Docker data-root configuration." fi -# Command to install sb dependencies. -python3 -m pip install . +# Clean up any broken previous attempts +rm -rf ./venv +# create a new virtual environment +python3 -m venv ./venv +# activate the virtual environment +source ./venv/bin/activate -# Command to build sb. -make postinstall - -# This command initiates /dev/nvidiactl and /dev/nvidia-uvm directories, which sb checks before running. -sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi \ No newline at end of file +# Commands to build sb. +python3 -m pip install . +make postinstall \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json b/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json index 9d43ae6774..05ef279102 100644 --- a/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json +++ b/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json @@ -3,7 +3,7 @@ "MinimumExecutionInterval": "00:01:00", "Metadata": { "RecommendedMinimumExecutionTime": "08:00:00", - "SupportedPlatforms": "linux-x64", + "SupportedPlatforms": "linux-x64,linux-arm64", "SupportedOperatingSystems": "Ubuntu", "PreliminaryRequirements": "Please note that the GPU hardware driver and related toolsets (e.g. Nvidia CUDA) MUST be already installed on the system for the workloads to function correctly." }, @@ -18,10 +18,10 @@ "Parameters": { "Scenario": "Models", "Username": "$.Parameters.Username", - "Version": "0.9.0", + "Version": "0.12.0", "DockerContainerPath": "$.Parameters.DockerContainerPath", "ConfigurationFile": "$.Parameters.ConfigurationFile", - "ContainerVersion": "superbench/superbench:v0.9.0-cuda12.1" + "ContainerVersion": "superbench/superbench:v0.12.0-cuda12.9" } } ] diff --git a/src/VirtualClient/VirtualClient.Main/profiles/SETUP-GPU-NVIDIA-GB200.json b/src/VirtualClient/VirtualClient.Main/profiles/SETUP-GPU-NVIDIA-GB200.json new file mode 100644 index 0000000000..164eb54ef8 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Main/profiles/SETUP-GPU-NVIDIA-GB200.json @@ -0,0 +1,53 @@ +{ + "Description": "NVIDIA GB200 GPU Driver Installation Dependency", + "Metadata": { + "RecommendedMinimumExecutionTime": "00:10:00", + "SupportedPlatforms": "linux-arm64", + "SupportedOperatingSystems": "Linux", + "SupportedLinuxGpuModel": "NVIDIA GB200", + "SupportedLinuxDistros": "Ubuntu24", + "SpecialRequirements": "This is an NVIDIA GPU Driver dependency. It can only be installed on the system having an NVIDIA GB200 GPU card/chip." + }, + "Parameters": { + "ConfigurationFile": "default.yaml", + "Username": "", + "LinuxCudaVersion": "13.1", + "LinuxDriverVersion": "590", + "LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/13.1.1/local_installers/cuda_13.1.1_590.48.01_linux_sbsa.run" + }, + "Dependencies": [ + { + "Type": "DockerInstallation", + "Parameters": { + "Scenario": "InstallDocker" + } + }, + { + "Type": "NvidiaCudaInstallation", + "Parameters": { + "Scenario": "InstallNvidiaCuda", + "LinuxCudaVersion": "$.Parameters.LinuxCudaVersion", + "LinuxDriverVersion": "$.Parameters.LinuxDriverVersion", + "Username": "$.Parameters.Username", + "LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile" + } + }, + { + "Type": "NvidiaContainerToolkitInstallation", + "Parameters": { + "Scenario": "InstallNvidiaContainerToolkit" + } + }, + { + "Type": "LinuxPackageInstallation", + "Parameters": { + "Scenario": "InstallLinuxPackages", + "Packages": "sshpass,python3-pip", + "Packages-Apt": "nvidia-common", + "Packages-Dnf": "nvidia-driver", + "Packages-Yum": "nvidia-driver", + "Packages-Zypper": "" + } + } + ] +} \ No newline at end of file