Skip to content

Commit 5ecd4b0

Browse files
Merge branch 'main' into kmorabia/security-coding-guide
2 parents ff70cc7 + ba29ad7 commit 5ecd4b0

File tree

17 files changed

+53
-74
lines changed

17 files changed

+53
-74
lines changed

.github/workflows/example_tests.yml

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
torch-pr:
6161
needs: [check-file-changes, wait-checks]
6262
if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
63-
strategy:
63+
strategy: &torch_strategy
6464
fail-fast: false
6565
matrix:
6666
example: [llm_distill, llm_qat, llm_sparsity]
@@ -72,25 +72,21 @@ jobs:
7272
with:
7373
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
7474
example: ${{ matrix.example }}
75+
timeout_minutes: 30
7576
pip_install_extras: "[hf,dev-test]"
76-
runner: linux-amd64-gpu-l4-latest-1
77+
runner: linux-amd64-gpu-h100-latest-1
7778

7879
torch-non-pr:
7980
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
80-
strategy:
81-
fail-fast: false
82-
matrix:
83-
example: [llm_distill, llm_qat, llm_sparsity]
84-
include:
85-
- example: speculative_decoding
86-
docker_image: "26.01"
81+
strategy: *torch_strategy
8782
uses: ./.github/workflows/_example_tests_runner.yml
8883
secrets: inherit
8984
with:
9085
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
9186
example: ${{ matrix.example }}
87+
timeout_minutes: 30
9288
pip_install_extras: "[hf,dev-test]"
93-
runner: linux-amd64-gpu-h100-latest-2
89+
runner: linux-amd64-gpu-rtxpro6000-latest-2
9490

9591
##### TensorRT-LLM Example Tests #####
9692
trtllm-pr:
@@ -99,14 +95,14 @@ jobs:
9995
strategy:
10096
fail-fast: false
10197
matrix:
102-
example: [llm_ptq] # vlm_ptq temporarily disabled due to pipeline error
98+
example: [llm_ptq, vlm_ptq]
10399
uses: ./.github/workflows/_example_tests_runner.yml
104100
secrets: inherit
105101
with:
106-
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
102+
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
107103
example: ${{ matrix.example }}
108104
pip_install_extras: "[hf,dev-test]"
109-
runner: linux-amd64-gpu-h100-latest-1
105+
runner: linux-amd64-gpu-rtxpro6000-latest-1
110106

111107
trtllm-non-pr:
112108
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
@@ -117,16 +113,16 @@ jobs:
117113
uses: ./.github/workflows/_example_tests_runner.yml
118114
secrets: inherit
119115
with:
120-
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
116+
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
121117
example: ${{ matrix.example }}
122118
pip_install_extras: "[hf,dev-test]"
123-
runner: linux-amd64-gpu-h100-latest-2
119+
runner: linux-amd64-gpu-rtxpro6000-latest-2
124120

125121
##### ONNX/TensorRT Example Tests #####
126122
onnx-pr:
127123
needs: [check-file-changes, wait-checks]
128124
if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
129-
strategy:
125+
strategy: &onnx_strategy
130126
fail-fast: false
131127
matrix:
132128
example: [diffusers, torch_onnx]
@@ -140,17 +136,14 @@ jobs:
140136

141137
onnx-non-pr:
142138
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
143-
strategy:
144-
fail-fast: false
145-
matrix:
146-
example: [diffusers, torch_onnx]
139+
strategy: *onnx_strategy
147140
uses: ./.github/workflows/_example_tests_runner.yml
148141
secrets: inherit
149142
with:
150143
docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
151144
example: ${{ matrix.example }}
152145
pip_install_extras: "[all,dev-test]"
153-
runner: linux-amd64-gpu-l4-latest-1
146+
runner: linux-amd64-gpu-rtxpro6000-latest-2
154147

155148
##### Required Check for PR #####
156149
example-pr-required-check:

.github/workflows/gpu_tests.yml

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,23 @@ jobs:
5959
gpu-tests-pr:
6060
needs: [check-file-changes, wait-checks]
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
62-
strategy:
62+
strategy: &gpu_strategy
6363
fail-fast: false
6464
matrix:
6565
include:
66-
- example: cuda13-gpu
66+
- example: gpu
67+
timeout: 60
68+
container_image: pytorch:26.01-py3
69+
- example: gpu-megatron
6770
timeout: 90
68-
- example: cuda13-gpu-megatron
69-
timeout: 120
70-
runs-on: linux-amd64-gpu-l4-latest-1
71+
container_image: pytorch:26.01-py3
72+
- example: gpu-trtllm
73+
timeout: 30
74+
container_image: tensorrt-llm/release:1.3.0rc5
75+
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
7176
timeout-minutes: ${{ matrix.timeout }}
7277
container: &gpu_container
73-
image: nvcr.io/nvidia/pytorch:26.01-py3
78+
image: nvcr.io/nvidia/${{ matrix.container_image }}
7479
env:
7580
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
7681
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -82,18 +87,11 @@ jobs:
8287
run: |
8388
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
8489
- name: Run gpu tests
85-
run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
90+
run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env
8691
gpu-tests-non-pr:
8792
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
88-
strategy:
89-
fail-fast: false
90-
matrix:
91-
include:
92-
- example: cuda13-gpu
93-
timeout: 90
94-
- example: cuda13-gpu-megatron
95-
timeout: 120
96-
runs-on: linux-amd64-gpu-h100-latest-2
93+
strategy: *gpu_strategy
94+
runs-on: linux-amd64-gpu-rtxpro6000-latest-2
9795
timeout-minutes: ${{ matrix.timeout }}
9896
container: *gpu_container
9997
steps: *gpu_steps

tests/_test_utils/import_helper.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,10 @@ def skip_if_no_libcudnn():
6161
pytest.skip(f"{e}!", allow_module_level=True)
6262

6363

64-
def skip_if_no_megatron(*, te_required: bool = True, mamba_required: bool = False):
65-
try:
66-
import megatron # noqa: F401
67-
except ImportError:
68-
pytest.skip("megatron not available", allow_module_level=True)
69-
70-
try:
71-
import transformer_engine # noqa: F401
72-
73-
has_te = True
74-
except ImportError:
75-
has_te = False
76-
64+
def skip_if_no_mamba():
7765
try:
7866
import mamba_ssm # noqa: F401
79-
80-
has_mamba = True
8167
except ImportError:
82-
has_mamba = False
83-
84-
if te_required and not has_te:
85-
pytest.skip("TE required for Megatron test", allow_module_level=True)
86-
87-
if mamba_required and not has_mamba:
8868
pytest.skip("Mamba required for Megatron test", allow_module_level=True)
8969

9070

tests/_test_utils/torch/megatron/models.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818
import torch
1919
import torch.nn as nn
2020
import torch.nn.functional as F
21-
from _test_utils.import_helper import skip_if_no_megatron
22-
from huggingface_hub import constants as hf_constants
23-
24-
skip_if_no_megatron()
25-
2621
from _test_utils.torch.megatron.utils import initialize_for_megatron
22+
from huggingface_hub import constants as hf_constants
2723
from megatron.core.models.gpt import GPTModel
2824
from megatron.core.models.gpt.gpt_layer_specs import (
2925
get_gpt_layer_local_spec,

tests/_test_utils/torch/megatron/utils.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@
1818
from warnings import warn
1919

2020
import torch
21-
from _test_utils.import_helper import skip_if_no_megatron
22-
23-
skip_if_no_megatron()
24-
2521
from megatron.core import dist_checkpointing
2622
from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage
2723
from megatron.core.inference.contexts import StaticInferenceContext

tests/gpu/onnx/quantization/autotune/test_workflow.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def simple_conv_model():
3535
return _test_models._create_simple_conv_onnx_model()
3636

3737

38+
@pytest.mark.skip(reason="TODO: Fix test and enable")
3839
@pytest.mark.parametrize("use_trtexec", [True, False])
3940
def test_export_quantized_model(use_trtexec, simple_conv_model):
4041
"""Test exporting quantized model with Q/DQ."""

tests/gpu/torch/quantization/test_hadamard.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,14 @@
1818
import torch
1919
import torch.nn as nn
2020

21-
pytest.importorskip("fast_hadamard_transform")
21+
fast_hadamard_transform = pytest.importorskip("fast_hadamard_transform")
22+
23+
try:
24+
fast_hadamard_transform.hadamard_transform(torch.randn(1, 2, device="cuda"))
25+
except Exception:
26+
pytest.skip(
27+
"fast_hadamard_transform CUDA kernels not available for this GPU", allow_module_level=True
28+
)
2229

2330
from _test_utils.torch.quantization.models import SDPAAttention
2431

tests/gpu_megatron/_extensions

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../gpu/_extensions/test_torch_extensions.py

tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515

1616

1717
import torch
18-
from _test_utils.import_helper import skip_if_no_megatron
18+
from _test_utils.import_helper import skip_if_no_mamba
1919

20-
skip_if_no_megatron(mamba_required=True)
20+
skip_if_no_mamba()
2121

2222
from _test_utils.torch.distributed.utils import spawn_multiprocess_job
2323
from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model

0 commit comments

Comments
 (0)