use of inputTensorShapeValues after freeing led to garbage memory being read in C++ runtime, adding test cases

apbose · lanluo-nvidia · commit 5b8b2ea542ff · 2026-03-10T10:41:44.000-07:00
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -96,9 +96,8 @@ void setup_input_tensors(
     std::vector<at::Tensor> inputs,
     c10::intrusive_ptr<TRTEngine> compiled_engine,
     bool cudagraphs_enabled,
-    bool need_cudagraphs_record) {
-  // this is a buffer to store shape tensor input addresses throughout the runtime scope
-  std::list<std::vector<int64_t>> inputShapeTensorValues;
+    bool need_cudagraphs_record,
+    std::list<std::vector<int64_t>>& inputShapeTensorValues) {
   std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
 
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -115,12 +114,10 @@ void setup_input_tensors(
 
     auto dims = core::util::toDims(inputs[i].sizes());
     auto shape = core::util::toVec(dims);
-    LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
+    bool is_shape_tensor = compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str());
+    LOG_DEBUG("Input Name: " << name << " Shape: " << dims << " isShapeInferenceIO: " << is_shape_tensor);
 
-    if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
-      // Shape tensor inputs are casted to int64 explicitly.
-      // Refer to
-      // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
+    if (is_shape_tensor) {
       auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
       std::vector<int64_t> inputs_cpu_vec(
           input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
@@ -233,6 +230,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
 
+    // Shape tensor CPU buffers must outlive inferShapes() and enqueueV3()
+    std::list<std::vector<int64_t>> inputShapeTensorValues;
+
     // Intialize inputs and outputs to be available throughout the succeeding scopes
     { // Input Setup
       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
@@ -241,7 +241,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
       }
 
-      setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
+      setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, inputShapeTensorValues);
       // Check if input shapes can be inferred.
       int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
       std::vector<char const*> names(io_size);
@@ -364,14 +364,17 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   };
 
   auto run_output_allocator = [&]() {
+    // Shape tensor CPU buffers must outlive inferShapes() and enqueueV3()
+    std::list<std::vector<int64_t>> inputShapeTensorValues;
+
     { // Input Setup
       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
       if (compiled_engine->profile_execution) {
         input_profiler_guard =
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
       }
 
-      setup_input_tensors(inputs, compiled_engine, false, false);
+      setup_input_tensors(inputs, compiled_engine, false, false, inputShapeTensorValues);
       // Check if input shapes can be inferred.
       int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
       std::vector<char const*> names(io_size);
diff --git a/py/torch_tensorrt/dynamo/conversion/_symbolic_shape_capture.py b/py/torch_tensorrt/dynamo/conversion/_symbolic_shape_capture.py
@@ -69,10 +69,16 @@ def extract_symbolic_shape_expressions(
                 }
             )
         elif isinstance(input_val, (torch.SymInt, torch.SymFloat, int, float, bool)):
+            if isinstance(input_val, (torch.SymInt, int)):
+                scalar_dtype = torch.int64
+            elif isinstance(input_val, (torch.SymFloat, float)):
+                scalar_dtype = torch.float64
+            else:
+                scalar_dtype = torch.bool
             input_info.append(
                 {
                     "shape_exprs": [],
-                    "dtype": None,
+                    "dtype": scalar_dtype,
                     "name": input_node.name,
                     "is_scalar": True,
                 }
@@ -113,10 +119,16 @@ def extract_symbolic_shape_expressions(
                 }
             )
         elif isinstance(out_val, (torch.SymInt, torch.SymFloat, int, float, bool)):
+            if isinstance(out_val, (torch.SymInt, int)):
+                scalar_dtype = torch.int64
+            elif isinstance(out_val, (torch.SymFloat, float)):
+                scalar_dtype = torch.float64
+            else:
+                scalar_dtype = torch.bool
             output_info.append(
                 {
                     "shape_exprs": [],
-                    "dtype": None,
+                    "dtype": scalar_dtype,
                     "is_scalar": True,
                 }
             )
diff --git a/tests/py/dynamo/models/test_symint_scalar_input.py b/tests/py/dynamo/models/test_symint_scalar_input.py
@@ -0,0 +1,204 @@
+"""
+Tests for SymInt scalar input handling in symbolic shape capture and TRT compilation.
+
+These tests verify that when Dynamo partitions an FX graph such that a SymInt
+(e.g., from targets.size(0)) becomes a bare scalar placeholder input to the TRT
+subgraph, the symbolic shape extraction and compilation succeed.
+
+This covers the fix in _symbolic_shape_capture.py where non-tensor inputs
+(SymInt, int, float, bool) are handled gracefully instead of aborting extraction.
+"""
+
+import unittest
+
+import pytest
+import torch
+import torch_tensorrt as torchtrt
+from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
+
+assertions = unittest.TestCase()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("use_python_runtime", [True, False])
+def test_symint_from_size_used_in_reshape(use_python_runtime):
+    """
+    Test that a SymInt derived from tensor.size(0) can be used in reshape
+    when it becomes a scalar placeholder input to the TRT subgraph.
+
+    This is the core pattern from issue #4107: targets.size(0) produces a
+    SymInt that Dynamo passes as a bare scalar input to the TRT partition,
+    which then uses it in a reshape operation.
+    """
+
+    class Model(torch.nn.Module):
+        def forward(self, x, targets):
+            B = targets.size(0)
+            y = x.reshape(B, -1)
+            return y
+
+    model = Model().eval().cuda()
+
+    x = torch.randn(16, 64).cuda()
+    targets = torch.randint(0, 10, (16, 1), dtype=torch.int64).cuda()
+
+    torch._dynamo.mark_dynamic(x, 0, min=1, max=2048)
+    torch._dynamo.mark_dynamic(targets, 0, min=1, max=2048)
+
+    compile_spec = {
+        "enabled_precisions": {torch.float},
+        "min_block_size": 1,
+        "pass_through_build_failures": True,
+        "use_python_runtime": use_python_runtime,
+    }
+
+    trt_model = torch.compile(model, backend="tensorrt", options=compile_spec)
+
+    output_ref = model(x, targets)
+    output_trt = trt_model(x, targets)
+
+    cos_sim = cosine_similarity(output_ref, output_trt)
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"SymInt reshape test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}",
+    )
+
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("use_python_runtime", [True, False])
+def test_scalar_tensor_input(use_python_runtime):
+    """
+    Test that a 0-dim scalar tensor input (e.g., cache_length) is handled
+    correctly during symbolic shape extraction and TRT compilation.
+    """
+
+    class Model(torch.nn.Module):
+        def forward(self, x, offset):
+            return x + offset
+
+    model = Model().eval().cuda()
+
+    x = torch.randn(16, 64).cuda()
+    offset = torch.tensor(5.0).cuda()
+
+    compile_spec = {
+        "enabled_precisions": {torch.float},
+        "min_block_size": 1,
+        "pass_through_build_failures": True,
+        "use_python_runtime": use_python_runtime,
+    }
+
+    trt_model = torch.compile(model, backend="tensorrt", options=compile_spec)
+
+    output_ref = model(x, offset)
+    output_trt = trt_model(x, offset)
+
+    cos_sim = cosine_similarity(output_ref, output_trt)
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"Scalar tensor input test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}",
+    )
+
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("use_python_runtime", [True, False])
+def test_symint_with_index_and_reshape(use_python_runtime):
+    """
+    Full reproduction of issue #4107 pattern: symbolic size from int64 tensor,
+    used with index operation and reshape.
+
+    Model does:
+    1. B = targets.size(0)  → SymInt
+    2. idx = cache_length + arange(1) → int64 index tensor
+    3. y = x[:, idx, :] → gather with int64 index
+    4. z = y.reshape(B, 1, -1, 2) → reshape using SymInt
+    """
+
+    class TestModule(torch.nn.Module):
+        def forward(self, x, targets, cache_length):
+            B = targets.size(0)
+            idx = cache_length + torch.arange(1, device=x.device)
+            y = x[:, idx, :]
+            z = y.reshape(B, 1, -1, 2)
+            return z
+
+    model = TestModule().eval().cuda()
+
+    B, S, D = 16, 128, 1024
+    x = torch.randn(B, S, D).cuda()
+    targets = torch.randint(0, 10, (B, 1), dtype=torch.int64).cuda()
+    cache_length = torch.tensor(0, dtype=torch.int64).cuda()
+
+    torch._dynamo.mark_dynamic(targets, 0, min=1, max=2048)
+    torch._dynamo.mark_dynamic(x, 0, min=1, max=2048)
+
+    compile_spec = {
+        "enabled_precisions": {torch.float, torch.half},
+        "min_block_size": 1,
+        "truncate_double": True,
+        "pass_through_build_failures": True,
+        "use_python_runtime": use_python_runtime,
+    }
+
+    trt_model = torch.compile(model, backend="tensorrt", options=compile_spec)
+
+    output_ref = model(x, targets, cache_length)
+    output_trt = trt_model(x, targets, cache_length)
+
+    cos_sim = cosine_similarity(output_ref, output_trt)
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"Issue 4107 repro test (python_runtime={use_python_runtime}) failed. Cosine sim: {cos_sim}",
+    )
+
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("use_python_runtime", [True, False])
+def test_symint_with_different_batch_sizes(use_python_runtime):
+    """
+    Test that after compilation with a SymInt scalar input, the model
+    produces correct results with different batch sizes.
+    """
+
+    class Model(torch.nn.Module):
+        def forward(self, x, targets):
+            B = targets.size(0)
+            return x.reshape(B, 2, -1)
+
+    model = Model().eval().cuda()
+
+    x = torch.randn(8, 64).cuda()
+    targets = torch.randint(0, 10, (8, 1), dtype=torch.int64).cuda()
+
+    torch._dynamo.mark_dynamic(x, 0, min=1, max=2048)
+    torch._dynamo.mark_dynamic(targets, 0, min=1, max=2048)
+
+    compile_spec = {
+        "enabled_precisions": {torch.float},
+        "min_block_size": 1,
+        "pass_through_build_failures": True,
+        "use_python_runtime": use_python_runtime,
+    }
+
+    trt_model = torch.compile(model, backend="tensorrt", options=compile_spec)
+
+    for batch_size in [4, 8, 16]:
+        x_test = torch.randn(batch_size, 64).cuda()
+        targets_test = torch.randint(0, 10, (batch_size, 1), dtype=torch.int64).cuda()
+
+        output_ref = model(x_test, targets_test)
+        output_trt = trt_model(x_test, targets_test)
+
+        cos_sim = cosine_similarity(output_ref, output_trt)
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Varying batch size test (python_runtime={use_python_runtime}) failed at B={batch_size}. Cosine sim: {cos_sim}",
+        )
+
+    torch._dynamo.reset()

Original file line number	Diff line number	Diff line change
`@@ -69,10 +69,16 @@ def extract_symbolic_shape_expressions(`
`69`	`69`	`}`
`70`	`70`	`)`
`71`	`71`	`elif isinstance(input_val, (torch.SymInt, torch.SymFloat, int, float, bool)):`
	`72`	`+ if isinstance(input_val, (torch.SymInt, int)):`
	`73`	`+ scalar_dtype = torch.int64`
	`74`	`+ elif isinstance(input_val, (torch.SymFloat, float)):`
	`75`	`+ scalar_dtype = torch.float64`
	`76`	`+ else:`
	`77`	`+ scalar_dtype = torch.bool`
`72`	`78`	`input_info.append(`
`73`	`79`	`{`
`74`	`80`	`"shape_exprs": [],`
`75`		`- "dtype": None,`
	`81`	`+ "dtype": scalar_dtype,`
`76`	`82`	`"name": input_node.name,`
`77`	`83`	`"is_scalar": True,`
`78`	`84`	`}`
`@@ -113,10 +119,16 @@ def extract_symbolic_shape_expressions(`
`113`	`119`	`}`
`114`	`120`	`)`
`115`	`121`	`elif isinstance(out_val, (torch.SymInt, torch.SymFloat, int, float, bool)):`
	`122`	`+ if isinstance(out_val, (torch.SymInt, int)):`
	`123`	`+ scalar_dtype = torch.int64`
	`124`	`+ elif isinstance(out_val, (torch.SymFloat, float)):`
	`125`	`+ scalar_dtype = torch.float64`
	`126`	`+ else:`
	`127`	`+ scalar_dtype = torch.bool`
`116`	`128`	`output_info.append(`
`117`	`129`	`{`
`118`	`130`	`"shape_exprs": [],`
`119`		`- "dtype": None,`
	`131`	`+ "dtype": scalar_dtype,`
`120`	`132`	`"is_scalar": True,`
`121`	`133`	`}`
`122`	`134`	`)`