NVIDIA · yaoyaoding · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/.claude/skills/ncu-report/SKILL.md b/.claude/skills/ncu-report/SKILL.md
@@ -14,7 +14,7 @@ user-invocable: true
 
 This skill handles two modes:
 
-1. **Analyze an existing report**: The user provides a path to an `.ncu-rep` file (or one exists under `examples/`). Use the `ncu` CLI to extract and present metrics. All `ncu` commands MUST use `TMPDIR=/tmp/ncu_tmp` prefix to avoid temp file errors.
+1. **Analyze an existing report**: The user provides a path to an `.ncu-rep` file (or one exists under `examples/`). Use the `ncu` CLI to extract and present metrics.
 
 2. **Generate a new report**: The user specifies a script or kernel to profile but does NOT provide a `.ncu-rep` file. In this case, set up profiling using `tilus.utils.ncu_utils.ncu_run()`, run it, then analyze the resulting report.
 
@@ -84,10 +84,10 @@ Run these in parallel:
 
 ```bash
 # List all kernels with timing
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page raw --csv --metrics gpu__time_duration.sum 2>&1
+ncu -i <REPORT> --page raw --csv --metrics gpu__time_duration.sum 2>&1
 
 # Session/device info
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page session --csv 2>&1
+ncu -i <REPORT> --page session --csv 2>&1
 ```
 
 Present a summary table:
@@ -97,7 +97,7 @@ Present a summary table:
 ### Step 2: Speed of Light — Top-level throughput
 
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section SpeedOfLight 2>&1
+ncu -i <REPORT> --page details --csv --section SpeedOfLight 2>&1
 ```
 
 Key metrics to highlight per kernel:
@@ -113,10 +113,10 @@ Key metrics to highlight per kernel:
 
 ```bash
 # Compute workload
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section ComputeWorkloadAnalysis 2>&1
+ncu -i <REPORT> --page details --csv --section ComputeWorkloadAnalysis 2>&1
 
 # Memory workload
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section MemoryWorkloadAnalysis 2>&1
+ncu -i <REPORT> --page details --csv --section MemoryWorkloadAnalysis 2>&1
 ```
 
 Key compute metrics: Executed IPC Active, SM Busy %, Issue Slots Busy %
@@ -125,7 +125,7 @@ Key memory metrics: Mem Busy %, Max Bandwidth %, L1/L2 hit rates
 ### Step 4: Occupancy
 
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section Occupancy 2>&1
+ncu -i <REPORT> --page details --csv --section Occupancy 2>&1
 ```
 
 Report: Theoretical Occupancy, Achieved Occupancy, and limiters (registers, shared memory, block size).
@@ -134,24 +134,24 @@ Report: Theoretical Occupancy, Achieved Occupancy, and limiters (registers, shar
 
 To extract specific raw metrics:
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page raw --csv --metrics <metric1>,<metric2>,... 2>&1
+ncu -i <REPORT> --page raw --csv --metrics <metric1>,<metric2>,... 2>&1
 ```
 
 To filter by kernel:
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page raw --csv --metrics <metrics> --kernel-name regex:<pattern> 2>&1
+ncu -i <REPORT> --page raw --csv --metrics <metrics> --kernel-name regex:<pattern> 2>&1
 ```
 
 ### Step 6: Source-level analysis (on demand)
 
 SASS-only (default, always available):
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page source --csv --kernel-name regex:<pattern> 2>&1
+ncu -i <REPORT> --page source --csv --kernel-name regex:<pattern> 2>&1
 ```
 
 CUDA source correlated with SASS (requires `--import-source yes` during profiling):
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page source --csv --print-source cuda,sass --kernel-name regex:<pattern> 2>&1
+ncu -i <REPORT> --page source --csv --print-source cuda,sass --kernel-name regex:<pattern> 2>&1
 ```
 
 Source output columns include per-instruction: Warp Stall Sampling, Instructions Executed, Thread Instructions Executed, stall reasons (stall_barrier, stall_math, stall_wait, etc.), shared memory conflicts, and more.
@@ -160,12 +160,12 @@ Source output columns include per-instruction: Warp Stall Sampling, Instructions
 
 Rules are included in the details page output. Look for non-empty "Rule Name" column entries.
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --print-rule-details 2>&1 | grep -v '^"[0-9]' | head -5  # header
+ncu -i <REPORT> --page details --csv --print-rule-details 2>&1 | grep -v '^"[0-9]' | head -5  # header
 ```
 
 To see all rule results with descriptions:
 ```bash
-TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --print-rule-details 2>&1
+ncu -i <REPORT> --page details --csv --print-rule-details 2>&1
 ```
 Filter for rows where column 17 (Rule Name) is non-empty.
 

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -70,7 +70,7 @@ Call `tilus.option.debug.dump_ir()` before running the kernel. The IR after each
 ```python
     ...
     self.store_shared(s_c, ...)
-    self.fence.async_view(space="shared")   # fence.proxy.async.shared::cta
+    self.fence.proxy_async(space="shared")   # fence.proxy.async.shared::cta
     self.sync()
     with self.single_thread():
         self.tma.shared_to_global(s_c, g_c, ...)

diff --git a/examples/blackwell_matmul/matmul_v5.py b/examples/blackwell_matmul/matmul_v5.py
@@ -233,7 +233,7 @@ def __call__(
             r_acc = self.tcgen05.load(t_acc)
             self.tcgen05.wait_load()
             self.store_shared(s_c, r_acc.to(float16))
-            self.fence.async_view()
+            self.fence.proxy_async()
             self.sync()
             with self.single_thread():
                 self.tma.shared_to_global(

diff --git a/examples/blackwell_matmul/matmul_v6.py b/examples/blackwell_matmul/matmul_v6.py
@@ -278,7 +278,7 @@ def __call__(
             r_acc = self.tcgen05.load(t_acc)
             self.tcgen05.wait_load()
             self.store_shared(s_c, r_acc.to(float16))
-            self.fence.async_view()
+            self.fence.proxy_async()
             self.sync()
             with self.single_thread():
                 self.tma.shared_to_global(

diff --git a/examples/blackwell_matmul/matmul_v7.py b/examples/blackwell_matmul/matmul_v7.py
@@ -95,7 +95,7 @@ def query_clc_response(self, s_clc_response: SharedTensor, pipe: Pipeline):
         pipe.consumer_acquire(scope="cluster")
         response = s_clc_response[pipe.consumer_stage]
         is_valid, new_blockIdx = self.clc.query_response(response)
-        self.fence.async_view(space="shared")
+        self.fence.proxy_async(space="shared")
         self.mbarrier.arrive_and_expect_tx_remote(
             pipe.consumer_barrier(), transaction_bytes=0, target_rank=0
         )
@@ -278,7 +278,7 @@ def __call__(
                     r_acc = self.tcgen05.load(t_acc_slice)
                     self.tcgen05.wait_load()
                     self.store_shared(s_c, r_acc.to(float16))
-                    self.fence.async_view(space="shared")
+                    self.fence.proxy_async(space="shared")
                     self.sync()
                     with self.single_thread():
                         self.tma.shared_to_global(

diff --git a/examples/blackwell_matmul/matmul_v8.py b/examples/blackwell_matmul/matmul_v8.py
@@ -31,11 +31,12 @@ def __init__(
         self.producer_phase: uint32 = self.mbarrier.producer_initial_phase
         self.consumer_phase: uint32 = self.mbarrier.consumer_initial_phase
 
-    def producer_acquire(self, scope: str = "cta"):
+    def producer_acquire(self):
         self.mbarrier.wait(
             barrier=self.empty_barriers[self.producer_stage],
             phase=self.producer_phase,
-            scope=scope,
+            sem="relaxed",
+            scope="cta",
         )
 
     def producer_barrier(self) -> RegisterTensor:
@@ -45,11 +46,12 @@ def producer_advance(self):
         self.producer_stage = (self.producer_stage + 1) % self.num_stages
         self.producer_phase = self.producer_phase ^ (self.producer_stage == 0)
 
-    def consumer_acquire(self, scope: str = "cta"):
+    def consumer_acquire(self):
         self.mbarrier.wait(
             barrier=self.full_barriers[self.consumer_stage],
             phase=self.consumer_phase,
-            scope=scope,
+            sem="relaxed",
+            scope="cta",
         )
 
     def consumer_barrier(self) -> RegisterTensor:
@@ -67,15 +69,15 @@ def consumer_advance(self):
 @tilus.autotune("mma_stages", [2])
 @tilus.autotune("swizzle_size", [4, 8, 16])
 class BlackwellMatmulV8(tilus.Script):
-    debug_schedule = dict(
-        block_m=256,
-        block_n=256,
-        block_k=64,
-        tma_stages=6,
-        mma_stages=2,
-        e_block_n=32,
-        swizzle_size=8,
-    )
+    # debug_schedule = dict(
+    #     block_m=256,
+    #     block_n=256,
+    #     block_k=64,
+    #     tma_stages=6,
+    #     mma_stages=2,
+    #     e_block_n=32,
+    #     swizzle_size=8,
+    # )
 
     def __init__(
         self,
@@ -113,10 +115,10 @@ def compute_block_coord(
         return m_block, n_block
 
     def query_clc_response(self, s_clc_response: SharedTensor, pipe: Pipeline):
-        pipe.consumer_acquire(scope="cluster")
+        pipe.consumer_acquire()
         response = s_clc_response[pipe.consumer_stage]
         is_valid, new_blockIdx = self.clc.query_response(response)
-        self.fence.async_view(space="shared")
+        self.fence.proxy_async(space="shared")
         self.mbarrier.arrive_and_expect_tx_remote(
             pipe.consumer_barrier(), transaction_bytes=0, target_rank=0
         )
@@ -270,9 +272,7 @@ def __call__(
         with self.single_warp(2):  # scheduler
             while True:
                 if cta_rank == 0:
-                    clc_pipe.producer_acquire(
-                        scope="cluster"
-                    )  # peer cta will arrive this barrier, need 'cluster'scoped acquire
+                    clc_pipe.producer_acquire()
                     self.mbarrier.arrive_and_expect_tx_multicast(
                         clc_pipe.producer_barrier(),
                         transaction_bytes=16,
@@ -310,7 +310,7 @@ def __call__(
                     r_acc = self.tcgen05.load(t_acc_slice)
                     self.tcgen05.wait_load()
                     self.store_shared(s_c, r_acc.to(float16))
-                    self.fence.async_view(space="shared")
+                    self.fence.proxy_async(space="shared")
                     self.sync()
                     with self.single_thread():
                         self.tma.shared_to_global(
@@ -320,7 +320,7 @@ def __call__(
                             dims=[0, 1],
                         )
                         self.tma.commit_group()
-                        self.tma.wait_group(n=0)
+                        self.tma.wait_group(n=0, read=True)
                     self.sync()
 
                 self.mbarrier.arrive(mma_pipe.consumer_barrier())
@@ -372,7 +372,7 @@ def main(bench=True):
                 ("torch", lambda: torch.matmul(a, b.T, out=c_expected)),
                 ("tilus", lambda: matmul(m_size, n_size, k_size, a, b, c_actual)),
             ]:
-                latency = benchmark_func(func, warmup=5, repeat=20)
+                latency = benchmark_func(func, warmup=5, repeat=200)
                 tflops = 2 * m_size * n_size * k_size / latency * 1e-9
                 rows.append([m_size, n_size, k_size, name, latency, tflops])
 
@@ -384,4 +384,4 @@ def main(bench=True):
 if __name__ == "__main__":
     main(bench=True)
     # main(bench=False)
-    # ncu_run(main, bench=False, kernel_regex="hidet|nvjet")
+    # ncu_run(main, bench=False, kernel_regex="tilus|nvjet")