Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions .claude/skills/ncu-report/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ user-invocable: true

This skill handles two modes:

1. **Analyze an existing report**: The user provides a path to an `.ncu-rep` file (or one exists under `examples/`). Use the `ncu` CLI to extract and present metrics. All `ncu` commands MUST use `TMPDIR=/tmp/ncu_tmp` prefix to avoid temp file errors.
1. **Analyze an existing report**: The user provides a path to an `.ncu-rep` file (or one exists under `examples/`). Use the `ncu` CLI to extract and present metrics.

2. **Generate a new report**: The user specifies a script or kernel to profile but does NOT provide a `.ncu-rep` file. In this case, set up profiling using `tilus.utils.ncu_utils.ncu_run()`, run it, then analyze the resulting report.

Expand Down Expand Up @@ -84,10 +84,10 @@ Run these in parallel:

```bash
# List all kernels with timing
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page raw --csv --metrics gpu__time_duration.sum 2>&1
ncu -i <REPORT> --page raw --csv --metrics gpu__time_duration.sum 2>&1

# Session/device info
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page session --csv 2>&1
ncu -i <REPORT> --page session --csv 2>&1
```

Present a summary table:
Expand All @@ -97,7 +97,7 @@ Present a summary table:
### Step 2: Speed of Light — Top-level throughput

```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section SpeedOfLight 2>&1
ncu -i <REPORT> --page details --csv --section SpeedOfLight 2>&1
```

Key metrics to highlight per kernel:
Expand All @@ -113,10 +113,10 @@ Key metrics to highlight per kernel:

```bash
# Compute workload
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section ComputeWorkloadAnalysis 2>&1
ncu -i <REPORT> --page details --csv --section ComputeWorkloadAnalysis 2>&1

# Memory workload
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section MemoryWorkloadAnalysis 2>&1
ncu -i <REPORT> --page details --csv --section MemoryWorkloadAnalysis 2>&1
```

Key compute metrics: Executed IPC Active, SM Busy %, Issue Slots Busy %
Expand All @@ -125,7 +125,7 @@ Key memory metrics: Mem Busy %, Max Bandwidth %, L1/L2 hit rates
### Step 4: Occupancy

```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --section Occupancy 2>&1
ncu -i <REPORT> --page details --csv --section Occupancy 2>&1
```

Report: Theoretical Occupancy, Achieved Occupancy, and limiters (registers, shared memory, block size).
Expand All @@ -134,24 +134,24 @@ Report: Theoretical Occupancy, Achieved Occupancy, and limiters (registers, shar

To extract specific raw metrics:
```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page raw --csv --metrics <metric1>,<metric2>,... 2>&1
ncu -i <REPORT> --page raw --csv --metrics <metric1>,<metric2>,... 2>&1
```

To filter by kernel:
```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page raw --csv --metrics <metrics> --kernel-name regex:<pattern> 2>&1
ncu -i <REPORT> --page raw --csv --metrics <metrics> --kernel-name regex:<pattern> 2>&1
```

### Step 6: Source-level analysis (on demand)

SASS-only (default, always available):
```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page source --csv --kernel-name regex:<pattern> 2>&1
ncu -i <REPORT> --page source --csv --kernel-name regex:<pattern> 2>&1
```

CUDA source correlated with SASS (requires `--import-source yes` during profiling):
```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page source --csv --print-source cuda,sass --kernel-name regex:<pattern> 2>&1
ncu -i <REPORT> --page source --csv --print-source cuda,sass --kernel-name regex:<pattern> 2>&1
```

Source output columns include per-instruction: Warp Stall Sampling, Instructions Executed, Thread Instructions Executed, stall reasons (stall_barrier, stall_math, stall_wait, etc.), shared memory conflicts, and more.
Expand All @@ -160,12 +160,12 @@ Source output columns include per-instruction: Warp Stall Sampling, Instructions

Rules are included in the details page output. Look for non-empty "Rule Name" column entries.
```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --print-rule-details 2>&1 | grep -v '^"[0-9]' | head -5 # header
ncu -i <REPORT> --page details --csv --print-rule-details 2>&1 | grep -v '^"[0-9]' | head -5 # header
```

To see all rule results with descriptions:
```bash
TMPDIR=/tmp/ncu_tmp ncu -i <REPORT> --page details --csv --print-rule-details 2>&1
ncu -i <REPORT> --page details --csv --print-rule-details 2>&1
```
Filter for rows where column 17 (Rule Name) is non-empty.

Expand Down
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Call `tilus.option.debug.dump_ir()` before running the kernel. The IR after each
```python
...
self.store_shared(s_c, ...)
self.fence.async_view(space="shared") # fence.proxy.async.shared::cta
self.fence.proxy_async(space="shared") # fence.proxy.async.shared::cta
self.sync()
with self.single_thread():
self.tma.shared_to_global(s_c, g_c, ...)
Expand Down
2 changes: 1 addition & 1 deletion examples/blackwell_matmul/matmul_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def __call__(
r_acc = self.tcgen05.load(t_acc)
self.tcgen05.wait_load()
self.store_shared(s_c, r_acc.to(float16))
self.fence.async_view()
self.fence.proxy_async()
self.sync()
with self.single_thread():
self.tma.shared_to_global(
Expand Down
2 changes: 1 addition & 1 deletion examples/blackwell_matmul/matmul_v6.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def __call__(
r_acc = self.tcgen05.load(t_acc)
self.tcgen05.wait_load()
self.store_shared(s_c, r_acc.to(float16))
self.fence.async_view()
self.fence.proxy_async()
self.sync()
with self.single_thread():
self.tma.shared_to_global(
Expand Down
4 changes: 2 additions & 2 deletions examples/blackwell_matmul/matmul_v7.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def query_clc_response(self, s_clc_response: SharedTensor, pipe: Pipeline):
pipe.consumer_acquire(scope="cluster")
response = s_clc_response[pipe.consumer_stage]
is_valid, new_blockIdx = self.clc.query_response(response)
self.fence.async_view(space="shared")
self.fence.proxy_async(space="shared")
self.mbarrier.arrive_and_expect_tx_remote(
pipe.consumer_barrier(), transaction_bytes=0, target_rank=0
)
Expand Down Expand Up @@ -278,7 +278,7 @@ def __call__(
r_acc = self.tcgen05.load(t_acc_slice)
self.tcgen05.wait_load()
self.store_shared(s_c, r_acc.to(float16))
self.fence.async_view(space="shared")
self.fence.proxy_async(space="shared")
self.sync()
with self.single_thread():
self.tma.shared_to_global(
Expand Down
44 changes: 22 additions & 22 deletions examples/blackwell_matmul/matmul_v8.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ def __init__(
self.producer_phase: uint32 = self.mbarrier.producer_initial_phase
self.consumer_phase: uint32 = self.mbarrier.consumer_initial_phase

def producer_acquire(self, scope: str = "cta"):
def producer_acquire(self):
self.mbarrier.wait(
barrier=self.empty_barriers[self.producer_stage],
phase=self.producer_phase,
scope=scope,
sem="relaxed",
scope="cta",
)

def producer_barrier(self) -> RegisterTensor:
Expand All @@ -45,11 +46,12 @@ def producer_advance(self):
self.producer_stage = (self.producer_stage + 1) % self.num_stages
self.producer_phase = self.producer_phase ^ (self.producer_stage == 0)

def consumer_acquire(self, scope: str = "cta"):
def consumer_acquire(self):
self.mbarrier.wait(
barrier=self.full_barriers[self.consumer_stage],
phase=self.consumer_phase,
scope=scope,
sem="relaxed",
scope="cta",
)

def consumer_barrier(self) -> RegisterTensor:
Expand All @@ -67,15 +69,15 @@ def consumer_advance(self):
@tilus.autotune("mma_stages", [2])
@tilus.autotune("swizzle_size", [4, 8, 16])
class BlackwellMatmulV8(tilus.Script):
debug_schedule = dict(
block_m=256,
block_n=256,
block_k=64,
tma_stages=6,
mma_stages=2,
e_block_n=32,
swizzle_size=8,
)
# debug_schedule = dict(
# block_m=256,
# block_n=256,
# block_k=64,
# tma_stages=6,
# mma_stages=2,
# e_block_n=32,
# swizzle_size=8,
# )

def __init__(
self,
Expand Down Expand Up @@ -113,10 +115,10 @@ def compute_block_coord(
return m_block, n_block

def query_clc_response(self, s_clc_response: SharedTensor, pipe: Pipeline):
pipe.consumer_acquire(scope="cluster")
pipe.consumer_acquire()
response = s_clc_response[pipe.consumer_stage]
is_valid, new_blockIdx = self.clc.query_response(response)
self.fence.async_view(space="shared")
self.fence.proxy_async(space="shared")
self.mbarrier.arrive_and_expect_tx_remote(
pipe.consumer_barrier(), transaction_bytes=0, target_rank=0
)
Expand Down Expand Up @@ -270,9 +272,7 @@ def __call__(
with self.single_warp(2): # scheduler
while True:
if cta_rank == 0:
clc_pipe.producer_acquire(
scope="cluster"
) # peer cta will arrive this barrier, need 'cluster'scoped acquire
clc_pipe.producer_acquire()
self.mbarrier.arrive_and_expect_tx_multicast(
clc_pipe.producer_barrier(),
transaction_bytes=16,
Expand Down Expand Up @@ -310,7 +310,7 @@ def __call__(
r_acc = self.tcgen05.load(t_acc_slice)
self.tcgen05.wait_load()
self.store_shared(s_c, r_acc.to(float16))
self.fence.async_view(space="shared")
self.fence.proxy_async(space="shared")
self.sync()
with self.single_thread():
self.tma.shared_to_global(
Expand All @@ -320,7 +320,7 @@ def __call__(
dims=[0, 1],
)
self.tma.commit_group()
self.tma.wait_group(n=0)
self.tma.wait_group(n=0, read=True)
self.sync()

self.mbarrier.arrive(mma_pipe.consumer_barrier())
Expand Down Expand Up @@ -372,7 +372,7 @@ def main(bench=True):
("torch", lambda: torch.matmul(a, b.T, out=c_expected)),
("tilus", lambda: matmul(m_size, n_size, k_size, a, b, c_actual)),
]:
latency = benchmark_func(func, warmup=5, repeat=20)
latency = benchmark_func(func, warmup=5, repeat=200)
tflops = 2 * m_size * n_size * k_size / latency * 1e-9
rows.append([m_size, n_size, k_size, name, latency, tflops])

Expand All @@ -384,4 +384,4 @@ def main(bench=True):
if __name__ == "__main__":
main(bench=True)
# main(bench=False)
# ncu_run(main, bench=False, kernel_regex="hidet|nvjet")
# ncu_run(main, bench=False, kernel_regex="tilus|nvjet")
Loading
Loading