solution.py
final answer
check.log
result.json
"""Custom paged attention decode kernel for RTX PRO 6000 (SM120 Blackwell).
Implements single-query paged attention decode using CUDA C++ via load_inline.
"""
import math
import torch
import torch.nn as nn
from torch.utils.cpp_extension import load_inline
OP_TYPE = "attention"
SUPPORTED_PRECISIONS = ["bf16"]
HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
BATCH = 8
NUM_HEADS = 32
NUM_KV_HEADS = 8
HEAD_DIM = 128
SEQ_LEN = 1024
PAGE_SIZE = 16
CUDA_SOURCE = r"""
#include <torch/extension.h>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
using bfloat16 = __nv_bfloat16;
__device__ inline float bf16_to_float(bfloat16 x) {
return __bfloat162float(x);
}
__device__ inline bfloat16 float_to_bf16(float x) {
return __float2bfloat16(x);
}
template <int THREADS>
__device__ inline float warp_reduce_sum(float val) {
#pragma unroll
for (int mask = THREADS >> 1; mask > 0; mask >>= 1) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val;
}
template <int HEAD_DIM, int MAX_PAGE_SIZE>
__global__ void paged_attention_kernel(
const bfloat16* __restrict__ query,
const bfloat16* __restrict__ kv_cache,
const int* __restrict__ block_table,
const int* __restrict__ seq_lens,
bfloat16* __restrict__ output,
float scale,
int batch,
int num_heads,
int num_kv_heads,
int group_size,
int max_blocks_per_seq,
int page_size
) {
int batch_idx = blockIdx.x;
int head_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch || head_idx >= num_heads) return;
int kv_head_idx = head_idx / group_size;
int seq_len = seq_lens[batch_idx];
int num_pages = (seq_len + page_size - 1) / page_size;
extern __shared__ char smem[];
bfloat16* smem_kv = reinterpret_cast<bfloat16*>(smem);
bfloat16* smem_scores = reinterpret_cast<bfloat16*>(smem + 2 * MAX_PAGE_SIZE * HEAD_DIM * sizeof(bfloat16));
const bfloat16* q_ptr = query + batch_idx * num_heads * HEAD_DIM + head_idx * HEAD_DIM;
float q_f[HEAD_DIM];
for (int i = 0; i < HEAD_DIM; i++) {
q_f[i] = bf16_to_float(q_ptr[i]);
}
float max_score = -1e20f;
float sum_exp = 0.0f;
float output_f[HEAD_DIM];
for (int i = 0; i < HEAD_DIM; i++) {
output_f[i] = 0.0f;
}
for (int page_idx = 0; page_idx < num_pages; page_idx++) {
int page_num = block_table[batch_idx * max_blocks_per_seq + page_idx];
int tokens_in_page = (page_idx == num_pages - 1) ? ((seq_len - 1) % page_size + 1) : page_size;
for (int tid = threadIdx.x; tid < tokens_in_page * HEAD_DIM; tid += blockDim.x) {
int token_in_page = tid / HEAD_DIM;
int dim = tid % HEAD_DIM;
int cache_idx = page_num * page_size * num_kv_heads * HEAD_DIM * 2 +
token_in_page * num_kv_heads * HEAD_DIM * 2 +
kv_head_idx * HEAD_DIM * 2 +
dim;
smem_kv[token_in_page * HEAD_DIM + dim] = kv_cache[cache_idx];
smem_kv[MAX_PAGE_SIZE * HEAD_DIM + token_in_page * HEAD_DIM + dim] = kv_cache[cache_idx + HEAD_DIM];
}
__syncthreads();
for (int token_local = threadIdx.y; token_local < tokens_in_page; token_local += blockDim.y) {
int token_abs = page_idx * page_size + token_local;
if (token_abs >= seq_len) continue;
bfloat16* k_vec = smem_kv + token_local * HEAD_DIM;
float score = 0.0f;
for (int dim = threadIdx.x; dim < HEAD_DIM; dim += blockDim.x) {
score += q_f[dim] * bf16_to_float(k_vec[dim]);
}
score = warp_reduce_sum<32>(score);
float exp_s = 0.0f;
if (threadIdx.x == 0) {
exp_s = expf(score * scale - max_score);
smem_scores[token_local] = float_to_bf16(exp_s);
}
float weight = warp_reduce_sum<32>(exp_s);
if (threadIdx.x == 0) {
sum_exp += weight;
}
bfloat16* v_vec = smem_kv + MAX_PAGE_SIZE * HEAD_DIM + token_local * HEAD_DIM;
for (int dim = threadIdx.x; dim < HEAD_DIM; dim += blockDim.x) {
output_f[dim] += weight * bf16_to_float(v_vec[dim]);
}
}
__syncthreads();
}
if (sum_exp > 0.0f) {
float inv_sum_exp = 1.0f / sum_exp;
for (int dim = threadIdx.x; dim < HEAD_DIM; dim += blockDim.x) {
output_f[dim] *= inv_sum_exp;
}
}
if (threadIdx.x == 0 && threadIdx.y == 0) {
bfloat16* out_ptr = output + batch_idx * num_heads * HEAD_DIM + head_idx * HEAD_DIM;
for (int dim = 0; dim < HEAD_DIM; dim++) {
out_ptr[dim] = float_to_bf16(output_f[dim]);
}
}
}
torch::Tensor paged_attention_forward_impl(
torch::Tensor query,
torch::Tensor kv_cache,
torch::Tensor block_table,
torch::Tensor seq_lens,
int num_heads,
int num_kv_heads,
int head_dim,
int page_size
) {
int batch = query.size(0);
int group_size = num_heads / num_kv_heads;
int max_blocks_per_seq = block_table.size(1);
auto output = torch::empty_like(query);
const int BLOCK_SIZE = 128;
const int NUM_THREADS_Y = 4;
const int MAX_PAGE_SIZE = 16;
dim3 block(BLOCK_SIZE, NUM_THREADS_Y);
dim3 grid(batch, (num_heads + NUM_THREADS_Y - 1) / NUM_THREADS_Y);
int smem_size = 2 * MAX_PAGE_SIZE * head_dim * sizeof(bfloat16) + MAX_PAGE_SIZE * sizeof(bfloat16);
float scale = 1.0f / sqrtf(static_cast<float>(head_dim));
paged_attention_kernel<128, 16><<<grid, block, smem_size>>>(
static_cast<bfloat16*>(query.data_ptr()),
static_cast<bfloat16*>(kv_cache.data_ptr()),
static_cast<int*>(block_table.data_ptr()),
static_cast<int*>(seq_lens.data_ptr()),
static_cast<bfloat16*>(output.data_ptr()),
scale,
batch,
num_heads,
num_kv_heads,
group_size,
max_blocks_per_seq,
page_size
);
return output;
}
"""
CPP_SOURCE = r"""
#include <torch/extension.h>
torch::Tensor paged_attention_forward_impl(
torch::Tensor query,
torch::Tensor kv_cache,
torch::Tensor block_table,
torch::Tensor seq_lens,
int num_heads,
int num_kv_heads,
int head_dim,
int page_size
);
torch::Tensor paged_attention_forward(
torch::Tensor query,
torch::Tensor kv_cache,
torch::Tensor block_table,
torch::Tensor seq_lens,
int num_heads,
int num_kv_heads,
int head_dim,
int page_size
) {
return paged_attention_forward_impl(query, kv_cache, block_table, seq_lens,
num_heads, num_kv_heads, head_dim, page_size);
}
"""
_cached_module = None
def _get_cuda_module():
global _cached_module
if _cached_module is None:
_cached_module = load_inline(
name="paged_attention_cuda",
cpp_sources=CPP_SOURCE,
cuda_sources=CUDA_SOURCE,
functions=["paged_attention_forward"],
verbose=False,
extra_cflags=["-O3"],
extra_cuda_cflags=["-O3", "-gencode=arch=compute_120,code=sm_120"],
)
return _cached_module
class Model(nn.Module):
"""Single-query paged attention decode.
Forward inputs (all on device):
query: (batch, num_heads, head_dim) bf16
kv_cache: (num_blocks, page_size, num_kv_heads, head_dim * 2)
Layout: last dim packs [K | V] so a single gather pulls both.
Stored as bf16.
block_table: (batch, max_blocks) int32
seq_lens: (batch,) int32
Output:
attn_out: (batch, num_heads, head_dim) bf16
"""
def __init__(
self,
batch: int,
num_heads: int,
num_kv_heads: int,
head_dim: int,
seq_len: int,
page_size: int,
):
super().__init__()
assert num_heads % num_kv_heads == 0, "num_heads must be a multiple of num_kv_heads (GQA)"
self.batch = batch
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.head_dim = head_dim
self.seq_len = seq_len
self.page_size = page_size
self.group_size = num_heads // num_kv_heads
self.register_buffer("_dummy", torch.zeros(1, dtype=torch.bfloat16), persistent=False)
def forward(
self,
query: torch.Tensor,
kv_cache: torch.Tensor,
block_table: torch.Tensor,
seq_lens: torch.Tensor,
) -> torch.Tensor:
cuda_mod = _get_cuda_module()
output = cuda_mod.paged_attention_forward(
query,
kv_cache,
block_table,
seq_lens,
self.num_heads,
self.num_kv_heads,
self.head_dim,
self.page_size,
)
return output
def get_inputs():
"""Build random paged inputs for the current module-level shape knobs."""
B = BATCH
H = NUM_HEADS
Hkv = NUM_KV_HEADS
D = HEAD_DIM
L = SEQ_LEN
P = PAGE_SIZE
pages_per_seq = (L + P - 1) // P
total_pages = max(B * pages_per_seq + 8, 64)
query = torch.randn(B, H, D, dtype=torch.bfloat16) * 0.1
kv_cache = torch.randn(total_pages, P, Hkv, 2 * D, dtype=torch.bfloat16) * 0.1
perm = torch.randperm(total_pages)[: B * pages_per_seq].reshape(B, pages_per_seq).int()
block_table = perm.contiguous()
seq_lens = torch.full((B,), L, dtype=torch.int32)
return [query, kv_cache, block_table, seq_lens]
def get_init_inputs():
return [BATCH, NUM_HEADS, NUM_KV_HEADS, HEAD_DIM, SEQ_LEN, PAGE_SIZE]
The kernel compiled but for wrong architecture. RTX PRO 6000 is SM120. Let me fix the architecture:
FAIL: shape 0 {'batch': 8, 'num_heads': 32, 'num_kv_heads': 8, 'head_dim': 128, 'seq_len': 1024, 'page_size': 16} seed 42: solution contains NaN
{
"problem": "03_paged_attention",
"harness": "opencode",
"model": "openrouter-pinned/minimax/minimax-m2.7",
"reasoning_effort": "",
"has_solution": true,
"correct": false,
"peak_fraction": null,
"elapsed_seconds": 506,
"harness_exit_code": 0,
"session_complete": true,
"usage": {"input_tokens": 122844, "output_tokens": 20981, "cache_read_tokens": 1239102, "cache_creation_tokens": 0, "reasoning_tokens": 3846, "total_cost_usd": null}
}
timeline (111 events)
tool
stdout (5926 chars) — TRUNCATED
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py</path>
<type>file</type>
<content>
1: """Naive PyTorch paged-attention decode reference (correctness oracle, not SOTA).
2:
3: Single-query decode: each batch element has a query of shape (num_heads, head_dim)
4: and attends over a KV cache of `seq_len[b]` tokens stored as fixed-size pages in
5: a global pool. Pages for batch element b are listed in `block_table[b]`.
6:
7: The reference performs the slow path:
8: 1. Gather pages -> contiguous (seq_len, num_kv_heads, head_dim) per batch element.
9: 2. Repeat KV heads for grouped-query (broadcast num_kv_heads -> num_heads).
10: 3. Manual softmax(QK^T / sqrt(d)) @ V in fp32, cast back to bf16.
11:
12: This avoids torch.nn.functional.scaled_dot_product_attention (which is on the
13: forbidden list) so the agent cannot dispatch through SDPA either.
14: """
15: import math
16:
17: import torch
18: import torch.nn as nn
19:
20: OP_TYPE = "attention"
21: SUPPORTED_PRECISIONS = ["bf16"]
22: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
23:
24:
25: # --- Shape knobs (overridden by check.py / benchmark.py from shapes.py) ----
26: BATCH = 8
27: NUM_HEADS = 32
28: NUM_KV_HEADS = 8
29: HEAD_DIM = 128
30: SEQ_LEN = 1024
31: PAGE_SIZE = 16
32:
33:
34: class Model(nn.Module):
35: """Single-query paged attention decode.
36:
37: Forward inputs (all on device):
38: query: (batch, num_heads, head_dim) bf16
3
... (4426 more chars)
tool
stdout (4037 chars) — TRUNCATED
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py</path>
<type>file</type>
<content>
1: """Correctness runner for paged-attention decode.
2:
3: Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4: each, with per-dtype atol/rtol. Also rejects forbidden ops by grep.
5: """
6: import re
7: import sys
8: from pathlib import Path
9:
10: import torch
11: import yaml
12:
13: REPO_ROOT = Path(__file__).resolve().parents[2]
14: sys.path.insert(0, str(REPO_ROOT))
15:
16: from src.eval.correctness import check_correctness # noqa: E402
17:
18:
19: def _apply_shape(reference, shape: dict) -> None:
20: reference.BATCH = shape["batch"]
21: reference.NUM_HEADS = shape["num_heads"]
22: reference.NUM_KV_HEADS = shape["num_kv_heads"]
23: reference.HEAD_DIM = shape["head_dim"]
24: reference.SEQ_LEN = shape["seq_len"]
25: reference.PAGE_SIZE = shape["page_size"]
26:
27:
28: def main():
29: try:
30: import reference
31: import shapes
32: import solution
33: except Exception as e:
34: print(f"FAIL: import error: {e}")
35: sys.exit(1)
36:
37: problem_yaml = Path("problem.yaml")
38: meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
39:
40: sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
41: for forbidden in meta.get("forbidden", []):
42: pat = re.escape(forbidden)
43:
... (2537 more chars)
tool
stdout (5063 chars) — TRUNCATED
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/benchmark.py</path>
<type>file</type>
<content>
1: """Roofline benchmark for paged-attention decode.
2:
3: For each shape: times eager reference, compiled reference, SOTA (if available),
4: and the agent's solution. Reports achieved TFLOPS, GB/s, and peak_fraction.
5:
6: Decode is memory-bound, so peak_fraction is computed from achieved GB/s vs
7: the GPU's peak DRAM bandwidth.
8: """
9: import sys
10: from math import exp, log
11: from pathlib import Path
12:
13: import torch
14: import yaml
15:
16: REPO_ROOT = Path(__file__).resolve().parents[2]
17: sys.path.insert(0, str(REPO_ROOT))
18:
19: from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
20: from src.eval.timing import time_fn # noqa: E402
21: from src.hardware import get as get_hw # noqa: E402
22:
23:
24: def _eval_formula(expr: str, vars: dict) -> float:
25: return float(eval(expr, {"__builtins__": {}}, vars))
26:
27:
28: def _apply_shape(reference, shape: dict) -> None:
29: reference.BATCH = shape["batch"]
30: reference.NUM_HEADS = shape["num_heads"]
31: reference.NUM_KV_HEADS = shape["num_kv_heads"]
32: reference.HEAD_DIM = shape["head_dim"]
33: reference.SEQ_LEN = shape["seq_len"]
34: reference.PAGE_SIZE = shape["page_size"]
35:
36:
37: def main():
38: import reference
39: import shapes
40: import solution
41:
42: meta = yaml.safe_load(Path("problem.y
... (3563 more chars)
tool
stdout (538 chars)
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/PROMPT.txt
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/benchmark.py
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/sota.py
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/problem.yaml
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/shapes.py
/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py
assistant
I'll start by reading the reference implementation and understanding the problem structure.
assistant
in 14203out 175cache_r 1824cache_w 0
tool
stdout (1155 chars)
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/shapes.py</path>
<type>file</type>
<content>
1: """Shape sweep for paged attention decode.
2:
3: Mix targets:
4: - small batch / long context (Llama-3 8B-style decode)
5: - large batch / medium context (server batched decode)
6: - GQA wide ratio (Llama-3 70B: 64 heads / 8 kv-heads)
7: - non-power-of-2 seq_len (forces predicated tail handling)
8: - head_dim=64 small-head case
9: """
10:
11: SHAPES = [
12: # (B, H, Hkv, D, L, P)
13: {"batch": 8, "num_heads": 32, "num_kv_heads": 8, "head_dim": 128, "seq_len": 1024, "page_size": 16},
14: {"batch": 32, "num_heads": 32, "num_kv_heads": 8, "head_dim": 128, "seq_len": 2048, "page_size": 16},
15: {"batch": 4, "num_heads": 64, "num_kv_heads": 8, "head_dim": 128, "seq_len": 4096, "page_size": 16},
16: {"batch": 16, "num_heads": 32, "num_kv_heads": 8, "head_dim": 128, "seq_len": 1535, "page_size": 16}, # non-pow2
17: {"batch": 8, "num_heads": 16, "num_kv_heads": 4, "head_dim": 64, "seq_len": 2000, "page_size": 16}, # small-D, non-pow2
18: ]
(End of file - total 18 lines)
</content>
tool
stdout (2375 chars) — TRUNCATED
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/problem.yaml</path>
<type>file</type>
<content>
1: name: 03_paged_attention
2: display_name: "Paged Attention Decode"
3: precision: bf16
4: regime: memory # decode is bandwidth-bound (KV cache streamed once per token)
5:
6: # Effective FLOPs per call: QK^T + softmax*V across all batches/heads/seq.
7: # Sum-formula evaluated per-shape; the benchmark substitutes batch/num_heads/etc.
8: # 2 * B * H * L * D for QK^T plus 2 * B * H * L * D for AV => 4 * B * H * L * D
9: flops_formula: "4 * batch * num_heads * seq_len * head_dim"
10:
11: # Bytes moved (the real bottleneck): the KV cache must be streamed end-to-end.
12: # K and V each: batch * seq_len * num_kv_heads * head_dim * 2 bytes (bf16)
13: # Q: batch * num_heads * head_dim * 2 (negligible vs KV)
14: # Out: same as Q
15: # So total ~ 2 * (B * L * Hkv * D * 2) + small.
16: bytes_formula: "2 * batch * seq_len * num_kv_heads * head_dim * 2 + batch * num_heads * head_dim * 2 * 2"
17:
18: hardware: [RTX_PRO_6000]
19: peak_tflops_key: bf16 # informational; regime=memory uses bandwidth
20: peak_bandwidth_key: dram # 1.8 TB/s GDDR7 on RTX PRO 6000 Blackwell
21:
22: tolerance:
23: bfloat16: 0.02
24: float16: 0.02
25:
26: # Forbidden ops -- using any of these in solution.py fails correctness post-hoc.
27: # vllm.attention / flashinfer.batch_decode_with_paged_kv_cache: dispatching to
28: # the SOTA library is the trivial cheat. SD
... (875 more chars)
tool
stdout (3303 chars) — TRUNCATED
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/sota.py</path>
<type>file</type>
<content>
1: """SOTA reference for paged-attention decode.
2:
3: Tries, in order:
4: 1. FlashInfer's BatchDecodeWithPagedKVCacheWrapper (preferred -- portable,
5: supports SM120, GQA, arbitrary head_dim).
6: 2. vLLM's paged_attention_v2 CUDA op (requires its KV-cache layout, more
7: finicky; we adapt the layout on the fly when possible).
8:
9: If neither is importable, is_available() returns False and the benchmark just
10: reports eager + compiled + solution.
11:
12: Agents are FORBIDDEN from importing these in solution.py (see problem.yaml).
13: This file is only for the benchmark's reference line.
14: """
15: from __future__ import annotations
16:
17: import torch
18:
19:
20: def _try_flashinfer(
21: query: torch.Tensor,
22: kv_cache: torch.Tensor,
23: block_table: torch.Tensor,
24: seq_lens: torch.Tensor,
25: num_kv_heads: int,
26: head_dim: int,
27: page_size: int,
28: ) -> torch.Tensor | None:
29: try:
30: import flashinfer # noqa: F401
31: from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
32: except Exception:
33: return None
34:
35: B, H, D = query.shape
36: # FlashInfer expects K and V as separate (num_blocks, page_size, num_kv_heads, head_dim) tensors.
37: # Our reference packs [K|V] on the last dim -- split here.
38: k_cache = kv_cache[..., :D].contig
... (1803 more chars)
assistant
in 19303out 124cache_r 1824cache_w 0
tool
stdout (211 chars)
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Mon_Mar_02_09:52:23_PM_PST_2026
Cuda compilation tools, release 13.2, V13.2.51
Build cuda_13.2.r13.2/compiler.37434383_0
tool
stdout (1781 chars) — TRUNCATED
Mon Apr 27 09:26:04 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03 Driver Version: 595.58.03 CUDA Version: 13.2 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA RTX PRO 6000 Blac... On | 00000000:01:00.0 On | Off |
| 30% 27C P8 11W / 600W | 383MiB / 97887MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|======================================================================================
... (281 more chars)
assistant
Now let me check the hardware specs and understand the CUDA environment:
tool
stdout (62 chars)
PyTorch: 2.11.0+cu130
CUDA available: True
CUDA version: 13.0
assistant
in 21656out 165cache_r 1787cache_w 0
tool
stdout (2962 chars) — TRUNCATED
<skill_content name="kernel-optimization">
# Skill: kernel-optimization
# Kernel Optimization
Unified reference for GPU/TPU kernel work across all platforms. This skill routes to specialized sub-references based on what you need.
## When this skill fires
- Writing or optimizing CUDA, Metal, Triton, HIP, or Pallas kernels
- Porting kernels between platforms (CUDA <-> Metal <-> Triton <-> HIP <-> Pallas)
- Debugging GPU code (OOM, NaN, race conditions, wrong results, hangs)
- Performance profiling and optimization
- Numerical precision issues (FP formats, mixed precision, tolerance)
- AMD MI300/MI350 or TPU v4/v5/v6/v7 specific work
## Sub-references (read on demand)
The following files contain deep platform-specific knowledge. Read the relevant one(s) based on the task:
| File | When to read |
|------|-------------|
| `~/.claude/skills/amd-gpu/SKILL.md` | AMD CDNA/RDNA, ROCm, HIP, MFMA, LDS, chiplet scheduling, HipKittens, AITER |
| `~/.claude/skills/tpu-opt/SKILL.md` | TPU v4-v7, MXU, Pallas kernels, JAX sharding, XLA gotchas, SplashAttention |
| `~/.claude/skills/port-kernel/SKILL.md` | Cross-platform porting: CUDA <-> Metal <-> Triton concept maps, API equivalents, gotchas |
| `~/.claude/skills/debug-gpu/SKILL.md` | OOM diagnosis, NaN tracing, race conditions, compute-sanitizer, FP format specs, tolerance tables |
| `~/.claude/skills/perf-optim/SKILL.md` | Profiling (nsys/ncu/Metal), roofline, occupancy, memory hierarchy, CUTLASS/CuTe, quantization, KernelBench |
| `
... (1462 more chars)
assistant
in 807out 25cache_r 23739cache_w 0
tool
stdout (55080 chars) — TRUNCATED
<path>/home/infatoshi/.claude/skills/perf-optim/SKILL.md</path>
<type>file</type>
<content>
1: ---
2: name: perf-optim
3: description: "[SUB-REFERENCE of kernel skill] Performance optimization. Do NOT invoke directly -- use /kernel instead, which reads this file on demand."
4: ---
5:
6: # Performance Optimization Atlas
7:
8: Comprehensive, hierarchical reference for optimizing GPU workloads. Pick combinations from this atlas based on your bottleneck profile. **Always profile before optimizing.**
9:
10: ---
11:
12: ## 0. FIRST PRINCIPLES: SPEED OF LIGHT, THEN PROFILE
13:
14: ### Rule
15: Never optimize blind. But before you even profile, compute the **speed of light** -- the theoretical physical limit for this operation. Then profile to measure the gap. The gap IS the optimization opportunity.
16:
17: ### Speed of Light Analysis
18: 1. **Compute the limit**: What does physics allow? Peak FLOPS, peak bandwidth, minimum latency for this exact workload
19: 2. **Profile current state**: Where are we now?
20: 3. **Measure the gap**: speed_of_light - measured = optimization budget
21: 4. **Don't continuous-improve from current state**: Strip to zero. If you built this from scratch knowing only the physical limits, what would you build? Then reason about every piece of the gap between that and current
22: - "It takes 74ms" -- don't optimize to 72ms. Compute that physics allows 6ms. Now reason about the 6-to-74 gap. The rest might be well-reasoned compromises, but at least yo
... (53580 more chars)
assistant
in 1248out 40cache_r 24187cache_w 0
tool
stdout (372 chars)
content: 'Implement paged attention CUDA kernel in solution.py'
status: 'in_progress'
priority: 'high'
content: 'Verify correctness with python check.py'
status: 'pending'
priority: 'high'
content: 'Profile kernel performance with benchmark.py'
status: 'pending'
priority: 'medium'
content: 'Optimize kernel based on profiling results'
status: 'pending'
priority: 'medium'
assistant
Now I have a thorough understanding. Let me implement the paged attention kernel. I'll use CUDA C++ via `torch.utils.cpp_extension.load_inline` with a memory-efficient design for Blackwell.
assistant
in 38063out 159cache_r 1787cache_w 0
assistant
in 484out 5279cache_r 39867cache_w 0
assistant
Let me test if this compiles and passes correctness:
tool
stdout (309 chars)
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 288, in <module>
module = load_inline(
^^^^^^^^^^^^
TypeError: load_inline() got an unexpected keyword argument 'cpp_source'
assistant
in 57out 89cache_r 45691cache_w 0
tool
stdout (26 chars)
Edit applied successfully.
assistant
Let me fix the load_inline syntax:
assistant
in 255out 100cache_r 45691cache_w 0
assistant
in 398out 79cache_r 45691cache_w 0
assistant
Now let me run the correctness check:
tool
stdout (1179 chars)
Traceback (most recent call last):
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py", line 109, in <module>
main()
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py", line 72, in main
sol_out = sol_model(*inputs)
^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 544, in forward
cuda_mod = self._get_cuda_module()
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 334, in _get_cuda_module
self._cuda_module = load_inline(
^^^^^^^^^^^^
TypeError: load_inline() got an unexpected keyword argument 'cpp_source'
assistant
in 143out 81cache_r 46139cache_w 0
tool
stdout (26 chars)
Edit applied successfully.
assistant
Let me fix the load_inline call - it doesn't take `cpp_source`:
assistant
in 565out 357cache_r 46139cache_w 0
tool
stdout (26 chars)
Edit applied successfully.
assistant
in 128out 324cache_r 47035cache_w 0
tool
stdout (934 chars)
Traceback (most recent call last):
File "<string>", line 35, in <module>
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 543, in forward
cuda_mod = self._get_cuda_module()
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 334, in _get_cuda_module
self._cuda_module = load_inline(
^^^^^^^^^^^^
TypeError: load_inline() got an unexpected keyword argument 'cuda_source'
assistant
in 285out 411cache_r 47483cache_w 0
tool
stdout (3594 chars) — TRUNCATED
Help on function load_inline in module torch.utils.cpp_extension:
load_inline(name, cpp_sources, cuda_sources=None, sycl_sources=None, functions=None, extra_cflags=None, extra_cuda_cflags=None, extra_sycl_cflags=None, extra_ldflags=None, extra_include_paths=None, build_directory=None, verbose=False, with_cuda=None, with_sycl=None, is_python_module=True, with_pytorch_error_handling=True, keep_intermediates=True, use_pch=False, no_implicit_headers=False)
Load a PyTorch C++ extension just-in-time (JIT) from string sources.
This function behaves exactly like :func:`load`, but takes its sources as
strings rather than filenames. These strings are stored to files in the
build directory, after which the behavior of :func:`load_inline` is
identical to :func:`load`.
See `the
tests <https://github.com/pytorch/pytorch/blob/master/test/test_cpp_extensions_jit.py>`_
for good examples of using this function.
Sources may omit two required parts of a typical non-inline C++ extension:
the necessary header includes, as well as the (pybind11) binding code. More
precisely, strings passed to ``cpp_sources`` are first concatenated into a
single ``.cpp`` file. This file is then prepended with ``#include
<torch/extension.h>``
Furthermore, if the ``functions`` argument is supplied, bindings will be
automatically generated for each function specified. ``functions`` can
either be a list of function names, or a dictionary mapping from f
... (2094 more chars)
assistant
in 515out 60cache_r 47931cache_w 0
tool
stdout (26 chars)
Edit applied successfully.
assistant
The parameter is `cuda_sources` (plural), not `cuda_source`. Let me fix:
assistant
in 1003out 343cache_r 48379cache_w 0
tool
stdout (5551 chars) — TRUNCATED
Traceback (most recent call last):
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2774, in _run_ninja_build
subprocess.run(
File "/usr/lib/python3.12/subprocess.py", line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<string>", line 35, in <module>
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 544, in forward
cuda_mod = self._get_cuda_module()
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 334, in _get_cuda_module
self._cuda_module = load_inline(
^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2174, in load_inline
return _jit_compile(
^^^^^^^^^^^^^
File "/home/infatosh
... (4051 more chars)
assistant
in 53out 387cache_r 49723cache_w 0
tool
stdout (9219 chars) — TRUNCATED
<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py</path>
<type>file</type>
<content>
320: self.head_dim = head_dim
321: self.seq_len = seq_len
322: self.page_size = page_size
323: self.group_size = num_heads // num_kv_heads
324:
325: self.register_buffer("_dummy", torch.zeros(1, dtype=torch.bfloat16), persistent=False)
326:
327: self._cuda_module = None
328:
329: def _get_cuda_module(self):
330: if self._cuda_module is None:
331: from torch.utils.cpp_extension import load_inline
332:
333: cuda_src = self._get_cuda_source()
334: self._cuda_module = load_inline(
335: name="paged_attention_cuda_" + str(id(self)),
336: cpp_sources="",
337: cuda_sources=cuda_src,
338: functions=["paged_attention_forward"],
339: verbose=False,
340: extra_cflags=["-O3"],
341: extra_cuda_cflags=["-O3", "-gencode=arch=compute_90,code=sm_90"],
342: )
343: return self._cuda_module
344:
345: def _get_cuda_source(self):
346: gs = self.group_size
347: ps = self.page_size
348: hd = self.head_dim
349: nkh = self.num_kv_heads
350:
351: return f"""
352: #include <torch/extension.h>
353: #include <cuda_bf16.h>
354: #include <cuda_runtime.h>
355: #include <device_launch_parameters.h>
356:
357: using bfloa
... (7719 more chars)
assistant
I see compilation errors. Let me fix the code - the template parameters and runtime variables are getting mixed up:
assistant
in 12370out 87cache_r 39419cache_w 0
assistant
I see the issue. `PAGE_SIZE` in the host code (line 511) is being confused with the template parameter. Let me rewrite the solution more cleanly:
assistant
in 3338out 2561cache_r 51515cache_w 0
tool
stdout (4214 chars) — TRUNCATED
Traceback (most recent call last):
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2774, in _run_ninja_build
subprocess.run(
File "/usr/lib/python3.12/subprocess.py", line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<string>", line 35, in <module>
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 262, in forward
cuda_mod = _get_cuda_module()
^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 207, in _get_cuda_module
_cached_module = load_inline(
^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2174, in load_inline
return _jit_compile(
^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/pyt
... (2714 more chars)
assistant
in 206out 387cache_r 57339cache_w 0
assistant
The issue is that `paged_attention_forward` is in cuda_sources but pybind11 expects it in cpp_sources. Let me fix this:
assistant
in 4487out 2556cache_r 54651cache_w 0
tool
stdout (3140 chars) — TRUNCATED
Traceback (most recent call last):
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2774, in _run_ninja_build
subprocess.run(
File "/usr/lib/python3.12/subprocess.py", line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<string>", line 35, in <module>
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 262, in forward
cuda_mod = _get_cuda_module()
^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 207, in _get_cuda_module
_cached_module = load_inline(
^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2174, in load_inline
return _jit_compile(
^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/pyt
... (1640 more chars)
assistant
in 211out 387cache_r 61819cache_w 0
assistant
I need to use cuda_sources for CUDA code. Let me try a different approach - putting the kernel in cuda_sources and the wrapper with forward declaration in cpp_sources:
assistant
in 1105out 2735cache_r 62715cache_w 0
tool
stdout (1930 chars) — TRUNCATED
Traceback (most recent call last):
File "<string>", line 35, in <module>
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 291, in forward
cuda_mod = _get_cuda_module()
^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 236, in _get_cuda_module
_cached_module = load_inline(
^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2174, in load_inline
return _jit_compile(
^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2294, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/lib/python3.12/site-packages/torch/utils/cpp_extension.py", line 2812, in _import_module_from_library
module = importlib.util.module_from_spec(spec)
^^^^^^^^^^^^^^
... (430 more chars)
assistant
in 119out 387cache_r 66747cache_w 0
assistant
The issue is with `data_ptr<__nv_bfloat16>()` in host code. Let me fix this by using proper PyTorch tensor access:
assistant
in 626out 2742cache_r 67195cache_w 0
tool
stdout (633 chars)
Traceback (most recent call last):
File "<string>", line 39, in <module>
torch.AcceleratorError: CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
ref_out shape: torch.Size([8, 32, 128])
sol_out shape: torch.Size([8, 32, 128])
assistant
in 450out 387cache_r 70331cache_w 0
tool
stdout (26 chars)
Edit applied successfully.
assistant
The kernel compiled but for wrong architecture. RTX PRO 6000 is SM120. Let me fix the architecture:
assistant
in 626out 142cache_r 70779cache_w 0
assistant
in 140out 412cache_r 71675cache_w 0