"""Paged attention decode kernel for RTX PRO 6000 (SM120 Blackwell).
Single-query decode with grouped-query attention (GQA).
One 32-thread warp-block per query head for maximum parallelism.
Each lane handles D/32 dimensions; all reduction via __shfl_xor_sync.
Online softmax processes KV cache in a single streaming pass.
"""
from __future__ import annotations
import math
import torch
import torch.nn as nn
from torch.utils.cpp_extension import load_inline
OP_TYPE = "attention"
SUPPORTED_PRECISIONS = ["bf16"]
HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
BATCH = 8
NUM_HEADS = 32
NUM_KV_HEADS = 8
HEAD_DIM = 128
SEQ_LEN = 1024
PAGE_SIZE = 16
# ---------------------------------------------------------------------------
# CUDA kernel + wrapper
# ---------------------------------------------------------------------------
_cpp_src = r"""
torch::Tensor launch_paged_attention_decode(
torch::Tensor query,
torch::Tensor kv_cache,
torch::Tensor block_table,
torch::Tensor seq_lens,
int B, int H, int Hkv, int D, int P, int max_blocks, float scale);
"""
_cuda_src = r"""
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#define WARP_SIZE 32
#define MAX_STRIDE 8
extern "C" __global__ void paged_attention_decode_kernel(
const __nv_bfloat16* __restrict__ query,
const __nv_bfloat16* __restrict__ kv_cache,
const int* __restrict__ block_table,
const int* __restrict__ seq_lens,
__nv_bfloat16* __restrict__ output,
int B, int H, int Hkv, int D, int P, int max_blocks, float scale,
int G)
{
// One block per query head
int linear_bid = blockIdx.x;
int b = linear_bid / H;
int q_head = linear_bid % H;
if (b >= B) return;
int L = seq_lens[b];
if (L <= 0) return;
int kv_h = q_head / G; // which KV head this query head maps to
int num_pages = (L + P - 1) / P;
int lane_id = threadIdx.x;
int stride = D / WARP_SIZE;
unsigned mask = __activemask();
// Load this query head (fp32)
float q_val[MAX_STRIDE];
int q_base = b * H * D + q_head * D;
#pragma unroll
for (int s = 0; s < stride; s++) {
q_val[s] = __bfloat162float(query[q_base + lane_id + s * WARP_SIZE]);
}
// Online softmax state for this single query head
float m = -1e10f;
float l = 0.0f;
float acc[MAX_STRIDE];
#pragma unroll
for (int s = 0; s < stride; s++) {
acc[s] = 0.0f;
}
// Offsets into kv_cache for this KV head
const int* bt_base = block_table + b * max_blocks;
int page_stride = P * Hkv * 2 * D;
int token_stride = Hkv * 2 * D;
int k_off = kv_h * 2 * D;
int v_off = k_off + D;
// Main loop over pages and tokens
for (int pg = 0; pg < num_pages; pg++) {
int block_idx = bt_base[pg];
int page_base = block_idx * page_stride;
int tokens = (pg == num_pages - 1) ? (L - pg * P) : P;
for (int t = 0; t < tokens; t++) {
int base = page_base + t * token_stride;
// Load K and V values for this token (strided across lanes)
float k_vals[MAX_STRIDE];
float v_vals[MAX_STRIDE];
#pragma unroll
for (int s = 0; s < stride; s++) {
int idx = lane_id + s * WARP_SIZE;
k_vals[s] = __bfloat162float(kv_cache[base + k_off + idx]);
v_vals[s] = __bfloat162float(kv_cache[base + v_off + idx]);
}
// Dot product: Q @ K
float dot = 0.0f;
#pragma unroll
for (int s = 0; s < stride; s++) {
dot += q_val[s] * k_vals[s];
}
// Warp-level butterfly reduction
#pragma unroll
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
dot += __shfl_xor_sync(mask, dot, offset);
}
dot *= scale;
// Online softmax — lane 0 computes, then broadcast
float weight_val = 0.0f;
float rescale_val = 0.0f;
if (lane_id == 0) {
float m_new = fmaxf(m, dot);
float old_scale = __expf(m - m_new);
float exp_val = __expf(dot - m_new);
weight_val = exp_val;
rescale_val = old_scale;
l = old_scale * l + exp_val;
m = m_new;
}
weight_val = __shfl_sync(mask, weight_val, 0);
rescale_val = __shfl_sync(mask, rescale_val, 0);
// Update output accumulator
#pragma unroll
for (int s = 0; s < stride; s++) {
acc[s] = rescale_val * acc[s] + weight_val * v_vals[s];
}
}
}
// Normalise and write output
float l_final = __shfl_sync(mask, l, 0);
int out_base = b * H * D + q_head * D;
#pragma unroll
for (int s = 0; s < stride; s++) {
output[out_base + lane_id + s * WARP_SIZE] = __float2bfloat16(acc[s] / l_final);
}
}
// -----------------------------------------------------------------------
// C++ wrapper
// -----------------------------------------------------------------------
torch::Tensor launch_paged_attention_decode(
torch::Tensor query,
torch::Tensor kv_cache,
torch::Tensor block_table,
torch::Tensor seq_lens,
int B, int H, int Hkv, int D, int P, int max_blocks, float scale)
{
auto output = torch::empty({B, H, D}, query.options());
constexpr int THREADS = 32;
int G = H / Hkv;
int num_blocks = B * H;
paged_attention_decode_kernel<<<num_blocks, THREADS>>>(
reinterpret_cast<const __nv_bfloat16*>(query.data_ptr<torch::BFloat16>()),
reinterpret_cast<const __nv_bfloat16*>(kv_cache.data_ptr<torch::BFloat16>()),
block_table.data_ptr<int>(),
seq_lens.data_ptr<int>(),
reinterpret_cast<__nv_bfloat16*>(output.data_ptr<torch::BFloat16>()),
B, H, Hkv, D, P, max_blocks, scale, G);
return output;
}
"""
_module = None
def _get_module():
global _module
if _module is None:
_module = load_inline(
name="paged_attention_decode_ext",
cpp_sources=[_cpp_src],
cuda_sources=[_cuda_src],
functions=["launch_paged_attention_decode"],
extra_cuda_cflags=[
"-std=c++17",
"-O3",
"--use_fast_math",
"-gencode", "arch=compute_120,code=sm_120",
],
extra_cflags=["-O3"],
verbose=False,
)
return _module
# ---------------------------------------------------------------------------
# Model
# ---------------------------------------------------------------------------
class Model(nn.Module):
"""Single-query paged attention decode -- custom CUDA kernel."""
def __init__(
self,
batch: int,
num_heads: int,
num_kv_heads: int,
head_dim: int,
seq_len: int,
page_size: int,
):
super().__init__()
assert num_heads % num_kv_heads == 0
self.batch = batch
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.head_dim = head_dim
self.seq_len = seq_len
self.page_size = page_size
self.scale = 1.0 / math.sqrt(head_dim)
self.register_buffer("_dummy", torch.zeros(1, dtype=torch.bfloat16), persistent=False)
def forward(
self,
query: torch.Tensor,
kv_cache: torch.Tensor,
block_table: torch.Tensor,
seq_lens: torch.Tensor,
) -> torch.Tensor:
B, H, D = query.shape
max_blocks = block_table.shape[1]
mod = _get_module()
return mod.launch_paged_attention_decode(
query.contiguous(),
kv_cache.contiguous(),
block_table.contiguous(),
seq_lens.contiguous(),
B, H, self.num_kv_heads, D, self.page_size, max_blocks, self.scale,
)
# ---------------------------------------------------------------------------
# Input factories
# ---------------------------------------------------------------------------
def get_inputs():
B = BATCH
H = NUM_HEADS
Hkv = NUM_KV_HEADS
D = HEAD_DIM
L = SEQ_LEN
P = PAGE_SIZE
pages_per_seq = (L + P - 1) // P
total_pages = max(B * pages_per_seq + 8, 64)
query = torch.randn(B, H, D, dtype=torch.bfloat16) * 0.1
kv_cache = torch.randn(total_pages, P, Hkv, 2 * D, dtype=torch.bfloat16) * 0.1
perm = torch.randperm(total_pages)[: B * pages_per_seq].reshape(B, pages_per_seq).int()
block_table = perm.contiguous()
seq_lens = torch.full((B,), L, dtype=torch.int32)
return [query, kv_cache, block_table, seq_lens]
def get_init_inputs():
return [BATCH, NUM_HEADS, NUM_KV_HEADS, HEAD_DIM, SEQ_LEN, PAGE_SIZE]
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] Graph break from `Tensor.item()`, consider setting:
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] torch._dynamo.config.capture_scalar_outputs = True
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] or:
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] to include these operations in the captured graph.
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] Graph break: from user code at:
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py", line 89, in forward
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] L = int(seq_lens[b].item())
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
W0427 09:25:00.605000 1131347 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
[sota unavailable] ValueError: The dtype of q torch.bfloat16 does not match the q_data_type torch.float16 specified in plan function.
shape=0 variant=eager tflops=0.230 gbps=57.830 ms=0.582
shape=0 variant=compiled tflops=0.225 gbps=56.407 ms=0.597
shape=0 variant=solution tflops=0.191 gbps=47.896 ms=0.703
shape=0 solution_peak_fraction=0.0266
[sota unavailable] ValueError: The dtype of q torch.bfloat16 does not match the q_data_type torch.float16 specified in plan function.
shape=1 variant=eager tflops=0.348 gbps=87.279 ms=3.082
shape=1 variant=compiled tflops=0.347 gbps=86.902 ms=3.095
shape=1 variant=solution tflops=0.768 gbps=192.396 ms=1.398
shape=1 solution_peak_fraction=0.1069
[sota unavailable] ValueError: The dtype of q torch.bfloat16 does not match the q_data_type torch.float16 specified in plan function.
shape=2 variant=eager tflops=0.243 gbps=30.494 ms=2.205
shape=2 variant=compiled tflops=0.243 gbps=30.403 ms=2.212
shape=2 variant=solution tflops=0.195 gbps=24.409 ms=2.755
shape=2 solution_peak_fraction=0.0136
[sota unavailable] ValueError: The dtype of q torch.bfloat16 does not match the q_data_type torch.float16 specified in plan function.
shape=3 variant=eager tflops=0.303 gbps=75.901 ms=1.329
shape=3 variant=compiled tflops=0.301 gbps=75.331 ms=1.339
shape=3 variant=solution tflops=0.381 gbps=95.498 ms=1.056
shape=3 solution_peak_fraction=0.0531
[sota unavailable] ValueError: The dtype of q torch.bfloat16 does not match the q_data_type torch.float16 specified in plan function.
shape=4 variant=eager tflops=0.113 gbps=28.235 ms=0.581
shape=4 variant=compiled tflops=0.111 gbps=27.705 ms=0.593
shape=4 variant=solution tflops=0.049 gbps=12.180 ms=1.348
shape=4 solution_peak_fraction=0.0068
peak_fraction: 0.0268
RESULT: LOW
PASS
{
"problem": "03_paged_attention",
"harness": "opencode",
"model": "deepseek/deepseek-v4-pro",
"reasoning_effort": "",
"has_solution": true,
"correct": true,
"peak_fraction": 0.0268,
"elapsed_seconds": 1933,
"harness_exit_code": 0,
"session_complete": true,
"usage": {"input_tokens": 65358, "output_tokens": 13136, "cache_read_tokens": 1116544, "cache_creation_tokens": 0, "reasoning_tokens": 45848, "total_cost_usd": null}
}
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention</path>
<type>directory</type>
<entries>
benchmark.py
check.py
problem.yaml
PROMPT.txt
reference.py
shapes.py
sota.py
(7 entries)
</entries>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py</path>
<type>file</type>
<content>
1: """Naive PyTorch paged-attention decode reference (correctness oracle, not SOTA).
2:
3: Single-query decode: each batch element has a query of shape (num_heads, head_dim)
4: and attends over a KV cache of `seq_len[b]` tokens stored as fixed-size pages in
5: a global pool. Pages for batch element b are listed in `block_table[b]`.
6:
7: The reference performs the slow path:
8: 1. Gather pages -> contiguous (seq_len, num_kv_heads, head_dim) per batch element.
9: 2. Repeat KV heads for grouped-query (broadcast num_kv_heads -> num_heads).
10: 3. Manual softmax(QK^T / sqrt(d)) @ V in fp32, cast back to bf16.
11:
12: This avoids torch.nn.functional.scaled_dot_product_attention (which is on the
13: forbidden list) so the agent cannot dispatch through SDPA either.
14: """
15: import math
16:
17: import torch
18: import torch.nn as nn
19:
20: OP_TYPE = "attention"
21: SUPPORTED_PRECISIONS = ["bf16"]
22: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
23:
24:
25: # --- Shape knobs (overridden by check.py / benchmark.py from shapes.py) ----
26: BATCH = 8
27: NUM_HEADS = 32
28: NUM_KV_HEADS = 8
29: HEAD_DIM = 128
30: SEQ_LEN = 1024
31: PAGE_SIZE = 16
32:
33:
34: class Model(nn.Module):
35: """Single-query paged attention decode.
36:
37: Forward inputs (all on device):
38: query: (batch, num_heads, head_dim) bf16
3
... (4426 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py</path>
<type>file</type>
<content>
1: """Correctness runner for paged-attention decode.
2:
3: Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4: each, with per-dtype atol/rtol. Also rejects forbidden ops by grep.
5: """
6: import re
7: import sys
8: from pathlib import Path
9:
10: import torch
11: import yaml
12:
13: REPO_ROOT = Path(__file__).resolve().parents[2]
14: sys.path.insert(0, str(REPO_ROOT))
15:
16: from src.eval.correctness import check_correctness # noqa: E402
17:
18:
19: def _apply_shape(reference, shape: dict) -> None:
20: reference.BATCH = shape["batch"]
21: reference.NUM_HEADS = shape["num_heads"]
22: reference.NUM_KV_HEADS = shape["num_kv_heads"]
23: reference.HEAD_DIM = shape["head_dim"]
24: reference.SEQ_LEN = shape["seq_len"]
25: reference.PAGE_SIZE = shape["page_size"]
26:
27:
28: def main():
29: try:
30: import reference
31: import shapes
32: import solution
33: except Exception as e:
34: print(f"FAIL: import error: {e}")
35: sys.exit(1)
36:
37: problem_yaml = Path("problem.yaml")
38: meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
39:
40: sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
41: for forbidden in meta.get("forbidden", []):
42: pat = re.escape(forbidden)
43:
... (2537 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/benchmark.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/benchmark.py</path>
<type>file</type>
<content>
1: """Roofline benchmark for paged-attention decode.
2:
3: For each shape: times eager reference, compiled reference, SOTA (if available),
4: and the agent's solution. Reports achieved TFLOPS, GB/s, and peak_fraction.
5:
6: Decode is memory-bound, so peak_fraction is computed from achieved GB/s vs
7: the GPU's peak DRAM bandwidth.
8: """
9: import sys
10: from math import exp, log
11: from pathlib import Path
12:
13: import torch
14: import yaml
15:
16: REPO_ROOT = Path(__file__).resolve().parents[2]
17: sys.path.insert(0, str(REPO_ROOT))
18:
19: from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
20: from src.eval.timing import time_fn # noqa: E402
21: from src.hardware import get as get_hw # noqa: E402
22:
23:
24: def _eval_formula(expr: str, vars: dict) -> float:
25: return float(eval(expr, {"__builtins__": {}}, vars))
26:
27:
28: def _apply_shape(reference, shape: dict) -> None:
29: reference.BATCH = shape["batch"]
30: reference.NUM_HEADS = shape["num_heads"]
31: reference.NUM_KV_HEADS = shape["num_kv_heads"]
32: reference.HEAD_DIM = shape["head_dim"]
33: reference.SEQ_LEN = shape["seq_len"]
34: reference.PAGE_SIZE = shape["page_size"]
35:
36:
37: def main():
38: import reference
39: import shapes
40: import solution
41:
42: meta = yaml.safe_load(Path("problem.y
... (3563 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py"
}{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/shapes.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/shapes.py</path>
<type>file</type>
<content>
1: """Shape sweep for paged attention decode.
2:
3: Mix targets:
4: - small batch / long context (Llama-3 8B-style decode)
5: - large batch / medium context (server batched decode)
6: - GQA wide ratio (Llama-3 70B: 64 heads / 8 kv-heads)
7: - non-power-of-2 seq_len (forces predicated tail handling)
8: - head_dim=64 small-head case
9: """
10:
11: SHAPES = [
12: # (B, H, Hkv, D, L, P)
13: {"batch": 8, "num_heads": 32, "num_kv_heads": 8, "head_dim": 128, "seq_len": 1024, "page_size": 16},
14: {"batch": 32, "num_heads": 32, "num_kv_heads": 8, "head_dim": 128, "seq_len": 2048, "page_size": 16},
15: {"batch": 4, "num_heads": 64, "num_kv_heads": 8, "head_dim": 128, "seq_len": 4096, "page_size": 16},
16: {"batch": 16, "num_heads": 32, "num_kv_heads": 8, "head_dim": 128, "seq_len": 1535, "page_size": 16}, # non-pow2
17: {"batch": 8, "num_heads": 16, "num_kv_heads": 4, "head_dim": 64, "seq_len": 2000, "page_size": 16}, # small-D, non-pow2
18: ]
(End of file - total 18 lines)
</content>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/problem.yaml"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/problem.yaml</path>
<type>file</type>
<content>
1: name: 03_paged_attention
2: display_name: "Paged Attention Decode"
3: precision: bf16
4: regime: memory # decode is bandwidth-bound (KV cache streamed once per token)
5:
6: # Effective FLOPs per call: QK^T + softmax*V across all batches/heads/seq.
7: # Sum-formula evaluated per-shape; the benchmark substitutes batch/num_heads/etc.
8: # 2 * B * H * L * D for QK^T plus 2 * B * H * L * D for AV => 4 * B * H * L * D
9: flops_formula: "4 * batch * num_heads * seq_len * head_dim"
10:
11: # Bytes moved (the real bottleneck): the KV cache must be streamed end-to-end.
12: # K and V each: batch * seq_len * num_kv_heads * head_dim * 2 bytes (bf16)
13: # Q: batch * num_heads * head_dim * 2 (negligible vs KV)
14: # Out: same as Q
15: # So total ~ 2 * (B * L * Hkv * D * 2) + small.
16: bytes_formula: "2 * batch * seq_len * num_kv_heads * head_dim * 2 + batch * num_heads * head_dim * 2 * 2"
17:
18: hardware: [RTX_PRO_6000]
19: peak_tflops_key: bf16 # informational; regime=memory uses bandwidth
20: peak_bandwidth_key: dram # 1.8 TB/s GDDR7 on RTX PRO 6000 Blackwell
21:
22: tolerance:
23: bfloat16: 0.02
24: float16: 0.02
25:
26: # Forbidden ops -- using any of these in solution.py fails correctness post-hoc.
27: # vllm.attention / flashinfer.batch_decode_with_paged_kv_cache: dispatching to
28: # the SOTA library is the trivial cheat. SD
... (875 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/sota.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/sota.py</path>
<type>file</type>
<content>
1: """SOTA reference for paged-attention decode.
2:
3: Tries, in order:
4: 1. FlashInfer's BatchDecodeWithPagedKVCacheWrapper (preferred -- portable,
5: supports SM120, GQA, arbitrary head_dim).
6: 2. vLLM's paged_attention_v2 CUDA op (requires its KV-cache layout, more
7: finicky; we adapt the layout on the fly when possible).
8:
9: If neither is importable, is_available() returns False and the benchmark just
10: reports eager + compiled + solution.
11:
12: Agents are FORBIDDEN from importing these in solution.py (see problem.yaml).
13: This file is only for the benchmark's reference line.
14: """
15: from __future__ import annotations
16:
17: import torch
18:
19:
20: def _try_flashinfer(
21: query: torch.Tensor,
22: kv_cache: torch.Tensor,
23: block_table: torch.Tensor,
24: seq_lens: torch.Tensor,
25: num_kv_heads: int,
26: head_dim: int,
27: page_size: int,
28: ) -> torch.Tensor | None:
29: try:
30: import flashinfer # noqa: F401
31: from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
32: except Exception:
33: return None
34:
35: B, H, D = query.shape
36: # FlashInfer expects K and V as separate (num_blocks, page_size, num_kv_heads, head_dim) tensors.
37: # Our reference packs [K|V] on the last dim -- split here.
38: k_cache = kv_cache[..., :D].contig
... (1803 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/src/eval/correctness.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/src/eval/correctness.py</path>
<type>file</type>
<content>
1: """Per-dtype correctness tolerance.
2:
3: Stanford's KernelBench uses 1e-4 atol/rtol for fp32 and 1e-2 for fp16/bf16.
4: We extend to fp8 (0.1, permissive) and int (exact).
5: """
6: from __future__ import annotations
7:
8: import torch
9:
10: DEFAULT_TOLERANCE = {
11: torch.float32: {"atol": 1e-4, "rtol": 1e-4},
12: torch.float16: {"atol": 1e-2, "rtol": 1e-2},
13: torch.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
14: torch.float8_e4m3fn: {"atol": 1e-1, "rtol": 1e-1},
15: torch.float8_e5m2: {"atol": 1e-1, "rtol": 1e-1},
16: torch.int8: {"atol": 0, "rtol": 0},
17: torch.int32: {"atol": 0, "rtol": 0},
18: torch.int64: {"atol": 0, "rtol": 0},
19: }
20:
21:
22: def _coerce_float(x):
23: """Accept int/float/str scientific-notation as float; pass through dicts."""
24: if isinstance(x, (int, float)):
25: return float(x)
26: if isinstance(x, str):
27: try:
28: return float(x)
29: except ValueError:
30: return x
31: return x
32:
33:
34: def tolerance_for_dtype(dtype: torch.dtype, override: dict | None = None) -> dict:
35: """Lookup atol/rtol for a given dtype, with optional per-problem override.
36:
37: The override dict is keyed by str(dtype). Values may be:
38: - a single number (or a string parseable as float, e.g. "5e-2") -> used
39: as both atol and rtol
4
... (2580 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/src/hardware.py"
}{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/PROMPT.txt"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/PROMPT.txt</path>
<type>file</type>
<content>
1: I need you to write a paged attention decode kernel for the RTX PRO 6000 (SM120 Blackwell, GDDR7, 1.8 TB/s). The reference implementation is in reference.py and your kernel goes in solution.py. You can make whatever mess you want in this directory — scratch files, intermediate kernels, build artifacts, profiling traces — but the final answer has to be in solution.py with the same Model, get_inputs, and get_init_inputs interface as reference.py.
2:
3: This is single-query decode: each batch element has a query of shape (num_heads, head_dim) and attends over a KV cache of seq_len[b] tokens stored as fixed-size pages in a global pool, with block_table[b] listing which pages belong to batch element b. The KV cache is laid out as (num_blocks, page_size, num_kv_heads, head_dim * 2) where the last dim packs [K | V] so a single gather pulls both. You return the bf16 output of shape (batch, num_heads, head_dim). Correctness tolerance is 0.02 abs/rel on bf16/fp16. The shapes you have to handle are batch=8 num_heads=32 num_kv_heads=8 head_dim=128 seq_len=1024 page_size=16 (Llama-3 8B-style decode), batch=32 num_heads=32 num_kv_heads=8 head_dim=128 seq_len=2048 page_size=16 (server-batched decode), batch=4 num_heads=64 num_kv_heads=8 head_dim=128 seq_len=4096 page_size=16 (Llama-3 70B GQA wide ratio with long context), batch=16 num_heads=32 num_kv_heads=8 head_dim=128
... (1253 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/src/hardware"
}<path>/home/infatoshi/cuda/KernelBench-Hard/src/hardware</path>
<type>directory</type>
<entries>
__init__.py
__pycache__/
m4_max.py
rtx_pro_6000.py
(4 entries)
</entries>{
"command": "nvidia-smi --query-gpu=name,compute_cap,memory.total --format=csv,noheader 2>&1 | head -5",
"description": "Check GPU info"
}NVIDIA RTX PRO 6000 Blackwell Workstation Edition, 12.0, 97887 MiB
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/src/hardware/rtx_pro_6000.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/src/hardware/rtx_pro_6000.py</path>
<type>file</type>
<content>
1: """RTX PRO 6000 Blackwell Workstation — SM120, consumer-lineage Blackwell.
2:
3: Peak tensor-core throughputs are dense-matrix advertised peaks. Actual kernels
4: will see 60-85% of peak on well-tuned code.
5: """
6: from dataclasses import dataclass
7:
8:
9: @dataclass(frozen=True)
10: class HardwareTarget:
11: name: str
12: sm: str
13: vram_gb: int
14: peak_bandwidth_gb_s: float # DRAM
15: peak_tflops_dense: dict[str, float] # dtype -> TFLOPS
16:
17:
18: RTX_PRO_6000 = HardwareTarget(
19: name="RTX PRO 6000 Blackwell Workstation",
20: sm="sm_120a",
21: vram_gb=96,
22: peak_bandwidth_gb_s=1800.0,
23: peak_tflops_dense={
24: "fp4": 800.0,
25: "nvfp4": 800.0,
26: "mxfp4": 800.0,
27: "fp6": 800.0,
28: "fp8": 400.0,
29: "bf16": 200.0,
30: "fp16": 200.0,
31: "tf32": 100.0,
32: "fp32": 12.0, # non-tensor-core SIMT fp32
33: "int8": 400.0,
34: "int4": 800.0,
35: },
36: )
(End of file - total 36 lines)
</content>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/src/hardware/__init__.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/src/hardware/__init__.py</path>
<type>file</type>
<content>
1: """Hardware peak-throughput lookup tables."""
2: from src.hardware.m4_max import M4_MAX
3: from src.hardware.rtx_pro_6000 import RTX_PRO_6000
4:
5: TARGETS = {
6: "RTX_PRO_6000": RTX_PRO_6000,
7: "M4_MAX": M4_MAX,
8: }
9:
10:
11: def get(name: str):
12: if name not in TARGETS:
13: raise ValueError(f"Unknown hardware {name!r}; available: {list(TARGETS)}")
14: return TARGETS[name]
(End of file - total 14 lines)
</content>{
"description": "Check torch load_inline API",
"command": "python3 -c \"import torch.utils.cpp_extension; help(torch.utils.cpp_extension.load_inline)\" 2>&1 | head -80"
}Help on function load_inline in module torch.utils.cpp_extension:
load_inline(name, cpp_sources, cuda_sources=None, sycl_sources=None, functions=None, extra_cflags=None, extra_cuda_cflags=None, extra_sycl_cflags=None, extra_ldflags=None, extra_include_paths=None, build_directory=None, verbose=False, with_cuda=None, with_sycl=None, is_python_module=True, with_pytorch_error_handling=True, keep_intermediates=True, use_pch=False, no_implicit_headers=False)
Load a PyTorch C++ extension just-in-time (JIT) from string sources.
This function behaves exactly like :func:`load`, but takes its sources as
strings rather than filenames. These strings are stored to files in the
build directory, after which the behavior of :func:`load_inline` is
identical to :func:`load`.
See `the
tests <https://github.com/pytorch/pytorch/blob/master/test/test_cpp_extensions_jit.py>`_
for good examples of using this function.
Sources may omit two required parts of a typical non-inline C++ extension:
the necessary header includes, as well as the (pybind11) binding code. More
precisely, strings passed to ``cpp_sources`` are first concatenated into a
single ``.cpp`` file. This file is then prepended with ``#include
<torch/extension.h>``
Furthermore, if the ``functions`` argument is supplied, bindings will be
automatically generated for each function specified. ``functions`` can
either be a list of function names, or a dictionary mapping from f
... (3323 more chars){
"description": "Search load_inline usage patterns",
"prompt": "Search the directory /home/infatoshi/cuda/KernelBench-Hard for any files that use `load_inline` or `cpp_extension.load_inline` to understand the calling convention - specifically how grid, block, and shared_mem are passed. Also look for how the kernel functions returned by load_inline are called. Return any relevant code snippets.",
"subagent_type": "explore"
}task_id: ses_23088b86cffeX6aN6lxvfBg8m2 (for resuming to continue this task if needed)
<task_result>
Now I have a complete picture. Let me provide a comprehensive answer with all findings.
---
## Results: `load_inline` / `cpp_extension.load_inline` Calling Convention in KernelBench-Hard
### Key Finding: No Existing Usage in the Repo
There are **zero actual calls to `load_inline`** anywhere in the KernelBench-Hard repository. The only references are:
1. **PROMPT.txt files** (7 of them) -- suggesting agents use `load_inline` as one possible approach:
- `/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/PROMPT.txt` (line 5)
- `/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/PROMPT.txt` (line 5)
- `/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/PROMPT.txt` (line 5)
- `/home/infatoshi/cuda/KernelBench-Hard/problems/04_kahan_softmax/PROMPT.txt` (line 5)
- `/home/infatoshi/cuda/KernelBench-Hard/problems/05_topk_bitonic/PROMPT.txt` (line 5)
- `/home/infatoshi/cuda/KernelBench-Hard/problems/06_sonic_moe_swiglu/PROMPT.txt` (line 5)
- `/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/PROMPT.txt` (line 5)
2. **check.py files** (7 of them) -- using it only as a regex detection pattern to label the framework:
```python
("cuda_raw", r"torch\.utils\.cpp_extension\.load_inline|__global__\s+void")
```
All 7 check.py files share this identical pattern (the file paths are listed above, all under `/home
... (6925 more chars){
"todos": [
{
"content": "Write initial solution.py with CUDA C++ kernel + C++ wrapper via load_inline",
"status": "in_progress",
"priority": "high"
},
{
"content": "Run check.py for correctness verification",
"status": "pending",
"priority": "high"
},
{
"content": "Run benchmark.py for performance measurement",
"status": "pending",
"priority": "high"
},
{
"content": "Profile with ncu and iterate if performance is below target",
"status": "pending",
"priority": "medium"
}
]
}content: 'Write initial solution.py with CUDA C++ kernel + C++ wrapper via load_inline'
status: 'in_progress'
priority: 'high'
content: 'Run check.py for correctness verification'
status: 'pending'
priority: 'high'
content: 'Run benchmark.py for performance measurement'
status: 'pending'
priority: 'high'
content: 'Profile with ncu and iterate if performance is below target'
status: 'pending'
priority: 'medium'{
"command": "ls -la /home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/",
"description": "Check current directory contents"
}total 44
drwxrwxr-x 2 infatoshi infatoshi 4096 Apr 27 08:52 .
drwxrwxr-x 10 infatoshi infatoshi 4096 Apr 24 00:30 ..
-rw-rw-r-- 1 infatoshi infatoshi 4353 Apr 26 03:11 benchmark.py
-rw-rw-r-- 1 infatoshi infatoshi 3441 Apr 25 22:20 check.py
-rw-rw-r-- 1 infatoshi infatoshi 2030 Apr 25 22:19 problem.yaml
-rw-rw-r-- 1 infatoshi infatoshi 2585 Apr 27 03:21 PROMPT.txt
-rw-rw-r-- 1 infatoshi infatoshi 5151 Apr 25 22:19 reference.py
-rw-rw-r-- 1 infatoshi infatoshi 933 Apr 25 22:19 shapes.py
-rw-rw-r-- 1 infatoshi infatoshi 2819 Apr 25 22:24 sota.py
{
"command": "<440 chars \u2014 see below>",
"description": "Verify CUDA environment"
}python3 -c "
import torch
# Verify bf16 support and compute capability
print('CUDA available:', torch.cuda.is_available())
print('Device:', torch.cuda.get_device_name(0))
print('Compute cap:', torch.cuda.get_device_capability(0))
print('bf16 supported:', torch.cuda.is_bf16_supported())
# Check nvcc
import subprocess
r = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
print('nvcc version:', r.stdout.strip()[:200])
"CUDA available: True
Device: NVIDIA RTX PRO 6000 Blackwell Workstation Edition
Compute cap: (12, 0)
bf16 supported: True
nvcc version: nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Mon_Mar_02_09:52:23_PM_PST_2026
Cuda compilation tools, release 13.2, V13.2.51
Build cuda_13.2.r13.2/compiler.
--- /dev/null
+++ b//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
@@ -0,0 +1,320 @@
+"""Paged attention decode kernel for RTX PRO 6000 (SM120 Blackwell).
+
+Single-query decode with grouped-query attention (GQA).
+Uses online softmax to process KV cache in a single pass without gathering pages.
+One block per (batch, kv_head) pair; each thread handles one head_dim element.
+"""
+from __future__ import annotations
+
+import math
+
+import torch
+import torch.nn as nn
+from torch.utils.cpp_extension import load_inline
+
+OP_TYPE = "attention"
+SUPPORTED_PRECISIONS = ["bf16"]
+HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
+
+# Language-visible shape knobs (overridden by check.py/benchmark.py)
+BATCH = 8
+NUM_HEADS = 32
+NUM_KV_HEADS = 8
+HEAD_DIM = 128
+SEQ_LEN = 1024
+PAGE_SIZE = 16
+
+# ---------------------------------------------------------------------------
+# CUDA kernel + wrapper (compiled once per process)
+# ---------------------------------------------------------------------------
+
+_cpp_src = r"""
+torch::Tensor launch_paged_attention_decode(
+ torch::Tensor query,
+ torch::Tensor kv_cache,
+ torch::Tensor block_table,
+ torch::Tensor seq_lens,
+ int B, int H, int Hkv, int D, int P, int max_blocks, float scale);
+"""
+
+_cuda_src = r"""
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cfloat>
+
+#define WARP_SIZE 32
+#define MAX_G 16
+
+extern "C" __global__ void paged_attention_decode_kernel(
+ const __nv_bfloat16* __restrict__ query,
+ const __nv_bfloat16* __restrict__ kv_cache,
+ const int* __restrict__ block_table,
+ const int* __restrict__ seq_lens,
+ __nv_bfloat16* __restrict__ output,
+ int B, int H, int Hkv, int D, int P, int max_blocks, float scale)
+{
+ int linear_bid = blockIdx.x;
+ int b = linear_bid / Hkv;
+ int kv_h = linear_bid % Hkv;
+ if (b >= B) return;
+
+ int L = seq_lens[b];
+ if (L <= 0) return;
+
+ int num_pages = (L + P - 1) / P;
+ int G = H / Hkv;
+ int tid = threadIdx.x;
+ int num_threads = blockDim.x;
+ int warp_id = tid / WARP_SIZE;
+ int lane_id = tid % WARP_SIZE;
+ int num_warps = num_threads / WARP_SIZE;
+
+ extern __shared__ float smem[];
+ float* scratch = smem; // num_warps * G
+ float* weight = smem + num_warps * G; // G
+ float* rescale = smem + num_warps * G + G; // G
+
+ // Load queries for all heads in this KV group into fp32 registers
+ float q_reg[MAX_G];
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ int q_h = kv_h * G + g;
+ q_reg[g] = __bfloat162float(query[b * H * D + q_h * D + tid]);
+ }
+
+ // Online softmax state — maintained by lane 0 only
+ float m[MAX_G], l[MAX_G], acc[MAX_G];
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ m[g] = -1e10f; // effectively -inf for softmax
+ l[g] = 0.0f;
+ acc[g] = 0.0f;
+ }
+
+ // Pre-compute offsets
+ const int* bt_base = block_table + b * max_blocks;
+ int page_stride = P * Hkv * 2 * D; // bytes per page in kv_cache
+ int token_stride = Hkv * 2 * D; // bytes per token (stride across kv_heads)
+ int k_off = kv_h * 2 * D; // offset to K for this kv_head
+ int v_off = k_off + D; // offset to V = K + D
+
+ for (int pg = 0; pg < num_pages; pg++) {
+ int block_idx = bt_base[pg];
+ int page_base = block_idx * page_stride;
+ int tokens = (pg == num_pages - 1) ? (L - pg * P) : P;
+
+ for (int t = 0; t < tokens; t++) {
+ int base = page_base + t * token_stride;
+
+ // Coalesced loads: adjacent threads read adjacent bf16 values
+ float k_val = __bfloat162float(kv_cache[base + k_off + tid]);
+ float v_val = __bfloat162float(kv_cache[base + v_off + tid]);
+
+ // ---- Part 1: partial dot products ----
+ float dot[MAX_G];
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ dot[g] = q_reg[g] * k_val;
+ }
+
+ // Warp-level butterfly reduction
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ #pragma unroll
+ for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+ dot[g] += __shfl_xor_sync(0xffffffff, dot[g], offset);
+ }
+ }
+
+ // Inter-warp reduction: each warp writes its partial to shared memory
+ if (lane_id == 0) {
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ scratch[g * num_warps + warp_id] = dot[g];
+ }
+ }
+ __syncthreads();
+
+ // Warp 0 collates inter-warp partials into full dot products
+ if (warp_id == 0) {
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ float sum = 0.0f;
+ #pragma unroll
+ for (int w = 0; w < num_warps; w++) {
+ sum += scratch[g * num_warps + w];
+ }
+ dot[g] = sum * scale;
+ }
+ }
+
+ // ---- Part 2: online softmax (lane 0 of warp 0) ----
+ if (warp_id == 0 && lane_id == 0) {
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ float s = dot[g];
+ float m_new = fmaxf(m[g], s);
+ float old_scale = expf(m[g] - m_new);
+ float exp_val = expf(s - m_new);
+ weight[g] = exp_val;
+ rescale[g] = old_scale;
+ l[g] = old_scale * l[g] + exp_val;
+ m[g] = m_new;
+ }
+ }
+ __syncthreads();
+
+ // ---- Part 3: all threads update
... (5048 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py",
"content": "<10612 chars \u2014 see diff>"
}Wrote file successfully.{
"description": "Run check.py correctness test",
"command": "cd /home/infatoshi/cuda/KernelBench-Hard && /home/infatoshi/.local/bin/uv run python problems/03_paged_attention/check.py 2>&1",
"timeout": 300000
}Traceback (most recent call last):
File "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2774, in _run_ninja_build
subprocess.run(
File "/home/infatoshi/.local/share/uv/python/cpython-3.11.15-linux-x86_64-gnu/lib/python3.11/subprocess.py", line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 2.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py", line 109, in <module>
main()
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/check.py", line 72, in main
sol_out = sol_model(*inputs)
^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py", line 283, in forward
mod = _get_module()
^^^^^^^^^^^^^
File "/home/infatoshi/cuda/Kernel
... (3641 more chars)--- a//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
+++ b//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
@@ -208,8 +208,7 @@
int G = H / Hkv;
size_t smem_bytes = (num_warps + 2) * G * sizeof(float);
- auto stream = at::cuda::getCurrentCUDAStream();
- paged_attention_decode_kernel<<<num_blocks, num_threads, smem_bytes, stream>>>(
+ paged_attention_decode_kernel<<<num_blocks, num_threads, smem_bytes>>>(
reinterpret_cast<const __nv_bfloat16*>(query.data_ptr<torch::BFloat16>()),
reinterpret_cast<const __nv_bfloat16*>(kv_cache.data_ptr<torch::BFloat16>()),
block_table.data_ptr<int>(),
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py",
"oldString": "<498 chars \u2014 see below>",
"newString": "<438 chars \u2014 see below>"
} auto stream = at::cuda::getCurrentCUDAStream();
paged_attention_decode_kernel<<<num_blocks, num_threads, smem_bytes, stream>>>(
reinterpret_cast<const __nv_bfloat16*>(query.data_ptr<torch::BFloat16>()),
reinterpret_cast<const __nv_bfloat16*>(kv_cache.data_ptr<torch::BFloat16>()),
block_table.data_ptr<int>(),
seq_lens.data_ptr<int>(),
reinterpret_cast<__nv_bfloat16*>(output.data_ptr<torch::BFloat16>()),
B, H, Hkv, D, P, max_blocks, scale); paged_attention_decode_kernel<<<num_blocks, num_threads, smem_bytes>>>(
reinterpret_cast<const __nv_bfloat16*>(query.data_ptr<torch::BFloat16>()),
reinterpret_cast<const __nv_bfloat16*>(kv_cache.data_ptr<torch::BFloat16>()),
block_table.data_ptr<int>(),
seq_lens.data_ptr<int>(),
reinterpret_cast<__nv_bfloat16*>(output.data_ptr<torch::BFloat16>()),
B, H, Hkv, D, P, max_blocks, scale);Edit applied successfully.{
"description": "Retry check.py after stream fix",
"command": "cd /home/infatoshi/cuda/KernelBench-Hard && /home/infatoshi/.local/bin/uv run python problems/03_paged_attention/check.py 2>&1",
"timeout": 300000
}PASS
{
"todos": [
{
"content": "Write initial solution.py with CUDA C++ kernel + C++ wrapper via load_inline",
"priority": "high",
"status": "completed"
},
{
"content": "Run check.py for correctness verification",
"priority": "high",
"status": "completed"
},
{
"content": "Run benchmark.py for performance measurement",
"priority": "high",
"status": "in_progress"
},
{
"content": "Profile with ncu and iterate if performance is below target",
"priority": "medium",
"status": "pending"
}
]
}content: 'Write initial solution.py with CUDA C++ kernel + C++ wrapper via load_inline'
status: 'completed'
priority: 'high'
content: 'Run check.py for correctness verification'
status: 'completed'
priority: 'high'
content: 'Run benchmark.py for performance measurement'
status: 'in_progress'
priority: 'high'
content: 'Profile with ncu and iterate if performance is below target'
status: 'pending'
priority: 'medium'{
"description": "Run benchmark.py for performance",
"command": "cd /home/infatoshi/cuda/KernelBench-Hard && /home/infatoshi/.local/bin/uv run python problems/03_paged_attention/benchmark.py 2>&1",
"timeout": 600000
}Traceback (most recent call last):
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/benchmark.py", line 131, in <module>
main()
File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/benchmark.py", line 42, in main
meta = yaml.safe_load(Path("problem.yaml").read_text())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/share/uv/python/cpython-3.11.15-linux-x86_64-gnu/lib/python3.11/pathlib.py", line 1058, in read_text
with self.open(mode='r', encoding=encoding, errors=errors) as f:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/infatoshi/.local/share/uv/python/cpython-3.11.15-linux-x86_64-gnu/lib/python3.11/pathlib.py", line 1044, in open
return io.open(self, mode, buffering, encoding, errors, newline)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'problem.yaml'
{
"description": "Run benchmark from problem directory",
"command": "cd /home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention && /home/infatoshi/.local/bin/uv run python benchmark.py 2>&1",
"timeout": 600000
}W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] Graph break from `Tensor.item()`, consider setting:
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] torch._dynamo.config.capture_scalar_outputs = True
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] or:
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] to include these operations in the captured graph.
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] Graph break: from user code at:
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py", line 89, in forward
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] L = int(seq_lens[b].item())
W0427 09:12:19.341000 1130296 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
W0427 09
... (1876 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py</path>
<type>file</type>
<content>
1: """Paged attention decode kernel for RTX PRO 6000 (SM120 Blackwell).
2:
3: Single-query decode with grouped-query attention (GQA).
4: Uses online softmax to process KV cache in a single pass without gathering pages.
5: One block per (batch, kv_head) pair; each thread handles one head_dim element.
6: """
7: from __future__ import annotations
8:
9: import math
10:
11: import torch
12: import torch.nn as nn
13: from torch.utils.cpp_extension import load_inline
14:
15: OP_TYPE = "attention"
16: SUPPORTED_PRECISIONS = ["bf16"]
17: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
18:
19: # Language-visible shape knobs (overridden by check.py/benchmark.py)
20: BATCH = 8
21: NUM_HEADS = 32
22: NUM_KV_HEADS = 8
23: HEAD_DIM = 128
24: SEQ_LEN = 1024
25: PAGE_SIZE = 16
26:
27: # ---------------------------------------------------------------------------
28: # CUDA kernel + wrapper (compiled once per process)
29: # ---------------------------------------------------------------------------
30:
31: _cpp_src = r"""
32: torch::Tensor launch_paged_attention_decode(
33: torch::Tensor query,
34: torch::Tensor kv_cache,
35: torch::Tensor block_table,
36: torch::Tensor seq_lens,
37: int B, int H, int Hkv, int D, int P, int max_blocks, float scale);
38: """
39:
40: _cuda_src = r"""
41: #include <cuda_runtime.h>
42: #include <cuda_fp16.h>
43: #in
... (10701 more chars)--- a//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
+++ b//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
@@ -1,8 +1,9 @@
"""Paged attention decode kernel for RTX PRO 6000 (SM120 Blackwell).
Single-query decode with grouped-query attention (GQA).
-Uses online softmax to process KV cache in a single pass without gathering pages.
-One block per (batch, kv_head) pair; each thread handles one head_dim element.
+Warp-only kernel: one warp per (batch, kv_head), no shared memory, no barriers.
+Each lane handles D/32 dimensions; all reduction via __shfl_xor_sync.
+Online softmax processes KV cache in a single streaming pass.
"""
from __future__ import annotations
@@ -16,7 +17,6 @@
SUPPORTED_PRECISIONS = ["bf16"]
HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
-# Language-visible shape knobs (overridden by check.py/benchmark.py)
BATCH = 8
NUM_HEADS = 32
NUM_KV_HEADS = 8
@@ -41,10 +41,10 @@
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
-#include <cfloat>
#define WARP_SIZE 32
#define MAX_G 16
+#define MAX_STRIDE 8 // max D/32 = 128/32 = 4, plus safety
extern "C" __global__ void paged_attention_decode_kernel(
const __nv_bfloat16* __restrict__ query,
@@ -64,41 +64,56 @@
int num_pages = (L + P - 1) / P;
int G = H / Hkv;
- int tid = threadIdx.x;
- int num_threads = blockDim.x;
- int warp_id = tid / WARP_SIZE;
- int lane_id = tid % WARP_SIZE;
- int num_warps = num_threads / WARP_SIZE;
-
- extern __shared__ float smem[];
- float* scratch = smem; // num_warps * G
- float* weight = smem + num_warps * G; // G
- float* rescale = smem + num_warps * G + G; // G
-
- // Load queries for all heads in this KV group into fp32 registers
- float q_reg[MAX_G];
+ int lane_id = threadIdx.x; // 0..31
+ int stride = D / WARP_SIZE; // elements per lane per head (4 for D=128, 2 for D=64)
+ unsigned mask = __activemask();
+
+ // ------------------------------------------------------------------
+ // Load queries for all G heads into registers (fp32).
+ // Each lane loads 'stride' elements per query head, stride-separated.
+ // ------------------------------------------------------------------
+ float q_reg[MAX_G][MAX_STRIDE];
#pragma unroll
for (int g = 0; g < G; g++) {
int q_h = kv_h * G + g;
- q_reg[g] = __bfloat162float(query[b * H * D + q_h * D + tid]);
- }
-
- // Online softmax state — maintained by lane 0 only
- float m[MAX_G], l[MAX_G], acc[MAX_G];
+ int q_base = b * H * D + q_h * D;
+ #pragma unroll
+ for (int s = 0; s < stride; s++) {
+ int idx = q_base + lane_id + s * WARP_SIZE;
+ q_reg[g][s] = __bfloat162float(query[idx]);
+ }
+ }
+
+ // Online softmax state (maintained by lane 0)
+ float m[MAX_G], l[MAX_G];
#pragma unroll
for (int g = 0; g < G; g++) {
- m[g] = -1e10f; // effectively -inf for softmax
- l[g] = 0.0f;
- acc[g] = 0.0f;
- }
-
+ m[g] = -1e10f;
+ l[g] = 0.0f;
+ }
+ // Output accumulator: one float per (g, stride_s)
+ float acc[MAX_G][MAX_STRIDE];
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ #pragma unroll
+ for (int s = 0; s < stride; s++) {
+ acc[g][s] = 0.0f;
+ }
+ }
+
+ // ------------------------------------------------------------------
// Pre-compute offsets
+ // ------------------------------------------------------------------
const int* bt_base = block_table + b * max_blocks;
- int page_stride = P * Hkv * 2 * D; // bytes per page in kv_cache
- int token_stride = Hkv * 2 * D; // bytes per token (stride across kv_heads)
- int k_off = kv_h * 2 * D; // offset to K for this kv_head
- int v_off = k_off + D; // offset to V = K + D
-
+ int page_stride = P * Hkv * 2 * D; // bf16 elements per page
+ int token_stride = Hkv * 2 * D; // bf16 elements per token
+ int head_off = kv_h * 2 * D; // offset to this KV head's K
+ int k_base = head_off;
+ int v_base = head_off + D;
+
+ // ------------------------------------------------------------------
+ // Main loop: iterate over pages, then tokens
+ // ------------------------------------------------------------------
for (int pg = 0; pg < num_pages; pg++) {
int block_idx = bt_base[pg];
int page_base = block_idx * page_stride;
@@ -107,91 +122,92 @@
for (int t = 0; t < tokens; t++) {
int base = page_base + t * token_stride;
- // Coalesced loads: adjacent threads read adjacent bf16 values
- float k_val = __bfloat162float(kv_cache[base + k_off + tid]);
- float v_val = __bfloat162float(kv_cache[base + v_off + tid]);
-
- // ---- Part 1: partial dot products ----
+ // ---- load K and V for this token, strided across lanes ----
+ float k_vals[MAX_STRIDE];
+ float v_vals[MAX_STRIDE];
+ #pragma unroll
+ for (int s = 0; s < stride; s++) {
+ int idx = base + k_base + lane_id + s * WARP_SIZE;
+ k_vals[s] = __bfloat162float(kv_cache[idx]);
+ v_vals[s] = __bfloat162float(kv_cache[base + v_base + lane_id + s * WARP_SIZE]);
+ }
+
+ // ---- compute dot product Q @ K for each query head ----
float dot[MAX_G];
#pragma unroll
for (int g = 0; g < G; g++) {
- dot[g] = q_reg[g] * k_val;
- }
-
- // Warp-level butterfly reduction
+ float s = 0.0f;
+ #pragma unroll
+ for (int j = 0; j < stride; j++) {
+ s = fmaf(q_reg[g][j], k_vals[j], s);
+
... (5805 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py",
"content": "<11232 chars \u2014 see diff>"
}Wrote file successfully.--- a//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
+++ b//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
@@ -139,7 +139,7 @@
float s = 0.0f;
#pragma unroll
for (int j = 0; j < stride; j++) {
- s = fmaf(q_reg[g][j], k_vals[j], s);
+ s += q_reg[g][j] * k_vals[j];
}
dot[g] = s;
}
@@ -156,6 +156,11 @@
// ---- online softmax (lane 0 computes; broadcast via shuffle) ----
float weight[MAX_G], rescale[MAX_G];
+ #pragma unroll
+ for (int g = 0; g < G; g++) {
+ weight[g] = 0.0f;
+ rescale[g] = 0.0f;
+ }
if (lane_id == 0) {
#pragma unroll
for (int g = 0; g < G; g++) {
@@ -182,7 +187,7 @@
for (int g = 0; g < G; g++) {
#pragma unroll
for (int s = 0; s < stride; s++) {
- acc[g][s] = fmaf(rescale[g], acc[g][s], weight[g] * v_vals[s]);
+ acc[g][s] = rescale[g] * acc[g][s] + weight[g] * v_vals[s];
}
}
}
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py",
"oldString": "<2022 chars \u2014 see below>",
"newString": "<2164 chars \u2014 see below>"
} // ---- compute dot product Q @ K for each query head ----
float dot[MAX_G];
#pragma unroll
for (int g = 0; g < G; g++) {
float s = 0.0f;
#pragma unroll
for (int j = 0; j < stride; j++) {
s = fmaf(q_reg[g][j], k_vals[j], s);
}
dot[g] = s;
}
// Warp-level butterfly sum reduction
#pragma unroll
for (int g = 0; g < G; g++) {
#pragma unroll
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
dot[g] += __shfl_xor_sync(mask, dot[g], offset);
}
dot[g] *= scale;
}
// ---- online softmax (lane 0 computes; broadcast via shuffle) ----
float weight[MAX_G], rescale[MAX_G];
if (lane_id == 0) {
#pragma unroll
for (int g = 0; g < G; g++) {
float s = dot[g];
float m_new = fmaxf(m[g], s);
float old_scale = __expf(m[g] - m_new);
float exp_val = __expf(s - m_new);
weight[g] = exp_val;
rescale[g] = old_scale;
l[g] = old_scale * l[g] + exp_val;
m[g] = m_new;
}
}
// Broadcast weight and rescale from lane 0 to all lanes
#pragma unroll
for (int g = 0; g < G; g++) {
weight[g] = __shfl_sync(mask, weight[g], 0);
rescale[g] = __shfl_sync(mask, rescale[g], 0);
}
// ---- all lanes update output accumulator ----
#pragma unroll
for (int g = 0; g < G; g++) {
#pragma unroll
for (int s = 0; s < stride; s++) {
acc[g][s] = fmaf(rescale[g], acc[g][s], weight[g] * v_vals[s]);
}
} // ---- compute dot product Q @ K for each query head ----
float dot[MAX_G];
#pragma unroll
for (int g = 0; g < G; g++) {
float s = 0.0f;
#pragma unroll
for (int j = 0; j < stride; j++) {
s += q_reg[g][j] * k_vals[j];
}
dot[g] = s;
}
// Warp-level butterfly sum reduction
#pragma unroll
for (int g = 0; g < G; g++) {
#pragma unroll
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
dot[g] += __shfl_xor_sync(mask, dot[g], offset);
}
dot[g] *= scale;
}
// ---- online softmax (lane 0 computes; broadcast via shuffle) ----
float weight[MAX_G], rescale[MAX_G];
#pragma unroll
for (int g = 0; g < G; g++) {
weight[g] = 0.0f;
rescale[g] = 0.0f;
}
if (lane_id == 0) {
#pragma unroll
for (int g = 0; g < G; g++) {
float s = dot[g];
float m_new = fmaxf(m[g], s);
float old_scale = __expf(m[g] - m_new);
float exp_val = __expf(s - m_new);
weight[g] = exp_val;
rescale[g] = old_scale;
l[g] = old_scale * l[g] + exp_val;
m[g] = m_new;
}
}
// Broadcast weight and rescale from lane 0 to all lanes
#pragma unroll
for (int g = 0; g < G; g++) {
weight[g] = __shfl_sync(mask, weight[g], 0);
rescale[g] = __shfl_sync(mask, rescale[g], 0);
}
// ---- all lanes update output accumulator ----
#pragma unroll
for (int g = 0; g < G; g++) {
#pragma unroll
for (int s = 0; s < stride; s++) {
acc[g][s] = rescale[g] * acc[g][s] + weight[g] * v_vals[s];
}
}Edit applied successfully.{
"description": "Check correctness then benchmark",
"command": "cd /home/infatoshi/cuda/KernelBench-Hard && uv run python problems/03_paged_attention/check.py 2>&1 && echo \"---\" && cd /home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention && uv run python benchmark.py 2>&1",
"timeout": 600000
}PASS
---
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] Graph break from `Tensor.item()`, consider setting:
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] torch._dynamo.config.capture_scalar_outputs = True
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] or:
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] to include these operations in the captured graph.
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] Graph break: from user code at:
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] File "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/reference.py", line 89, in forward
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0] L = int(seq_lens[b].item())
W0427 09:18:45.298000 1130835 .venv/lib/python3.11/site-packages/torch/_dynamo/variables/tensor.py:1379] [0/0]
... (1885 more chars)--- a//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
+++ b//home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py
@@ -1,7 +1,7 @@
"""Paged attention decode kernel for RTX PRO 6000 (SM120 Blackwell).
Single-query decode with grouped-query attention (GQA).
-Warp-only kernel: one warp per (batch, kv_head), no shared memory, no barriers.
+One 32-thread warp-block per query head for maximum parallelism.
Each lane handles D/32 dimensions; all reduction via __shfl_xor_sync.
Online softmax processes KV cache in a single streaming pass.
"""
@@ -25,7 +25,7 @@
PAGE_SIZE = 16
# ---------------------------------------------------------------------------
-# CUDA kernel + wrapper (compiled once per process)
+# CUDA kernel + wrapper
# ---------------------------------------------------------------------------
_cpp_src = r"""
@@ -43,8 +43,7 @@
#include <cuda_bf16.h>
#define WARP_SIZE 32
-#define MAX_G 16
-#define MAX_STRIDE 8 // max D/32 = 128/32 = 4, plus safety
+#define MAX_STRIDE 8
extern "C" __global__ void paged_attention_decode_kernel(
const __nv_bfloat16* __restrict__ query,
@@ -52,68 +51,49 @@
const int* __restrict__ block_table,
const int* __restrict__ seq_lens,
__nv_bfloat16* __restrict__ output,
- int B, int H, int Hkv, int D, int P, int max_blocks, float scale)
+ int B, int H, int Hkv, int D, int P, int max_blocks, float scale,
+ int G)
{
+ // One block per query head
int linear_bid = blockIdx.x;
- int b = linear_bid / Hkv;
- int kv_h = linear_bid % Hkv;
+ int b = linear_bid / H;
+ int q_head = linear_bid % H;
if (b >= B) return;
int L = seq_lens[b];
if (L <= 0) return;
+ int kv_h = q_head / G; // which KV head this query head maps to
int num_pages = (L + P - 1) / P;
- int G = H / Hkv;
- int lane_id = threadIdx.x; // 0..31
- int stride = D / WARP_SIZE; // elements per lane per head (4 for D=128, 2 for D=64)
+ int lane_id = threadIdx.x;
+ int stride = D / WARP_SIZE;
unsigned mask = __activemask();
- // ------------------------------------------------------------------
- // Load queries for all G heads into registers (fp32).
- // Each lane loads 'stride' elements per query head, stride-separated.
- // ------------------------------------------------------------------
- float q_reg[MAX_G][MAX_STRIDE];
+ // Load this query head (fp32)
+ float q_val[MAX_STRIDE];
+ int q_base = b * H * D + q_head * D;
#pragma unroll
- for (int g = 0; g < G; g++) {
- int q_h = kv_h * G + g;
- int q_base = b * H * D + q_h * D;
- #pragma unroll
- for (int s = 0; s < stride; s++) {
- int idx = q_base + lane_id + s * WARP_SIZE;
- q_reg[g][s] = __bfloat162float(query[idx]);
- }
- }
-
- // Online softmax state (maintained by lane 0)
- float m[MAX_G], l[MAX_G];
+ for (int s = 0; s < stride; s++) {
+ q_val[s] = __bfloat162float(query[q_base + lane_id + s * WARP_SIZE]);
+ }
+
+ // Online softmax state for this single query head
+ float m = -1e10f;
+ float l = 0.0f;
+ float acc[MAX_STRIDE];
#pragma unroll
- for (int g = 0; g < G; g++) {
- m[g] = -1e10f;
- l[g] = 0.0f;
- }
- // Output accumulator: one float per (g, stride_s)
- float acc[MAX_G][MAX_STRIDE];
- #pragma unroll
- for (int g = 0; g < G; g++) {
- #pragma unroll
- for (int s = 0; s < stride; s++) {
- acc[g][s] = 0.0f;
- }
- }
-
- // ------------------------------------------------------------------
- // Pre-compute offsets
- // ------------------------------------------------------------------
+ for (int s = 0; s < stride; s++) {
+ acc[s] = 0.0f;
+ }
+
+ // Offsets into kv_cache for this KV head
const int* bt_base = block_table + b * max_blocks;
- int page_stride = P * Hkv * 2 * D; // bf16 elements per page
- int token_stride = Hkv * 2 * D; // bf16 elements per token
- int head_off = kv_h * 2 * D; // offset to this KV head's K
- int k_base = head_off;
- int v_base = head_off + D;
-
- // ------------------------------------------------------------------
- // Main loop: iterate over pages, then tokens
- // ------------------------------------------------------------------
+ int page_stride = P * Hkv * 2 * D;
+ int token_stride = Hkv * 2 * D;
+ int k_off = kv_h * 2 * D;
+ int v_off = k_off + D;
+
+ // Main loop over pages and tokens
for (int pg = 0; pg < num_pages; pg++) {
int block_idx = bt_base[pg];
int page_base = block_idx * page_stride;
@@ -122,92 +102,58 @@
for (int t = 0; t < tokens; t++) {
int base = page_base + t * token_stride;
- // ---- load K and V for this token, strided across lanes ----
+ // Load K and V values for this token (strided across lanes)
float k_vals[MAX_STRIDE];
float v_vals[MAX_STRIDE];
#pragma unroll
for (int s = 0; s < stride; s++) {
- int idx = base + k_base + lane_id + s * WARP_SIZE;
- k_vals[s] = __bfloat162float(kv_cache[idx]);
- v_vals[s] = __bfloat162float(kv_cache[base + v_base + lane_id + s * WARP_SIZE]);
- }
-
- // ---- compute dot product Q @ K for each query head ----
- float dot[MAX_G];
- #pragma unroll
- for (int g = 0; g < G; g++) {
- float s = 0.0f;
- #pragma unroll
- for (int j = 0; j < stride; j++) {
- s += q_reg[g][j] * k_vals[j];
- }
- dot[g] = s;
- }
-
- // Warp-level butterfly sum reduction
- #pragma unroll
- for (int g = 0; g < G; g++) {
- #pragma unroll
-
... (5201 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention/solution.py",
"content": "<8857 chars \u2014 see diff>"
}Wrote file successfully.{
"description": "Clean build cache, test correctness, benchmark",
"command": "rm -rf /home/infatoshi/.cache/torch_extensions/py311_cu130/paged_attention_decode_ext && cd /home/infatoshi/cuda/KernelBench-Hard && uv run python problems/03_paged_attention/check.py 2>&1 && echo \"---BENCH---\" && cd /home/infatoshi/cuda/KernelBench-Hard/problems/03_paged_attention && uv run python benchmark.py 2>&1",
"timeout": 600000
}