"""Custom KDA forward (chunk form) using Triton kernels for SM120.
Implementation notes:
- g is rescaled by RCP_LN2 so we can use exp2 throughout (faster than exp).
- In-chunk cumsum on g is fused into the intra kernel via tl.cumsum.
- Phase 1 (intra): per-chunk forward substitution on the (I + L) factor and
materialize the chunk-local Akk_inv, w, u tensors.
- Phase 2 (inter): sequential per-(b, h) loop over chunks computing the state
h and the output o, with the V dim split into BV-tiles for parallelism.
"""
from __future__ import annotations
import torch
import torch.nn as nn
import triton
import triton.language as tl
RCP_LN2 = 1.4426950408889634 # log2(e), so exp(x) = exp2(x * RCP_LN2)
@triton.jit
def _kda_intra_kernel(
k_ptr,
v_ptr,
g_in_ptr,
g_out_ptr,
beta_ptr,
A_ptr,
w_ptr,
u_ptr,
T,
H: tl.constexpr,
K: tl.constexpr,
V: tl.constexpr,
BT: tl.constexpr,
SCALE_LOG: tl.constexpr,
):
"""Per-chunk preparation: cumsum g; build Akk_inv*beta_col; w, u.
Grid: (B*H, NT)
"""
i_bh = tl.program_id(0)
i_n = tl.program_id(1)
i_b = i_bh // H
i_h = i_bh % H
NT = T // BT
offs_t = tl.arange(0, BT)
offs_k = tl.arange(0, K)
offs_v = tl.arange(0, V)
# ---------- Pointers ----------
# k, g: (B, T, H, K) — strides (T*H*K, H*K, K, 1)
k_base = i_b * T * H * K + i_n * BT * H * K + i_h * K
g_base = k_base
# v: (B, T, H, V)
v_base = i_b * T * H * V + i_n * BT * H * V + i_h * V
# beta: (B, T, H)
beta_base = i_b * T * H + i_n * BT * H + i_h
# A: (B, NT, H, BT, BT) contiguous on the last two dims
A_base = i_b * NT * H * BT * BT + i_n * H * BT * BT + i_h * BT * BT
# ---------- Load k, g, beta ----------
kg_ptrs_t = offs_t[:, None] * H * K + offs_k[None, :]
b_k = tl.load(k_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
b_g_raw = tl.load(g_in_ptr + g_base + kg_ptrs_t).to(tl.float32) * SCALE_LOG
b_g = tl.cumsum(b_g_raw, axis=0) # (BT, K) fp32
# Save the cumsumed g back to global (used by inter kernel).
tl.store(g_out_ptr + g_base + kg_ptrs_t, b_g)
b_beta = tl.load(beta_ptr + beta_base + offs_t * H).to(tl.float32) # (BT,)
# ---------- Akk_raw = -beta_row * (k*exp(g)) @ (k*exp(-g)).T ----------
b_eg = tl.exp2(b_g) # (BT, K) fp32
b_eng = tl.exp2(-b_g) # (BT, K) fp32
b_kf = b_k.to(tl.float32) * b_eg
b_kg = b_k.to(tl.float32) * b_eng
b_A = tl.dot(b_kf.to(b_k.dtype), tl.trans(b_kg).to(b_k.dtype)) # (BT, BT) fp32
mask_strict_lower = offs_t[:, None] > offs_t[None, :]
b_A = tl.where(mask_strict_lower, -b_A * b_beta[:, None], 0.0)
# ---------- Forward substitute (BT iterations) ----------
# for i in 1..BT-1: A[i, :i] += sum_m A[i, m] * A[m, :i] for m < i
for i in tl.static_range(1, BT):
is_row = (offs_t == i)
# row i of A (BT,)
row_i = tl.sum(tl.where(is_row[:, None], b_A, 0.0), axis=0)
# (row_i @ A)[j] = sum_m row_i[m] * A[m, j]
update = tl.sum(row_i[:, None] * b_A, axis=0)
# write only the lower part of row i
write_mask = is_row[:, None] & (offs_t[None, :] < i)
b_A = tl.where(write_mask, b_A + update[None, :], b_A)
# Add identity on the diagonal
b_A = tl.where(offs_t[:, None] == offs_t[None, :], b_A + 1.0, b_A)
# Multiply by beta_col: A *= beta[None, :]
b_A = b_A * b_beta[None, :]
# ---------- Store A_inv (fp32 for downstream precision) ----------
A_ptrs = offs_t[:, None] * BT + offs_t[None, :]
tl.store(A_ptr + A_base + A_ptrs, b_A)
# ---------- w = A @ (g.exp() * k) ----------
b_w = tl.dot(b_A.to(b_k.dtype), b_kf.to(b_k.dtype)) # (BT, K) fp32
tl.store(w_ptr + k_base + kg_ptrs_t, b_w.to(b_k.dtype))
# ---------- u = A @ v ----------
v_ptrs_t = offs_t[:, None] * H * V + offs_v[None, :]
b_v = tl.load(v_ptr + v_base + v_ptrs_t) # (BT, V) bf16
b_u = tl.dot(b_A.to(b_v.dtype), b_v) # (BT, V) fp32
tl.store(u_ptr + v_base + v_ptrs_t, b_u.to(b_v.dtype))
@triton.jit
def _kda_inter_kernel(
q_ptr,
k_ptr,
g_ptr,
w_ptr,
u_ptr,
o_ptr,
scale,
T,
H: tl.constexpr,
K: tl.constexpr,
V: tl.constexpr,
BT: tl.constexpr,
BV: tl.constexpr,
SCALE_LOG: tl.constexpr,
):
"""Inter-chunk recurrence: maintain state S, compute o.
Grid: (B*H, V/BV)
Each program handles one (b, h) pair and one V-slice. Sequential over chunks.
"""
i_bh = tl.program_id(0)
i_v = tl.program_id(1)
i_b = i_bh // H
i_h = i_bh % H
NT = T // BT
offs_t = tl.arange(0, BT)
offs_k = tl.arange(0, K)
offs_v = tl.arange(0, BV)
v_lo = i_v * BV
# Strict-upper mask for Aqk (zero out strict-upper, keep diag and below)
mask_su = offs_t[:, None] < offs_t[None, :]
# State S: (K, BV) fp32
b_S = tl.zeros((K, BV), dtype=tl.float32)
for i_n in range(NT):
# Pointers
k_base = i_b * T * H * K + i_n * BT * H * K + i_h * K
v_base = i_b * T * H * V + i_n * BT * H * V + i_h * V + v_lo
kg_ptrs_t = offs_t[:, None] * H * K + offs_k[None, :]
v_ptrs_t = offs_t[:, None] * H * V + offs_v[None, :]
# Load q, k, g, w, u_slice
b_q = tl.load(q_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
b_k = tl.load(k_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
b_g = tl.load(g_ptr + k_base + kg_ptrs_t) # (BT, K) fp32
b_w = tl.load(w_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
b_u = tl.load(u_ptr + v_base + v_ptrs_t) # (BT, BV) bf16
# Aqk[c, c'] = q[c] * exp(g[c] - g[c']) * k[c']
# = (q * exp(g))[c] @ (k * exp(-g))[c'].T
b_eg = tl.exp2(b_g)
b_eng = tl.exp2(-b_g)
# apply scale to q here so q @ S also gets scaled
b_qe = (b_q.to(tl.float32) * b_eg * scale).to(b_q.dtype)
b_kf = (b_k.to(tl.float32) * b_eng).to(b_k.dtype)
b_Aqk = tl.dot(b_qe, tl.trans(b_kf)) # (BT, BT) fp32
b_Aqk = tl.where(mask_su, 0.0, b_Aqk)
# v_new = u - w @ S (BT, BV)
b_v_new = b_u.to(tl.float32) - tl.dot(b_w, b_S.to(b_w.dtype))
# o = (q*exp(g)*scale) @ S + Aqk @ v_new
b_o = tl.dot(b_qe, b_S.to(b_qe.dtype))
b_o += tl.dot(b_Aqk.to(b_w.dtype), b_v_new.to(b_w.dtype))
# Store o
tl.store(o_ptr + v_base + v_ptrs_t, b_o.to(b_q.dtype))
# Update S: S = exp(g_last) * S + (k * exp(g_last - g)).T @ v_new
# g_last = g[BT-1, :]
g_last_off = (BT - 1) * H * K
b_g_last = tl.load(g_ptr + k_base + g_last_off + offs_k) # (K,) fp32
b_S = b_S * tl.exp2(b_g_last)[:, None]
b_state_term = (b_k.to(tl.float32) * tl.exp2(b_g_last[None, :] - b_g)).to(b_k.dtype)
b_S = b_S + tl.dot(tl.trans(b_state_term), b_v_new.to(b_state_term.dtype))
def kda_forward(
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
g: torch.Tensor,
beta: torch.Tensor,
scale: float,
chunk_size: int = 64,
) -> torch.Tensor:
"""KDA forward (chunk form). Returns o of shape (B, T, H, V) with v's dtype."""
B, T, H, K = q.shape
V = v.shape[-1]
BT = chunk_size
assert T % BT == 0
NT = T // BT
assert q.is_contiguous() and k.is_contiguous() and v.is_contiguous() and beta.is_contiguous()
# Workspaces
g_cum = torch.empty_like(g) # fp32, (B, T, H, K)
A_inv = torch.empty(B, NT, H, BT, BT, device=q.device, dtype=torch.float32)
w = torch.empty_like(k) # bf16, (B, T, H, K)
u = torch.empty_like(v) # bf16, (B, T, H, V)
o = torch.empty_like(v) # bf16, (B, T, H, V)
# Phase 1: intra-chunk preparation
grid_intra = (B * H, NT)
_kda_intra_kernel[grid_intra](
k, v, g, g_cum, beta, A_inv, w, u,
T,
H=H, K=K, V=V, BT=BT,
SCALE_LOG=RCP_LN2,
num_warps=4,
num_stages=2,
)
# Phase 2: inter-chunk recurrence
BV = 32 # split V into BV-sized tiles
assert V % BV == 0
grid_inter = (B * H, V // BV)
_kda_inter_kernel[grid_inter](
q, k, g_cum, w, u, o, scale,
T,
H=H, K=K, V=V, BT=BT, BV=BV,
SCALE_LOG=RCP_LN2,
num_warps=4,
num_stages=2,
)
return o
class Model(nn.Module):
"""KDA forward (chunk form). Mirror of reference.Model — no learned params."""
def __init__(self, B: int, T: int, H: int, K: int, V: int, chunk_size: int = 64):
super().__init__()
self.B, self.T, self.H, self.K, self.V = B, T, H, K, V
self.chunk_size = chunk_size
self.scale = float(K) ** -0.5
self.register_buffer("_dummy", torch.zeros(1), persistent=False)
def forward(
self,
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
g: torch.Tensor,
beta: torch.Tensor,
) -> torch.Tensor:
return kda_forward(q, k, v, g, beta, scale=self.scale, chunk_size=self.chunk_size)
# Module-level shape shims (overridden by check.py / benchmark.py per shape).
B = 2
T = 1024
H = 8
K = 128
V = 128
CHUNK_SIZE = 64
def get_inputs():
torch.manual_seed(0)
q = torch.randn(B, T, H, K, dtype=torch.bfloat16) * 0.1
k = torch.randn(B, T, H, K, dtype=torch.bfloat16) * 0.1
v = torch.randn(B, T, H, V, dtype=torch.bfloat16) * 0.1
g = (torch.randn(B, T, H, K, dtype=torch.float32) * 0.1 - 0.05)
beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.bfloat16))
return [q, k, v, g, beta]
def get_init_inputs():
return [B, T, H, K, V, CHUNK_SIZE]
/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:322: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
warnings.warn(
PASS
{
"problem": "02_kda_cutlass",
"harness": "claude",
"model": "claude-opus-4-7",
"reasoning_effort": "max",
"has_solution": true,
"correct": true,
"peak_fraction": null,
"elapsed_seconds": 887,
"harness_exit_code": 143,
"session_complete": false,
"usage": {"input_tokens": 106, "output_tokens": 3620, "cache_read_tokens": 7305478, "cache_creation_tokens": 336586, "reasoning_tokens": null, "total_cost_usd": null}
}
{
"command": "pwd && ls -la",
"description": "Check current directory and contents"
}/home/infatoshi/cuda/KernelBench-Hard
drwxrwxr-x infatoshi infatoshi 4.0 KB Mon Apr 27 21:56:40 2026 .
drwxr-xr-x infatoshi infatoshi 4.0 KB Tue Apr 28 00:55:24 2026 ..
.rw-rw-r-- infatoshi infatoshi 1.6 KB Fri Apr 24 00:32:28 2026 .cursorrules
drwxrwxr-x infatoshi infatoshi 4.0 KB Tue Apr 28 02:09:38 2026 .git
.rw-rw-r-- infatoshi infatoshi 262 B Fri Apr 24 00:30:12 2026 .gitignore
drwxrwxr-x infatoshi infatoshi 4.0 KB Fri Apr 24 00:41:00 2026 .pytest_cache
.rw-rw-r-- infatoshi infatoshi 5 B Fri Apr 24 00:30:15 2026 .python-version
drwxrwxr-x infatoshi infatoshi 4.0 KB Fri Apr 24 00:40:58 2026 .ruff_cache
drwxrwxr-x infatoshi infatoshi 4.0 KB Fri Apr 24 00:40:54 2026 .venv
.rw-rw-r-- infatoshi infatoshi 1012 B Mon Apr 27 02:25:28 2026 AGENTS.md
.rw-rw-r-- infatoshi infatoshi 6.9 KB Mon Apr 27 02:24:34 2026 CLAUDE.md
.rw-rw-r-- infatoshi infatoshi 45 KB Mon Apr 27 21:56:40 2026 DEVLOG.md
drwxrwxr-x infatoshi infatoshi 4.0 KB Fri Apr 24 00:30:02 2026 docs
.rw-rw-r-- infatoshi infatoshi 1.0 KB Mon Apr 27 04:26:18 2026 LICENSE
drwxrwxr-x infatoshi infatoshi 4.0 KB Fri Apr 24 00:30:02 2026 outputs
drwxrwxr-x infatoshi infatoshi 4.0 KB Fri Apr 24 00:30:02 2026 problems
.rw-rw-r-- infatoshi infatoshi 663 B Sat Apr 25 22:54:16 2026 pyproject.toml
.rw-rw-r-- infatoshi infatoshi 8.0 KB Sun Apr 26 00:07:33 2026 README.md
drwxrwxr-x infatoshi infatoshi 4.0 KB Tue Apr 28 04:05:34 2026 scripts
.rw-rw-r-- infatoshi infatoshi 6.5 KB Mon Apr 27 02:25:12 2026 SPEC.md
... (210 more chars){
"command": "ls problems/",
"description": "List problems"
}01_fp8_gemm
02_kda_cutlass
03_paged_attention
04_kahan_softmax
05_topk_bitonic
06_sonic_moe_swiglu
07_w4a16_gemm
08_metal_lightning_attn{
"command": "ls problems/02_kda_cutlass/",
"description": "List KDA problem files"
}benchmark.py
check.py
problem.yaml
PROMPT.txt
reference.py
shapes.py
sota.py{
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/reference.py"
}1 """Naive PyTorch reference for Kimi Delta Attention (KDA) forward, chunk form.
2
3 This is the correctness oracle, NOT the SOTA baseline. It mirrors the
4 chunk-parallel formulation in fla/ops/kda/naive.py (Songlin Yang et al.)
5 without any Triton or CUDA optimization.
6
7 Inputs (per the FLA convention):
8 q, k : (B, T, H, K) bf16 -- queries / keys
9 v : (B, T, H, V) bf16 -- values
10 g : (B, T, H, K) fp32 -- per-channel log-decay (in-chunk cumsum applied)
11 beta : (B, T, H) bf16 -- write strength
12
13 Output:
14 o : (B, T, H, V) bf16
15
16 The agent must reproduce this output (within bf16 tolerance) using a CUTLASS
17 CuTe kernel on SM120 -- NOT by calling fla.ops.chunk_kda directly.
18 """
19 from __future__ import annotations
20
21 import torch
22 import torch.nn as nn
23 from einops import rearrange
24
25 OP_TYPE = "linear_attention"
26 SUPPORTED_PRECISIONS = ["bf16"]
27 HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
28
29
30 def _naive_chunk_kda(
31 q: torch.Tensor,
32 k: torch.Tensor,
33 v: torch.Tensor,
34 g: torch.Tensor,
35 beta: torch.Tensor,
36 scale: float,
37 chunk_size: int = 64,
38 ) -> torch.Tensor:
39 """KDA forward, no initial state, no final state. Returns o with v's dtype."""
40 dtype = v.dtype
41 B, T, H, K = q.shape
42 V = v.shape[-1]
43 BT = chunk_size
44 assert T % BT == 0, f"T={T} must be a multiple of chunk_size={BT}"
45 NT = T // BT
46
... (4061 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/PROMPT.txt"
}1 I need you to implement Kimi Delta Attention forward (chunk form) for the RTX PRO 6000 (SM120 Blackwell, GDDR7, 1.8 TB/s). The reference implementation is in reference.py and your kernel goes in solution.py. You can make whatever mess you want in this directory — scratch files, intermediate kernels, build artifacts, profiling traces — but the final answer has to be in solution.py with the same Model, get_inputs, and get_init_inputs interface as reference.py.
2
3 The op is the chunk-parallel KDA forward from the FLA library: q and k of shape (B, T, H, K) in bf16, v of shape (B, T, H, V) in bf16, g of shape (B, T, H, K) in fp32 (per-channel log-decay with in-chunk cumsum already applied), beta of shape (B, T, H) in bf16, scale a python float, chunk_size 64, no initial state, no final state. You return o of shape (B, T, H, V) in bf16. Correctness tolerance is 0.05 abs/rel — the long recurrence accumulates more error than a single GEMM so the bar's a bit looser than default bf16. The shapes you have to handle are B=2 T=1024 H=8 K=128 V=128 (short-context training step), B=2 T=2048 H=8 K=128 V=128 (the headline shape from the Kimi Linear paper), B=1 T=4096 H=8 K=128 V=128 (long context that stresses the inter-chunk recurrence), and B=1 T=2048 H=4 K=128 V=128 (thin batch decode).
4
5 This needs to be a real custom kernel — the whole point of the problem is to write the chunk-parallel attention yourself, not call FLA's existing implementation. Don't import or call fla.ops.kda, f
... (816 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/check.py"
}1 """Correctness runner for KDA forward (chunk form).
2
3 Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4 each, with per-dtype atol/rtol (bf16 default 1e-2 plus a 5e-2 override for
5 this problem). Also rejects forbidden ops by grep.
6 """
7 import re
8 import sys
9 from pathlib import Path
10
11 import torch
12 import yaml
13
14 # Make the repo's src/ importable
15 REPO_ROOT = Path(__file__).resolve().parents[2]
16 sys.path.insert(0, str(REPO_ROOT))
17
18 from src.eval.correctness import check_correctness # noqa: E402
19
20
21 def _apply_shape(reference, shape):
22 """Override reference's module-level shape shims so get_inputs/get_init_inputs match."""
23 for k, v in shape.items():
24 setattr(reference, k, v)
25
26
27 def main():
28 try:
29 import reference
30 import shapes
31 import solution
32 except Exception as e:
33 print(f"FAIL: import error: {e}")
34 sys.exit(1)
35
36 problem_yaml = Path("problem.yaml")
37 meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
38
39 # --- Forbidden-op check ------------------------------------------------
40 sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
41 for forbidden in meta.get("forbidden", []):
42 pat = re.escape(forbidden)
43 if re.search(pat, sol_src):
44 print(f"FAIL: forbidden op used: {forbidden}")
45
... (2670 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/benchmark.py"
}1 """Roofline benchmark for KDA forward (chunk form).
2
3 For each shape: times eager reference, compiled reference, SOTA (FLA's Triton
4 chunk_kda, if available on this GPU), and the agent's solution. Reports
5 achieved TFLOPS, GB/s, and peak_fraction.
6
7 Output lines the harness picks up:
8 shape=<idx> variant=<name> tflops=<N> gbps=<N> ms=<N>
9 peak_fraction: <N> (geomean over shapes of solution's peak_fraction)
10 """
11 import sys
12 from math import exp, log
13 from pathlib import Path
14
15 import torch
16 import yaml
17
18 REPO_ROOT = Path(__file__).resolve().parents[2]
19 sys.path.insert(0, str(REPO_ROOT))
20
21 from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
22 from src.eval.timing import time_fn # noqa: E402
23 from src.hardware import get as get_hw # noqa: E402
24
25
26 def _eval_formula(expr: str, vars: dict) -> float:
27 return float(eval(expr, {"__builtins__": {}}, vars))
28
29
30 def _apply_shape(reference, shape):
31 for k, v in shape.items():
32 setattr(reference, k, v)
33
34
35 def main():
36 import reference
37 import shapes
38 import solution
39
40 meta = yaml.safe_load(Path("problem.yaml").read_text())
41 hw = get_hw(meta["hardware"][0])
42 peak_tflops = hw.peak_tflops_dense.get(meta["peak_tflops_key"], 0.0)
43 peak_gbps = hw.peak_bandwidth_gb_s
44 regime = meta.get("regime", "compute")
45 flops_formula = meta["flops_formula"]
46 bytes_form
... (3308 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/problem.yaml"
}1 name: 02_kda_cutlass
2 display_name: "Kimi Delta Attention (chunk forward) via CUTLASS CuTe"
3 precision: bf16
4 regime: compute
5
6 # Approximate FLOPs per forward call. KDA forward (chunk form) breakdown per
7 # token across all heads:
8 # - intra-chunk Aqk (q @ k.T with decay): 2*BT*K
9 # - intra-chunk Akk + cumulative invert : ~3*BT*K + BT^2
10 # - w = A @ (g.exp()*k): 2*BT*K
11 # - u = A @ v: 2*BT*V
12 # - inter-chunk q @ S: 2*K*V
13 # - inter-chunk (Aqk @ v_i): 2*BT*V
14 # - state update S += k.T @ v_i: 2*K*V
15 # Aggregating with a constant of 4 (typical for chunked linear attention) gives
16 # the formula below. Used only for roofline reporting -- not graded directly.
17 flops_formula: "4 * B * T * H * (K*V + CHUNK_SIZE*K + CHUNK_SIZE*V)"
18
19 # Bytes moved (lower bound): one read of q,k,v,g,beta and one write of o.
20 bytes_formula: "B*T*H*K*2 + B*T*H*K*2 + B*T*H*V*2 + B*T*H*K*4 + B*T*H*2 + B*T*H*V*2"
21
22 hardware: [RTX_PRO_6000]
23 peak_tflops_key: bf16 # KDA's heavy work is bf16 @ bf16 -> fp32 GEMM-equivalent
24 peak_bandwidth_key: dram
25
26 # bf16 default tolerance from src/eval/correctness.py is 1e-2 atol/rtol; KDA's
27 # long recurrence accumulates more error than a single GEMM, so we relax slightly.
28 tolerance:
29 torch.bfloat16: 0.05 # was "5e-2" -- PyYAML parses un-dotted sci-notation as string
30
31 # Forbidden ops -- using any in solution.py f
... (899 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/sota.py"
}1 """SOTA reference for KDA forward: fla.ops.kda.chunk_kda (Triton).
2
3 The agent's solution is forbidden from importing this module path (see
4 problem.yaml.forbidden). This file is only used by benchmark.py to draw
5 the SOTA reference line.
6
7 If FLA's Triton kernel does not run on SM120 (Blackwell consumer-lineage --
8 some Triton kernels in FLA target Hopper TMA), is_available() returns False
9 and benchmark.py omits the SOTA variant. The H100 reference is documented
10 in problem.yaml for context.
11 """
12 from __future__ import annotations
13
14 import torch
15
16
17 def _import_fla():
18 try:
19 from fla.ops.kda import chunk_kda # noqa: F401
20 return chunk_kda
21 except Exception:
22 return None
23
24
25 def sota_forward(
26 q: torch.Tensor,
27 k: torch.Tensor,
28 v: torch.Tensor,
29 g: torch.Tensor,
30 beta: torch.Tensor,
31 scale: float | None = None,
32 ) -> torch.Tensor:
33 """Run FLA's Triton chunk_kda. Returns o (B, T, H, V) in v's dtype."""
34 chunk_kda = _import_fla()
35 if chunk_kda is None:
36 raise RuntimeError("fla.ops.kda.chunk_kda unavailable")
37 # FLA's chunk_kda has a richer signature (A_log, dt_bias, l2norm, gates, ...).
38 # We need the bare forward: pass A_log/dt_bias as None, gates off, no l2norm.
39 # The wrapper expects fp32 g; q/k/v/beta in bf16/fp16.
40 out = chunk_kda(
41 q=q,
42 k=k,
43 v=v,
44 g=g,
45
... (1131 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/shapes.py"
}1 """Canonical shape sweep for KDA forward (chunk form).
2
3 Mix of:
4 - short-context training-step scale (T=1024)
5 - mid-context (T=2048) which is the headline benchmark
6 - long-context that stresses the inter-chunk recurrence (T=4096)
7 - thin-batch decode-style (B=1, T=2048, fewer heads)
8
9 Constraints:
10 - T % chunk_size == 0 (chunk_size = 64)
11 - K, V are the per-head channel dims; KDA in Kimi Linear uses K=V=128
12 """
13
14 SHAPES = [
15 {"B": 2, "T": 1024, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
16 {"B": 2, "T": 2048, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
17 {"B": 1, "T": 4096, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
18 {"B": 1, "T": 2048, "H": 4, "K": 128, "V": 128, "CHUNK_SIZE": 64},
19 ]
20 {
"query": "select:TodoWrite,WebFetch,WebSearch",
"max_results": 5
}{
"todos": [
{
"content": "Study FLA's chunk_kda Triton kernel and reference impl",
"activeForm": "Studying FLA's chunk_kda Triton kernel and reference impl",
"status": "in_progress"
},
{
"content": "Check GPU state and torch/triton/cuda versions",
"activeForm": "Checking GPU state and toolchain versions",
"status": "pending"
},
{
"content": "Implement chunk-parallel KDA forward in Triton",
"activeForm": "Implementing chunk-parallel KDA forward in Triton",
"status": "pending"
},
{
"content": "Run check.py
... (312 more chars)Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable{
"command": "nvidia-smi --query-gpu=name,memory.free,utilization.gpu,utilization.memory --format=csv",
"description": "Check GPU state"
}name, memory.free [MiB], utilization.gpu [%], utilization.memory [%]
NVIDIA RTX PRO 6000 Blackwell Workstation Edition, 96493 MiB, 0 %, 19 %{
"command": "uv run python -c \"import torch; print('torch:', torch.__version__, 'cuda:', torch.version.cuda); import triton; print('triton:', triton.__version__); print('device:', torch.cuda.get_device_name(), 'cc:', torch.cuda.get_device_capability())\"",
"description": "Check torch/triton versions"
}torch: 2.11.0+cu130 cuda: 13.0
triton: 3.6.0
device: NVIDIA RTX PRO 6000 Blackwell Workstation Edition cc: (12, 0){
"command": "uv run python -c \"import fla; print(fla.__version__); import fla.ops.kda; print(fla.ops.kda.__file__)\"",
"description": "Check FLA install"
}0.5.0
/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/__init__.py{
"command": "ls -la /home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/",
"description": "List KDA implementation files"
}drwxrwxr-x infatoshi infatoshi 4.0 KB Sat Apr 25 22:52:30 2026 .
drwxrwxr-x infatoshi infatoshi 4.0 KB Sat Apr 25 22:52:29 2026 ..
.rw-rw-r-- infatoshi infatoshi 441 B Fri Apr 24 23:38:15 2026 __init__.py
drwxrwxr-x infatoshi infatoshi 4.0 KB Sat Apr 25 22:52:30 2026 __pycache__
drwxrwxr-x infatoshi infatoshi 4.0 KB Sat Apr 25 22:52:31 2026 backends
.rw-rw-r-- infatoshi infatoshi 16 KB Fri Apr 24 23:38:15 2026 chunk.py
.rw-rw-r-- infatoshi infatoshi 19 KB Fri Apr 24 23:38:15 2026 chunk_bwd.py
.rw-rw-r-- infatoshi infatoshi 4.3 KB Fri Apr 24 23:38:15 2026 chunk_fwd.py
.rw-rw-r-- infatoshi infatoshi 35 KB Fri Apr 24 23:38:15 2026 chunk_intra.py
.rw-rw-r-- infatoshi infatoshi 5.1 KB Fri Apr 24 23:38:15 2026 chunk_intra_token_parallel.py
.rw-rw-r-- infatoshi infatoshi 15 KB Fri Apr 24 23:38:15 2026 fused_recurrent.py
.rw-rw-r-- infatoshi infatoshi 14 KB Fri Apr 24 23:38:15 2026 gate.py
.rw-rw-r-- infatoshi infatoshi 6.2 KB Fri Apr 24 23:38:15 2026 naive.py
.rw-rw-r-- infatoshi infatoshi 12 KB Fri Apr 24 23:38:15 2026 wy_fast.py{
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/__init__.py"
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 from .chunk import chunk_kda
9 from .fused_recurrent import fused_recurrent_kda
10
11 __all__ = [
12 "chunk_kda",
13 "fused_recurrent_kda",
14 ]
15 {
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/naive.py"
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 import torch
9 from einops import rearrange
10
11
12 def naive_recurrent_kda(
13 q: torch.Tensor,
14 k: torch.Tensor,
15 v: torch.Tensor,
16 g: torch.Tensor,
17 beta: torch.Tensor,
18 scale: float | None = None,
19 initial_state: torch.Tensor | None = None,
20 output_final_state: bool = False,
21 ):
22 r"""
23 Args:
24 q (torch.Tensor):
25 Queries of shape ``[B, T, H, K]``.
26 k (torch.Tensor):
27 Keys of shape ``[B, T, H, K]``.
28 v (torch.Tensor):
29 Values of shape ``[B, T, HV, V]``. ``HV`` must be divisible by ``H``.
30 g (torch.Tensor):
31 Per-dimension decay gates (log-space) of shape ``[B, T, HV, K]``.
32 beta (torch.Tensor):
33 Beta scalars of shape ``[B, T, HV]``.
34 scale (Optional[float]):
35 Scale factor. Defaults to ``1 / sqrt(K)``.
36 initial_state (Optional[torch.Tensor]):
37 Initial state of shape ``[B, HV, K, V]``.
38 output_final_state (bool):
39 Whether to return the final state.
40
41 Returns:
42 A tuple ``(o, S)`` where ``o`` has
... (5399 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk.py"
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 # Related files are modified and supported by the Moonshot AI Team
9
10 import torch
11
12 from fla.modules.l2norm import l2norm_bwd, l2norm_fwd
13 from fla.ops.backends import dispatch
14 from fla.ops.cp import FLACPContext
15 from fla.ops.kda.chunk_bwd import chunk_kda_bwd
16 from fla.ops.kda.chunk_fwd import chunk_kda_fwd
17 from fla.ops.utils.index import prepare_chunk_indices
18 from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard
19
20
21 class ChunkKDAFunction(torch.autograd.Function):
22 @staticmethod
23 @input_guard
24 @autocast_custom_fwd
25 def forward(
26 ctx,
27 q: torch.Tensor,
28 k: torch.Tensor,
29 v: torch.Tensor,
30 g: torch.Tensor,
31 beta: torch.Tensor,
32 A_log: torch.Tensor,
33 dt_bias: torch.Tensor,
34 scale: float,
35 initial_state: torch.Tensor,
36 output_final_state: bool = False,
37 use_qk_l2norm_in_kernel: bool = False,
38 use_gate_in_kernel: bool = False,
39 cu_seqlens: torch.LongTensor | None = None,
40 cu_seqlens_cpu: torch.LongTensor | None = None,
41 safe_gate:
... (16820 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_fwd.py"
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 import torch
9
10 from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h
11 from fla.ops.cp import FLACPContext
12 from fla.ops.cp.chunk_delta_h import chunk_gated_delta_rule_fwd_h_pre_process, compress_h0
13 from fla.ops.gla.chunk import chunk_gla_fwd_o_gk
14 from fla.ops.kda.chunk_intra import chunk_kda_fwd_intra
15 from fla.ops.kda.gate import kda_gate_chunk_cumsum
16 from fla.ops.utils import chunk_local_cumsum
17 from fla.ops.utils.constant import RCP_LN2
18
19
20 def chunk_kda_fwd(
21 q: torch.Tensor,
22 k: torch.Tensor,
23 v: torch.Tensor,
24 g: torch.Tensor,
25 beta: torch.Tensor,
26 scale: float,
27 initial_state: torch.Tensor,
28 output_final_state: bool,
29 cu_seqlens: torch.LongTensor | None = None,
30 cu_seqlens_cpu: torch.LongTensor | None = None,
31 chunk_indices: torch.LongTensor | None = None,
32 chunk_size: int = 64,
33 safe_gate: bool = False,
34 lower_bound: float | None = None,
35 use_gate_in_kernel: bool = False,
36 A_log: torch.Tensor | None = None,
37 dt_bias: torch.Tensor | None = None,
38 disable_recompute: bool = False,
39 return_intermediate_
... (3298 more chars){
"command": "ls /home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/common/ | head -20",
"description": "List common FLA ops"
}__init__.py
__pycache__
backends
chunk_delta_h.py
chunk_h.py
chunk_h_parallel.py
chunk_h_split.py
chunk_o.py
chunk_scaled_dot_kkt.py
fused_chunk.py
fused_recurrent.py
intracard_cp.py{
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_intra.py",
"limit": 100
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 import torch
9 import triton
10 import triton.language as tl
11
12 from fla.ops.kda.chunk_intra_token_parallel import chunk_kda_fwd_intra_token_parallel
13 from fla.ops.kda.wy_fast import recompute_w_u_fwd
14 from fla.ops.utils import prepare_chunk_indices
15 from fla.ops.utils.op import exp2, gather
16 from fla.utils import IS_GATHER_SUPPORTED, IS_TF32_SUPPORTED, autotune_cache_kwargs
17
18 if IS_TF32_SUPPORTED:
19 SOLVE_TRIL_DOT_PRECISION = tl.constexpr('tf32')
20 else:
21 SOLVE_TRIL_DOT_PRECISION = tl.constexpr('ieee')
22
23 ################################################################################
24 # Fused inter + solve_tril kernel: compute off-diagonal Akk and solve in one pass
25 ################################################################################
26
27
28 @triton.heuristics({
29 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
30 })
31 @triton.autotune(
32 configs=[
33 triton.Config({'BK': BK}, num_warps=num_warps)
34 for BK in [32, 64]
35 for num_warps in [1, 2, 4]
36 ],
37 key=["H", "HV", "K", "BC"],
38 **autotune_cache_kwargs,
39 )
40 @triton.jit(do_not_specialize=['T
... (1794 more chars){
"todos": [
{
"content": "Study FLA's chunk_kda Triton kernel and reference impl",
"activeForm": "Studying FLA's chunk_kda Triton kernel and reference impl",
"status": "in_progress"
},
{
"content": "Check GPU state and torch/triton/cuda versions",
"activeForm": "Checking GPU state and toolchain versions",
"status": "completed"
},
{
"content": "Implement chunk-parallel KDA forward in Triton",
"activeForm": "Implementing chunk-parallel KDA forward in Triton",
"status": "pending"
},
{
"content": "Run check.
... (314 more chars)Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable{
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/src/hardware/rtx_pro_6000.py"
}1 """RTX PRO 6000 Blackwell Workstation — SM120, consumer-lineage Blackwell.
2
3 Peak tensor-core throughputs are dense-matrix advertised peaks. Actual kernels
4 will see 60-85% of peak on well-tuned code.
5 """
6 from dataclasses import dataclass
7
8
9 @dataclass(frozen=True)
10 class HardwareTarget:
11 name: str
12 sm: str
13 vram_gb: int
14 peak_bandwidth_gb_s: float # DRAM
15 peak_tflops_dense: dict[str, float] # dtype -> TFLOPS
16
17
18 RTX_PRO_6000 = HardwareTarget(
19 name="RTX PRO 6000 Blackwell Workstation",
20 sm="sm_120a",
21 vram_gb=96,
22 peak_bandwidth_gb_s=1800.0,
23 peak_tflops_dense={
24 "fp4": 800.0,
25 "nvfp4": 800.0,
26 "mxfp4": 800.0,
27 "fp6": 800.0,
28 "fp8": 400.0,
29 "bf16": 200.0,
30 "fp16": 200.0,
31 "tf32": 100.0,
32 "fp32": 12.0, # non-tensor-core SIMT fp32
33 "int8": 400.0,
34 "int4": 800.0,
35 },
36 )
37 {
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/common/chunk_delta_h.py",
"limit": 100
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 import torch
9 import triton
10 import triton.language as tl
11
12 from fla.ops.backends import dispatch
13 from fla.ops.utils import prepare_chunk_indices, prepare_chunk_offsets
14 from fla.ops.utils.op import exp, exp2
15 from fla.utils import IS_NVIDIA_HOPPER, USE_CUDA_GRAPH, autotune_cache_kwargs, check_shared_mem
16
17 NUM_WARPS = [2, 4] if IS_NVIDIA_HOPPER else [2, 4, 8, 16]
18
19
20 @triton.heuristics({
21 'USE_G': lambda args: args['g'] is not None,
22 'USE_GK': lambda args: args['gk'] is not None,
23 'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
24 'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
25 'SAVE_NEW_VALUE': lambda args: args['v_new'] is not None,
26 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
27 })
28 @triton.autotune(
29 configs=[
30 triton.Config({'BV': BV}, num_warps=num_warps, num_stages=num_stages)
31 for num_warps in [2, 4]
32 for num_stages in ([2, 3, 4] if check_shared_mem('ampere') else [2, 1])
33 for BV in ([32, 64] if check_shared_mem('ada') else [32])
34 ],
35 key=['H', 'HV', 'K', 'V', 'BT', 'USE_EXP2', 'TRANSPOSE_STATE'],
36
... (2017 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/wy_fast.py"
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 import torch
9 import triton
10 import triton.language as tl
11
12 from fla.ops.utils import prepare_chunk_indices
13 from fla.ops.utils.op import exp2
14 from fla.utils import autotune_cache_kwargs, check_shared_mem
15
16
17 @triton.heuristics({
18 'STORE_QG': lambda args: args['qg'] is not None,
19 'STORE_KG': lambda args: args['kg'] is not None,
20 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
21 })
22 @triton.autotune(
23 configs=[
24 triton.Config({}, num_warps=num_warps, num_stages=num_stages)
25 for num_warps in [2, 4, 8]
26 for num_stages in [2, 3, 4]
27 ],
28 key=['H', 'HV', 'K', 'V', 'BT', 'BK', 'BV', 'IS_VARLEN'],
29 **autotune_cache_kwargs,
30 )
31 @triton.jit(do_not_specialize=['T'])
32 def recompute_w_u_fwd_kda_kernel(
33 q,
34 k,
35 qg,
36 kg,
37 v,
38 beta,
39 w,
40 u,
41 A,
42 gk,
43 cu_seqlens,
44 chunk_indices,
45 T,
46 H: tl.constexpr,
47 HV: tl.constexpr,
48 K: tl.constexpr,
49 V: tl.constexpr,
50 BT: tl.constexpr,
51 BK: tl.constexpr,
52 BV: tl.constexpr,
53 STORE_QG: tl.constexpr,
54 STO
... (11760 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_intra.py",
"offset": 100,
"limit": 300
}100 o_i = tl.arange(0, BC)
101 m_tc1 = (i_tc1 + o_i) < T
102 m_tc2 = (i_tc2 + o_i) < T
103 m_tc3 = (i_tc3 + o_i) < T
104
105 b_Aqk10 = tl.zeros([BC, BC], dtype=tl.float32)
106 b_Akk10 = tl.zeros([BC, BC], dtype=tl.float32)
107
108 b_Aqk20 = tl.zeros([BC, BC], dtype=tl.float32)
109 b_Akk20 = tl.zeros([BC, BC], dtype=tl.float32)
110 b_Aqk21 = tl.zeros([BC, BC], dtype=tl.float32)
111 b_Akk21 = tl.zeros([BC, BC], dtype=tl.float32)
112
113 b_Aqk30 = tl.zeros([BC, BC], dtype=tl.float32)
114 b_Akk30 = tl.zeros([BC, BC], dtype=tl.float32)
115 b_Aqk31 = tl.zeros([BC, BC], dtype=tl.float32)
116 b_Akk31 = tl.zeros([BC, BC], dtype=tl.float32)
117 b_Aqk32 = tl.zeros([BC, BC], dtype=tl.float32)
118 b_Akk32 = tl.zeros([BC, BC], dtype=tl.float32)
119
120 ################################################################################
121 # off-diagonal blocks
122 ################################################################################
123 for i_k in range(tl.cdiv(K, BK)):
124 o_k = i_k * BK + tl.arange(0, BK)
125 m_k = o_k < K
126
127 p_k0 = tl.make_block_ptr(k, (T, K), (H*K, 1), (i_tc0, i_k * BK), (BC, BK), (1, 0))
128 p_g0 = tl.make_block_ptr(g, (T, K), (HV*K, 1), (i_tc0, i_k * BK), (BC, BK), (1, 0))
129 b_k0 = tl.load(p_k0, boundary_check=(0, 1)).to(tl.float32)
130 b_g0 = tl.load(p_g0, boundary_check=(0, 1)).to(tl.float32)
131
132 if i_tc1 <
... (13945 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_intra_token_parallel.py"
}1 # Copyright (c) 2023-2026, Songlin Yang, Yu Zhang, Zhiyuan Li
2 #
3 # This source code is licensed under the MIT license found in the
4 # LICENSE file in the root directory of this source tree.
5 # For a list of all contributors, visit:
6 # https://github.com/fla-org/flash-linear-attention/graphs/contributors
7
8 # Token-parallel implementation of KDA intra chunk kernel
9
10 import torch
11 import triton
12 import triton.language as tl
13
14 from fla.ops.utils.op import exp2
15 from fla.utils import autotune_cache_kwargs
16
17
18 @triton.heuristics({
19 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
20 })
21 @triton.autotune(
22 configs=[
23 triton.Config({'BH': BH}, num_warps=num_warps)
24 for BH in [1, 2, 4, 8]
25 for num_warps in [1, 2, 4, 8]
26 ],
27 key=["K", "H", "HV"],
28 **autotune_cache_kwargs,
29 )
30 @triton.jit(do_not_specialize=['T', 'N'])
31 def chunk_kda_fwd_kernel_intra_token_parallel(
32 q,
33 k,
34 g,
35 beta,
36 Aqk,
37 Akk,
38 scale,
39 cu_seqlens,
40 N,
41 T,
42 H: tl.constexpr,
43 HV: tl.constexpr,
44 K: tl.constexpr,
45 BT: tl.constexpr,
46 BC: tl.constexpr,
47 BH: tl.constexpr,
48 IS_VARLEN: tl.constexpr,
49 ):
50 i_tg, i_hg = tl.program_id(0), tl.program_id(1)
51
52 if IS_VARLEN:
53 i_n = 0
54 left, right = 0, N
55
56 # Unrolled binary search (max B=2^32)
57 # We can limit iterati
... (4334 more chars){
"command": "<512 chars \u2014 see below>",
"description": "Verify reference runs"
}cd /home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass && uv run python -c "
import sys
sys.path.insert(0, '.')
import torch
import reference
device = torch.device('cuda:0')
m = reference.Model(*reference.get_init_inputs()).to(device).eval()
inputs = [t.to(device) if hasattr(t, 'to') else t for t in reference.get_inputs()]
with torch.no_grad():
o = m(*inputs)
print('shape:', o.shape, 'dtype:', o.dtype)
print('o stats:', o.float().min().item(), o.float().max().item(), o.float().mean().item())
"shape: torch.Size([2, 1024, 8, 128]) dtype: torch.bfloat16
o stats: -0.01116943359375 0.01171875 -1.1375359463272616e-06--- /dev/null
+++ b//home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/solution.py
@@ -0,0 +1,290 @@
+"""Custom KDA forward (chunk form) using Triton kernels for SM120.
+
+Implementation notes:
+ - g is rescaled by RCP_LN2 so we can use exp2 throughout (faster than exp).
+ - In-chunk cumsum on g is fused into the intra kernel via tl.cumsum.
+ - Phase 1 (intra): per-chunk forward substitution on the (I + L) factor and
+ materialize the chunk-local Akk_inv, w, u tensors.
+ - Phase 2 (inter): sequential per-(b, h) loop over chunks computing the state
+ h and the output o, with the V dim split into BV-tiles for parallelism.
+"""
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+RCP_LN2 = 1.4426950408889634 # log2(e), so exp(x) = exp2(x * RCP_LN2)
+
+
+@triton.jit
+def _kda_intra_kernel(
+ k_ptr,
+ v_ptr,
+ g_in_ptr,
+ g_out_ptr,
+ beta_ptr,
+ A_ptr,
+ w_ptr,
+ u_ptr,
+ T,
+ H: tl.constexpr,
+ K: tl.constexpr,
+ V: tl.constexpr,
+ BT: tl.constexpr,
+ SCALE_LOG: tl.constexpr,
+):
+ """Per-chunk preparation: cumsum g; build Akk_inv*beta_col; w, u.
+
+ Grid: (B*H, NT)
+ """
+ i_bh = tl.program_id(0)
+ i_n = tl.program_id(1)
+ i_b = i_bh // H
+ i_h = i_bh % H
+ NT = T // BT
+
+ offs_t = tl.arange(0, BT)
+ offs_k = tl.arange(0, K)
+ offs_v = tl.arange(0, V)
+
+ # ---------- Pointers ----------
+ # k, g: (B, T, H, K) — strides (T*H*K, H*K, K, 1)
+ k_base = i_b * T * H * K + i_n * BT * H * K + i_h * K
+ g_base = k_base
+ # v: (B, T, H, V)
+ v_base = i_b * T * H * V + i_n * BT * H * V + i_h * V
+ # beta: (B, T, H)
+ beta_base = i_b * T * H + i_n * BT * H + i_h
+ # A: (B, NT, H, BT, BT) contiguous on the last two dims
+ A_base = i_b * NT * H * BT * BT + i_n * H * BT * BT + i_h * BT * BT
+
+ # ---------- Load k, g, beta ----------
+ kg_ptrs_t = offs_t[:, None] * H * K + offs_k[None, :]
+ b_k = tl.load(k_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
+ b_g_raw = tl.load(g_in_ptr + g_base + kg_ptrs_t).to(tl.float32) * SCALE_LOG
+ b_g = tl.cumsum(b_g_raw, axis=0) # (BT, K) fp32
+
+ # Save the cumsumed g back to global (used by inter kernel).
+ tl.store(g_out_ptr + g_base + kg_ptrs_t, b_g)
+
+ b_beta = tl.load(beta_ptr + beta_base + offs_t * H).to(tl.float32) # (BT,)
+
+ # ---------- Akk_raw = -beta_row * (k*exp(g)) @ (k*exp(-g)).T ----------
+ b_eg = tl.exp2(b_g) # (BT, K) fp32
+ b_eng = tl.exp2(-b_g) # (BT, K) fp32
+ b_kf = b_k.to(tl.float32) * b_eg
+ b_kg = b_k.to(tl.float32) * b_eng
+ b_A = tl.dot(b_kf.to(b_k.dtype), tl.trans(b_kg).to(b_k.dtype)) # (BT, BT) fp32
+ mask_strict_lower = offs_t[:, None] > offs_t[None, :]
+ b_A = tl.where(mask_strict_lower, -b_A * b_beta[:, None], 0.0)
+
+ # ---------- Forward substitute (BT iterations) ----------
+ # for i in 1..BT-1: A[i, :i] += sum_m A[i, m] * A[m, :i] for m < i
+ for i in tl.static_range(1, BT):
+ is_row = (offs_t == i)
+ # row i of A (BT,)
+ row_i = tl.sum(tl.where(is_row[:, None], b_A, 0.0), axis=0)
+ # (row_i @ A)[j] = sum_m row_i[m] * A[m, j]
+ update = tl.sum(row_i[:, None] * b_A, axis=0)
+ # write only the lower part of row i
+ write_mask = is_row[:, None] & (offs_t[None, :] < i)
+ b_A = tl.where(write_mask, b_A + update[None, :], b_A)
+
+ # Add identity on the diagonal
+ b_A = tl.where(offs_t[:, None] == offs_t[None, :], b_A + 1.0, b_A)
+ # Multiply by beta_col: A *= beta[None, :]
+ b_A = b_A * b_beta[None, :]
+
+ # ---------- Store A_inv (fp32 for downstream precision) ----------
+ A_ptrs = offs_t[:, None] * BT + offs_t[None, :]
+ tl.store(A_ptr + A_base + A_ptrs, b_A)
+
+ # ---------- w = A @ (g.exp() * k) ----------
+ b_w = tl.dot(b_A.to(b_k.dtype), b_kf.to(b_k.dtype)) # (BT, K) fp32
+ tl.store(w_ptr + k_base + kg_ptrs_t, b_w.to(b_k.dtype))
+
+ # ---------- u = A @ v ----------
+ v_ptrs_t = offs_t[:, None] * H * V + offs_v[None, :]
+ b_v = tl.load(v_ptr + v_base + v_ptrs_t) # (BT, V) bf16
+ b_u = tl.dot(b_A.to(b_v.dtype), b_v) # (BT, V) fp32
+ tl.store(u_ptr + v_base + v_ptrs_t, b_u.to(b_v.dtype))
+
+
+@triton.jit
+def _kda_inter_kernel(
+ q_ptr,
+ k_ptr,
+ g_ptr,
+ w_ptr,
+ u_ptr,
+ o_ptr,
+ scale,
+ T,
+ H: tl.constexpr,
+ K: tl.constexpr,
+ V: tl.constexpr,
+ BT: tl.constexpr,
+ BV: tl.constexpr,
+ SCALE_LOG: tl.constexpr,
+):
+ """Inter-chunk recurrence: maintain state S, compute o.
+
+ Grid: (B*H, V/BV)
+ Each program handles one (b, h) pair and one V-slice. Sequential over chunks.
+ """
+ i_bh = tl.program_id(0)
+ i_v = tl.program_id(1)
+ i_b = i_bh // H
+ i_h = i_bh % H
+ NT = T // BT
+
+ offs_t = tl.arange(0, BT)
+ offs_k = tl.arange(0, K)
+ offs_v = tl.arange(0, BV)
+
+ v_lo = i_v * BV
+
+ # Strict-upper mask for Aqk (zero out strict-upper, keep diag and below)
+ mask_su = offs_t[:, None] < offs_t[None, :]
+
+ # State S: (K, BV) fp32
+ b_S = tl.zeros((K, BV), dtype=tl.float32)
+
+ for i_n in range(NT):
+ # Pointers
+ k_base = i_b * T * H * K + i_n * BT * H * K + i_h * K
+ v_base = i_b * T * H * V + i_n * BT * H * V + i_h * V + v_lo
+
+ kg_ptrs_t = offs_t[:, None] * H * K + offs_k[None, :]
+ v_ptrs_t = offs_t[:, None] * H * V + offs_v[None, :]
+
+ # Load q, k, g, w, u_slice
+ b_q = tl.load(q_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
+ b_k = tl.load(k_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
+ b_g = tl.load(g_ptr + k_base + kg_ptrs_t) # (BT, K) fp32
+ b_w = tl.load(w_ptr + k_base + kg_ptrs_t) # (BT, K) bf16
+ b_u = tl.load(u_ptr + v_base + v_ptrs_t) # (BT, BV) bf16
+
+
... (4241 more chars){
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/solution.py",
"content": "<9839 chars \u2014 see diff>"
}File created successfully at: /home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/solution.py (file state is current in your context — no need to Read it back){
"todos": [
{
"content": "Study FLA's chunk_kda Triton kernel and reference impl",
"activeForm": "Studying FLA's chunk_kda Triton kernel and reference impl",
"status": "completed"
},
{
"content": "Check GPU state and torch/triton/cuda versions",
"activeForm": "Checking GPU state and toolchain versions",
"status": "completed"
},
{
"content": "Implement chunk-parallel KDA forward in Triton",
"activeForm": "Implementing chunk-parallel KDA forward in Triton",
"status": "in_progress"
},
{
"content": "Run chec
... (316 more chars)Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable{
"command": "<896 chars \u2014 see below>",
"description": "Smoke test solution"
}uv run python -c "
import sys
sys.path.insert(0, '.')
import torch
import reference
import solution
device = torch.device('cuda:0')
ref_m = reference.Model(*reference.get_init_inputs()).to(device).eval()
sol_m = solution.Model(*solution.get_init_inputs()).to(device).eval()
torch.manual_seed(42)
inputs = [t.to(device) if hasattr(t, 'to') else t for t in reference.get_inputs()]
with torch.no_grad():
ref_out = ref_m(*inputs)
sol_out = sol_m(*inputs)
print('ref:', ref_out.shape, ref_out.dtype, 'sol:', sol_out.shape, sol_out.dtype)
diff = (ref_out.float() - sol_out.float()).abs()
print('max abs diff:', diff.max().item(), 'mean abs diff:', diff.mean().item())
print('ref max:', ref_out.float().abs().max().item(), 'sol max:', sol_out.float().abs().max().item())
print('ref[0,0,0,:5]:', ref_out[0,0,0,:5].tolist())
print('sol[0,0,0,:5]:', sol_out[0,0,0,:5].tolist())
" 2>&1 | head -100ref: torch.Size([2, 1024, 8, 128]) torch.bfloat16 sol: torch.Size([2, 1024, 8, 128]) torch.bfloat16
max abs diff: 6.103515625e-05 mean abs diff: 5.398797839006875e-06
ref max: 0.01171875 sol max: 0.01171875
ref[0,0,0,:5]: [0.0002288818359375, -0.000102996826171875, -3.2901763916015625e-05, 0.000164031982421875, -6.379559636116028e-08]
sol[0,0,0,:5]: [0.00022792816162109375, -0.00010251998901367188, -3.2901763916015625e-05, 0.000164031982421875, -6.379559636116028e-08]{
"command": "uv run python check.py 2>&1 | tail -30",
"description": "Run check.py"
}PASS{
"command": "uv run python benchmark.py 2>&1 | tail -50",
"description": "Run benchmark"
}Command running in background with ID: b6c9yuro1. Output is being written to: /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output{
"command": "sleep 60 && tail -70 /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output",
"description": "Wait for benchmark and show results"
}<tool_use_error>Blocked: sleep 60 followed by: tail -70 /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output. To wait for a condition, use Monitor with an until-loop (e.g. `until <check>; do sleep 2; done`). To wait for a command you started, use run_in_background: true. Do not chain shorter sleeps to work around this block.</tool_use_error>{
"query": "select:Monitor",
"max_results": 1
}{
"command": "until ! pgrep -f \"benchmark.py\" > /dev/null; do sleep 5; done; tail -100 /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output",
"description": "Wait for benchmark to finish",
"run_in_background": true
}Command running in background with ID: bld2hpt2m. Output is being written to: /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/bld2hpt2m.output{
"command": "cat /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/bld2hpt2m.output",
"description": "Check benchmark results"
}(Bash completed with no output){
"command": "ps aux | grep -E \"benchmark|python\" | grep -v grep | head -10",
"description": "Check running python processes"
}root 1907 0.0 0.0 126720 24796 ? Ssl 01:49 0:00 /usr/bin/python3 /usr/share/unattended-upgrades/unattended-upgrade-shutdown --wait-for-signal
infatos+ 14352 0.0 0.0 344112 95032 ? Ssl 01:50 0:01 /home/infatoshi/hermes-agent/.venv/bin/python -m hermes_cli.main gateway run --replace
infatos+ 101246 0.0 0.0 17404 2160 ? SN 04:15 0:00 timeout 35891 codex exec -m gpt-5.5 -c model_reasoning_effort="xhigh" --dangerously-bypass-approvals-and-sandbox --skip-git-repo-check You are running inside TerrariaBench with a live Terraria/tModLoader client. Your objective is open-ended long-horizon progress: make as much concrete progress as possible toward beating Terraria. Do not ask the user for confirmation. Do not stop just because a starter checkpoint passes. Keep playing until the external 10-hour budget ends, the game dies, or there is a fundamental unrecoverable harness failure. Operate through the TerrariaBench helper commands appended below. Use screenshots frequently. Treat checkpoint logs as authoritative metadata, not as the full objective. Maintain a concise progress.md in the current workspace after each meaningful milestone, including current position, inventory/gear, world discoveries, deaths, blockers, and next actions. Priorities: 1. Establish control: verify screenshots, movement, inventory, mining, placement, combat, and menu interaction. 2. Survive: avoid night deaths where possible, gather wood/stone/ore, make shelter/w
... (22014 more chars){
"command": "until ! pgrep -f \"benchmark.py\" > /dev/null 2>&1; do sleep 10; done; echo \"BENCHMARK_DONE\"; tail -100 /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output",
"description": "Wait for first benchmark to complete",
"timeout_ms": 3600000,
"persistent": false
}Monitor started (task bpeclfxa4, timeout 3600000ms). You will be notified on each event. Keep working — do not poll or sleep. Events may arrive while you are waiting for the user — an event is not their reply.{
"file_path": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/solution.py",
"offset": 1,
"limit": 130
}1 """Custom KDA forward (chunk form) using Triton kernels for SM120.
2
3 Implementation notes:
4 - g is rescaled by RCP_LN2 so we can use exp2 throughout (faster than exp).
5 - In-chunk cumsum on g is fused into the intra kernel via tl.cumsum.
6 - Phase 1 (intra): per-chunk forward substitution on the (I + L) factor and
7 materialize the chunk-local Akk_inv, w, u tensors.
8 - Phase 2 (inter): sequential per-(b, h) loop over chunks computing the state
9 h and the output o, with the V dim split into BV-tiles for parallelism.
10 """
11 from __future__ import annotations
12
13 import torch
14 import torch.nn as nn
15 import triton
16 import triton.language as tl
17
18 RCP_LN2 = 1.4426950408889634 # log2(e), so exp(x) = exp2(x * RCP_LN2)
19
20
21 @triton.jit
22 def _kda_intra_kernel(
23 k_ptr,
24 v_ptr,
25 g_in_ptr,
26 g_out_ptr,
27 beta_ptr,
28 A_ptr,
29 w_ptr,
30 u_ptr,
31 T,
32 H: tl.constexpr,
33 K: tl.constexpr,
34 V: tl.constexpr,
35 BT: tl.constexpr,
36 SCALE_LOG: tl.constexpr,
37 ):
38 """Per-chunk preparation: cumsum g; build Akk_inv*beta_col; w, u.
39
40 Grid: (B*H, NT)
41 """
42 i_bh = tl.program_id(0)
43 i_n = tl.program_id(1)
44 i_b = i_bh // H
45 i_h = i_bh % H
46 NT = T // BT
47
48 offs_t = tl.arange(0, BT)
49 offs_k = tl.arange(0, K)
50 offs_v = tl.arange(0, V)
51
52 # ---------- Pointers ----------
53 # k, g: (B, T, H, K) — strides (
... (3300 more chars){
"command": "cat /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output",
"description": "Check current benchmark output"
}(Bash completed with no output){
"command": "<1321 chars \u2014 see below>",
"description": "Quick timing run on solution only"
}uv run python -c "
import sys, time
sys.path.insert(0, '.')
import torch
import solution
device = torch.device('cuda:0')
torch.manual_seed(42)
shapes = [
(2, 1024, 8, 128, 128, 64),
(2, 2048, 8, 128, 128, 64),
(1, 4096, 8, 128, 128, 64),
(1, 2048, 4, 128, 128, 64),
]
for B, T, H, K, V, CHUNK in shapes:
solution.B, solution.T, solution.H, solution.K, solution.V = B, T, H, K, V
sol_m = solution.Model(B, T, H, K, V, CHUNK).to(device).eval()
q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device=device) * 0.1
k = torch.randn(B, T, H, K, dtype=torch.bfloat16, device=device) * 0.1
v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device=device) * 0.1
g = torch.randn(B, T, H, K, dtype=torch.float32, device=device) * 0.1 - 0.05
beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.bfloat16, device=device))
# warmup
with torch.no_grad():
for _ in range(3):
sol_m(q, k, v, g, beta)
torch.cuda.synchronize()
# measure
n_iters = 50
t0 = time.perf_counter()
with torch.no_grad():
for _ in range(n_iters):
sol_m(q, k, v, g, beta)
torch.cuda.synchronize()
t1 = time.perf_counter()
ms = (t1 - t0) / n_iters * 1000
print(f'B={B} T={T} H={H} K={K} V={V}: {ms:.3f} ms')
" 2>&1 | tail -10B=2 T=1024 H=8 K=128 V=128: 0.183 ms
B=2 T=2048 H=8 K=128 V=128: 0.357 ms
B=1 T=4096 H=8 K=128 V=128: 0.671 ms
B=1 T=2048 H=4 K=128 V=128: 0.334 ms{
"command": "<129 chars \u2014 see below>",
"description": "Check if SOTA available"
}uv run python -c "
import sys
sys.path.insert(0, '.')
import sota
print('SOTA available:', sota.is_available())
" 2>&1 | tail -10SOTA available: True{
"command": "<1246 chars \u2014 see below>",
"description": "Time SOTA for comparison"
}uv run python -c "
import sys, time
sys.path.insert(0, '.')
import torch
import sota
device = torch.device('cuda:0')
torch.manual_seed(42)
shapes = [
(2, 1024, 8, 128, 128, 64),
(2, 2048, 8, 128, 128, 64),
(1, 4096, 8, 128, 128, 64),
(1, 2048, 4, 128, 128, 64),
]
for B, T, H, K, V, CHUNK in shapes:
q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device=device) * 0.1
k = torch.randn(B, T, H, K, dtype=torch.bfloat16, device=device) * 0.1
v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device=device) * 0.1
g = torch.randn(B, T, H, K, dtype=torch.float32, device=device) * 0.1 - 0.05
beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.bfloat16, device=device))
scale = K**-0.5
# warmup
with torch.no_grad():
for _ in range(3):
sota.sota_forward(q, k, v, g, beta, scale=scale)
torch.cuda.synchronize()
# measure
n_iters = 50
t0 = time.perf_counter()
with torch.no_grad():
for _ in range(n_iters):
sota.sota_forward(q, k, v, g, beta, scale=scale)
torch.cuda.synchronize()
t1 = time.perf_counter()
ms = (t1 - t0) / n_iters * 1000
print(f'B={B} T={T} H={H} K={K} V={V}: SOTA {ms:.3f} ms')
" 2>&1 | tail -10B=2 T=1024 H=8 K=128 V=128: SOTA 0.327 ms
B=2 T=2048 H=8 K=128 V=128: SOTA 0.295 ms
B=1 T=4096 H=8 K=128 V=128: SOTA 0.298 ms
B=1 T=2048 H=4 K=128 V=128: SOTA 0.283 ms{
"command": "cat /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output 2>/dev/null | head -50",
"description": "Check benchmark progress"
}(Bash completed with no output){
"todos": [
{
"content": "Study FLA's chunk_kda Triton kernel and reference impl",
"activeForm": "Studying FLA's chunk_kda Triton kernel and reference impl",
"status": "completed"
},
{
"content": "Check GPU state and torch/triton/cuda versions",
"activeForm": "Checking GPU state and toolchain versions",
"status": "completed"
},
{
"content": "Implement chunk-parallel KDA forward in Triton (baseline working)",
"activeForm": "Baseline KDA Triton kernel done",
"status": "completed"
},
{
"content": "Run check
... (805 more chars)Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable{
"task_id": "bpeclfxa4"
}message: 'Successfully stopped task: bpeclfxa4 (until ! pgrep -f "benchmark.py" > /dev/null 2>&1; do sleep 10; done; echo "BENCHMARK_DONE"; tail -100 /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output)'
task_id: 'bpeclfxa4'
task_type: 'local_bash'
command: 'until ! pgrep -f "benchmark.py" > /dev/null 2>&1; do sleep 10; done; echo "BENCHMARK_DONE"; tail -100 /tmp/claude-1002/-home-infatoshi-cuda-KernelBench-Hard/400239da-f68f-4aa1-8a16-bfbfbe931013/tasks/b6c9yuro1.output'{
"query": "select:TaskStop",
"max_results": 1
}{
"command": "pkill -f benchmark.py 2>/dev/null; sleep 2; ps aux | grep -v grep | grep -E \"benchmark.py\" | head -3",
"description": "Stop ongoing benchmark"
}Exit code 144