{
"problem": "02_kda_cutlass",
"harness": "opencode",
"model": "openrouter-pinned/qwen/qwen3.6-plus",
"reasoning_effort": "",
"has_solution": false,
"correct": false,
"peak_fraction": null,
"elapsed_seconds": 717,
"harness_exit_code": 0,
"session_complete": true,
"usage": {"input_tokens": 194396, "output_tokens": 999, "cache_read_tokens": 0, "cache_creation_tokens": 0, "reasoning_tokens": 36539, "total_cost_usd": null}
}
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass</path>
<type>directory</type>
<entries>
benchmark.py
check.py
problem.yaml
PROMPT.txt
reference.py
shapes.py
sota.py
(7 entries)
</entries>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/reference.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/reference.py</path>
<type>file</type>
<content>
1: """Naive PyTorch reference for Kimi Delta Attention (KDA) forward, chunk form.
2:
3: This is the correctness oracle, NOT the SOTA baseline. It mirrors the
4: chunk-parallel formulation in fla/ops/kda/naive.py (Songlin Yang et al.)
5: without any Triton or CUDA optimization.
6:
7: Inputs (per the FLA convention):
8: q, k : (B, T, H, K) bf16 -- queries / keys
9: v : (B, T, H, V) bf16 -- values
10: g : (B, T, H, K) fp32 -- per-channel log-decay (in-chunk cumsum applied)
11: beta : (B, T, H) bf16 -- write strength
12:
13: Output:
14: o : (B, T, H, V) bf16
15:
16: The agent must reproduce this output (within bf16 tolerance) using a CUTLASS
17: CuTe kernel on SM120 -- NOT by calling fla.ops.chunk_kda directly.
18: """
19: from __future__ import annotations
20:
21: import torch
22: import torch.nn as nn
23: from einops import rearrange
24:
25: OP_TYPE = "linear_attention"
26: SUPPORTED_PRECISIONS = ["bf16"]
27: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
28:
29:
30: def _naive_chunk_kda(
31: q: torch.Tensor,
32: k: torch.Tensor,
33: v: torch.Tensor,
34: g: torch.Tensor,
35: beta: torch.Tensor,
36: scale: float,
37: chunk_size: int = 64,
38: ) -> torch.Tensor:
39: """KDA forward, no initial state, no final state. Returns o with v's dtype."""
40: dtype = v.dtype
41: B, T,
... (4359 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/check.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/check.py</path>
<type>file</type>
<content>
1: """Correctness runner for KDA forward (chunk form).
2:
3: Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4: each, with per-dtype atol/rtol (bf16 default 1e-2 plus a 5e-2 override for
5: this problem). Also rejects forbidden ops by grep.
6: """
7: import re
8: import sys
9: from pathlib import Path
10:
11: import torch
12: import yaml
13:
14: # Make the repo's src/ importable
15: REPO_ROOT = Path(__file__).resolve().parents[2]
16: sys.path.insert(0, str(REPO_ROOT))
17:
18: from src.eval.correctness import check_correctness # noqa: E402
19:
20:
21: def _apply_shape(reference, shape):
22: """Override reference's module-level shape shims so get_inputs/get_init_inputs match."""
23: for k, v in shape.items():
24: setattr(reference, k, v)
25:
26:
27: def main():
28: try:
29: import reference
30: import shapes
31: import solution
32: except Exception as e:
33: print(f"FAIL: import error: {e}")
34: sys.exit(1)
35:
36: problem_yaml = Path("problem.yaml")
37: meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
38:
39: # --- Forbidden-op check ------------------------------------------------
40: sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
41: for forbidden in meta.get("forbidden", [])
... (2934 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/benchmark.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/benchmark.py</path>
<type>file</type>
<content>
1: """Roofline benchmark for KDA forward (chunk form).
2:
3: For each shape: times eager reference, compiled reference, SOTA (FLA's Triton
4: chunk_kda, if available on this GPU), and the agent's solution. Reports
5: achieved TFLOPS, GB/s, and peak_fraction.
6:
7: Output lines the harness picks up:
8: shape=<idx> variant=<name> tflops=<N> gbps=<N> ms=<N>
9: peak_fraction: <N> (geomean over shapes of solution's peak_fraction)
10: """
11: import sys
12: from math import exp, log
13: from pathlib import Path
14:
15: import torch
16: import yaml
17:
18: REPO_ROOT = Path(__file__).resolve().parents[2]
19: sys.path.insert(0, str(REPO_ROOT))
20:
21: from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
22: from src.eval.timing import time_fn # noqa: E402
23: from src.hardware import get as get_hw # noqa: E402
24:
25:
26: def _eval_formula(expr: str, vars: dict) -> float:
27: return float(eval(expr, {"__builtins__": {}}, vars))
28:
29:
30: def _apply_shape(reference, shape):
31: for k, v in shape.items():
32: setattr(reference, k, v)
33:
34:
35: def main():
36: import reference
37: import shapes
38: import solution
39:
40: meta = yaml.safe_load(Path("problem.yaml").read_text())
41: hw = get_hw(meta["hardware"][0])
42: peak_tflops = hw.peak_tflops_dense.get(meta["peak_tflops_key"
... (3596 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/shapes.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/shapes.py</path>
<type>file</type>
<content>
1: """Canonical shape sweep for KDA forward (chunk form).
2:
3: Mix of:
4: - short-context training-step scale (T=1024)
5: - mid-context (T=2048) which is the headline benchmark
6: - long-context that stresses the inter-chunk recurrence (T=4096)
7: - thin-batch decode-style (B=1, T=2048, fewer heads)
8:
9: Constraints:
10: - T % chunk_size == 0 (chunk_size = 64)
11: - K, V are the per-head channel dims; KDA in Kimi Linear uses K=V=128
12: """
13:
14: SHAPES = [
15: {"B": 2, "T": 1024, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
16: {"B": 2, "T": 2048, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
17: {"B": 1, "T": 4096, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
18: {"B": 1, "T": 2048, "H": 4, "K": 128, "V": 128, "CHUNK_SIZE": 64},
19: ]
(End of file - total 19 lines)
</content>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/problem.yaml"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/problem.yaml</path>
<type>file</type>
<content>
1: name: 02_kda_cutlass
2: display_name: "Kimi Delta Attention (chunk forward) via CUTLASS CuTe"
3: precision: bf16
4: regime: compute
5:
6: # Approximate FLOPs per forward call. KDA forward (chunk form) breakdown per
7: # token across all heads:
8: # - intra-chunk Aqk (q @ k.T with decay): 2*BT*K
9: # - intra-chunk Akk + cumulative invert : ~3*BT*K + BT^2
10: # - w = A @ (g.exp()*k): 2*BT*K
11: # - u = A @ v: 2*BT*V
12: # - inter-chunk q @ S: 2*K*V
13: # - inter-chunk (Aqk @ v_i): 2*BT*V
14: # - state update S += k.T @ v_i: 2*K*V
15: # Aggregating with a constant of 4 (typical for chunked linear attention) gives
16: # the formula below. Used only for roofline reporting -- not graded directly.
17: flops_formula: "4 * B * T * H * (K*V + CHUNK_SIZE*K + CHUNK_SIZE*V)"
18:
19: # Bytes moved (lower bound): one read of q,k,v,g,beta and one write of o.
20: bytes_formula: "B*T*H*K*2 + B*T*H*K*2 + B*T*H*V*2 + B*T*H*K*4 + B*T*H*2 + B*T*H*V*2"
21:
22: hardware: [RTX_PRO_6000]
23: peak_tflops_key: bf16 # KDA's heavy work is bf16 @ bf16 -> fp32 GEMM-equivalent
24: peak_bandwidth_key: dram
25:
26: # bf16 default tolerance from src/eval/correctness.py is 1e-2 atol/rtol; KDA's
27: # long recurrence accumulates more error than a single GEMM, so we relax slightly.
28: tolerance
... (1108 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/sota.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/sota.py</path>
<type>file</type>
<content>
1: """SOTA reference for KDA forward: fla.ops.kda.chunk_kda (Triton).
2:
3: The agent's solution is forbidden from importing this module path (see
4: problem.yaml.forbidden). This file is only used by benchmark.py to draw
5: the SOTA reference line.
6:
7: If FLA's Triton kernel does not run on SM120 (Blackwell consumer-lineage --
8: some Triton kernels in FLA target Hopper TMA), is_available() returns False
9: and benchmark.py omits the SOTA variant. The H100 reference is documented
10: in problem.yaml for context.
11: """
12: from __future__ import annotations
13:
14: import torch
15:
16:
17: def _import_fla():
18: try:
19: from fla.ops.kda import chunk_kda # noqa: F401
20: return chunk_kda
21: except Exception:
22: return None
23:
24:
25: def sota_forward(
26: q: torch.Tensor,
27: k: torch.Tensor,
28: v: torch.Tensor,
29: g: torch.Tensor,
30: beta: torch.Tensor,
31: scale: float | None = None,
32: ) -> torch.Tensor:
33: """Run FLA's Triton chunk_kda. Returns o (B, T, H, V) in v's dtype."""
34: chunk_kda = _import_fla()
35: if chunk_kda is None:
36: raise RuntimeError("fla.ops.kda.chunk_kda unavailable")
37: # FLA's chunk_kda has a richer signature (A_log, dt_bias, l2norm, gates, ...).
38: # We need the bare forward: pass A_log/dt_bias as None, gates off, no l2norm.
39: #
... (1352 more chars){
"command": "nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv 2>&1 | head -5",
"description": "Check GPU info"
}name, driver_version, memory.total [MiB]
NVIDIA RTX PRO 6000 Blackwell Workstation Edition, 595.58.03, 97887 MiB
{
"command": "uv run python -c \"import triton; print(triton.__version__)\" 2>&1",
"description": "Check Triton version"
}3.6.0
{
"command": "ls /home/infatoshi/cuda/KernelBench-Hard/src/eval/",
"description": "Check src eval structure"
}correctness.py
__init__.py
__pycache__
report.py
roofline.py
shapes.py
timing.py
{
"command": "uv run python -c \"from fla.ops.kda import chunk_kda; print('FLA available')\" 2>&1",
"description": "Check if FLA is available"
}Traceback (most recent call last):
File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'fla'
{
"command": "find /home/infatoshi/cuda -path \"*/fla/ops/kda*\" -type f 2>/dev/null | head -20",
"description": "Find FLA KDA source files"
}/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/__init__.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_intra_token_parallel.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/naive.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_bwd.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/fused_recurrent.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/gate.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_fwd.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_intra.py
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/__pycache__/gate.cpython-311.pyc
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/__pycache__/wy_fast.cpython-311.pyc
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/__pycache__/gate.cpython-312.pyc
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/__pycache__/chunk_fwd.cpython-312.pyc
/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/__pycache__/fused_recurrent.cpython-311.pyc
/home/infatoshi/cuda/Kerne
... (711 more chars){
"todos": [
{
"content": "Write initial Triton kernel for chunk KDA forward",
"status": "in_progress",
"priority": "high"
},
{
"content": "Run check.py to verify correctness",
"status": "pending",
"priority": "high"
},
{
"content": "Iterate on bugs until PASS",
"status": "pending",
"priority": "high"
},
{
"content": "Benchmark with benchmark.py",
"status": "pending",
"priority": "medium"
}
]
}content: 'Write initial Triton kernel for chunk KDA forward'
status: 'in_progress'
priority: 'high'
content: 'Run check.py to verify correctness'
status: 'pending'
priority: 'high'
content: 'Iterate on bugs until PASS'
status: 'pending'
priority: 'high'
content: 'Benchmark with benchmark.py'
status: 'pending'
priority: 'medium'{
"filePath": "/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/naive.py"
}{
"filePath": "/home/infatoshi/cuda/KernelBench-v3/.venv/lib/python3.11/site-packages/fla/ops/kda/chunk_fwd.py"
}{
"name": "kernel-optimization"
}<skill_content name="kernel-optimization">
# Skill: kernel-optimization
# Kernel Optimization
Unified reference for GPU/TPU kernel work across all platforms. This skill routes to specialized sub-references based on what you need.
## When this skill fires
- Writing or optimizing CUDA, Metal, Triton, HIP, or Pallas kernels
- Porting kernels between platforms (CUDA <-> Metal <-> Triton <-> HIP <-> Pallas)
- Debugging GPU code (OOM, NaN, race conditions, wrong results, hangs)
- Performance profiling and optimization
- Numerical precision issues (FP formats, mixed precision, tolerance)
- AMD MI300/MI350 or TPU v4/v5/v6/v7 specific work
## Sub-references (read on demand)
The following files contain deep platform-specific knowledge. Read the relevant one(s) based on the task:
| File | When to read |
|------|-------------|
| `~/.claude/skills/amd-gpu/SKILL.md` | AMD CDNA/RDNA, ROCm, HIP, MFMA, LDS, chiplet scheduling, HipKittens, AITER |
| `~/.claude/skills/tpu-opt/SKILL.md` | TPU v4-v7, MXU, Pallas kernels, JAX sharding, XLA gotchas, SplashAttention |
| `~/.claude/skills/port-kernel/SKILL.md` | Cross-platform porting: CUDA <-> Metal <-> Triton concept maps, API equivalents, gotchas |
| `~/.claude/skills/debug-gpu/SKILL.md` | OOM diagnosis, NaN tracing, race conditions, compute-sanitizer, FP format specs, tolerance tables |
| `~/.claude/skills/perf-optim/SKILL.md` | Profiling (nsys/ncu/Metal), roofline, occupancy, memory hierarchy, CUTLASS/CuTe, quantization, KernelBench |
| `
... (1462 more chars)