{
"problem": "02_kda_cutlass",
"harness": "opencode",
"model": "openrouter-pinned/minimax/minimax-m2.7",
"reasoning_effort": "",
"has_solution": false,
"correct": false,
"peak_fraction": null,
"elapsed_seconds": 2700,
"harness_exit_code": 124,
"session_complete": false,
"usage": {"input_tokens": 61606, "output_tokens": 672, "cache_read_tokens": 167008, "cache_creation_tokens": 0, "reasoning_tokens": 1946, "total_cost_usd": null}
}
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/reference.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/reference.py</path>
<type>file</type>
<content>
1: """Naive PyTorch reference for Kimi Delta Attention (KDA) forward, chunk form.
2:
3: This is the correctness oracle, NOT the SOTA baseline. It mirrors the
4: chunk-parallel formulation in fla/ops/kda/naive.py (Songlin Yang et al.)
5: without any Triton or CUDA optimization.
6:
7: Inputs (per the FLA convention):
8: q, k : (B, T, H, K) bf16 -- queries / keys
9: v : (B, T, H, V) bf16 -- values
10: g : (B, T, H, K) fp32 -- per-channel log-decay (in-chunk cumsum applied)
11: beta : (B, T, H) bf16 -- write strength
12:
13: Output:
14: o : (B, T, H, V) bf16
15:
16: The agent must reproduce this output (within bf16 tolerance) using a CUTLASS
17: CuTe kernel on SM120 -- NOT by calling fla.ops.chunk_kda directly.
18: """
19: from __future__ import annotations
20:
21: import torch
22: import torch.nn as nn
23: from einops import rearrange
24:
25: OP_TYPE = "linear_attention"
26: SUPPORTED_PRECISIONS = ["bf16"]
27: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
28:
29:
30: def _naive_chunk_kda(
31: q: torch.Tensor,
32: k: torch.Tensor,
33: v: torch.Tensor,
34: g: torch.Tensor,
35: beta: torch.Tensor,
36: scale: float,
37: chunk_size: int = 64,
38: ) -> torch.Tensor:
39: """KDA forward, no initial state, no final state. Returns o with v's dtype."""
40: dtype = v.dtype
41: B, T,
... (4359 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/check.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/check.py</path>
<type>file</type>
<content>
1: """Correctness runner for KDA forward (chunk form).
2:
3: Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4: each, with per-dtype atol/rtol (bf16 default 1e-2 plus a 5e-2 override for
5: this problem). Also rejects forbidden ops by grep.
6: """
7: import re
8: import sys
9: from pathlib import Path
10:
11: import torch
12: import yaml
13:
14: # Make the repo's src/ importable
15: REPO_ROOT = Path(__file__).resolve().parents[2]
16: sys.path.insert(0, str(REPO_ROOT))
17:
18: from src.eval.correctness import check_correctness # noqa: E402
19:
20:
21: def _apply_shape(reference, shape):
22: """Override reference's module-level shape shims so get_inputs/get_init_inputs match."""
23: for k, v in shape.items():
24: setattr(reference, k, v)
25:
26:
27: def main():
28: try:
29: import reference
30: import shapes
31: import solution
32: except Exception as e:
33: print(f"FAIL: import error: {e}")
34: sys.exit(1)
35:
36: problem_yaml = Path("problem.yaml")
37: meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
38:
39: # --- Forbidden-op check ------------------------------------------------
40: sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
41: for forbidden in meta.get("forbidden", [])
... (2934 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/benchmark.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/benchmark.py</path>
<type>file</type>
<content>
1: """Roofline benchmark for KDA forward (chunk form).
2:
3: For each shape: times eager reference, compiled reference, SOTA (FLA's Triton
4: chunk_kda, if available on this GPU), and the agent's solution. Reports
5: achieved TFLOPS, GB/s, and peak_fraction.
6:
7: Output lines the harness picks up:
8: shape=<idx> variant=<name> tflops=<N> gbps=<N> ms=<N>
9: peak_fraction: <N> (geomean over shapes of solution's peak_fraction)
10: """
11: import sys
12: from math import exp, log
13: from pathlib import Path
14:
15: import torch
16: import yaml
17:
18: REPO_ROOT = Path(__file__).resolve().parents[2]
19: sys.path.insert(0, str(REPO_ROOT))
20:
21: from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
22: from src.eval.timing import time_fn # noqa: E402
23: from src.hardware import get as get_hw # noqa: E402
24:
25:
26: def _eval_formula(expr: str, vars: dict) -> float:
27: return float(eval(expr, {"__builtins__": {}}, vars))
28:
29:
30: def _apply_shape(reference, shape):
31: for k, v in shape.items():
32: setattr(reference, k, v)
33:
34:
35: def main():
36: import reference
37: import shapes
38: import solution
39:
40: meta = yaml.safe_load(Path("problem.yaml").read_text())
41: hw = get_hw(meta["hardware"][0])
42: peak_tflops = hw.peak_tflops_dense.get(meta["peak_tflops_key"
... (3596 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/problem.yaml"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/problem.yaml</path>
<type>file</type>
<content>
1: name: 02_kda_cutlass
2: display_name: "Kimi Delta Attention (chunk forward) via CUTLASS CuTe"
3: precision: bf16
4: regime: compute
5:
6: # Approximate FLOPs per forward call. KDA forward (chunk form) breakdown per
7: # token across all heads:
8: # - intra-chunk Aqk (q @ k.T with decay): 2*BT*K
9: # - intra-chunk Akk + cumulative invert : ~3*BT*K + BT^2
10: # - w = A @ (g.exp()*k): 2*BT*K
11: # - u = A @ v: 2*BT*V
12: # - inter-chunk q @ S: 2*K*V
13: # - inter-chunk (Aqk @ v_i): 2*BT*V
14: # - state update S += k.T @ v_i: 2*K*V
15: # Aggregating with a constant of 4 (typical for chunked linear attention) gives
16: # the formula below. Used only for roofline reporting -- not graded directly.
17: flops_formula: "4 * B * T * H * (K*V + CHUNK_SIZE*K + CHUNK_SIZE*V)"
18:
19: # Bytes moved (lower bound): one read of q,k,v,g,beta and one write of o.
20: bytes_formula: "B*T*H*K*2 + B*T*H*K*2 + B*T*H*V*2 + B*T*H*K*4 + B*T*H*2 + B*T*H*V*2"
21:
22: hardware: [RTX_PRO_6000]
23: peak_tflops_key: bf16 # KDA's heavy work is bf16 @ bf16 -> fp32 GEMM-equivalent
24: peak_bandwidth_key: dram
25:
26: # bf16 default tolerance from src/eval/correctness.py is 1e-2 atol/rtol; KDA's
27: # long recurrence accumulates more error than a single GEMM, so we relax slightly.
28: tolerance
... (1108 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/shapes.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/02_kda_cutlass/shapes.py</path>
<type>file</type>
<content>
1: """Canonical shape sweep for KDA forward (chunk form).
2:
3: Mix of:
4: - short-context training-step scale (T=1024)
5: - mid-context (T=2048) which is the headline benchmark
6: - long-context that stresses the inter-chunk recurrence (T=4096)
7: - thin-batch decode-style (B=1, T=2048, fewer heads)
8:
9: Constraints:
10: - T % chunk_size == 0 (chunk_size = 64)
11: - K, V are the per-head channel dims; KDA in Kimi Linear uses K=V=128
12: """
13:
14: SHAPES = [
15: {"B": 2, "T": 1024, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
16: {"B": 2, "T": 2048, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
17: {"B": 1, "T": 4096, "H": 8, "K": 128, "V": 128, "CHUNK_SIZE": 64},
18: {"B": 1, "T": 2048, "H": 4, "K": 128, "V": 128, "CHUNK_SIZE": 64},
19: ]
(End of file - total 19 lines)
</content>{
"name": "kernel-optimization"
}<skill_content name="kernel-optimization">
# Skill: kernel-optimization
# Kernel Optimization
Unified reference for GPU/TPU kernel work across all platforms. This skill routes to specialized sub-references based on what you need.
## When this skill fires
- Writing or optimizing CUDA, Metal, Triton, HIP, or Pallas kernels
- Porting kernels between platforms (CUDA <-> Metal <-> Triton <-> HIP <-> Pallas)
- Debugging GPU code (OOM, NaN, race conditions, wrong results, hangs)
- Performance profiling and optimization
- Numerical precision issues (FP formats, mixed precision, tolerance)
- AMD MI300/MI350 or TPU v4/v5/v6/v7 specific work
## Sub-references (read on demand)
The following files contain deep platform-specific knowledge. Read the relevant one(s) based on the task:
| File | When to read |
|------|-------------|
| `~/.claude/skills/amd-gpu/SKILL.md` | AMD CDNA/RDNA, ROCm, HIP, MFMA, LDS, chiplet scheduling, HipKittens, AITER |
| `~/.claude/skills/tpu-opt/SKILL.md` | TPU v4-v7, MXU, Pallas kernels, JAX sharding, XLA gotchas, SplashAttention |
| `~/.claude/skills/port-kernel/SKILL.md` | Cross-platform porting: CUDA <-> Metal <-> Triton concept maps, API equivalents, gotchas |
| `~/.claude/skills/debug-gpu/SKILL.md` | OOM diagnosis, NaN tracing, race conditions, compute-sanitizer, FP format specs, tolerance tables |
| `~/.claude/skills/perf-optim/SKILL.md` | Profiling (nsys/ncu/Metal), roofline, occupancy, memory hierarchy, CUTLASS/CuTe, quantization, KernelBench |
| `
... (1462 more chars){
"filePath": "/home/infatoshi/.claude/skills/kernel/SKILL.md"
}<path>/home/infatoshi/.claude/skills/kernel/SKILL.md</path>
<type>file</type>
<content>
1: ---
2: name: kernel-optimization
3: description: Unified GPU/TPU kernel skill. Covers CUDA, Metal, Triton, AMD HIP, TPU Pallas, kernel porting, debugging, numerical precision, and performance optimization. Use when writing, optimizing, debugging, porting, or benchmarking GPU/TPU kernels on any platform.
4: ---
5:
6: # Kernel Optimization
7:
8: Unified reference for GPU/TPU kernel work across all platforms. This skill routes to specialized sub-references based on what you need.
9:
10: ## When this skill fires
11:
12: - Writing or optimizing CUDA, Metal, Triton, HIP, or Pallas kernels
13: - Porting kernels between platforms (CUDA <-> Metal <-> Triton <-> HIP <-> Pallas)
14: - Debugging GPU code (OOM, NaN, race conditions, wrong results, hangs)
15: - Performance profiling and optimization
16: - Numerical precision issues (FP formats, mixed precision, tolerance)
17: - AMD MI300/MI350 or TPU v4/v5/v6/v7 specific work
18:
19: ## Sub-references (read on demand)
20:
21: The following files contain deep platform-specific knowledge. Read the relevant one(s) based on the task:
22:
23: | File | When to read |
24: |------|-------------|
25: | `~/.claude/skills/amd-gpu/SKILL.md` | AMD CDNA/RDNA, ROCm, HIP, MFMA, LDS, chiplet scheduling, HipKittens, AITER |
26: | `~/.claude/skills/tpu-opt/SKILL.md` | TPU v4-v7, MXU, Pallas kernels, JAX sharding, XLA gotchas, SplashAttention |
27: | `~/.claude/s
... (1791 more chars){
"filePath": "/home/infatoshi/.claude/skills/perf-optim/SKILL.md"
}<path>/home/infatoshi/.claude/skills/perf-optim/SKILL.md</path>
<type>file</type>
<content>
1: ---
2: name: perf-optim
3: description: "[SUB-REFERENCE of kernel skill] Performance optimization. Do NOT invoke directly -- use /kernel instead, which reads this file on demand."
4: ---
5:
6: # Performance Optimization Atlas
7:
8: Comprehensive, hierarchical reference for optimizing GPU workloads. Pick combinations from this atlas based on your bottleneck profile. **Always profile before optimizing.**
9:
10: ---
11:
12: ## 0. FIRST PRINCIPLES: SPEED OF LIGHT, THEN PROFILE
13:
14: ### Rule
15: Never optimize blind. But before you even profile, compute the **speed of light** -- the theoretical physical limit for this operation. Then profile to measure the gap. The gap IS the optimization opportunity.
16:
17: ### Speed of Light Analysis
18: 1. **Compute the limit**: What does physics allow? Peak FLOPS, peak bandwidth, minimum latency for this exact workload
19: 2. **Profile current state**: Where are we now?
20: 3. **Measure the gap**: speed_of_light - measured = optimization budget
21: 4. **Don't continuous-improve from current state**: Strip to zero. If you built this from scratch knowing only the physical limits, what would you build? Then reason about every piece of the gap between that and current
22: - "It takes 74ms" -- don't optimize to 72ms. Compute that physics allows 6ms. Now reason about the 6-to-74 gap. The rest might be well-reasoned compromises, but at least yo
... (53580 more chars){
"command": "nvidia-smi --query-gpu=name,compute_cap,memory.total,driver --format=csv,noheader",
"description": "Check GPU info"
}Field "driver" is not a valid field to query.
{
"command": "nvcc --version 2>/dev/null | head -3",
"description": "Check CUDA compiler version"
}nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Mon_Mar_02_09:52:23_PM_PST_2026
{
"command": "nvidia-smi -L",
"description": "List GPUs"
}GPU 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition (UUID: GPU-d594fb7a-abcb-bad5-1fc4-675b478a47df)
{
"command": "python3 -c \"import torch; print(f'CUDA: {torch.version.cuda}, Device: {torch.cuda.get_device_name()}, BF16: {torch.cuda.is_bf16_supported()}')\"",
"description": "Check PyTorch CUDA support"
}CUDA: 13.0, Device: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, BF16: True
{
"command": "which ncu nsys 2>/dev/null || echo \"Profiling tools location check\"",
"description": "Check profiling tools"
}/usr/local/cuda-13/bin/ncu
/usr/local/cuda-13/bin/nsys