{
"problem": "01_fp8_gemm",
"harness": "opencode",
"model": "openrouter-pinned/minimax/minimax-m2.7",
"reasoning_effort": "",
"has_solution": false,
"correct": false,
"peak_fraction": null,
"elapsed_seconds": 160,
"harness_exit_code": 0,
"session_complete": true,
"usage": {"input_tokens": 143665, "output_tokens": 1449, "cache_read_tokens": 192128, "cache_creation_tokens": 0, "reasoning_tokens": 1291, "total_cost_usd": null}
}
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/reference.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/reference.py</path>
<type>file</type>
<content>
1: """Naive FP8 e4m3 GEMM reference (correctness only, NOT the SOTA baseline).
2:
3: We cast inputs to bf16 and use torch.matmul. The agent's solution must match
4: this numerically within the fp8 tolerance declared in problem.yaml.
5: """
6: import torch
7: import torch.nn as nn
8:
9: OP_TYPE = "gemm"
10: SUPPORTED_PRECISIONS = ["fp8_e4m3"]
11: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
12:
13:
14: class Model(nn.Module):
15: """y = (x @ w.T).to(bf16), where x is fp8_e4m3 (M, K), w is fp8_e4m3 (N, K)."""
16:
17: def __init__(self, M: int, N: int, K: int):
18: super().__init__()
19: self.M, self.N, self.K = M, N, K
20: # Weights stored as parameters so state_dict is well-defined.
21: # We initialize in bf16 then cast; the fp8 dtype is set by get_inputs.
22: self.weight = nn.Parameter(torch.empty(N, K, dtype=torch.bfloat16))
23: nn.init.normal_(self.weight, std=0.02)
24:
25: def forward(self, x: torch.Tensor) -> torch.Tensor:
26: # Upcast to bf16 for the naive reference; the kernel equivalent would
27: # use mma.sync f8f6f4 kind directly.
28: x_bf = x.to(torch.bfloat16)
29: w_bf = self.weight.to(torch.bfloat16)
30: return x_bf @ w_bf.T # (M, N) bf16
31:
32:
33: M = 4096
34: N = 4096
35: K = 4096
36:
37:
38: def get_inputs():
39: # fp8_e4m3 inp
... (227 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/check.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/check.py</path>
<type>file</type>
<content>
1: """Correctness runner for FP8 GEMM.
2:
3: Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4: each, with per-dtype atol/rtol. Also rejects forbidden ops by grep.
5: """
6: import re
7: import sys
8: from pathlib import Path
9:
10: import torch
11: import yaml
12:
13: # Make the repo's src/ importable
14: REPO_ROOT = Path(__file__).resolve().parents[2]
15: sys.path.insert(0, str(REPO_ROOT))
16:
17: from src.eval.correctness import check_correctness # noqa: E402
18:
19:
20: def main():
21: try:
22: import reference
23: import shapes
24: import solution
25: except Exception as e:
26: print(f"FAIL: import error: {e}")
27: sys.exit(1)
28:
29: problem_yaml = Path("problem.yaml")
30: meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
31:
32: # --- Forbidden-op check ------------------------------------------------
33: sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
34: for forbidden in meta.get("forbidden", []):
35: pat = re.escape(forbidden)
36: if re.search(pat, sol_src):
37: print(f"FAIL: forbidden op used: {forbidden}")
38: sys.exit(1)
39:
40: device = torch.device("cuda:0")
41: tol_override = meta.get("tolerance") or None
42:
43: # --- Per-shape c
... (2907 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/benchmark.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/benchmark.py</path>
<type>file</type>
<content>
1: """Roofline benchmark for FP8 GEMM.
2:
3: For each shape: times eager reference, compiled reference, SOTA (if available),
4: and the agent's solution. Reports achieved TFLOPS, GB/s, and peak_fraction.
5:
6: Output lines the harness picks up:
7: shape=<idx> variant=<name> tflops=<N> gbps=<N> ms=<N>
8: peak_fraction: <N> (geomean over shapes of solution's peak_fraction)
9: """
10: import sys
11: from math import exp, log
12: from pathlib import Path
13:
14: import torch
15: import yaml
16:
17: REPO_ROOT = Path(__file__).resolve().parents[2]
18: sys.path.insert(0, str(REPO_ROOT))
19:
20: from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
21: from src.eval.timing import time_fn # noqa: E402
22: from src.hardware import get as get_hw # noqa: E402
23:
24:
25: def _eval_formula(expr: str, vars: dict) -> float:
26: # Very small eval: only names from `vars` are valid.
27: return float(eval(expr, {"__builtins__": {}}, vars))
28:
29:
30: def main():
31: import reference
32: import shapes
33: import solution
34:
35: meta = yaml.safe_load(Path("problem.yaml").read_text())
36: hw = get_hw(meta["hardware"][0])
37: peak_tflops = hw.peak_tflops_dense.get(meta["peak_tflops_key"], 0.0)
38: peak_gbps = hw.peak_bandwidth_gb_s
39: regime = meta.get("regime", "compute")
40: flops_formula
... (3416 more chars){
"name": "kernel-optimization"
}<skill_content name="kernel-optimization">
# Skill: kernel-optimization
# Kernel Optimization
Unified reference for GPU/TPU kernel work across all platforms. This skill routes to specialized sub-references based on what you need.
## When this skill fires
- Writing or optimizing CUDA, Metal, Triton, HIP, or Pallas kernels
- Porting kernels between platforms (CUDA <-> Metal <-> Triton <-> HIP <-> Pallas)
- Debugging GPU code (OOM, NaN, race conditions, wrong results, hangs)
- Performance profiling and optimization
- Numerical precision issues (FP formats, mixed precision, tolerance)
- AMD MI300/MI350 or TPU v4/v5/v6/v7 specific work
## Sub-references (read on demand)
The following files contain deep platform-specific knowledge. Read the relevant one(s) based on the task:
| File | When to read |
|------|-------------|
| `~/.claude/skills/amd-gpu/SKILL.md` | AMD CDNA/RDNA, ROCm, HIP, MFMA, LDS, chiplet scheduling, HipKittens, AITER |
| `~/.claude/skills/tpu-opt/SKILL.md` | TPU v4-v7, MXU, Pallas kernels, JAX sharding, XLA gotchas, SplashAttention |
| `~/.claude/skills/port-kernel/SKILL.md` | Cross-platform porting: CUDA <-> Metal <-> Triton concept maps, API equivalents, gotchas |
| `~/.claude/skills/debug-gpu/SKILL.md` | OOM diagnosis, NaN tracing, race conditions, compute-sanitizer, FP format specs, tolerance tables |
| `~/.claude/skills/perf-optim/SKILL.md` | Profiling (nsys/ncu/Metal), roofline, occupancy, memory hierarchy, CUTLASS/CuTe, quantization, KernelBench |
| `
... (1462 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/shapes.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/shapes.py</path>
<type>file</type>
<content>
1: """Canonical shape sweep for FP8 GEMM.
2:
3: Mix of:
4: - square aligned (the easy case)
5: - off-alignment K (common real-world failure mode for tile-quantized kernels)
6: - skinny (decode-like, memory-bound)
7: - rectangular (prefill with grouped attention)
8: """
9:
10: SHAPES = [
11: {"M": 4096, "N": 4096, "K": 4096}, # square aligned
12: {"M": 4096, "N": 4096, "K": 4127}, # K not multiple of 128 -> forces predicated tails
13: {"M": 32, "N": 8192, "K": 8192}, # skinny M (decode)
14: {"M": 4096, "N": 14336, "K": 4096}, # Llama3 up-proj shape
15: ]
(End of file - total 15 lines)
</content>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/problem.yaml"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/problem.yaml</path>
<type>file</type>
<content>
1: name: 01_fp8_gemm
2: display_name: "FP8 e4m3 GEMM"
3: precision: fp8_e4m3
4: regime: compute
5:
6: # Dense-equivalent FLOPs per invocation. Evaluated per-shape; the benchmark
7: # substitutes M/N/K from each shapes.py entry.
8: flops_formula: "2 * M * N * K"
9:
10: # Bytes moved (approximate): fp8 inputs in, bf16 output.
11: bytes_formula: "M*K + K*N + M*N*2"
12:
13: hardware: [RTX_PRO_6000]
14: peak_tflops_key: fp8 # index into src/hardware/rtx_pro_6000.py peak_tflops_dense
15: peak_bandwidth_key: dram
16:
17: # Per-dtype tolerance override (optional). Defaults are in src/eval/correctness.py.
18: tolerance:
19: fp8_e4m3fn: 0.15
20:
21: # Forbidden ops — using any of these in solution.py fails correctness post-hoc.
22: # torch._scaled_mm is Stanford-style cheating (just call the vendor library).
23: forbidden:
24: - "torch._scaled_mm"
25: - "torch.ops.aten._scaled_mm"
26:
27: sota:
28: name: "FlashInfer FP8 GEMM"
29: url: "https://github.com/flashinfer-ai/flashinfer"
30: function: "flashinfer.gemm.fp8_gemm"
31: deps:
32: - "flashinfer>=0.6.8"
33: # Documented H100 throughput for this shape (informational, not graded):
34: reference_throughput_tflops_h100: 550
35:
36: num_correct_trials: 3
37: num_perf_trials: 30
(End of file - total 37 lines)
</content>{
"filePath": "/home/infatoshi/.claude/skills/perf-optim/SKILL.md"
}<path>/home/infatoshi/.claude/skills/perf-optim/SKILL.md</path>
<type>file</type>
<content>
1: ---
2: name: perf-optim
3: description: "[SUB-REFERENCE of kernel skill] Performance optimization. Do NOT invoke directly -- use /kernel instead, which reads this file on demand."
4: ---
5:
6: # Performance Optimization Atlas
7:
8: Comprehensive, hierarchical reference for optimizing GPU workloads. Pick combinations from this atlas based on your bottleneck profile. **Always profile before optimizing.**
9:
10: ---
11:
12: ## 0. FIRST PRINCIPLES: SPEED OF LIGHT, THEN PROFILE
13:
14: ### Rule
15: Never optimize blind. But before you even profile, compute the **speed of light** -- the theoretical physical limit for this operation. Then profile to measure the gap. The gap IS the optimization opportunity.
16:
17: ### Speed of Light Analysis
18: 1. **Compute the limit**: What does physics allow? Peak FLOPS, peak bandwidth, minimum latency for this exact workload
19: 2. **Profile current state**: Where are we now?
20: 3. **Measure the gap**: speed_of_light - measured = optimization budget
21: 4. **Don't continuous-improve from current state**: Strip to zero. If you built this from scratch knowing only the physical limits, what would you build? Then reason about every piece of the gap between that and current
22: - "It takes 74ms" -- don't optimize to 72ms. Compute that physics allows 6ms. Now reason about the 6-to-74 gap. The rest might be well-reasoned compromises, but at least yo
... (53580 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/src/hardware/rtx_pro_6000.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/src/hardware/rtx_pro_6000.py</path>
<type>file</type>
<content>
1: """RTX PRO 6000 Blackwell Workstation — SM120, consumer-lineage Blackwell.
2:
3: Peak tensor-core throughputs are dense-matrix advertised peaks. Actual kernels
4: will see 60-85% of peak on well-tuned code.
5: """
6: from dataclasses import dataclass
7:
8:
9: @dataclass(frozen=True)
10: class HardwareTarget:
11: name: str
12: sm: str
13: vram_gb: int
14: peak_bandwidth_gb_s: float # DRAM
15: peak_tflops_dense: dict[str, float] # dtype -> TFLOPS
16:
17:
18: RTX_PRO_6000 = HardwareTarget(
19: name="RTX PRO 6000 Blackwell Workstation",
20: sm="sm_120a",
21: vram_gb=96,
22: peak_bandwidth_gb_s=1800.0,
23: peak_tflops_dense={
24: "fp4": 800.0,
25: "nvfp4": 800.0,
26: "mxfp4": 800.0,
27: "fp6": 800.0,
28: "fp8": 400.0,
29: "bf16": 200.0,
30: "fp16": 200.0,
31: "tf32": 100.0,
32: "fp32": 12.0, # non-tensor-core SIMT fp32
33: "int8": 400.0,
34: "int4": 800.0,
35: },
36: )
(End of file - total 36 lines)
</content>{
"command": "ls -la /home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm/",
"description": "List problem directory contents"
}total 40
drwxrwxr-x 2 infatoshi infatoshi 4096 Apr 27 05:59 .
drwxrwxr-x 10 infatoshi infatoshi 4096 Apr 24 00:30 ..
-rw-rw-r-- 1 infatoshi infatoshi 4228 Apr 26 03:11 benchmark.py
-rw-rw-r-- 1 infatoshi infatoshi 3805 Apr 24 00:37 check.py
-rw-rw-r-- 1 infatoshi infatoshi 1128 Apr 24 00:36 problem.yaml
-rw-rw-r-- 1 infatoshi infatoshi 1850 Apr 27 03:21 PROMPT.txt
-rw-rw-r-- 1 infatoshi infatoshi 1401 Apr 24 00:40 reference.py
-rw-rw-r-- 1 infatoshi infatoshi 564 Apr 24 00:36 shapes.py
-rw-rw-r-- 1 infatoshi infatoshi 1725 Apr 24 00:37 sota.py
{
"command": "nvidia-smi --query-gpu=name,memory.total,compute_cap --format=csv,noheader",
"description": "Check GPU name and compute capability"
}NVIDIA RTX PRO 6000 Blackwell Workstation Edition, 97887 MiB, 12.0
{
"command": "cd /home/infatoshi/cuda/KernelBench-Hard && nvcc --version && python -c \"import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))\"",
"description": "Check CUDA and PyTorch versions"
}nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2026 NVIDIA Corporation
Built on Mon_Mar_02_09:52:23_PM_PST_2026
Cuda compilation tools, release 13.2, V13.2.51
Build cuda_13.2.r13.2/compiler.37434383_0
2.11.0+cu130
True
NVIDIA RTX PRO 6000 Blackwell Workstation Edition
{
"command": "<601 chars \u2014 see below>",
"description": "Run baseline benchmark"
}python -c "
import torch
import sys
sys.path.insert(0, '.')
from problems.01_fp8_gemm import reference, shapes
from src.eval.timing import time_fn
device = torch.device('cuda:0')
shape = shapes.SHAPES[0]
M, N, K = shape['M'], shape['N'], shape['K']
reference.M, reference.N, reference.K = M, N, K
init_args = reference.get_init_inputs()
model = reference.Model(*init_args).to(device).eval()
inputs = [t.to(device) for t in reference.get_inputs()]
ms = time_fn(model, inputs, iters=30)
print(f'M={M} N={N} K={K}: {ms:.3f} ms')
print(f'FLOPs: {2*M*N*K}')
print(f'TFLOPS: {2*M*N*K / ms / 1e9:.1f}')
"
File "<string>", line 5
from problems.01_fp8_gemm import reference, shapes
^
SyntaxError: invalid decimal literal
{
"command": "<704 chars \u2014 see below>",
"description": "Run baseline benchmark for all shapes"
}cd /home/infatoshi/cuda/KernelBench-Hard/problems/01_fp8_gemm && python -c "
import torch
import reference, shapes
from pathlib import Path
import sys
sys.path.insert(0, str(Path('../..').resolve()))
from src.eval.timing import time_fn
device = torch.device('cuda:0')
for shape in shapes.SHAPES:
M, N, K = shape['M'], shape['N'], shape['K']
reference.M, reference.N, reference.K = M, N, K
init_args = reference.get_init_inputs()
model = reference.Model(*init_args).to(device).eval()
inputs = [t.to(device) for t in reference.get_inputs()]
ms = time_fn(model, inputs, iters=30)
tflops = 2*M*N*K / ms / 1e9
print(f'M={M} N={N} K={K}: {ms:.3f} ms, {tflops:.1f} TFLOPS')
"
M=4096 N=4096 K=4096: 0.395 ms, 348.0 TFLOPS
M=4096 N=4096 K=4127: 0.806 ms, 171.8 TFLOPS
M=32 N=8192 K=8192: 0.111 ms, 38.8 TFLOPS
M=4096 N=14336 K=4096: 1.237 ms, 388.8 TFLOPS
{
"command": "ls /usr/local/cuda/",
"description": "Check CUDA installation"
}bin
compute-sanitizer
DOCS
EULA.txt
extras
gds
gds-12.8
include
lib64
libnvvp
nsight-compute-2025.1.1
nsightee_plugins
nsight-systems-2024.6.2
nvml
nvvm
README
share
src
targets
tools
version.json
{
"command": "python -c \"import torch; print([attr for attr in dir(torch.cuda) if 'fp8' in attr.lower() or 'mma' in attr.lower() or 'tensor' in attr.lower()])\"",
"description": "Check CUDA tensor core support"
}['BFloat16Tensor', 'BoolTensor', 'ByteTensor', 'CharTensor', 'DoubleTensor', 'FloatTensor', 'HalfTensor', 'IntTensor', 'LongTensor', 'ShortTensor', '_check_bf16_tensor_supported', 'memory_summary']
{
"command": "ls /usr/local/cuda/samples/ | head -30",
"description": "Check CUDA samples"
}ls: cannot access '/usr/local/cuda/samples/': No such file or directory
{
"command": "python -c \"import torch; print(torch.float8_e4m3fn); print(torch.float8_e5m2)\"",
"description": "Verify float8 dtypes available"
}torch.float8_e4m3fn
torch.float8_e5m2
{
"command": "<429 chars \u2014 see below>",
"description": "Check CUDA capabilities"
}python -c "
# Quick test to see what PyTorch can do with float8 and what CUDA support exists
import torch
print('CUDA version:', torch.version.cuda)
print('cuDNN version:', torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'N/A')
# Check if torch._scaled_mm is available
try:
print('torch._scaled_mm:', torch._scaled_mm)
except AttributeError as e:
print('torch._scaled_mm not available:', e)
"
CUDA version: 13.0
cuDNN version: 91900
torch._scaled_mm: <built-in method _scaled_mm of type object at 0x70b0b613e180>
{
"command": "ls /usr/local/cuda/DOCS/",
"description": "Check CUDA docs"
}ls: cannot access '/usr/local/cuda/DOCS/': Not a directory
{
"pattern": "mma\\.sync.*fp8",
"path": "/usr/local/cuda",
"include": "*.html"
}