{
"problem": "07_w4a16_gemm",
"harness": "opencode",
"model": "openrouter-pinned/qwen/qwen3.6-27b",
"reasoning_effort": "",
"has_solution": false,
"correct": false,
"peak_fraction": null,
"elapsed_seconds": 20,
"harness_exit_code": 0,
"session_complete": true,
"usage": {"input_tokens": 30022, "output_tokens": 429, "cache_read_tokens": 0, "cache_creation_tokens": 0, "reasoning_tokens": 133, "total_cost_usd": null}
}
{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm</path>
<type>directory</type>
<entries>
benchmark.py
check.py
problem.yaml
PROMPT.txt
reference.py
shapes.py
sota.py
(7 entries)
</entries>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/CLAUDE.md"
}<path>/home/infatoshi/cuda/KernelBench-Hard/CLAUDE.md</path>
<type>file</type>
<content>
1: # KernelBench-Hard — Developer Instructions
2:
3: Last updated: 2026-04-27.
4:
5: This file is for **coding agents editing the repo** (you, via Claude Code). Do not confuse with `problems/<X>/PROMPT.txt` — those are the human-voice queries fed to agents _under test_.
6:
7: For the journey behind the current design, read [DEVLOG.md](./DEVLOG.md).
8:
9: ## What this repo is
10:
11: Small kernel benchmark. Frontier coding agents are given URLs to SOTA implementations (sonic-moe, flashinfer, marlin) and asked to write a competitive kernel on RTX PRO 6000 Blackwell (SM120) in 45 minutes. Roofline-graded. Published artifact is the best kernel per (problem × model × harness), plus the agent trace.
12:
13: See [SPEC.md](./SPEC.md) for methodology. See [README.md](./README.md) for the model matrix and quick start.
14:
15: ## Non-negotiable rules
16:
17: - **uv only.** No bare `python`, no `pip`. Use `uv run ...`, `uv add ...`, `uv pip install ...`.
18: - **Before committing:** `uv run ruff check . --fix && uv run pytest`.
19: - **Never edit `problems/*/solution.py`**. Those files are agent output; they're gitignored for a reason. If you need to inspect one, read it from `outputs/runs/<run>/<problem>/solution.py`.
20: - **Never modify `problems/*/reference.py`, `check.py`, `benchmark.py`, `problem.yaml`, `shapes.py`, or `PROMPT.txt`** once a sweep has been published. Those define the ben
... (5941 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/reference.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/reference.py</path>
<type>file</type>
<content>
1: """Naive W4A16 weight-only quantized GEMM reference (correctness only).
2:
3: AWQ/GPTQ-style scheme:
4: x: (M, K) bf16
5: w_q: (K // 2, N) uint8 -- two int4 weights packed per byte (low nibble = even-K, high = odd-K)
6: scales: (K // group, N) bf16
7: zeros: (K // group, N) bf16 -- asymmetric (stored already as float zero-point)
8: out: (M, N) bf16
9:
10: Dequant (per group along K):
11: w_bf[k, n] = (w_q[k, n] - zeros[k // group, n]) * scales[k // group, n]
12: where w_q[k, n] is the unpacked 4-bit value (0..15).
13:
14: This reference unpacks to a full bf16 matrix and then runs torch.matmul. Slow and
15: memory-heavy on the dequant; the agent's solution must fuse unpack+GEMM.
16: """
17: from __future__ import annotations
18:
19: import torch
20: import torch.nn as nn
21:
22: OP_TYPE = "gemm_w4a16"
23: SUPPORTED_PRECISIONS = ["int4_bf16"]
24: HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]
25:
26: GROUP_SIZE = 128
27:
28:
29: def _pack_int4(w_q: torch.Tensor) -> torch.Tensor:
30: """Pack (K, N) uint8 in [0,15] into (K//2, N) uint8.
31:
32: Even rows go in the low nibble, odd rows in the high nibble.
33: """
34: K, N = w_q.shape
35: assert K % 2 == 0
36: lo = w_q[0::2].to(torch.uint8) & 0xF
37: hi = w_q[1::2].to(torch.uint8) & 0xF
38: ret
... (3312 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/check.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/check.py</path>
<type>file</type>
<content>
1: """Correctness runner for FP8 GEMM.
2:
3: Runs solution.Model vs reference.Model across all shapes in shapes.py, 3 seeds
4: each, with per-dtype atol/rtol. Also rejects forbidden ops by grep.
5: """
6: import re
7: import sys
8: from pathlib import Path
9:
10: import torch
11: import yaml
12:
13: # Make the repo's src/ importable
14: REPO_ROOT = Path(__file__).resolve().parents[2]
15: sys.path.insert(0, str(REPO_ROOT))
16:
17: from src.eval.correctness import check_correctness # noqa: E402
18:
19:
20: def main():
21: try:
22: import reference
23: import shapes
24: import solution
25: except Exception as e:
26: print(f"FAIL: import error: {e}")
27: sys.exit(1)
28:
29: problem_yaml = Path("problem.yaml")
30: meta = yaml.safe_load(problem_yaml.read_text()) if problem_yaml.exists() else {}
31:
32: # --- Forbidden-op check ------------------------------------------------
33: sol_src = Path("solution.py").read_text() if Path("solution.py").exists() else ""
34: for forbidden in meta.get("forbidden", []):
35: pat = re.escape(forbidden)
36: if re.search(pat, sol_src):
37: print(f"FAIL: forbidden op used: {forbidden}")
38: sys.exit(1)
39:
40: device = torch.device("cuda:0")
41: tol_override = meta.get("tolerance") or None
42:
43: # --- Per-shape
... (2909 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/shapes.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/shapes.py</path>
<type>file</type>
<content>
1: """Shape sweep for W4A16 GEMM.
2:
3: Llama-style up_proj / qkv_proj shapes. Decode (M=1) is the bandwidth-bound
4: case every inference engine optimizes -- it's the bar to beat.
5: """
6:
7: SHAPES = [
8: {"M": 1, "N": 12288, "K": 4096}, # decode: memory-bound on int4 weight read
9: {"M": 32, "N": 12288, "K": 4096}, # small prefill: mixed regime
10: {"M": 256, "N": 12288, "K": 4096}, # larger prefill: approaching compute
11: {"M": 1, "N": 4096, "K": 4096}, # decode: square shape
12: {"M": 16, "N": 14336, "K": 4096}, # speculative-decode-ish
13: ]
(End of file - total 13 lines)
</content>{
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/problem.yaml"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/problem.yaml</path>
<type>file</type>
<content>
1: name: 07_w4a16_gemm
2: display_name: "W4A16 Weight-only Quantized GEMM"
3: precision: int4_bf16
4: regime: memory # decode-dominant; M=1 is bandwidth-bound on the int4 weight stream
5:
6: # Dense-equivalent FLOPs (matmul work, ignoring dequant arithmetic).
7: flops_formula: "2 * M * N * K"
8:
9: # Bytes moved per call (memory roofline):
10: # x: M*K*2 (bf16 activations, streamed in once)
11: # w_q: (K/2)*N (packed int4, 0.5 B/elem)
12: # scales: (K/128)*N*2 (bf16 scales)
13: # zeros: (K/128)*N*2 (bf16 zero-points)
14: # out: M*N*2 (bf16 store)
15: bytes_formula: "M*K*2 + (K/2)*N + (K/128)*N*2 + (K/128)*N*2 + M*N*2"
16:
17: hardware: [RTX_PRO_6000]
18: peak_tflops_key: bf16
19: peak_bandwidth_key: dram
20:
21: tolerance:
22: bfloat16: 0.10 # group-quant adds noise on top of bf16 accumulator slop
23:
24: # Forbidden ops -- agent must write the unpack + GEMM themselves, not call a
25: # vendor library that does both.
26: forbidden:
27: - "bitsandbytes.functional.dequantize_4bit"
28: - "bitsandbytes.functional.gemv_4bit"
29: - "marlin_kernel.gemm"
30: - "torch.nn.functional.linear"
31:
32: sota:
33: name: "bitsandbytes NF4 (gemv_4bit / dequantize_4bit + matmul)"
34: url: "https://github.com/TimDettmers/bitsandbytes"
35: function: "bitsandbytes.functional.gemv_4bit"
36: notes: |
37:
... (757 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/benchmark.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/benchmark.py</path>
<type>file</type>
<content>
1: """Roofline benchmark for FP8 GEMM.
2:
3: For each shape: times eager reference, compiled reference, SOTA (if available),
4: and the agent's solution. Reports achieved TFLOPS, GB/s, and peak_fraction.
5:
6: Output lines the harness picks up:
7: shape=<idx> variant=<name> tflops=<N> gbps=<N> ms=<N>
8: peak_fraction: <N> (geomean over shapes of solution's peak_fraction)
9: """
10: import sys
11: from math import exp, log
12: from pathlib import Path
13:
14: import torch
15: import yaml
16:
17: REPO_ROOT = Path(__file__).resolve().parents[2]
18: sys.path.insert(0, str(REPO_ROOT))
19:
20: from src.eval.roofline import compute_gbps, compute_tflops, peak_fraction # noqa: E402
21: from src.eval.timing import time_fn # noqa: E402
22: from src.hardware import get as get_hw # noqa: E402
23:
24:
25: def _eval_formula(expr: str, vars: dict) -> float:
26: # Very small eval: only names from `vars` are valid.
27: return float(eval(expr, {"__builtins__": {}}, vars))
28:
29:
30: def main():
31: import reference
32: import shapes
33: import solution
34:
35: meta = yaml.safe_load(Path("problem.yaml").read_text())
36: hw = get_hw(meta["hardware"][0])
37: peak_tflops = hw.peak_tflops_dense.get(meta["peak_tflops_key"], 0.0)
38: peak_gbps = hw.peak_bandwidth_gb_s
39: regime = meta.get("regime", "compute")
40: flops_formul
... (3415 more chars){
"filePath": "/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/sota.py"
}<path>/home/infatoshi/cuda/KernelBench-Hard/problems/07_w4a16_gemm/sota.py</path>
<type>file</type>
<content>
1: """SOTA reference for W4A16 GEMM.
2:
3: Library survey on RTX PRO 6000 Blackwell (SM120, CC 12.0):
4:
5: - Marlin (IST-DASLab): no SM120 kernels (Ampere/Hopper only). Skip.
6: - GPTQ-Triton (fpgaminer): unmaintained; pure Triton path works on SM120
7: but is not faster than Marlin on its target HW
8: and has no Blackwell tuning. Skip as primary.
9: - AWQ (mit-han-lab/llm-awq): CUDA kernels not built for SM120 in the wheel.
10: Skip.
11: - bitsandbytes >= 0.49.2: CUDA kernels compile and run on SM120 (verified
12: on this machine). Different quant scheme (NF4,
13: symmetric, blocksize 64) than our reference's
14: AWQ-style asymmetric INT4 with group_size 128,
15: but it occupies the same memory regime and is
16: the only tuned W4A16-class kernel that runs on
17: SM120 today. Used here as an *informational*
18: SOTA line, not as a numerical reference.
19:
20: The benchmark calls `sota_forward(x, ref_model)` and times it; correctness is
21: NOT checked against this path (the quant scheme differs).
22: """
23: from
... (2725 more chars)