"""Triton grouped GEMM + fused SwiGLU for top-K MoE up-projection.

Per-expert we compute:
    h_e = silu(x_e @ W_gate[e]) * (x_e @ W_up[e])
where x_e is the slice of permuted hidden states routed to expert e.

The kernel tiles the (T_perm, I) output space.  Each output tile belongs to
exactly one expert, with row boundaries aligned to expert boundaries so that
all rows in a tile share the same gate/up weight matrix.
"""
from __future__ import annotations

import math
from typing import List, Tuple

import torch
import torch.nn as nn
import triton
import triton.language as tl

OP_TYPE = "grouped_gemm_swiglu"
SUPPORTED_PRECISIONS = ["bf16"]
HARDWARE_REQUIRED = ["RTX_PRO_6000", "H100", "B200"]


# --------------------------------------------------------------------------- #
# Triton kernel
# --------------------------------------------------------------------------- #
@triton.jit
def grouped_gemm_swiglu_kernel(
    hidden_ptr,
    W_gate_ptr,
    W_up_ptr,
    out_ptr,
    tile_expert_ptr,
    tile_row_start_ptr,
    expert_offsets_ptr,
    H: tl.constexpr,
    I: tl.constexpr,
    E: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
):
    """One grouped-GEMM tile: rows within one expert, contiguous columns of I.

    Weight tensors are passed as transposed views of shape (E, I, H) so that the
    K dimension is contiguous in memory for the B matrix of each dot product.
    """
    EVEN_K: tl.constexpr = (H % BLOCK_K == 0)
    pid = tl.program_id(0)
    num_n_tiles = tl.cdiv(I, BLOCK_N)
    tile_m = pid // num_n_tiles
    tile_n = pid % num_n_tiles

    expert = tl.load(tile_expert_ptr + tile_m).to(tl.int32)
    row_start = tl.load(tile_row_start_ptr + tile_m).to(tl.int32)
    expert_end = tl.load(expert_offsets_ptr + expert + 1).to(tl.int32)
    row_end = tl.minimum(row_start + BLOCK_M, expert_end)
    n_start = tile_n * BLOCK_N

    # Pointer bases for this tile.  W_gate/W_up are (E, I, H) transposed views,
    # so element (e, n, k) is at offset e*H*I + n + k*I.
    a_ptr = hidden_ptr + row_start * H
    b_gate_ptr = W_gate_ptr + expert * H * I + n_start
    b_up_ptr = W_up_ptr + expert * H * I + n_start
    c_ptr = out_ptr + row_start * I + n_start

    # Tile offsets
    offs_m = tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    offs_k = tl.arange(0, BLOCK_K)

    row_mask = offs_m < (row_end - row_start)
    col_mask = offs_n < (I - n_start)

    acc_gate = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
    acc_up = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

    for k in range(0, H, BLOCK_K):
        if EVEN_K:
            a = tl.load(
                a_ptr + offs_m[:, None] * H + (k + offs_k)[None, :],
                mask=row_mask[:, None],
                other=0.0,
            )
            b_gate = tl.load(
                b_gate_ptr + (k + offs_k)[:, None] * I + offs_n[None, :],
                mask=col_mask[None, :],
                other=0.0,
            )
            b_up = tl.load(
                b_up_ptr + (k + offs_k)[:, None] * I + offs_n[None, :],
                mask=col_mask[None, :],
                other=0.0,
            )
        else:
            k_mask = (k + offs_k) < H
            a = tl.load(
                a_ptr + offs_m[:, None] * H + (k + offs_k)[None, :],
                mask=row_mask[:, None] & k_mask[None, :],
                other=0.0,
            )
            b_gate = tl.load(
                b_gate_ptr + (k + offs_k)[:, None] * I + offs_n[None, :],
                mask=k_mask[:, None] & col_mask[None, :],
                other=0.0,
            )
            b_up = tl.load(
                b_up_ptr + (k + offs_k)[:, None] * I + offs_n[None, :],
                mask=k_mask[:, None] & col_mask[None, :],
                other=0.0,
            )

        acc_gate = tl.dot(a, b_gate, acc_gate)
        acc_up = tl.dot(a, b_up, acc_up)

    # Fused SwiGLU epilogue in float32, then store bf16
    gate = acc_gate
    up = acc_up
    silu = gate * tl.sigmoid(gate)
    out = (silu * up).to(tl.bfloat16)

    tl.store(
        c_ptr + offs_m[:, None] * I + offs_n[None, :],
        out,
        mask=row_mask[:, None] & col_mask[None, :],
    )


# --------------------------------------------------------------------------- #
# Tile scheduling (CPU-side)
# --------------------------------------------------------------------------- #
def _build_tile_metadata(
    expert_offsets: torch.Tensor,
    block_m: int,
    device: torch.device,
) -> Tuple[torch.Tensor, torch.Tensor, int]:
    """Return (tile_expert, tile_row_start, num_tiles) on *device*.

    Each tile is confined to a single expert and contains at most BLOCK_M rows.
    """
    offsets = expert_offsets.cpu().tolist()
    E = len(offsets) - 1
    tile_expert: List[int] = []
    tile_row_start: List[int] = []
    for e in range(E):
        start = int(offsets[e])
        end = int(offsets[e + 1])
        count = end - start
        if count <= 0:
            continue
        num_tiles = math.ceil(count / block_m)
        for t in range(num_tiles):
            tile_expert.append(e)
            tile_row_start.append(start + t * block_m)
    tile_expert_t = torch.tensor(tile_expert, dtype=torch.int32, device=device)
    tile_row_start_t = torch.tensor(tile_row_start, dtype=torch.int32, device=device)
    return tile_expert_t, tile_row_start_t, len(tile_expert)


# --------------------------------------------------------------------------- #
# Model
# --------------------------------------------------------------------------- #
class Model(nn.Module):
    """Up-projection of a top-K MoE FFN with grouped GEMM + fused SwiGLU."""

    def __init__(self, T_total: int, H: int, I: int, E: int, K: int):  # noqa: E741
        super().__init__()
        self.T_total = T_total
        self.H = H
        self.I = I
        self.E = E
        self.K = K
        self.W_gate = nn.Parameter(torch.empty(E, H, I, dtype=torch.bfloat16))
        self.W_up = nn.Parameter(torch.empty(E, H, I, dtype=torch.bfloat16))
        nn.init.normal_(self.W_gate, std=0.02)
        nn.init.normal_(self.W_up, std=0.02)

    def forward(
        self,
        hidden_states: torch.Tensor,
        expert_offsets: torch.Tensor,
    ) -> torch.Tensor:
        T_perm, H = hidden_states.shape
        I = self.I
        E = self.E
        out = torch.empty(T_perm, I, dtype=torch.bfloat16, device=hidden_states.device)

        # Tile scheduling: one tile never spans experts.  Cache per offsets tensor
        # since benchmark.py invokes forward many times with identical routing.
        BLOCK_M = 256
        if not hasattr(self, "_tile_cache") or not self._tile_cache_matches(expert_offsets):
            self._tile_cache = _build_tile_metadata(expert_offsets, BLOCK_M, hidden_states.device)
            self._tile_cache_key = expert_offsets.data_ptr()

        tile_expert, tile_row_start, num_tiles = self._tile_cache
        if num_tiles == 0:
            return out

        num_n_tiles = math.ceil(I / 64)
        grid = (num_tiles * num_n_tiles,)

        W_gate_t = self._cached_transpose(self.W_gate, "_W_gate_t")
        W_up_t = self._cached_transpose(self.W_up, "_W_up_t")

        grouped_gemm_swiglu_kernel[grid](
            hidden_states,
            W_gate_t,
            W_up_t,
            out,
            tile_expert,
            tile_row_start,
            expert_offsets,
            H=H,
            I=I,
            E=E,
            BLOCK_M=256,
            BLOCK_N=64,
            BLOCK_K=64,
            num_warps=8,
            num_stages=3,
        )
        return out

    def _cached_transpose(self, weight: nn.Parameter, attr: str) -> torch.Tensor:
        key = weight.data_ptr()
        cached = getattr(self, attr, None)
        if cached is None or getattr(self, f"{attr}_key", None) != key:
            cached = weight.transpose(1, 2)
            setattr(self, attr, cached)
            setattr(self, f"{attr}_key", key)
        return cached

    def _tile_cache_matches(self, expert_offsets: torch.Tensor):
        key = getattr(self, "_tile_cache_key", None)
        if key is None:
            return False
        return key == expert_offsets.data_ptr()


# --------------------------------------------------------------------------- #
# Shape shims for check.py / benchmark.py
# --------------------------------------------------------------------------- #
T_total = 32768
H = 4096
I = 1536  # noqa: E741
E = 128
K = 8


def _build_routing(T_total: int, E: int, K: int, device: str = "cpu") -> torch.Tensor:
    T_perm = T_total * K
    base = T_perm // E
    rem = T_perm - base * E
    counts = torch.full((E,), base, dtype=torch.int32, device=device)
    counts[:rem] += 1
    offsets = torch.zeros(E + 1, dtype=torch.int32, device=device)
    offsets[1:] = torch.cumsum(counts, dim=0)
    return offsets


def get_inputs():
    T_perm = T_total * K
    hidden_states = torch.randn(T_perm, H, dtype=torch.bfloat16) * 0.1
    expert_offsets = _build_routing(T_total, E, K)
    return [hidden_states, expert_offsets]


def get_init_inputs():
    return [T_total, H, I, E, K]