#!/usr/bin/env python3
"""
GLM-4.6 AWQ Quantization Script

Quantizes GLM-4.6 (357B MoE) to 4-bit AWQ for efficient inference with vLLM.

Requirements:
    - 1× GPU with 48GB+ VRAM (single GPU is optimal)
    - 768GB+ system RAM (DDR4/DDR5)
    - 300GB+ swap space (will be actively used)
    - PyTorch with CUDA support
    - llm-compressor
    - transformers
    - datasets

Hardware Notes:
    - Multi-GPU provides NO quantization speedup (process is RAM-bound, not GPU-bound)
    - The full BF16 model (~714GB) will be offloaded to system RAM/swap
    - Quantized using: 1× RTX PRO 6000 Blackwell Max-Q (96GB) + 768GB RAM
    - Quantization time: ~5 hours (includes calibration, smoothing, compression, and saving)

Usage:
    python quantize_glm46_awq.py --model zai-org/GLM-4.6 --output ./GLM-4.6-AWQ

Advanced options:
    python quantize_glm46_awq.py \
        --model zai-org/GLM-4.6 \
        --output ./GLM-4.6-AWQ \
        --device-map sequential \
        --max-cpu-memory 750GiB \
        --cal-samples 512
"""

import os
import argparse
import json
import shutil
import pathlib
from typing import List

import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier


def add_no_split(cfg: AutoConfig, classes: List[str]) -> AutoConfig:
    """Prevent splitting specific module classes across devices."""
    ns = set(getattr(cfg, "no_split_module_classes", []) or [])
    ns.update(classes)
    cfg.no_split_module_classes = list(ns)
    return cfg


def compute_batch_size(seq_len: int, target_tokens: int) -> int:
    """Calculate batch size to achieve target tokens per calibration step."""
    return max(1, target_tokens // seq_len)


def clone_and_fix_index(src_dir: str) -> str:
    """
    Clone model directory and fix empty-string key in weight_map if present.
    This prevents device_map='auto' errors with some sharded checkpoints.
    """
    src = pathlib.Path(src_dir)
    dst = src.parent / (src.name + "_fixed_index")
    if dst.exists():
        shutil.rmtree(dst)
    shutil.copytree(src, dst)

    candidates = ["model.safetensors.index.json", "pytorch_model.bin.index.json"]
    found = None
    for c in candidates:
        p = dst / c
        if p.exists():
            found = p
            break
    if not found:
        return str(dst)

    with open(found, "r") as f:
        idx = json.load(f)
    wm = idx.get("weight_map", {})
    if "" in wm:
        del wm[""]
        idx["weight_map"] = wm
        with open(found, "w") as f:
            json.dump(idx, f)
    return str(dst)


def main():
    parser = argparse.ArgumentParser(description="Quantize GLM-4.6 to 4-bit AWQ")
    parser.add_argument("--model", required=True, help="Path or HF ID of GLM-4.6 model (e.g., zai-org/GLM-4.6)")
    parser.add_argument("--output", required=True, help="Output directory for quantized model")
    parser.add_argument("--cal-samples", type=int, default=512, help="Number of calibration samples (default: 512)")
    parser.add_argument("--cal-seq-len", type=int, default=2048, help="Calibration sequence length (default: 2048)")
    parser.add_argument("--batch-tokens", type=int, default=131072, help="Tokens per calibration step (default: 131072)")
    parser.add_argument("--dataset", default="neuralmagic/LLM_compression_calibration", help="Calibration dataset")
    parser.add_argument("--dataset-split", default="train", help="Dataset split to use")
    parser.add_argument("--device-map", choices=["auto", "sequential"], default="auto",
                        help="Device placement strategy: 'auto' (recommended) or 'sequential' (robust)")
    parser.add_argument("--max-memory-per-gpu", type=str, default="92GiB",
                        help="Max memory per GPU (default: 92GiB for 96GB GPUs)")
    parser.add_argument("--max-cpu-memory", type=str, default="500GiB",
                        help="Max CPU memory for offloading (default: 500GiB)")
    args = parser.parse_args()

    # Environment setup
    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
    os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:512")

    # Use only GPU 0 for quantization (multi-GPU provides no benefit - process is RAM-bound)
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")

    # Enable TF32 for faster computation on Ampere+ GPUs
    try:
        torch.backends.cuda.matmul.fp32_precision = "tf32"
        torch.backends.cudnn.conv.fp32_precision = "tf32"
    except Exception:
        pass

    torch.set_num_threads(8)

    # Verify CUDA availability
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. This script requires GPU(s).")

    num_gpus = torch.cuda.device_count()
    print(f"✓ Found {num_gpus} CUDA device(s)")
    print(f"✓ Using GPU 0 for quantization (CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', 'all')})")
    print(f"\nNote: Multi-GPU provides NO speedup for quantization - the process is RAM-bound.")
    print(f"      The full BF16 model (~714GB) will be offloaded to system RAM/swap.")

    # Load configuration
    print(f"Loading config from: {args.model}")
    cfg = AutoConfig.from_pretrained(args.model, trust_remote_code=True)

    # Prevent splitting merged linear layers across devices
    cfg = add_no_split(cfg, ["MergedColumnParallelLinear"])

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True, use_fast=True)

    # Load model with device placement
    print(f"Loading model weights from: {args.model}")
    load_dir = args.model

    if args.device_map == "auto":
        try:
            load_dir = clone_and_fix_index(args.model)
        except Exception as e:
            print(f"Index sanitization skipped: {e}")

    # Configure memory allocation
    max_mem = {i: args.max_memory_per_gpu for i in range(num_gpus)}
    max_mem["cpu"] = args.max_cpu_memory

    try:
        model = AutoModelForCausalLM.from_pretrained(
            load_dir,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            device_map=args.device_map,
            config=cfg,
            max_memory=max_mem,
            offload_folder=None,
            offload_state_dict=False,
        )
    except KeyError as e:
        if args.device_map == "auto":
            print(f"Auto device_map failed with {e}; falling back to sequential...")
            model = AutoModelForCausalLM.from_pretrained(
                load_dir,
                torch_dtype=torch.bfloat16,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                device_map="sequential",
                config=cfg,
                max_memory=max_mem,
            )
        else:
            raise

    print("✓ Model loaded successfully")

    # Print GPU memory usage
    print("\nGPU Memory Usage:")
    for i in range(num_gpus):
        allocated = torch.cuda.memory_allocated(i) / 1e9
        peak = torch.cuda.max_memory_allocated(i) / 1e9
        print(f"  GPU {i}: {allocated:.2f} GB allocated / {peak:.2f} GB peak")

    # Load calibration dataset
    print(f"\nLoading calibration dataset: {args.dataset}")
    ds = load_dataset(args.dataset, split=args.dataset_split)
    ds = ds.shuffle(seed=42).select(range(args.cal_samples))
    print(f"✓ Selected {len(ds)} calibration samples")

    seq_len = args.cal_seq_len
    batch_size = compute_batch_size(seq_len, args.batch_tokens)
    print(f"Calibration config: seq_len={seq_len}, batch_size={batch_size}")

    # AWQ quantization recipe
    # Keep critical layers at full precision for quality
    ignore_patterns = [
        "lm_head",
        "model.embed_tokens",
        "re:.*input_layernorm$",
        "re:.*post_attention_layernorm$",
        "model.norm",
        "re:.*q_norm$",
        "re:.*k_norm$",
        "re:.*shared_experts.*",         # Always-active experts
        "re:.*mlp\\.gate\\.weight$",     # MoE router
        "re:.*mlp\\.gate\\..*bias$",     # MoE router bias
        "re:model.layers.[0-2]\\.",      # First 3 layers for quality
    ]

    # Target patterns for quantization
    targets = [
        "re:.*gate_proj.*",   # MLP projections
        "re:.*up_proj.*",
        "re:.*down_proj.*",
        "re:.*k_proj.*",      # Attention projections
        "re:.*q_proj.*",
        "re:.*v_proj.*",
        "re:.*o_proj.*",
    ]

    recipe = [
        AWQModifier(
            ignore=ignore_patterns,
            config_groups={
                "group_0": {
                    "targets": targets,
                    "weights": {
                        "num_bits": 4,
                        "type": "int",
                        "symmetric": True,
                        "group_size": 128,
                        "strategy": "group",
                        "dynamic": False,
                    },
                    "input_activations": None,
                    "output_activations": None,
                    "format": None,
                }
            },
        )
    ]

    # Run AWQ quantization
    print("\n" + "="*80)
    print("Starting AWQ quantization...")
    print("="*80)

    with torch.inference_mode():
        oneshot_args = {
            "model": model,
            "dataset": ds,
            "recipe": recipe,
            "max_seq_length": seq_len,
            "num_calibration_samples": len(ds),
        }

        # Add batch_size if supported
        try:
            from inspect import signature
            if "batch_size" in signature(oneshot).parameters:
                oneshot_args["batch_size"] = batch_size
        except Exception:
            pass

        oneshot(**oneshot_args)

    print("\n✓ AWQ quantization completed successfully")

    # Save quantized model
    print(f"\nSaving quantized model to: {args.output}")
    os.makedirs(args.output, exist_ok=True)

    model.save_pretrained(args.output, save_compressed=True)
    tokenizer.save_pretrained(args.output)

    print("\n" + "="*80)
    print("QUANTIZATION COMPLETE")
    print("="*80)
    print(f"Quantized model saved to: {args.output}")
    print(f"\nModel size on disk: ~176 GB (39 safetensors files)")
    print(f"\nTo use with vLLM:")
    print(f"  vllm serve {args.output} \\")
    print(f"    --tensor-parallel-size 4 \\")
    print(f"    --enable-expert-parallel \\")
    print(f"    --trust-remote-code")
    print("="*80)


if __name__ == "__main__":
    main()