GPTQ Quantization of YanoljaNEXT-Rosetta-27B-2511

On 1x3090, you can set the context length up to 49152 using vLLM.

Quantization method

Quantized using

import sys
import random
import string
from transformers import AutoProcessor, AutoModelForCausalLM
from datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
import torch

MODEL_ID = sys.argv[1]
NUM_CALIBRATION_SAMPLES=1024
MAX_SEQUENCE_LENGTH=2048

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

dampening_frac=0.07

ds = load_dataset("Helsinki-NLP/opus-100", "en-ja", split="train[:1024]")
def preprocess_function(example):
    en = example["translation"].get("en", "")
    ja = example["translation"].get("ja", "")
    rid = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
    messages = [
        {"role": "system", "content": "Translate the user's text to Japanese.\nOutput format:XML\nProvide the final translation immediately without any other text."},
        {"role": "user", "content": f'<seg id="{rid}" type="calib"><field key="content">{en}</field></seg>'},
        {"role": "assistant", "content": f'<seg id="{rid}" type="calib"><field key="content">{ja}</field></seg>'},
    ]
    return processor.apply_chat_template(
        messages,
        return_tensors="pt",
        padding=False,
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
        tokenize=True,
        add_special_tokens=False,
        return_dict=True,
        add_generation_prompt=False,
    )
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)

def data_collator(batch):
    assert len(batch) == 1
    return {
        key: (
            torch.tensor(value)
            if key != "pixel_values"
            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
        )
        for key, value in batch[0].items()
    }

recipe = [
    GPTQModifier(
        targets="Linear",
        scheme="W4A16",
        ignore=["lm_head"],
        dampening_frac=dampening_frac,
    )
]

SAVE_DIR = sys.argv[2]
oneshot(
    model=model,
    processor=processor,
    recipe=recipe,
    dataset=ds,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    data_collator=data_collator,
    sequential_targets=["Gemma3DecoderLayer"],
    tie_word_embeddings=True,
)

model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
Downloads last month
15
Safetensors
Model size
6B params
Tensor type
I64
ยท
I32
ยท
BF16
ยท
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for Bedovyy/YanoljaNEXT-Rosetta-27B-2511-W4A16-G128

Quantized
(6)
this model