File size: 3,622 Bytes

7e1725c

# prepare_donor_v2.py
import torch
import os
import argparse
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights

def main(foundation_model_id, donor_model_id, output_path):
    """
    Creates a new 'Aligned' donor model.
    1. Defines a target Qwen3 80-layer architecture using the foundation config.
    2. Creates an empty model 'shell' with this pure Qwen3 architecture.
    3. Fills the shell with weights from the Qwen2.5 donor, discarding incompatible tensors.
    """
    print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
    
    # Load the CONFIG from our Qwen3 foundation to get the correct blueprint
    foundation_config = AutoConfig.from_pretrained(
        foundation_model_id, trust_remote_code=True
    )
    
    # Modify the config to match the target 72B size and 80 layers
    foundation_config.num_hidden_layers = 80
    foundation_config.hidden_size = 8192
    foundation_config.intermediate_size = 29568
    foundation_config.torch_dtype = torch.bfloat16
    
    # Create an empty 'shell' of the final model. This is instant and memory-efficient.
    # Its config.json will be a pure Qwen3 config.
    print("Creating empty Qwen3 80-layer model shell...")
    with init_empty_weights():
        aligned_model = AutoModelForCausalLM.from_config(
            foundation_config, trust_remote_code=True
        )
    aligned_model.tie_weights()
    print("Empty shell created successfully.")

    print("\n--- Phase 2: Loading Donor Weights ---")
    print(f"Loading weights from donor: {donor_model_id}")
    
    # Load the donor model on CPU to get its state dict
    donor_model = AutoModelForCausalLM.from_pretrained(
        donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
    )
    donor_state_dict = donor_model.state_dict()
    del donor_model # Free memory
    
    # We will now load the donor weights into our pure Qwen3 shell.
    # strict=False is essential because the donor has '.bias' tensors that our
    # shell doesn't, and our shell has '.norm' tensors the donor doesn't.
    # This will load all matching weights and ignore the rest.
    print("Loading donor state_dict into the Qwen3 shell (strict=False)...")
    aligned_model.load_state_dict(donor_state_dict, strict=False, assign=True)
    
    # The '.norm' weights in the shell will keep their default initialization (1.0),
    # which is exactly what we want for a neutral placeholder.

    print("\n--- Phase 3: Saving the Aligned Donor ---")
    tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
    
    print(f"Saving the architecturally aligned model to: {output_path}")
    os.makedirs(output_path, exist_ok=True)
    aligned_model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

    print("\nDonor preparation complete! This is the definitive donor model.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
    parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
    parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
    args = parser.parse_args()
    
    main(args.foundation_model, args.donor_model, args.output_path)