|
|
|
import torch |
|
import os |
|
import argparse |
|
from tqdm import tqdm |
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer |
|
from accelerate import init_empty_weights |
|
|
|
def main(foundation_model_id, donor_model_id, output_path): |
|
""" |
|
Creates a new 'Aligned' donor model. |
|
1. Defines a target Qwen3 80-layer architecture using the foundation config. |
|
2. Creates an empty model 'shell' with this pure Qwen3 architecture. |
|
3. Fills the shell with weights from the Qwen2.5 donor, discarding incompatible tensors. |
|
""" |
|
print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---") |
|
|
|
|
|
foundation_config = AutoConfig.from_pretrained( |
|
foundation_model_id, trust_remote_code=True |
|
) |
|
|
|
|
|
foundation_config.num_hidden_layers = 80 |
|
foundation_config.hidden_size = 8192 |
|
foundation_config.intermediate_size = 29568 |
|
foundation_config.torch_dtype = torch.bfloat16 |
|
|
|
|
|
|
|
print("Creating empty Qwen3 80-layer model shell...") |
|
with init_empty_weights(): |
|
aligned_model = AutoModelForCausalLM.from_config( |
|
foundation_config, trust_remote_code=True |
|
) |
|
aligned_model.tie_weights() |
|
print("Empty shell created successfully.") |
|
|
|
print("\n--- Phase 2: Loading Donor Weights ---") |
|
print(f"Loading weights from donor: {donor_model_id}") |
|
|
|
|
|
donor_model = AutoModelForCausalLM.from_pretrained( |
|
donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True |
|
) |
|
donor_state_dict = donor_model.state_dict() |
|
del donor_model |
|
|
|
|
|
|
|
|
|
|
|
print("Loading donor state_dict into the Qwen3 shell (strict=False)...") |
|
aligned_model.load_state_dict(donor_state_dict, strict=False, assign=True) |
|
|
|
|
|
|
|
|
|
print("\n--- Phase 3: Saving the Aligned Donor ---") |
|
tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True) |
|
|
|
print(f"Saving the architecturally aligned model to: {output_path}") |
|
os.makedirs(output_path, exist_ok=True) |
|
aligned_model.save_pretrained(output_path) |
|
tokenizer.save_pretrained(output_path) |
|
|
|
print("\nDonor preparation complete! This is the definitive donor model.") |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.") |
|
parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.") |
|
parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.") |
|
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.") |
|
args = parser.parse_args() |
|
|
|
main(args.foundation_model, args.donor_model, args.output_path) |
|
|