Qwen3-72B-Synthesis / prepare_donor_v3.py

Upload folder using huggingface_hub

7e1725c verified 16 days ago

4.56 kB

	# prepare_donor_v3.py
	import torch
	import os
	import argparse
	from tqdm import tqdm
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
	from accelerate import init_empty_weights

	def main(foundation_model_id, donor_model_id, output_path):
	"""
	Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
	1. Defines a target Qwen3 80-layer architecture.
	2. Creates an empty Qwen3 model 'shell'.
	3. Manually copies weights from the Qwen2.5 donor, truncating the vocabulary-related
	tensors to fit the Qwen3 architecture.
	"""
	print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")

	foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)

	# Target architecture: 80 layers, 72B dimensions, and Qwen3's vocab size
	target_config = foundation_config
	target_config.num_hidden_layers = 80
	target_config.hidden_size = 8192
	target_config.intermediate_size = 29568
	target_config.vocab_size = 151936 # Explicitly set Qwen3 vocab size
	target_config.torch_dtype = torch.bfloat16

	print("Creating empty Qwen3 80-layer model shell...")
	with init_empty_weights():
	aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
	aligned_model.tie_weights()
	print("Empty shell created successfully.")

	print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
	print(f"Loading weights from donor: {donor_model_id}")

	donor_model = AutoModelForCausalLM.from_pretrained(
	donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
	)
	donor_state_dict = donor_model.state_dict()
	del donor_model

	# Get the state dict of our target shell to know the correct shapes
	target_state_dict = aligned_model.state_dict()
	new_state_dict = {}

	print("Copying and aligning tensors one-by-one...")
	for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
	if name in donor_state_dict:
	donor_tensor = donor_state_dict[name]

	# --- THIS IS THE FIX ---
	# If shapes match, copy directly.
	if donor_tensor.shape == target_tensor.shape:
	new_state_dict[name] = donor_tensor.clone()
	# If shapes mismatch, handle the known vocabulary size difference.
	else:
	print(f" - Resolving shape mismatch for {name}:")
	print(f" Donor shape: {donor_tensor.shape}, Target shape: {target_tensor.shape}")
	# We know the mismatch is on the vocab dimension (dim 0).
	# Truncate the donor tensor to fit the target shape.
	vocab_dim = target_tensor.shape[0]
	new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
	else:
	# This handles tensors that are in the Qwen3 shell but not the Qwen2.5 donor
	# (i.e., q_norm.weight and k_norm.weight). We just keep the initialized value.
	print(f" - Keeping initialized tensor for {name} (not in donor)")
	new_state_dict[name] = target_tensor.clone()

	print("Loading the fully aligned state_dict into the Qwen3 shell...")
	# This load will now succeed because every tensor has the correct shape.
	aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)

	print("\n--- Phase 3: Saving the Aligned Donor ---")
	tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)

	print(f"Saving the architecturally aligned model to: {output_path}")
	os.makedirs(output_path, exist_ok=True)
	aligned_model.save_pretrained(output_path)
	tokenizer.save_pretrained(output_path)

	print("\nDonor preparation complete! This is the definitive donor model.")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
	parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
	parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
	parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
	args = parser.parse_args()

	main(args.foundation_model, args.donor_model, args.output_path)