cognitivecomputations
/

Qwen3-72B-Synthesis

Model card Files Files and versions

Qwen3-72B-Synthesis / prepare_donor_v2.py

ehartford's picture

Upload folder using huggingface_hub

7e1725c verified 16 days ago

history blame contribute delete

3.62 kB

	# prepare_donor_v2.py
	import torch
	import os
	import argparse
	from tqdm import tqdm
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
	from accelerate import init_empty_weights

	def main(foundation_model_id, donor_model_id, output_path):
	"""
	Creates a new 'Aligned' donor model.
	1. Defines a target Qwen3 80-layer architecture using the foundation config.
	2. Creates an empty model 'shell' with this pure Qwen3 architecture.
	3. Fills the shell with weights from the Qwen2.5 donor, discarding incompatible tensors.
	"""
	print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")

	# Load the CONFIG from our Qwen3 foundation to get the correct blueprint
	foundation_config = AutoConfig.from_pretrained(
	foundation_model_id, trust_remote_code=True
	)

	# Modify the config to match the target 72B size and 80 layers
	foundation_config.num_hidden_layers = 80
	foundation_config.hidden_size = 8192
	foundation_config.intermediate_size = 29568
	foundation_config.torch_dtype = torch.bfloat16

	# Create an empty 'shell' of the final model. This is instant and memory-efficient.
	# Its config.json will be a pure Qwen3 config.
	print("Creating empty Qwen3 80-layer model shell...")
	with init_empty_weights():
	aligned_model = AutoModelForCausalLM.from_config(
	foundation_config, trust_remote_code=True
	)
	aligned_model.tie_weights()
	print("Empty shell created successfully.")

	print("\n--- Phase 2: Loading Donor Weights ---")
	print(f"Loading weights from donor: {donor_model_id}")

	# Load the donor model on CPU to get its state dict
	donor_model = AutoModelForCausalLM.from_pretrained(
	donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
	)
	donor_state_dict = donor_model.state_dict()
	del donor_model # Free memory

	# We will now load the donor weights into our pure Qwen3 shell.
	# strict=False is essential because the donor has '.bias' tensors that our
	# shell doesn't, and our shell has '.norm' tensors the donor doesn't.
	# This will load all matching weights and ignore the rest.
	print("Loading donor state_dict into the Qwen3 shell (strict=False)...")
	aligned_model.load_state_dict(donor_state_dict, strict=False, assign=True)

	# The '.norm' weights in the shell will keep their default initialization (1.0),
	# which is exactly what we want for a neutral placeholder.

	print("\n--- Phase 3: Saving the Aligned Donor ---")
	tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)

	print(f"Saving the architecturally aligned model to: {output_path}")
	os.makedirs(output_path, exist_ok=True)
	aligned_model.save_pretrained(output_path)
	tokenizer.save_pretrained(output_path)

	print("\nDonor preparation complete! This is the definitive donor model.")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
	parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
	parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
	parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
	args = parser.parse_args()

	main(args.foundation_model, args.donor_model, args.output_path)