cognitivecomputations
/

Qwen3-72B-Synthesis

Model card Files Files and versions

Qwen3-72B-Synthesis / prepare_donor.py

ehartford's picture

Upload folder using huggingface_hub

7e1725c verified 16 days ago

history blame contribute delete

3.09 kB

	# prepare_donor.py
	import torch
	import os
	import argparse
	from tqdm import tqdm
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	def main(source_model_id, output_path):
	"""
	Loads a Qwen2.5 model, removes all '.bias' tensors, adds placeholder
	'q_norm.weight' and 'k_norm.weight' tensors, and saves the result.
	This creates an architecturally compatible donor for a Qwen3 merge.
	"""
	print(f"Loading source donor model: {source_model_id}")
	# Load on CPU to save VRAM
	model = AutoModelForCausalLM.from_pretrained(
	source_model_id,
	torch_dtype=torch.bfloat16,
	device_map="cpu",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
	config = model.config

	source_state_dict = model.state_dict()
	new_state_dict = {}

	# --- Part 1: Remove '.bias' tensors ---
	print("Removing all '.bias' tensors...")
	for name, tensor in tqdm(source_state_dict.items(), desc="Filtering Tensors"):
	if not name.endswith(".bias"):
	new_state_dict[name] = tensor

	# --- Part 2: Add placeholder 'q_norm' and 'k_norm' tensors ---
	print("Adding placeholder 'q_norm' and 'k_norm' tensors...")
	# These norms are 1D vectors of size `head_dim` (128)
	# A value of 1.0 is a standard, neutral initialization for a norm weight.
	norm_dim = config.hidden_size // config.num_attention_heads # Should be 128 for this model
	placeholder_norm = torch.ones(norm_dim, dtype=torch.bfloat16)

	for i in tqdm(range(config.num_hidden_layers), desc="Adding Norm Tensors"):
	q_norm_name = f"model.layers.{i}.self_attn.q_norm.weight"
	k_norm_name = f"model.layers.{i}.self_attn.k_norm.weight"
	new_state_dict[q_norm_name] = placeholder_norm.clone()
	new_state_dict[k_norm_name] = placeholder_norm.clone()

	# The original model is a fine container, we just need to load the modified state dict.
	# strict=False is crucial because we have removed and added keys.
	print("Loading the new state dict back into the model shell...")
	model.load_state_dict(new_state_dict, strict=False, assign=True)

	print(f"Saving the architecturally aligned model to: {output_path}")
	os.makedirs(output_path, exist_ok=True)
	model.save_pretrained(output_path)
	tokenizer.save_pretrained(output_path)

	print("\nDonor preparation complete!")
	print(f"The aligned donor is ready at '{output_path}'.")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
	parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
	parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
	args = parser.parse_args()

	# Example: python prepare_donor.py --output_path ./Qwen2.5-72B-Instruct-Aligned
	main(args.source_model, args.output_path)