Safetensors
qwen3
Qwen3-72B-Embiggened / stage2_v3.py
ehartford's picture
Update stage2_v3.py
aa9e8d9 verified
#!/usr/bin/env python
"""
Stage 2: Expand Qwen3 from 64 to 80 layers using simple duplication
Mapping:
- Layers 0-23 β†’ 0-23 (unchanged)
- Layers 24-39 β†’ 24-55 (each layer duplicated once)
- Layers 40-63 β†’ 56-79 (unchanged)
"""
import torch
import os
import json
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from safetensors.torch import load_file, save_file
import numpy as np
from collections import OrderedDict
import gc
import shutil
# Configuration
INPUT_DIR = "./Qwen3-58B-Embiggened" # Output from stage 1
OUTPUT_DIR = "./Qwen3-72B-Embiggened"
TARGET_LAYERS = 80
SOURCE_LAYERS = 64
def load_model_sharted(model_path):
"""Load model weights from sharted safetensors files."""
print("\nπŸ’© Loading sharted weights...")
index_path = os.path.join(model_path, "model.safetensors.index.json")
if not os.path.exists(index_path):
raise FileNotFoundError(f"No index file found at {index_path}")
with open(index_path, 'r') as f:
index = json.load(f)
weight_map = index['weight_map']
unique_files = set(weight_map.values())
all_weights = {}
for file in tqdm(unique_files, desc="Loading sharts"):
file_path = os.path.join(model_path, file)
weights = load_file(file_path)
all_weights.update(weights)
return all_weights
def save_model_sharted(state_dict, output_dir, max_shart_size="5GB"):
"""Save model in sharted safetensors format."""
print("\nπŸ’© Sharting model weights...")
os.makedirs(output_dir, exist_ok=True)
# Convert max_shart_size to bytes
size_map = {'GB': 1e9, 'MB': 1e6}
for unit, multiplier in size_map.items():
if unit in max_shart_size:
max_bytes = int(float(max_shart_size.replace(unit, '')) * multiplier)
break
# Group weights into sharts
sharts = []
current_shart = {}
current_size = 0
for name, tensor in state_dict.items():
tensor_size = tensor.numel() * tensor.element_size()
if current_size + tensor_size > max_bytes and current_shart:
sharts.append(current_shart)
current_shart = {}
current_size = 0
current_shart[name] = tensor
current_size += tensor_size
if current_shart:
sharts.append(current_shart)
# Save sharts
weight_map = {}
for i, shart in enumerate(tqdm(sharts, desc="Saving sharts")):
shart_name = f"model-{i+1:05d}-of-{len(sharts):05d}.safetensors"
save_file(shart, os.path.join(output_dir, shart_name))
for name in shart:
weight_map[name] = shart_name
# Save index
index = {
"metadata": {"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())},
"weight_map": weight_map
}
with open(os.path.join(output_dir, "model.safetensors.index.json"), 'w') as f:
json.dump(index, f, indent=2)
print(f"πŸ’© Successfully sharted into {len(sharts)} files!")
def extract_layer_weights(weights, layer_idx):
"""Extract all weights for a specific layer."""
layer_weights = OrderedDict()
prefix = f"model.layers.{layer_idx}."
for name, tensor in weights.items():
if name.startswith(prefix):
# Remove the layer prefix to get the component name
component_name = name[len(prefix):]
layer_weights[component_name] = tensor
return layer_weights
def create_layer_weights(layer_weights, new_layer_idx):
"""Create weight dict with new layer index."""
result = OrderedDict()
prefix = f"model.layers.{new_layer_idx}."
for component_name, tensor in layer_weights.items():
full_name = prefix + component_name
result[full_name] = tensor.clone() # Clone to ensure independent copies
return result
def verify_architecture(model_path):
"""Verify the model architecture matches expected Qwen3-72B dimensions."""
print("\n" + "="*60)
print("ARCHITECTURE VERIFICATION")
print("="*60)
print("\nLoading model for verification...")
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="cpu",
trust_remote_code=True
)
expected = {
"lm_head.weight": (151936, 8192),
"model.embed_tokens.weight": (151936, 8192),
"model.layers.0.input_layernorm.weight": (8192,),
"model.layers.0.mlp.down_proj.weight": (8192, 29568),
"model.layers.0.mlp.gate_proj.weight": (29568, 8192),
"model.layers.0.mlp.up_proj.weight": (29568, 8192),
"model.layers.0.post_attention_layernorm.weight": (8192,),
"model.layers.0.self_attn.k_norm.weight": (128,),
"model.layers.0.self_attn.k_proj.weight": (1024, 8192),
"model.layers.0.self_attn.o_proj.weight": (8192, 8192),
"model.layers.0.self_attn.q_norm.weight": (128,),
"model.layers.0.self_attn.q_proj.weight": (8192, 8192),
"model.layers.0.self_attn.v_proj.weight": (1024, 8192),
"model.norm.weight": (8192,),
}
all_correct = True
# Check specific layers including duplicated ones
check_layers = [0, 24, 25, 39, 40, 56, 79] # Original and duplicated layers
for layer_idx in check_layers:
print(f"\nπŸ“ Checking layer {layer_idx}:")
for base_name, expected_shape in expected.items():
if "layers.0." in base_name:
name = base_name.replace("layers.0.", f"layers.{layer_idx}.")
param_dict = dict(model.named_parameters())
if name in param_dict:
actual_shape = tuple(param_dict[name].shape)
if actual_shape == expected_shape:
print(f" βœ“ {name.split('.')[-1]}: {actual_shape}")
else:
print(f" βœ— {name}: {actual_shape} (expected {expected_shape})")
all_correct = False
num_layers = model.config.num_hidden_layers
print(f"\nTotal layers: {num_layers} (expected: 80)")
if all_correct and num_layers == 80:
print("\nβœ… Architecture verification PASSED!")
else:
print("\n❌ Architecture verification FAILED!")
del model
torch.cuda.empty_cache()
return all_correct
def run_diagnostics(model_path):
"""Run comprehensive diagnostics on the expanded model."""
print("\n" + "="*60)
print("COMPREHENSIVE DIAGNOSTICS")
print("="*60)
# Load model and tokenizer
print("\nLoading model for diagnostics...")
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Test generation quality
print("\nπŸ§ͺ Generation Quality Tests:")
test_cases = [
("The capital of France is", ["Paris"]),
("2 + 2 =", ["4", "four"]),
("The quick brown fox", ["jumps", "jumped", "lazy", "dog"]),
("Hello, my name is", None),
("Water boils at", ["100", "212", "degrees"]),
("The Earth orbits the", ["Sun", "solar"]),
("Machine learning is a type of", ["artificial intelligence", "AI"]),
("Python is a", ["programming", "language", "snake"]),
("The largest planet is", ["Jupiter"]),
("DNA stands for", ["deoxyribonucleic", "acid"]),
# Additional tests
("The derivative of x squared is", ["2x", "two"]),
("Shakespeare wrote", ["plays", "Hamlet", "Romeo"]),
("The speed of light is", ["299", "300", "fast"]),
("Photosynthesis converts", ["light", "energy", "carbon"]),
("The Pythagorean theorem states", ["aΒ²", "squared", "hypotenuse"]),
]
device = model.device
coherent_count = 0
total_tests = len(test_cases)
for prompt, expected in test_cases:
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=20,
do_sample=True,
temperature=0.7,
top_k=50,
top_p=0.95,
pad_token_id=tokenizer.pad_token_id,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_only = generated_text[len(prompt):].strip()
print(f"\n Prompt: '{prompt}'")
print(f" Generated: '{generated_only}'")
# Check coherence
is_coherent = True
# Check for repetition
words = generated_only.split()
if len(words) > 3:
if len(set(words)) < len(words) / 2:
print(" ⚠️ High repetition detected")
is_coherent = False
# Check for expected content
if expected and len(generated_only) > 0:
found = any(kw.lower() in generated_only.lower() for kw in expected)
if found:
print(" βœ“ Contains expected content")
else:
print(" ⚠️ Missing expected keywords")
is_coherent = False
if is_coherent and len(generated_only.split()) >= 2:
coherent_count += 1
coherence_rate = (coherent_count / total_tests) * 100
print(f"\nπŸ“Š Overall coherence rate: {coherence_rate:.1f}%")
# Perplexity test
print("\nπŸ“ˆ Perplexity Test:")
test_texts = [
"The quick brown fox jumps over the lazy dog.",
"In the beginning was the Word, and the Word was with God.",
"To be or not to be, that is the question.",
"E equals m c squared is Einstein's famous equation.",
]
perplexities = []
for test_text in test_texts:
inputs = tokenizer(test_text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(**inputs, labels=inputs["input_ids"])
perplexity = torch.exp(outputs.loss).item()
perplexities.append(perplexity)
print(f" '{test_text[:30]}...': {perplexity:.2f}")
avg_perplexity = np.mean(perplexities)
print(f"\n Average perplexity: {avg_perplexity:.2f}")
if avg_perplexity > 100:
print(" ⚠️ Very high perplexity")
elif avg_perplexity > 50:
print(" ⚠️ Moderately high perplexity")
else:
print(" βœ“ Reasonable perplexity")
# Test duplicate layer behavior
print("\nπŸ”¬ Duplicate Layer Analysis:")
print("Checking if duplicated layers maintain reasonable behavior...")
# Get activations from a few layers
test_input = "The meaning of life is"
inputs = tokenizer(test_input, return_tensors="pt").to(device)
activations = {}
hooks = []
def get_activation(name):
def hook(model, input, output):
activations[name] = output[0].detach()
return hook
# Register hooks for duplicate pairs
for layer_idx in [24, 25, 39, 40]: # Original and duplicate
hook = model.model.layers[layer_idx].register_forward_hook(
get_activation(f'layer_{layer_idx}')
)
hooks.append(hook)
with torch.no_grad():
_ = model(**inputs)
# Remove hooks
for hook in hooks:
hook.remove()
# Check similarity of duplicates
if len(activations) >= 4:
# Check 24 vs 25 (should be duplicates)
act_24 = activations['layer_24'].flatten()
act_25 = activations['layer_25'].flatten()
similarity_24_25 = torch.cosine_similarity(act_24.unsqueeze(0), act_25.unsqueeze(0)).item()
# Check 39 vs 40 (should be different - 40 is original layer 40, not duplicate)
act_39 = activations['layer_39'].flatten()
act_40 = activations['layer_40'].flatten()
similarity_39_40 = torch.cosine_similarity(act_39.unsqueeze(0), act_40.unsqueeze(0)).item()
print(f" Cosine similarity layer 24 vs 25 (duplicate): {similarity_24_25:.4f}")
print(f" Cosine similarity layer 39 vs 40 (different): {similarity_39_40:.4f}")
if similarity_24_25 > 0.95:
print(" βœ“ Duplicate layers show expected high similarity")
else:
print(" ⚠️ Duplicate layers diverged more than expected")
# Weight statistics check
print("\nπŸ” Weight Statistics (checking for anomalies):")
anomalies = 0
for name, param in model.named_parameters():
if torch.isnan(param).any():
print(f" ⚠️ {name}: Contains NaN!")
anomalies += 1
elif torch.isinf(param).any():
print(f" ⚠️ {name}: Contains Inf!")
anomalies += 1
elif param.std() < 1e-8:
print(f" ⚠️ {name}: Zero variance!")
anomalies += 1
if anomalies == 0:
print(" βœ“ No anomalies detected in weights")
# Final summary
success = coherence_rate >= 60 and avg_perplexity < 100 and anomalies == 0
print("\n" + "="*60)
print("DIAGNOSTIC SUMMARY")
print("="*60)
if success:
print("βœ… Model passed all diagnostics!")
print(" - Good coherence rate")
print(" - Reasonable perplexity")
print(" - No weight anomalies")
print(" - Duplicate layers functioning correctly")
else:
print("⚠️ Some issues detected:")
if coherence_rate < 60:
print(f" - Low coherence rate: {coherence_rate:.1f}%")
if avg_perplexity >= 100:
print(f" - High average perplexity: {avg_perplexity:.2f}")
if anomalies > 0:
print(f" - Weight anomalies: {anomalies}")
del model
torch.cuda.empty_cache()
return success
def main():
print("="*60)
print("Stage 2: Simple Layer Duplication")
print("64 layers β†’ 80 layers")
print("="*60)
# Load weights from stage 1
print(f"\nπŸ“₯ Loading model from: {INPUT_DIR}")
weights = load_model_sharted(INPUT_DIR)
print(f"\nπŸ“Š Loaded {len(weights)} tensors")
# Create new weight dictionary
new_weights = OrderedDict()
# Copy non-layer weights
print("\nπŸ“‹ Copying non-layer weights...")
for name, tensor in weights.items():
if not name.startswith("model.layers."):
new_weights[name] = tensor.clone()
# Layer expansion with progress bar
print("\nπŸ”„ Expanding layers with simple duplication...")
print(" Layers 0-23: Direct copy")
print(" Layers 24-39: Each layer duplicated once")
print(" Layers 40-63: Direct copy (shifted to 56-79)")
new_layer_idx = 0
with tqdm(total=TARGET_LAYERS, desc="Creating layers") as pbar:
# Copy layers 0-23 unchanged
for old_idx in range(24):
layer_weights = extract_layer_weights(weights, old_idx)
new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
new_layer_idx += 1
pbar.update(1)
# Duplicate layers 24-39
for old_idx in range(24, 40):
# Copy original layer
layer_weights = extract_layer_weights(weights, old_idx)
new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
new_layer_idx += 1
pbar.update(1)
# Duplicate the same layer
print(f"\n Duplicating layer {old_idx} β†’ layer {new_layer_idx}")
new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
new_layer_idx += 1
pbar.update(1)
# Copy layers 40-63 to positions 56-79
for old_idx in range(40, 64):
layer_weights = extract_layer_weights(weights, old_idx)
new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
new_layer_idx += 1
pbar.update(1)
print(f"\nβœ“ Created {new_layer_idx} layers")
# Verify we have all layers
if new_layer_idx != TARGET_LAYERS:
print(f"\n❌ ERROR: Created {new_layer_idx} layers but expected {TARGET_LAYERS}")
print("Layer creation failed. Exiting.")
return False
# Update config
print("\nπŸ“ Updating model configuration...")
config_path = os.path.join(INPUT_DIR, "config.json")
with open(config_path, 'r') as f:
config = json.load(f)
config['num_hidden_layers'] = TARGET_LAYERS
# Save everything
print(f"\nπŸ’Ύ Saving expanded model to: {OUTPUT_DIR}")
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Save config
with open(os.path.join(OUTPUT_DIR, "config.json"), 'w') as f:
json.dump(config, f, indent=2)
# Copy tokenizer files
tokenizer_files = [
'tokenizer.json', 'tokenizer_config.json',
'special_tokens_map.json', 'generation_config.json'
]
for file in tokenizer_files:
src = os.path.join(INPUT_DIR, file)
dst = os.path.join(OUTPUT_DIR, file)
if os.path.exists(src):
shutil.copy(src, dst)
# Save weights in sharted format
save_model_sharted(new_weights, OUTPUT_DIR)
# Save metadata
metadata = {
"stage": "2-duplicate",
"source_model": INPUT_DIR,
"method": "Simple layer duplication",
"layer_mapping": {
"0-23": "0-23 (unchanged)",
"24-39": "24-55 (each duplicated once)",
"40-63": "56-79 (unchanged)"
},
"duplication_info": {
"method": "exact_copy",
"layers_duplicated": list(range(24, 40))
},
"final_layers": TARGET_LAYERS
}
with open(os.path.join(OUTPUT_DIR, "stage2_metadata.json"), 'w') as f:
json.dump(metadata, f, indent=2)
print("\nβœ… Stage 2 duplication complete!")
# Quick verification
print("\nπŸ” Quick verification:")
print(f" Total weights: {len(new_weights)}")
# Count layers
layer_count = 0
for name in new_weights.keys():
if name.startswith("model.layers.") and ".input_layernorm.weight" in name:
layer_count += 1
print(f" Layer count: {layer_count} (expected: {TARGET_LAYERS})")
# Check duplicate similarity
print("\nπŸ”¬ Checking layer duplication:")
test_component = "self_attn.q_proj.weight"
# Check first duplicate pair
if f"model.layers.24.{test_component}" in new_weights and f"model.layers.25.{test_component}" in new_weights:
layer24 = new_weights[f"model.layers.24.{test_component}"]
layer25 = new_weights[f"model.layers.25.{test_component}"]
# Should be identical
if torch.equal(layer24, layer25):
print(" βœ“ Layer 24 and 25 are identical (as expected)")
else:
print(" ⚠️ Layer 24 and 25 differ (unexpected!)")
print(f"\nπŸŽ‰ SUCCESS! Model expanded to {TARGET_LAYERS} layers.")
print(f"πŸ“ Output saved to: {OUTPUT_DIR}")
# Run full diagnostics
arch_ok = verify_architecture(OUTPUT_DIR)
diag_ok = run_diagnostics(OUTPUT_DIR)
if arch_ok and diag_ok:
print("\n🎊 FINAL SUCCESS! Your Qwen3-72B-DupeLayers model is ready and verified!")
print("\nπŸ“ Final architecture:")
print(" Hidden size: 8192")
print(" Intermediate size: 29568")
print(" Attention heads: 64")
print(" KV heads: 8")
print(" Layers: 80")
print(" Vocabulary: 151936")
print("\nπŸ’‘ The model has passed all quality checks and is ready for use!")
else:
print("\n⚠️ Some verification issues detected. Please review the diagnostics above.")
return arch_ok and diag_ok
if __name__ == "__main__":
success = main()
exit(0 if success else 1)