|
|
|
""" |
|
Stage 2: Expand Qwen3 from 64 to 80 layers using simple duplication |
|
Mapping: |
|
- Layers 0-23 β 0-23 (unchanged) |
|
- Layers 24-39 β 24-55 (each layer duplicated once) |
|
- Layers 40-63 β 56-79 (unchanged) |
|
""" |
|
|
|
import torch |
|
import os |
|
import json |
|
from tqdm import tqdm |
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer |
|
from safetensors.torch import load_file, save_file |
|
import numpy as np |
|
from collections import OrderedDict |
|
import gc |
|
import shutil |
|
|
|
|
|
INPUT_DIR = "./Qwen3-58B-Embiggened" |
|
OUTPUT_DIR = "./Qwen3-72B-Embiggened" |
|
TARGET_LAYERS = 80 |
|
SOURCE_LAYERS = 64 |
|
|
|
def load_model_sharted(model_path): |
|
"""Load model weights from sharted safetensors files.""" |
|
print("\nπ© Loading sharted weights...") |
|
|
|
index_path = os.path.join(model_path, "model.safetensors.index.json") |
|
|
|
if not os.path.exists(index_path): |
|
raise FileNotFoundError(f"No index file found at {index_path}") |
|
|
|
with open(index_path, 'r') as f: |
|
index = json.load(f) |
|
|
|
weight_map = index['weight_map'] |
|
unique_files = set(weight_map.values()) |
|
|
|
all_weights = {} |
|
for file in tqdm(unique_files, desc="Loading sharts"): |
|
file_path = os.path.join(model_path, file) |
|
weights = load_file(file_path) |
|
all_weights.update(weights) |
|
|
|
return all_weights |
|
|
|
def save_model_sharted(state_dict, output_dir, max_shart_size="5GB"): |
|
"""Save model in sharted safetensors format.""" |
|
print("\nπ© Sharting model weights...") |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
size_map = {'GB': 1e9, 'MB': 1e6} |
|
for unit, multiplier in size_map.items(): |
|
if unit in max_shart_size: |
|
max_bytes = int(float(max_shart_size.replace(unit, '')) * multiplier) |
|
break |
|
|
|
|
|
sharts = [] |
|
current_shart = {} |
|
current_size = 0 |
|
|
|
for name, tensor in state_dict.items(): |
|
tensor_size = tensor.numel() * tensor.element_size() |
|
|
|
if current_size + tensor_size > max_bytes and current_shart: |
|
sharts.append(current_shart) |
|
current_shart = {} |
|
current_size = 0 |
|
|
|
current_shart[name] = tensor |
|
current_size += tensor_size |
|
|
|
if current_shart: |
|
sharts.append(current_shart) |
|
|
|
|
|
weight_map = {} |
|
for i, shart in enumerate(tqdm(sharts, desc="Saving sharts")): |
|
shart_name = f"model-{i+1:05d}-of-{len(sharts):05d}.safetensors" |
|
save_file(shart, os.path.join(output_dir, shart_name)) |
|
|
|
for name in shart: |
|
weight_map[name] = shart_name |
|
|
|
|
|
index = { |
|
"metadata": {"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())}, |
|
"weight_map": weight_map |
|
} |
|
|
|
with open(os.path.join(output_dir, "model.safetensors.index.json"), 'w') as f: |
|
json.dump(index, f, indent=2) |
|
|
|
print(f"π© Successfully sharted into {len(sharts)} files!") |
|
|
|
def extract_layer_weights(weights, layer_idx): |
|
"""Extract all weights for a specific layer.""" |
|
layer_weights = OrderedDict() |
|
prefix = f"model.layers.{layer_idx}." |
|
|
|
for name, tensor in weights.items(): |
|
if name.startswith(prefix): |
|
|
|
component_name = name[len(prefix):] |
|
layer_weights[component_name] = tensor |
|
|
|
return layer_weights |
|
|
|
def create_layer_weights(layer_weights, new_layer_idx): |
|
"""Create weight dict with new layer index.""" |
|
result = OrderedDict() |
|
prefix = f"model.layers.{new_layer_idx}." |
|
|
|
for component_name, tensor in layer_weights.items(): |
|
full_name = prefix + component_name |
|
result[full_name] = tensor.clone() |
|
|
|
return result |
|
|
|
def verify_architecture(model_path): |
|
"""Verify the model architecture matches expected Qwen3-72B dimensions.""" |
|
print("\n" + "="*60) |
|
print("ARCHITECTURE VERIFICATION") |
|
print("="*60) |
|
|
|
print("\nLoading model for verification...") |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
torch_dtype=torch.bfloat16, |
|
device_map="cpu", |
|
trust_remote_code=True |
|
) |
|
|
|
expected = { |
|
"lm_head.weight": (151936, 8192), |
|
"model.embed_tokens.weight": (151936, 8192), |
|
"model.layers.0.input_layernorm.weight": (8192,), |
|
"model.layers.0.mlp.down_proj.weight": (8192, 29568), |
|
"model.layers.0.mlp.gate_proj.weight": (29568, 8192), |
|
"model.layers.0.mlp.up_proj.weight": (29568, 8192), |
|
"model.layers.0.post_attention_layernorm.weight": (8192,), |
|
"model.layers.0.self_attn.k_norm.weight": (128,), |
|
"model.layers.0.self_attn.k_proj.weight": (1024, 8192), |
|
"model.layers.0.self_attn.o_proj.weight": (8192, 8192), |
|
"model.layers.0.self_attn.q_norm.weight": (128,), |
|
"model.layers.0.self_attn.q_proj.weight": (8192, 8192), |
|
"model.layers.0.self_attn.v_proj.weight": (1024, 8192), |
|
"model.norm.weight": (8192,), |
|
} |
|
|
|
all_correct = True |
|
|
|
|
|
check_layers = [0, 24, 25, 39, 40, 56, 79] |
|
|
|
for layer_idx in check_layers: |
|
print(f"\nπ Checking layer {layer_idx}:") |
|
for base_name, expected_shape in expected.items(): |
|
if "layers.0." in base_name: |
|
name = base_name.replace("layers.0.", f"layers.{layer_idx}.") |
|
param_dict = dict(model.named_parameters()) |
|
if name in param_dict: |
|
actual_shape = tuple(param_dict[name].shape) |
|
if actual_shape == expected_shape: |
|
print(f" β {name.split('.')[-1]}: {actual_shape}") |
|
else: |
|
print(f" β {name}: {actual_shape} (expected {expected_shape})") |
|
all_correct = False |
|
|
|
num_layers = model.config.num_hidden_layers |
|
print(f"\nTotal layers: {num_layers} (expected: 80)") |
|
|
|
if all_correct and num_layers == 80: |
|
print("\nβ
Architecture verification PASSED!") |
|
else: |
|
print("\nβ Architecture verification FAILED!") |
|
|
|
del model |
|
torch.cuda.empty_cache() |
|
return all_correct |
|
|
|
def run_diagnostics(model_path): |
|
"""Run comprehensive diagnostics on the expanded model.""" |
|
print("\n" + "="*60) |
|
print("COMPREHENSIVE DIAGNOSTICS") |
|
print("="*60) |
|
|
|
|
|
print("\nLoading model for diagnostics...") |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
print("\nπ§ͺ Generation Quality Tests:") |
|
test_cases = [ |
|
("The capital of France is", ["Paris"]), |
|
("2 + 2 =", ["4", "four"]), |
|
("The quick brown fox", ["jumps", "jumped", "lazy", "dog"]), |
|
("Hello, my name is", None), |
|
("Water boils at", ["100", "212", "degrees"]), |
|
("The Earth orbits the", ["Sun", "solar"]), |
|
("Machine learning is a type of", ["artificial intelligence", "AI"]), |
|
("Python is a", ["programming", "language", "snake"]), |
|
("The largest planet is", ["Jupiter"]), |
|
("DNA stands for", ["deoxyribonucleic", "acid"]), |
|
|
|
("The derivative of x squared is", ["2x", "two"]), |
|
("Shakespeare wrote", ["plays", "Hamlet", "Romeo"]), |
|
("The speed of light is", ["299", "300", "fast"]), |
|
("Photosynthesis converts", ["light", "energy", "carbon"]), |
|
("The Pythagorean theorem states", ["aΒ²", "squared", "hypotenuse"]), |
|
] |
|
|
|
device = model.device |
|
coherent_count = 0 |
|
total_tests = len(test_cases) |
|
|
|
for prompt, expected in test_cases: |
|
inputs = tokenizer(prompt, return_tensors="pt").to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=20, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_k=50, |
|
top_p=0.95, |
|
pad_token_id=tokenizer.pad_token_id, |
|
) |
|
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
generated_only = generated_text[len(prompt):].strip() |
|
|
|
print(f"\n Prompt: '{prompt}'") |
|
print(f" Generated: '{generated_only}'") |
|
|
|
|
|
is_coherent = True |
|
|
|
|
|
words = generated_only.split() |
|
if len(words) > 3: |
|
if len(set(words)) < len(words) / 2: |
|
print(" β οΈ High repetition detected") |
|
is_coherent = False |
|
|
|
|
|
if expected and len(generated_only) > 0: |
|
found = any(kw.lower() in generated_only.lower() for kw in expected) |
|
if found: |
|
print(" β Contains expected content") |
|
else: |
|
print(" β οΈ Missing expected keywords") |
|
is_coherent = False |
|
|
|
if is_coherent and len(generated_only.split()) >= 2: |
|
coherent_count += 1 |
|
|
|
coherence_rate = (coherent_count / total_tests) * 100 |
|
print(f"\nπ Overall coherence rate: {coherence_rate:.1f}%") |
|
|
|
|
|
print("\nπ Perplexity Test:") |
|
test_texts = [ |
|
"The quick brown fox jumps over the lazy dog.", |
|
"In the beginning was the Word, and the Word was with God.", |
|
"To be or not to be, that is the question.", |
|
"E equals m c squared is Einstein's famous equation.", |
|
] |
|
|
|
perplexities = [] |
|
for test_text in test_texts: |
|
inputs = tokenizer(test_text, return_tensors="pt").to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs, labels=inputs["input_ids"]) |
|
perplexity = torch.exp(outputs.loss).item() |
|
perplexities.append(perplexity) |
|
|
|
print(f" '{test_text[:30]}...': {perplexity:.2f}") |
|
|
|
avg_perplexity = np.mean(perplexities) |
|
print(f"\n Average perplexity: {avg_perplexity:.2f}") |
|
|
|
if avg_perplexity > 100: |
|
print(" β οΈ Very high perplexity") |
|
elif avg_perplexity > 50: |
|
print(" β οΈ Moderately high perplexity") |
|
else: |
|
print(" β Reasonable perplexity") |
|
|
|
|
|
print("\n㪠Duplicate Layer Analysis:") |
|
print("Checking if duplicated layers maintain reasonable behavior...") |
|
|
|
|
|
test_input = "The meaning of life is" |
|
inputs = tokenizer(test_input, return_tensors="pt").to(device) |
|
|
|
activations = {} |
|
hooks = [] |
|
|
|
def get_activation(name): |
|
def hook(model, input, output): |
|
activations[name] = output[0].detach() |
|
return hook |
|
|
|
|
|
for layer_idx in [24, 25, 39, 40]: |
|
hook = model.model.layers[layer_idx].register_forward_hook( |
|
get_activation(f'layer_{layer_idx}') |
|
) |
|
hooks.append(hook) |
|
|
|
with torch.no_grad(): |
|
_ = model(**inputs) |
|
|
|
|
|
for hook in hooks: |
|
hook.remove() |
|
|
|
|
|
if len(activations) >= 4: |
|
|
|
act_24 = activations['layer_24'].flatten() |
|
act_25 = activations['layer_25'].flatten() |
|
similarity_24_25 = torch.cosine_similarity(act_24.unsqueeze(0), act_25.unsqueeze(0)).item() |
|
|
|
|
|
act_39 = activations['layer_39'].flatten() |
|
act_40 = activations['layer_40'].flatten() |
|
similarity_39_40 = torch.cosine_similarity(act_39.unsqueeze(0), act_40.unsqueeze(0)).item() |
|
|
|
print(f" Cosine similarity layer 24 vs 25 (duplicate): {similarity_24_25:.4f}") |
|
print(f" Cosine similarity layer 39 vs 40 (different): {similarity_39_40:.4f}") |
|
|
|
if similarity_24_25 > 0.95: |
|
print(" β Duplicate layers show expected high similarity") |
|
else: |
|
print(" β οΈ Duplicate layers diverged more than expected") |
|
|
|
|
|
print("\nπ Weight Statistics (checking for anomalies):") |
|
anomalies = 0 |
|
|
|
for name, param in model.named_parameters(): |
|
if torch.isnan(param).any(): |
|
print(f" β οΈ {name}: Contains NaN!") |
|
anomalies += 1 |
|
elif torch.isinf(param).any(): |
|
print(f" β οΈ {name}: Contains Inf!") |
|
anomalies += 1 |
|
elif param.std() < 1e-8: |
|
print(f" β οΈ {name}: Zero variance!") |
|
anomalies += 1 |
|
|
|
if anomalies == 0: |
|
print(" β No anomalies detected in weights") |
|
|
|
|
|
success = coherence_rate >= 60 and avg_perplexity < 100 and anomalies == 0 |
|
|
|
print("\n" + "="*60) |
|
print("DIAGNOSTIC SUMMARY") |
|
print("="*60) |
|
|
|
if success: |
|
print("β
Model passed all diagnostics!") |
|
print(" - Good coherence rate") |
|
print(" - Reasonable perplexity") |
|
print(" - No weight anomalies") |
|
print(" - Duplicate layers functioning correctly") |
|
else: |
|
print("β οΈ Some issues detected:") |
|
if coherence_rate < 60: |
|
print(f" - Low coherence rate: {coherence_rate:.1f}%") |
|
if avg_perplexity >= 100: |
|
print(f" - High average perplexity: {avg_perplexity:.2f}") |
|
if anomalies > 0: |
|
print(f" - Weight anomalies: {anomalies}") |
|
|
|
del model |
|
torch.cuda.empty_cache() |
|
return success |
|
|
|
def main(): |
|
print("="*60) |
|
print("Stage 2: Simple Layer Duplication") |
|
print("64 layers β 80 layers") |
|
print("="*60) |
|
|
|
|
|
print(f"\nπ₯ Loading model from: {INPUT_DIR}") |
|
weights = load_model_sharted(INPUT_DIR) |
|
|
|
print(f"\nπ Loaded {len(weights)} tensors") |
|
|
|
|
|
new_weights = OrderedDict() |
|
|
|
|
|
print("\nπ Copying non-layer weights...") |
|
for name, tensor in weights.items(): |
|
if not name.startswith("model.layers."): |
|
new_weights[name] = tensor.clone() |
|
|
|
|
|
print("\nπ Expanding layers with simple duplication...") |
|
print(" Layers 0-23: Direct copy") |
|
print(" Layers 24-39: Each layer duplicated once") |
|
print(" Layers 40-63: Direct copy (shifted to 56-79)") |
|
|
|
new_layer_idx = 0 |
|
|
|
with tqdm(total=TARGET_LAYERS, desc="Creating layers") as pbar: |
|
|
|
for old_idx in range(24): |
|
layer_weights = extract_layer_weights(weights, old_idx) |
|
new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) |
|
new_layer_idx += 1 |
|
pbar.update(1) |
|
|
|
|
|
for old_idx in range(24, 40): |
|
|
|
layer_weights = extract_layer_weights(weights, old_idx) |
|
new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) |
|
new_layer_idx += 1 |
|
pbar.update(1) |
|
|
|
|
|
print(f"\n Duplicating layer {old_idx} β layer {new_layer_idx}") |
|
new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) |
|
new_layer_idx += 1 |
|
pbar.update(1) |
|
|
|
|
|
for old_idx in range(40, 64): |
|
layer_weights = extract_layer_weights(weights, old_idx) |
|
new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) |
|
new_layer_idx += 1 |
|
pbar.update(1) |
|
|
|
print(f"\nβ Created {new_layer_idx} layers") |
|
|
|
|
|
if new_layer_idx != TARGET_LAYERS: |
|
print(f"\nβ ERROR: Created {new_layer_idx} layers but expected {TARGET_LAYERS}") |
|
print("Layer creation failed. Exiting.") |
|
return False |
|
|
|
|
|
print("\nπ Updating model configuration...") |
|
config_path = os.path.join(INPUT_DIR, "config.json") |
|
with open(config_path, 'r') as f: |
|
config = json.load(f) |
|
|
|
config['num_hidden_layers'] = TARGET_LAYERS |
|
|
|
|
|
print(f"\nπΎ Saving expanded model to: {OUTPUT_DIR}") |
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
|
|
with open(os.path.join(OUTPUT_DIR, "config.json"), 'w') as f: |
|
json.dump(config, f, indent=2) |
|
|
|
|
|
tokenizer_files = [ |
|
'tokenizer.json', 'tokenizer_config.json', |
|
'special_tokens_map.json', 'generation_config.json' |
|
] |
|
|
|
for file in tokenizer_files: |
|
src = os.path.join(INPUT_DIR, file) |
|
dst = os.path.join(OUTPUT_DIR, file) |
|
if os.path.exists(src): |
|
shutil.copy(src, dst) |
|
|
|
|
|
save_model_sharted(new_weights, OUTPUT_DIR) |
|
|
|
|
|
metadata = { |
|
"stage": "2-duplicate", |
|
"source_model": INPUT_DIR, |
|
"method": "Simple layer duplication", |
|
"layer_mapping": { |
|
"0-23": "0-23 (unchanged)", |
|
"24-39": "24-55 (each duplicated once)", |
|
"40-63": "56-79 (unchanged)" |
|
}, |
|
"duplication_info": { |
|
"method": "exact_copy", |
|
"layers_duplicated": list(range(24, 40)) |
|
}, |
|
"final_layers": TARGET_LAYERS |
|
} |
|
|
|
with open(os.path.join(OUTPUT_DIR, "stage2_metadata.json"), 'w') as f: |
|
json.dump(metadata, f, indent=2) |
|
|
|
print("\nβ
Stage 2 duplication complete!") |
|
|
|
|
|
print("\nπ Quick verification:") |
|
print(f" Total weights: {len(new_weights)}") |
|
|
|
|
|
layer_count = 0 |
|
for name in new_weights.keys(): |
|
if name.startswith("model.layers.") and ".input_layernorm.weight" in name: |
|
layer_count += 1 |
|
|
|
print(f" Layer count: {layer_count} (expected: {TARGET_LAYERS})") |
|
|
|
|
|
print("\n㪠Checking layer duplication:") |
|
test_component = "self_attn.q_proj.weight" |
|
|
|
|
|
if f"model.layers.24.{test_component}" in new_weights and f"model.layers.25.{test_component}" in new_weights: |
|
layer24 = new_weights[f"model.layers.24.{test_component}"] |
|
layer25 = new_weights[f"model.layers.25.{test_component}"] |
|
|
|
|
|
if torch.equal(layer24, layer25): |
|
print(" β Layer 24 and 25 are identical (as expected)") |
|
else: |
|
print(" β οΈ Layer 24 and 25 differ (unexpected!)") |
|
|
|
print(f"\nπ SUCCESS! Model expanded to {TARGET_LAYERS} layers.") |
|
print(f"π Output saved to: {OUTPUT_DIR}") |
|
|
|
|
|
arch_ok = verify_architecture(OUTPUT_DIR) |
|
diag_ok = run_diagnostics(OUTPUT_DIR) |
|
|
|
if arch_ok and diag_ok: |
|
print("\nπ FINAL SUCCESS! Your Qwen3-72B-DupeLayers model is ready and verified!") |
|
print("\nπ Final architecture:") |
|
print(" Hidden size: 8192") |
|
print(" Intermediate size: 29568") |
|
print(" Attention heads: 64") |
|
print(" KV heads: 8") |
|
print(" Layers: 80") |
|
print(" Vocabulary: 151936") |
|
print("\nπ‘ The model has passed all quality checks and is ready for use!") |
|
else: |
|
print("\nβ οΈ Some verification issues detected. Please review the diagnostics above.") |
|
|
|
return arch_ok and diag_ok |
|
|
|
if __name__ == "__main__": |
|
success = main() |
|
exit(0 if success else 1) |
|
|