import torch | |
from typing import Dict | |
from .model import BitTransformerLM | |
import torch.nn as nn | |
def expand_model(model: BitTransformerLM, new_params: Dict) -> BitTransformerLM: | |
"""Return a new model with updated params and copied weights.""" | |
new_model = BitTransformerLM(**new_params) | |
new_state = new_model.state_dict() | |
old_state = model.state_dict() | |
for k, v in old_state.items(): | |
if k in new_state: | |
dest = new_state[k] | |
slices = tuple(slice(0, min(d, s)) for d, s in zip(dest.shape, v.shape)) | |
dest[slices].copy_(v[slices]) | |
if dest.shape != v.shape: | |
mask = torch.ones_like(dest, dtype=torch.bool) | |
mask[slices] = False | |
if "bias" in k: | |
dest[mask] = 0.0 | |
else: | |
dest[mask] = 0.001 * torch.randn_like(dest[mask]) | |
for k, v in new_state.items(): | |
if k not in old_state: | |
if "bias" in k: | |
v.zero_() | |
elif v.dim() > 1: | |
nn.init.normal_(v, mean=0.0, std=1e-3) | |
else: | |
v.zero_() | |
new_model.load_state_dict(new_state) | |
return new_model | |