| import torch | |
| from typing import Dict | |
| from .model import BitTransformerLM | |
| import torch.nn as nn | |
| def expand_model(model: BitTransformerLM, new_params: Dict) -> BitTransformerLM: | |
| """Return a new model with updated params and copied weights.""" | |
| new_model = BitTransformerLM(**new_params) | |
| new_state = new_model.state_dict() | |
| old_state = model.state_dict() | |
| for k, v in old_state.items(): | |
| if k in new_state: | |
| dest = new_state[k] | |
| slices = tuple(slice(0, min(d, s)) for d, s in zip(dest.shape, v.shape)) | |
| dest[slices].copy_(v[slices]) | |
| if dest.shape != v.shape: | |
| mask = torch.ones_like(dest, dtype=torch.bool) | |
| mask[slices] = False | |
| if "bias" in k: | |
| dest[mask] = 0.0 | |
| else: | |
| dest[mask] = 0.001 * torch.randn_like(dest[mask]) | |
| for k, v in new_state.items(): | |
| if k not in old_state: | |
| if "bias" in k: | |
| v.zero_() | |
| elif v.dim() > 1: | |
| nn.init.normal_(v, mean=0.0, std=1e-3) | |
| else: | |
| v.zero_() | |
| new_model.load_state_dict(new_state) | |
| return new_model | |