Spaces:
Running
Running
| # Ref: https://github.com/MiniMax-AI/MiniMax-01 | |
| from typing import Optional | |
| import torch | |
| from transformers import ( | |
| AutoConfig, | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| GenerationConfig, | |
| QuantoConfig, | |
| ) | |
| from .base import AbstractLLMModel | |
| from .registry import register_llm_model | |
| class MiniMaxLLM(AbstractLLMModel): | |
| def __init__( | |
| self, model_id: str, device: str = "cuda", cache_dir: str = "cache", **kwargs | |
| ): | |
| try: | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("MiniMax model only supports CUDA device") | |
| super().__init__(model_id, device, cache_dir, **kwargs) | |
| # load hf config | |
| hf_config = AutoConfig.from_pretrained( | |
| "MiniMaxAI/MiniMax-Text-01", trust_remote_code=True, cache_dir=cache_dir, | |
| ) | |
| # quantization config, int8 is recommended | |
| quantization_config = QuantoConfig( | |
| weights="int8", | |
| modules_to_not_convert=[ | |
| "lm_head", | |
| "embed_tokens", | |
| ] | |
| + [ | |
| f"model.layers.{i}.coefficient" | |
| for i in range(hf_config.num_hidden_layers) | |
| ] | |
| + [ | |
| f"model.layers.{i}.block_sparse_moe.gate" | |
| for i in range(hf_config.num_hidden_layers) | |
| ], | |
| ) | |
| # assume 8 GPUs | |
| world_size = torch.cuda.device_count() | |
| layers_per_device = hf_config.num_hidden_layers // world_size | |
| # set device map | |
| device_map = { | |
| "model.embed_tokens": "cuda:0", | |
| "model.norm": f"cuda:{world_size - 1}", | |
| "lm_head": f"cuda:{world_size - 1}", | |
| } | |
| for i in range(world_size): | |
| for j in range(layers_per_device): | |
| device_map[f"model.layers.{i * layers_per_device + j}"] = f"cuda:{i}" | |
| # load tokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| "MiniMaxAI/MiniMax-Text-01", cache_dir=cache_dir | |
| ) | |
| # load bfloat16 model, move to device, and apply quantization | |
| self.quantized_model = AutoModelForCausalLM.from_pretrained( | |
| "MiniMaxAI/MiniMax-Text-01", | |
| torch_dtype="bfloat16", | |
| device_map=device_map, | |
| quantization_config=quantization_config, | |
| trust_remote_code=True, | |
| offload_buffers=True, | |
| cache_dir=cache_dir, | |
| ) | |
| except Exception as e: | |
| print(f"Failed to load MiniMax model: {e}") | |
| import traceback | |
| print(traceback.format_exc()) | |
| raise e | |
| def generate( | |
| self, | |
| prompt: str, | |
| system_prompt: Optional[ | |
| str | |
| ] = "You are a helpful assistant created by MiniMax based on MiniMax-Text-01 model.", | |
| max_new_tokens: int = 20, | |
| **kwargs, | |
| ) -> str: | |
| messages = [] | |
| if system_prompt: | |
| messages.append( | |
| { | |
| "role": "system", | |
| "content": [{"type": "text", "text": system_prompt}], | |
| } | |
| ) | |
| messages.append({"role": "user", "content": [ | |
| {"type": "text", "text": prompt}]}) | |
| text = self.tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| # tokenize and move to device | |
| model_inputs = self.tokenizer(text, return_tensors="pt").to("cuda") | |
| generation_config = GenerationConfig( | |
| max_new_tokens=max_new_tokens, | |
| eos_token_id=200020, | |
| use_cache=True, | |
| ) | |
| generated_ids = self.quantized_model.generate( | |
| **model_inputs, generation_config=generation_config | |
| ) | |
| generated_ids = [ | |
| output_ids[len(input_ids):] | |
| for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
| ] | |
| response = self.tokenizer.batch_decode( | |
| generated_ids, skip_special_tokens=True)[0] | |
| return response | |