unsloth/Llama-3.2-11B-Vision · RuntimeError: NVML_SUCCESS == r INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":995, please report a bug to PyTorch.

Hihi, may I ask how to solve this error?

RuntimeError Traceback (most recent call last)
Cell In[1], line 23
5 # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
6 fourbit_models = [
7 "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
8 "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
(...)
20 "unsloth/llava-1.5-7b-hf-bnb-4bit",
21 ] # More models at https://huggingface.co/unsloth
---> 23 model, tokenizer = FastVisionModel.from_pretrained(
24 "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
25 load_in_4bit=True, # Use 4-bit quantization
26 use_gradient_checkpointing="unsloth",
27 device_map="cuda", # Automatically assign layers to GPU/CPU
28 )

File /opt/conda/lib/python3.11/site-packages/unsloth/models/loader.py:654, in FastModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, return_logits, fullgraph, use_exact_model_name, *args, **kwargs)
651 is_vlm = is_vlm or hasattr(model_config, "vision_config")
652 auto_model = AutoModelForVision2Seq if is_vlm else AutoModelForCausalLM
--> 654 model, tokenizer = FastBaseModel.from_pretrained(
655 model_name = model_name,
656 max_seq_length = max_seq_length,
657 dtype = _get_dtype(dtype),
658 load_in_4bit = load_in_4bit,
659 load_in_8bit = load_in_8bit,
660 full_finetuning = full_finetuning,
661 token = token,
662 device_map = device_map,
663 trust_remote_code = trust_remote_code,
664 revision = revision if not is_peft else None,
665 model_types = model_types,
666 tokenizer_name = tokenizer_name,
667 auto_model = auto_model,
668 use_gradient_checkpointing = use_gradient_checkpointing,
669 *args, **kwargs,
670 )
672 if resize_model_vocab is not None:
673 model.resize_token_embeddings(resize_model_vocab)

File /opt/conda/lib/python3.11/site-packages/unsloth/models/vision.py:245, in FastBaseModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, trust_remote_code, model_types, tokenizer_name, auto_model, use_gradient_checkpointing, **kwargs)
242 # Cannot be None, since HF now checks for the config
243 if load_in_4bit: kwargs["quantization_config"] = bnb_config
--> 245 model = auto_model.from_pretrained(
246 model_name,
247 device_map = device_map,
248 torch_dtype = dtype,
249 # quantization_config = bnb_config,
250 token = token,
251 trust_remote_code = trust_remote_code,
252 attn_implementation = "sdpa", #[TODO] Pixtral for eg fails
253 **kwargs,
254 )
255 # Return old flag
256 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer

File /opt/conda/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:564, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
562 elif type(config) in cls._model_mapping.keys():
563 model_class = _get_model_class(config, cls._model_mapping)
--> 564 return model_class.from_pretrained(
565 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
566 )
567 raise ValueError(
568 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.\n"
569 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
570 )

File /opt/conda/lib/python3.11/site-packages/transformers/modeling_utils.py:262, in restore_default_torch_dtype.._wrapper(*args, **kwargs)
260 old_dtype = torch.get_default_dtype()
261 try:
--> 262 return func(*args, **kwargs)
263 finally:
264 torch.set_default_dtype(old_dtype)

File /opt/conda/lib/python3.11/site-packages/transformers/modeling_utils.py:4185, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
4179 config = cls._autoset_attn_implementation(
4180 config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map
4181 )
4183 with ContextManagers(init_contexts):
4184 # Let's make sure we don't run the init function of buffer modules
-> 4185 model = cls(config, *model_args, **model_kwargs)
4187 # make sure we use the model's config since the init call might have copied it
4188 config = model.config

File ~/unsloth_compiled_cache/unsloth_compiled_module_mllama.py:1124, in MllamaForConditionalGeneration.init(self, config)
1121 self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
1123 self.vision_model = MllamaVisionModel._from_config(config.vision_config)
-> 1124 self.language_model = MllamaForCausalLM._from_config(config.text_config)
1125 if self.language_model._tied_weights_keys is not None:
1126 self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]

File /opt/conda/lib/python3.11/site-packages/transformers/modeling_utils.py:1501, in PreTrainedModel._from_config(cls, config, **kwargs)
1498 model = cls(config, **kwargs)
1500 else:
-> 1501 model = cls(config, **kwargs)
1503 # restore default dtype if it was modified
1504 if dtype_orig is not None:

File ~/unsloth_compiled_cache/unsloth_compiled_module_mllama.py:1062, in MllamaForCausalLM.init(self, config)
1060 self.text_config = config.get_text_config()
1061 self.vocab_size = self.text_config.vocab_size
-> 1062 self.model = MllamaTextModel._from_config(self.text_config)
1063 self.lm_head = nn.Linear(self.text_config.hidden_size, self.vocab_size, bias=False)
1065 self.post_init()

File /opt/conda/lib/python3.11/site-packages/transformers/models/mllama/modeling_mllama.py:1667, in MllamaTextModel.init(self, config)
1665 self.padding_idx = config.pad_token_id
1666 self.vocab_size = config.vocab_size
-> 1667 self.embed_tokens = nn.Embedding(config.vocab_size + 8, config.hidden_size, self.padding_idx)
1668 self.cross_attention_layers = config.cross_attention_layers
1670 layers = []

File /opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py:167, in Embedding.init(self, num_embeddings, embedding_dim, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse, _weight, _freeze, device, dtype)
164 self.scale_grad_by_freq = scale_grad_by_freq
165 if _weight is None:
166 self.weight = Parameter(
--> 167 torch.empty((num_embeddings, embedding_dim), **factory_kwargs),
168 requires_grad=not _freeze,
169 )
170 self.reset_parameters()
171 else:

File /opt/conda/lib/python3.11/site-packages/torch/utils/_device.py:106, in DeviceContext.torch_function(self, func, types, args, kwargs)
104 if func in _device_constructors() and kwargs.get('device') is None:
105 kwargs['device'] = self.device
--> 106 return func(*args, **kwargs)

RuntimeError: NVML_SUCCESS == r INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":995, please report a bug to PyTorch.