DragonLineageAI
/

Vi-SparkTTS-0.5B

Text-to-Speech

Safetensors

spark-tts

custom_code

Model card Files Files and versions Community

ancv commited on Apr 5

Commit

97682b0

verified ·

1 Parent(s): 2143f77

Update modeling_spark_tts.py

Browse files

Files changed (1) hide show

modeling_spark_tts.py +64 -74

modeling_spark_tts.py CHANGED Viewed

@@ -3011,7 +3011,6 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
         return outputs # Should be CausalLMOutputWithPast or tuple
     @classmethod
-    @torch.no_grad() # Decorator often used for loading, though internal ops might need grads later
     def from_pretrained(
         cls,
         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
@@ -3021,46 +3020,39 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
         ignore_mismatched_sizes: bool = False,
         force_download: bool = False,
         local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
         revision: str = "main",
         use_safetensors: Optional[bool] = None,
         # New args from base class signature to pass down if relevant
-        state_dict = None, # Pass state_dict explicitly is usually avoided with component loading
-        device_map = None, # Simplified handling
-        low_cpu_mem_usage = None, # Simplified handling
-        torch_dtype = "auto", # Keep "auto" as default
-        quantization_config = None, # Pass down if needed by components
-        trust_remote_code = None, # Default to None, will be set below
         # Add other relevant args from base class if needed: subfolder, variant, etc.
-        subfolder: str = "",
         variant: Optional[str] = None,
         **kwargs,
     ):
         # --- Argument Handling & Initial Setup ---
-        # Pop device map and dtype early - handle placement later
         if device_map:
             logger.warning("`device_map` is not directly supported for this composite model. Use .to(device) after loading.")
         if low_cpu_mem_usage:
              logger.info("`low_cpu_mem_usage` is set, but simplified loading is used. Memory usage might not be optimized.")
-        # Handle trust_remote_code explicitly for custom code loading
         if trust_remote_code is None:
-             logger.warning(
-                 "Loading SparkTTSModel requires custom code. Setting `trust_remote_code=True`. "
-                 "Make sure you trust the source of the code you are loading."
-             )
              trust_remote_code = True
         elif not trust_remote_code:
              raise ValueError("Loading SparkTTSModel requires `trust_remote_code=True`.")
-        # Pop unused kwargs specific to base class loading logic if not handled here
         kwargs.pop("output_loading_info", None)
         kwargs.pop("_from_auto", None)
-        kwargs.pop("attn_implementation", None) # LLM loader might handle this
         # --- 1. Resolve the main model directory ---
         if state_dict is not None:
-             raise ValueError("Explicitly passing `state_dict` is not supported for this composite model. Load components individually if needed.")
         if pretrained_model_name_or_path is None:
             raise ValueError("`pretrained_model_name_or_path` must be provided.")
@@ -3075,6 +3067,7 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
             logger.info(f"{pretrained_model_name_or_path} is not a local directory. Assuming Hub ID and downloading.")
             try:
                 # Use snapshot_download to get all necessary files
                 resolved_model_path_str = snapshot_download(
                     repo_id=str(pretrained_model_name_or_path),
                     cache_dir=cache_dir,
@@ -3082,82 +3075,85 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
                     local_files_only=local_files_only,
                     token=token,
                     revision=revision,
-                    allow_patterns=[ # Be more specific if possible
                         "*.json", "*.safetensors", "*.bin", "*.yaml", "*.txt",
-                        "README.md", ".gitattributes", # Common files
-                        "LLM/*", "BiCodec/*", "wav2vec2-large-xlsr-53/*" # Component folders
                         ],
-                    ignore_patterns=["*.git*", "*.h5", "*.ot", "*.msgpack"], # Ignore unnecessary files
-                    subfolder=subfolder, # Pass subfolder to snapshot_download
-                    repo_type="model", # Specify repo type
                 )
                 resolved_model_path = Path(resolved_model_path_str)
                 logger.info(f"Model files downloaded to cache: {resolved_model_path}")
             except Exception as e:
                 raise OSError(
-                    f"Failed to download model '{pretrained_model_name_or_path}' (subfolder: '{subfolder}') from Hugging Face Hub. "
                     f"Error: {e}"
                 )
         if not resolved_model_path.is_dir():
              raise EnvironmentError(f"Resolved model path is not a directory: {resolved_model_path}")
-        # If subfolder is used, update resolved_model_path to point inside it
         if subfolder:
-             resolved_model_path = resolved_model_path / subfolder
-             if not resolved_model_path.is_dir():
-                  raise EnvironmentError(f"Subfolder '{subfolder}' not found within the resolved path: {resolved_model_path.parent}")
         # --- 2. Load the main configuration ---
         if not isinstance(config, PretrainedConfig):
             config_path = config if config is not None else resolved_model_path
             try:
                 loaded_config, model_kwargs = SparkTTSConfig.from_pretrained(
-                    config_path,
-                    *model_args, # Pass model_args here
                     cache_dir=cache_dir,
                     force_download=force_download if not is_local else False,
                     local_files_only=local_files_only or is_local,
                     token=token,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code, # Crucial if config class is remote
-                    #subfolder="", # Config is usually at the root, not subfolder
                     return_unused_kwargs=True,
-                    **kwargs, # Pass remaining kwargs for config loading
                 )
                 config = loaded_config
-                kwargs = model_kwargs # Update kwargs with unused ones
             except OSError as e:
-                 raise OSError(f"Cannot load config for '{pretrained_model_name_or_path}'. Check `config.json` exists and is correctly formatted. Error: {e}")
-        # else: config object was passed directly
         # --- Determine final torch_dtype ---
-        final_torch_dtype = torch_dtype # Explicit arg has highest prio
         if final_torch_dtype == "auto":
-            final_torch_dtype = getattr(config, "torch_dtype", None) # Use config value if present
-        # Convert string to torch.dtype object if needed
         if isinstance(final_torch_dtype, str) and final_torch_dtype != "auto":
             try:
                 final_torch_dtype = getattr(torch, final_torch_dtype)
             except AttributeError:
                 logger.warning(f"Invalid torch_dtype string: {final_torch_dtype}. Falling back to default.")
-                final_torch_dtype = None # Fallback to None (which means float32 usually)
         elif final_torch_dtype == "auto":
-             final_torch_dtype = None # Treat "auto" as None for component loading
-        # --- Helper function to resolve paths relative to the main model directory ---
-        # (This handles components potentially being in subfolders specified in config)
         def _resolve_sub_path(sub_path_str):
             p = Path(sub_path_str)
             if p.is_absolute():
                 if not p.exists(): logger.warning(f"Absolute path specified for sub-component does not exist: {p}")
                 return str(p)
             else:
-                # Resolve relative to the main model path (which might be in cache or local)
                 resolved = resolved_model_path / p
                 if not resolved.exists():
-                     # Check if the path exists without the leading './' often found in configs
                      resolved_alt = resolved_model_path / sub_path_str.lstrip('./')
                      if resolved_alt.exists():
                           resolved = resolved_alt
@@ -3171,26 +3167,24 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
             "force_download": force_download,
             "local_files_only": local_files_only,
             "token": token,
-            "revision": revision,
-            "trust_remote_code": trust_remote_code, # Pass this down
-            "torch_dtype": final_torch_dtype, # Pass resolved dtype
             "use_safetensors": use_safetensors,
-             # Pass quantization config if provided and relevant to component
             "quantization_config": quantization_config if quantization_config else None,
-            # Pass variant if needed for specific component checkpoints
             "variant": variant,
-            # Filter kwargs? For now, pass all remaining, component loaders should ignore unused ones.
-            **kwargs,
         }
         # --- 3. Load Sub-components ---
         # --- Load LLM ---
         llm_path = _resolve_sub_path(config.llm_model_name_or_path)
         logger.info(f"Loading LLM from resolved path: {llm_path}")
         try:
             llm = AutoModelForCausalLM.from_pretrained(
-                llm_path, **component_loading_kwargs
             )
         except Exception as e:
             raise OSError(f"Failed to load LLM from {llm_path}: {e}")
@@ -3199,47 +3193,46 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
         w2v_path = _resolve_sub_path(config.wav2vec2_model_name_or_path)
         logger.info(f"Loading Wav2Vec2 components from resolved path: {w2v_path}")
         try:
-            # Use specific class for extractor, Auto* might not work if only config is present
             wav2vec2_processor = Wav2Vec2FeatureExtractor.from_pretrained(
                 w2v_path,
-                cache_dir=cache_dir, # Pass relevant args
                 force_download=force_download,
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
-                # No trust_remote_code needed usually for feature extractors
             )
             wav2vec2_model = Wav2Vec2Model.from_pretrained(
-                w2v_path, **component_loading_kwargs # Pass full kwargs here
             )
-            wav2vec2_model.config.output_hidden_states = True # Ensure this is set
         except Exception as e:
             raise OSError(f"Failed to load Wav2Vec2 components from {w2v_path}: {e}")
         # --- Load BiCodec ---
         bicodec_path = _resolve_sub_path(config.bicodec_model_name_or_path)
         logger.info(f"Loading BiCodec from resolved path: {bicodec_path}")
-        if not config.bicodec_config: # Check if the nested config object exists
-            raise ValueError("BiCodec configuration (`bicodec_config`) not found or properly instantiated in SparkTTSConfig.")
         try:
-            # Pass the SparkTTSBiCodecConfig *object* directly
             bicodec = BiCodec.load_from_config_and_checkpoint(
                 model_dir=Path(bicodec_path),
-                bicodec_config_object=config.bicodec_config # Pass the object
             )
             if not isinstance(bicodec, torch.nn.Module):
                  logger.warning("Loaded BiCodec component is not an instance of torch.nn.Module.")
-            # Apply torch_dtype to BiCodec if it's an nn.Module and dtype is set
             if isinstance(bicodec, torch.nn.Module) and final_torch_dtype:
                  bicodec = bicodec.to(dtype=final_torch_dtype)
         except FileNotFoundError as e:
-             raise OSError(f"Failed to load BiCodec: A required file was not found in {bicodec_path}. Original error: {e}")
         except Exception as e:
              logger.error(f"Raw error loading BiCodec: {type(e).__name__}: {e}")
              import traceback
              traceback.print_exc()
-             raise OSError(f"Failed to load BiCodec from {bicodec_path}. Check BiCodec implementation, config, and file paths. Error: {e}")
         # --- 4. Instantiate the main model wrapper ---
         model = cls(
@@ -3251,20 +3244,17 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
         )
         # --- 5. Handle device placement (Simplified) ---
-        # Determine target device (simple logic: CUDA > MPS > CPU)
         if torch.cuda.is_available():
              final_device = torch.device("cuda")
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): # Check MPS availability
              final_device = torch.device("mps")
         else:
              final_device = torch.device("cpu")
         logger.info(f"Placing SparkTTSModel and components on device: {final_device}")
         try:
              model.to(final_device)
         except Exception as e:
              logger.error(f"Failed to move model to device {final_device}. Error: {e}")
-             logger.warning("Device placement might be incomplete. Check component types and implementations.")
         # --- 6. Return the loaded and prepared model ---
         return model

         return outputs # Should be CausalLMOutputWithPast or tuple
     @classmethod
     def from_pretrained(
         cls,
         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
         ignore_mismatched_sizes: bool = False,
         force_download: bool = False,
         local_files_only: bool = False,
+        token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         use_safetensors: Optional[bool] = None,
         # New args from base class signature to pass down if relevant
+        state_dict = None,
+        device_map = None,
+        low_cpu_mem_usage = None,
+        torch_dtype = "auto",
+        quantization_config = None,
+        trust_remote_code = None,
         # Add other relevant args from base class if needed: subfolder, variant, etc.
+        subfolder: str = "", # Keep subfolder arg for overall loading logic
         variant: Optional[str] = None,
         **kwargs,
     ):
         # --- Argument Handling & Initial Setup ---
         if device_map:
             logger.warning("`device_map` is not directly supported for this composite model. Use .to(device) after loading.")
         if low_cpu_mem_usage:
              logger.info("`low_cpu_mem_usage` is set, but simplified loading is used. Memory usage might not be optimized.")
         if trust_remote_code is None:
+             logger.warning("Loading SparkTTSModel requires custom code. Setting `trust_remote_code=True`.")
              trust_remote_code = True
         elif not trust_remote_code:
              raise ValueError("Loading SparkTTSModel requires `trust_remote_code=True`.")
         kwargs.pop("output_loading_info", None)
         kwargs.pop("_from_auto", None)
+        kwargs.pop("attn_implementation", None)
         # --- 1. Resolve the main model directory ---
         if state_dict is not None:
+             raise ValueError("Explicitly passing `state_dict` is not supported for this composite model.")
         if pretrained_model_name_or_path is None:
             raise ValueError("`pretrained_model_name_or_path` must be provided.")
             logger.info(f"{pretrained_model_name_or_path} is not a local directory. Assuming Hub ID and downloading.")
             try:
                 # Use snapshot_download to get all necessary files
+                # REMOVED subfolder=subfolder from this call
                 resolved_model_path_str = snapshot_download(
                     repo_id=str(pretrained_model_name_or_path),
                     cache_dir=cache_dir,
                     local_files_only=local_files_only,
                     token=token,
                     revision=revision,
+                    allow_patterns=[
                         "*.json", "*.safetensors", "*.bin", "*.yaml", "*.txt",
+                        "README.md", ".gitattributes",
+                        "LLM/*", "BiCodec/*", "wav2vec2-large-xlsr-53/*"
                         ],
+                    ignore_patterns=["*.git*", "*.h5", "*.ot", "*.msgpack"],
+                    repo_type="model", # Explicitly set repo_type
+                    # max_workers=..., # Can adjust workers if needed
+                    # user_agent=..., # Can add user agent
                 )
                 resolved_model_path = Path(resolved_model_path_str)
                 logger.info(f"Model files downloaded to cache: {resolved_model_path}")
             except Exception as e:
+                # Catch potential TypeErrors from snapshot_download if args change again
+                if isinstance(e, TypeError) and 'unexpected keyword argument' in str(e):
+                     logger.error(f"snapshot_download() received an unexpected keyword argument. Check huggingface_hub version compatibility. Error: {e}")
                 raise OSError(
+                    f"Failed to download model '{pretrained_model_name_or_path}' (revision: '{revision}') from Hugging Face Hub. "
                     f"Error: {e}"
                 )
         if not resolved_model_path.is_dir():
              raise EnvironmentError(f"Resolved model path is not a directory: {resolved_model_path}")
+        # If subfolder was specified for from_pretrained, adjust the path *after* download
         if subfolder:
+             resolved_model_path_with_subfolder = resolved_model_path / subfolder
+             if not resolved_model_path_with_subfolder.is_dir():
+                  raise EnvironmentError(f"Subfolder '{subfolder}' not found within the resolved path: {resolved_model_path}")
+             resolved_model_path = resolved_model_path_with_subfolder # Update path to include subfolder
+             logger.info(f"Using subfolder within resolved path: {resolved_model_path}")
         # --- 2. Load the main configuration ---
         if not isinstance(config, PretrainedConfig):
+            # Load config from the potentially subfolder-adjusted path
             config_path = config if config is not None else resolved_model_path
             try:
                 loaded_config, model_kwargs = SparkTTSConfig.from_pretrained(
+                    config_path, # Load from the final resolved path
+                    *model_args,
                     cache_dir=cache_dir,
                     force_download=force_download if not is_local else False,
                     local_files_only=local_files_only or is_local,
                     token=token,
+                    revision=revision, # Pass revision for config loading too
+                    trust_remote_code=trust_remote_code,
+                    subfolder="", # Config is expected at the root of resolved_model_path
                     return_unused_kwargs=True,
+                    **kwargs,
                 )
                 config = loaded_config
+                kwargs = model_kwargs
             except OSError as e:
+                 raise OSError(f"Cannot load config from {config_path}. Check `config.json` exists and is correctly formatted. Error: {e}")
         # --- Determine final torch_dtype ---
+        final_torch_dtype = torch_dtype
         if final_torch_dtype == "auto":
+            final_torch_dtype = getattr(config, "torch_dtype", None)
         if isinstance(final_torch_dtype, str) and final_torch_dtype != "auto":
             try:
                 final_torch_dtype = getattr(torch, final_torch_dtype)
             except AttributeError:
                 logger.warning(f"Invalid torch_dtype string: {final_torch_dtype}. Falling back to default.")
+                final_torch_dtype = None
         elif final_torch_dtype == "auto":
+             final_torch_dtype = None
+        # --- Helper function to resolve component paths relative to the final resolved_model_path ---
         def _resolve_sub_path(sub_path_str):
             p = Path(sub_path_str)
             if p.is_absolute():
                 if not p.exists(): logger.warning(f"Absolute path specified for sub-component does not exist: {p}")
                 return str(p)
             else:
+                # Resolve relative to the potentially subfolder-adjusted main model path
                 resolved = resolved_model_path / p
                 if not resolved.exists():
                      resolved_alt = resolved_model_path / sub_path_str.lstrip('./')
                      if resolved_alt.exists():
                           resolved = resolved_alt
             "force_download": force_download,
             "local_files_only": local_files_only,
             "token": token,
+            "revision": revision, # Pass revision to component loaders
+            "trust_remote_code": trust_remote_code,
+            "torch_dtype": final_torch_dtype,
             "use_safetensors": use_safetensors,
             "quantization_config": quantization_config if quantization_config else None,
             "variant": variant,
+            **kwargs, # Pass remaining kwargs
         }
         # --- 3. Load Sub-components ---
+        # (LLM, Wav2Vec2, BiCodec loading logic remains the same as previous version)
         # --- Load LLM ---
         llm_path = _resolve_sub_path(config.llm_model_name_or_path)
         logger.info(f"Loading LLM from resolved path: {llm_path}")
         try:
+            # Pass subfolder="" because llm_path is now absolute or correctly relative
             llm = AutoModelForCausalLM.from_pretrained(
+                llm_path, subfolder="", **component_loading_kwargs
             )
         except Exception as e:
             raise OSError(f"Failed to load LLM from {llm_path}: {e}")
         w2v_path = _resolve_sub_path(config.wav2vec2_model_name_or_path)
         logger.info(f"Loading Wav2Vec2 components from resolved path: {w2v_path}")
         try:
+            # Load extractor without full component_loading_kwargs if they cause issues
             wav2vec2_processor = Wav2Vec2FeatureExtractor.from_pretrained(
                 w2v_path,
+                cache_dir=cache_dir,
                 force_download=force_download,
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
+                subfolder="", # Path is resolved
             )
+            # Load model with full kwargs
             wav2vec2_model = Wav2Vec2Model.from_pretrained(
+                w2v_path, subfolder="", **component_loading_kwargs
             )
+            wav2vec2_model.config.output_hidden_states = True
         except Exception as e:
             raise OSError(f"Failed to load Wav2Vec2 components from {w2v_path}: {e}")
         # --- Load BiCodec ---
         bicodec_path = _resolve_sub_path(config.bicodec_model_name_or_path)
         logger.info(f"Loading BiCodec from resolved path: {bicodec_path}")
+        if not config.bicodec_config:
+            raise ValueError("BiCodec configuration (`bicodec_config`) not found in SparkTTSConfig.")
         try:
             bicodec = BiCodec.load_from_config_and_checkpoint(
                 model_dir=Path(bicodec_path),
+                bicodec_config_object=config.bicodec_config
             )
             if not isinstance(bicodec, torch.nn.Module):
                  logger.warning("Loaded BiCodec component is not an instance of torch.nn.Module.")
             if isinstance(bicodec, torch.nn.Module) and final_torch_dtype:
                  bicodec = bicodec.to(dtype=final_torch_dtype)
         except FileNotFoundError as e:
+             raise OSError(f"Failed to load BiCodec: Required file not found in {bicodec_path}. Error: {e}")
         except Exception as e:
              logger.error(f"Raw error loading BiCodec: {type(e).__name__}: {e}")
              import traceback
              traceback.print_exc()
+             raise OSError(f"Failed to load BiCodec from {bicodec_path}. Error: {e}")
         # --- 4. Instantiate the main model wrapper ---
         model = cls(
         )
         # --- 5. Handle device placement (Simplified) ---
         if torch.cuda.is_available():
              final_device = torch.device("cuda")
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
              final_device = torch.device("mps")
         else:
              final_device = torch.device("cpu")
         logger.info(f"Placing SparkTTSModel and components on device: {final_device}")
         try:
              model.to(final_device)
         except Exception as e:
              logger.error(f"Failed to move model to device {final_device}. Error: {e}")
         # --- 6. Return the loaded and prepared model ---
         return model