LandyGuo commited on
Commit
81a8221
·
1 Parent(s): 9a10e16

update 20250516 version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -1
  2. .gitignore +3 -0
  3. audio_detokenizer/__pycache__/__init__.cpython-38.pyc +0 -0
  4. audio_detokenizer/cli/__pycache__/__init__.cpython-38.pyc +0 -0
  5. audio_detokenizer/cli/__pycache__/model.cpython-38.pyc +0 -0
  6. audio_detokenizer/flow/__pycache__/__init__.cpython-38.pyc +0 -0
  7. audio_detokenizer/flow/__pycache__/decoder.cpython-38.pyc +0 -0
  8. audio_detokenizer/flow/__pycache__/flow.cpython-38.pyc +0 -0
  9. audio_detokenizer/flow/__pycache__/flow_matching.cpython-38.pyc +0 -0
  10. audio_detokenizer/flow/__pycache__/length_regulator.cpython-38.pyc +0 -0
  11. audio_detokenizer/hifigan/__pycache__/__init__.cpython-38.pyc +0 -0
  12. audio_detokenizer/hifigan/__pycache__/f0_predictor.cpython-38.pyc +0 -0
  13. audio_detokenizer/hifigan/__pycache__/generator.cpython-38.pyc +0 -0
  14. audio_detokenizer/transformer/__pycache__/__init__.cpython-38.pyc +0 -0
  15. audio_detokenizer/transformer/__pycache__/activation.cpython-38.pyc +0 -0
  16. audio_detokenizer/transformer/__pycache__/attention.cpython-38.pyc +0 -0
  17. audio_detokenizer/transformer/__pycache__/convolution.cpython-38.pyc +0 -0
  18. audio_detokenizer/transformer/__pycache__/embedding.cpython-38.pyc +0 -0
  19. audio_detokenizer/transformer/__pycache__/encoder.cpython-38.pyc +0 -0
  20. audio_detokenizer/transformer/__pycache__/encoder_layer.cpython-38.pyc +0 -0
  21. audio_detokenizer/transformer/__pycache__/positionwise_feed_forward.cpython-38.pyc +0 -0
  22. audio_detokenizer/transformer/__pycache__/subsampling.cpython-38.pyc +0 -0
  23. audio_detokenizer/utils/__pycache__/__init__.cpython-38.pyc +0 -0
  24. audio_detokenizer/utils/__pycache__/class_utils.cpython-38.pyc +0 -0
  25. audio_detokenizer/utils/__pycache__/common.cpython-38.pyc +0 -0
  26. audio_detokenizer/utils/__pycache__/mask.cpython-38.pyc +0 -0
  27. audio_processing_bailingmm.py +41 -10
  28. bailingmm_utils.py +20 -7
  29. config.json +12 -5
  30. configuration_bailing_moe.py +3 -1
  31. configuration_bailingmm.py +4 -0
  32. configuration_whisper_encoder.py +37 -0
  33. connector/LICENSE +54 -0
  34. connector/README.md +111 -0
  35. connector/config.json +29 -0
  36. connector/generation_config.json +14 -0
  37. connector/gitattributes +35 -0
  38. connector/merges.txt +0 -0
  39. model-00001-of-00015.safetensors → connector/model-00001-of-00003.safetensors +2 -2
  40. connector/model-00002-of-00003.safetensors +3 -0
  41. connector/model-00003-of-00003.safetensors +3 -0
  42. connector/model.safetensors.index.json +441 -0
  43. connector/tokenizer.json +0 -0
  44. connector/tokenizer_config.json +207 -0
  45. connector/vocab.json +0 -0
  46. data/openai_whisper-20240930-py3-none-any.whl +3 -0
  47. data/wavs/speechQA_sample.wav +0 -0
  48. diffusion/__init__.py +0 -0
  49. diffusion/pipeline_sana.py +1011 -0
  50. diffusion/sana_loss.py +303 -0
.gitattributes CHANGED
@@ -37,6 +37,6 @@ data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl filter=lfs diff=lfs merge=lfs
37
  data/wavs/BAC009S0915W0292.wav filter=lfs diff=lfs merge=lfs -text
38
  out.wav filter=lfs diff=lfs merge=lfs -text
39
  talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
-
41
 
42
 
 
37
  data/wavs/BAC009S0915W0292.wav filter=lfs diff=lfs merge=lfs -text
38
  out.wav filter=lfs diff=lfs merge=lfs -text
39
  talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
41
 
42
 
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ *.DS_Store
3
+
audio_detokenizer/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (178 Bytes)
 
audio_detokenizer/cli/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (182 Bytes)
 
audio_detokenizer/cli/__pycache__/model.cpython-38.pyc DELETED
Binary file (2.02 kB)
 
audio_detokenizer/flow/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (183 Bytes)
 
audio_detokenizer/flow/__pycache__/decoder.cpython-38.pyc DELETED
Binary file (5.28 kB)
 
audio_detokenizer/flow/__pycache__/flow.cpython-38.pyc DELETED
Binary file (4.15 kB)
 
audio_detokenizer/flow/__pycache__/flow_matching.cpython-38.pyc DELETED
Binary file (6.1 kB)
 
audio_detokenizer/flow/__pycache__/length_regulator.cpython-38.pyc DELETED
Binary file (1.48 kB)
 
audio_detokenizer/hifigan/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (186 Bytes)
 
audio_detokenizer/hifigan/__pycache__/f0_predictor.cpython-38.pyc DELETED
Binary file (1.37 kB)
 
audio_detokenizer/hifigan/__pycache__/generator.cpython-38.pyc DELETED
Binary file (11.3 kB)
 
audio_detokenizer/transformer/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (190 Bytes)
 
audio_detokenizer/transformer/__pycache__/activation.cpython-38.pyc DELETED
Binary file (2.51 kB)
 
audio_detokenizer/transformer/__pycache__/attention.cpython-38.pyc DELETED
Binary file (10.9 kB)
 
audio_detokenizer/transformer/__pycache__/convolution.cpython-38.pyc DELETED
Binary file (3.1 kB)
 
audio_detokenizer/transformer/__pycache__/embedding.cpython-38.pyc DELETED
Binary file (9.78 kB)
 
audio_detokenizer/transformer/__pycache__/encoder.cpython-38.pyc DELETED
Binary file (19.6 kB)
 
audio_detokenizer/transformer/__pycache__/encoder_layer.cpython-38.pyc DELETED
Binary file (8.64 kB)
 
audio_detokenizer/transformer/__pycache__/positionwise_feed_forward.cpython-38.pyc DELETED
Binary file (3.79 kB)
 
audio_detokenizer/transformer/__pycache__/subsampling.cpython-38.pyc DELETED
Binary file (10.6 kB)
 
audio_detokenizer/utils/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (184 Bytes)
 
audio_detokenizer/utils/__pycache__/class_utils.cpython-38.pyc DELETED
Binary file (1.45 kB)
 
audio_detokenizer/utils/__pycache__/common.cpython-38.pyc DELETED
Binary file (4.17 kB)
 
audio_detokenizer/utils/__pycache__/mask.cpython-38.pyc DELETED
Binary file (5.12 kB)
 
audio_processing_bailingmm.py CHANGED
@@ -7,11 +7,11 @@ import torch
7
  import torch.utils.data
8
  import torchaudio
9
  import torchaudio.compliance.kaldi as kaldi
 
10
  from torch.nn.utils.rnn import pad_sequence
11
 
12
  from transformers.utils import TensorType
13
- from transformers.feature_extraction_utils import FeatureExtractionMixin
14
- from image_processing_bailingmm import BatchFeature
15
 
16
  NORM_FACTOR_FOR_DTYPE = {
17
  torch.int8: 2**7,
@@ -49,10 +49,13 @@ DEFAULT_TTS_TOKEN = '<tts>'
49
 
50
 
51
  class BailingMMAudioProcessor(FeatureExtractionMixin):
52
- def __init__(self, wav_frontend_args: Dict[str, Any], **kwargs):
53
  super().__init__(**kwargs)
54
  self.sample_rate = 16000
55
- self.wav_frontend = WavFrontend(**wav_frontend_args)
 
 
 
56
 
57
  def to_dict(self) -> Dict[str, Any]:
58
  output = copy.deepcopy(self.__dict__)
@@ -60,6 +63,8 @@ class BailingMMAudioProcessor(FeatureExtractionMixin):
60
  output["wav_frontend"]["cmvn"] = output["wav_frontend"]["cmvn"].tolist()
61
  output["wav_frontend"]["_non_persistent_buffers_set"] = list(output["wav_frontend"]["_non_persistent_buffers_set"])
62
  output["audio_processor_type"] = self.__class__.__name__
 
 
63
  return output
64
 
65
  @classmethod
@@ -85,13 +90,22 @@ class BailingMMAudioProcessor(FeatureExtractionMixin):
85
  """Preprocess an audio or a batch of audios."""
86
  return self.preprocess(audios, **kwargs)
87
 
88
- def _preprocess_audio(self, waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
89
  waveform = normalize_audio_tensor(waveform, sample_rate, target_sample_rate=self.sample_rate)
90
- audio_feat = self.wav_frontend(waveform.unsqueeze(0), [len(waveform)])[0].squeeze(0)
 
 
 
91
  return audio_feat
92
 
93
- def _make_batched_audios(self, audio_feat_list: List[torch.Tensor]) -> Dict[str, Any]:
94
  audio_feats_lengths = torch.tensor([[audio_feat.shape[0]] for audio_feat in audio_feat_list], dtype=torch.long)
 
 
 
 
 
 
95
  max_length = max(audio_feat.shape[0] for audio_feat in audio_feat_list)
96
  audio_feats = torch.stack(
97
  [
@@ -101,7 +115,7 @@ class BailingMMAudioProcessor(FeatureExtractionMixin):
101
  ) for audio_feat in audio_feat_list
102
  ], dim=0,
103
  )
104
- return {"audio_feats": audio_feats.numpy(), "audio_feats_lengths": audio_feats_lengths.numpy()}
105
 
106
  def preprocess(
107
  self,
@@ -110,10 +124,10 @@ class BailingMMAudioProcessor(FeatureExtractionMixin):
110
  **kwargs,
111
  ) -> BatchFeature:
112
  if isinstance(audios, List):
113
- audio_inputs = self._make_batched_audios([self._preprocess_audio(waveform, sr) for waveform, sr in audios])
114
  else:
115
  waveform, sr = audios
116
- audio_inputs = self._make_batched_audios([self._preprocess_audio(waveform, sr)])
117
  return BatchFeature(data=audio_inputs, tensor_type=return_tensors)
118
 
119
 
@@ -252,6 +266,23 @@ class WavFrontend(torch.nn.Module):
252
  return feats_pad, feats_lens
253
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  def load_cmvn(cmvn_file):
256
  with open(cmvn_file, 'r', encoding='utf-8') as f:
257
  lines = f.readlines()
 
7
  import torch.utils.data
8
  import torchaudio
9
  import torchaudio.compliance.kaldi as kaldi
10
+ import whisper
11
  from torch.nn.utils.rnn import pad_sequence
12
 
13
  from transformers.utils import TensorType
14
+ from transformers.feature_extraction_utils import FeatureExtractionMixin, BatchFeature
 
15
 
16
  NORM_FACTOR_FOR_DTYPE = {
17
  torch.int8: 2**7,
 
49
 
50
 
51
  class BailingMMAudioProcessor(FeatureExtractionMixin):
52
+ def __init__(self, wav_frontend_args: Dict[str, Any]=None, whisper_frontend_args: Dict[str, Any]=None, **kwargs):
53
  super().__init__(**kwargs)
54
  self.sample_rate = 16000
55
+ if wav_frontend_args is not None:
56
+ self.wav_frontend = WavFrontend(**wav_frontend_args)
57
+ if whisper_frontend_args is not None:
58
+ self.whisper_frontend = WhisperFrontend(**whisper_frontend_args)
59
 
60
  def to_dict(self) -> Dict[str, Any]:
61
  output = copy.deepcopy(self.__dict__)
 
63
  output["wav_frontend"]["cmvn"] = output["wav_frontend"]["cmvn"].tolist()
64
  output["wav_frontend"]["_non_persistent_buffers_set"] = list(output["wav_frontend"]["_non_persistent_buffers_set"])
65
  output["audio_processor_type"] = self.__class__.__name__
66
+ if 'whisper_frontend' in output:
67
+ output["whisper_frontend"] = output["whisper_frontend"].__dict__
68
  return output
69
 
70
  @classmethod
 
90
  """Preprocess an audio or a batch of audios."""
91
  return self.preprocess(audios, **kwargs)
92
 
93
+ def _preprocess_audio(self, waveform: torch.Tensor, sample_rate: int, use_whisper_encoder: bool=False) -> torch.Tensor:
94
  waveform = normalize_audio_tensor(waveform, sample_rate, target_sample_rate=self.sample_rate)
95
+ if not use_whisper_encoder:
96
+ audio_feat = self.wav_frontend(waveform.unsqueeze(0), [len(waveform)])[0].squeeze(0)
97
+ else:
98
+ audio_feat = self.whisper_frontend(waveform.unsqueeze(0), [len(waveform)])[0].squeeze(0)
99
  return audio_feat
100
 
101
+ def _make_batched_audios(self, audio_feat_list: List[torch.Tensor], use_whisper_encoder=False) -> Dict[str, Any]:
102
  audio_feats_lengths = torch.tensor([[audio_feat.shape[0]] for audio_feat in audio_feat_list], dtype=torch.long)
103
+ if not use_whisper_encoder:
104
+ encoder_feats_lengths = audio_feats_lengths
105
+ else:
106
+ # whisper + project layer has two conv
107
+ encoder_feats_lengths = ((audio_feats_lengths-3+2*1)//2+1-3+2*1)//2+1
108
+
109
  max_length = max(audio_feat.shape[0] for audio_feat in audio_feat_list)
110
  audio_feats = torch.stack(
111
  [
 
115
  ) for audio_feat in audio_feat_list
116
  ], dim=0,
117
  )
118
+ return {"audio_feats": audio_feats.numpy(), "audio_feats_lengths": audio_feats_lengths.numpy(), "encoder_feats_lengths": encoder_feats_lengths}
119
 
120
  def preprocess(
121
  self,
 
124
  **kwargs,
125
  ) -> BatchFeature:
126
  if isinstance(audios, List):
127
+ audio_inputs = self._make_batched_audios([self._preprocess_audio(waveform, sr, use_whisper_encoder=kwargs.get('use_whisper_encoder', False)) for waveform, sr in audios], use_whisper_encoder=kwargs.get('use_whisper_encoder', False))
128
  else:
129
  waveform, sr = audios
130
+ audio_inputs = self._make_batched_audios([self._preprocess_audio(waveform, sr, use_whisper_encoder=kwargs.get('use_whisper_encoder', False))])
131
  return BatchFeature(data=audio_inputs, tensor_type=return_tensors)
132
 
133
 
 
266
  return feats_pad, feats_lens
267
 
268
 
269
+ class WhisperFrontend:
270
+ def __init__(self, n_mels: int=128):
271
+ self.n_mels = n_mels
272
+
273
+ def __call__(self, input: torch.Tensor, input_lengths: List[int]):
274
+ """
275
+ input: [B, T]
276
+ input_lengths: [B]
277
+ """
278
+
279
+ assert input.size(0) == 1
280
+
281
+ mel = whisper.log_mel_spectrogram(input.squeeze(0), n_mels=self.n_mels).to(input.device) # [n_mels, T]
282
+ feats_pad = mel.transpose(0, 1).unsqueeze(0) # [B=1, T, n_mels]
283
+ feats_lens = torch.tensor([mel.size(1)], dtype=torch.long) # [B=1]
284
+ return feats_pad, feats_lens
285
+
286
  def load_cmvn(cmvn_file):
287
  with open(cmvn_file, 'r', encoding='utf-8') as f:
288
  lines = f.readlines()
bailingmm_utils.py CHANGED
@@ -268,10 +268,13 @@ def _read_video_decord(
268
  total_frames, video_fps = len(vr), vr.get_avg_fps()
269
  logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
270
 
271
- sample_method = ele.get("sample", "uniform")
272
  # if sample_method == "sequence":
273
  # total_frames = int(total_frames / video_fps * 2)
274
- num_frames = get_frames(ele, int(total_frames / video_fps * 2))
 
 
 
275
  frame_indices = sample_frames(
276
  num_frames=num_frames, total_frames=total_frames, sample=sample_method
277
  )
@@ -353,14 +356,19 @@ def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample
353
  fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
354
  for video_element in ele["video"]
355
  ]
356
- if len(images) > ele["max_frames"]:
357
- num_frames_target = ele["max_frames"]
358
- print(ele["max_frames"])
359
- interval = len(images) // num_frames_target # 计算抽取间隔
360
- images = [images[i] for i in range(0, len(images), interval)][:num_frames_target]
361
  num_frames = ceil_by_factor(len(images), FRAME_FACTOR)
362
  if len(images) < num_frames:
363
  images.extend([images[-1]] * (num_frames - len(images)))
 
 
 
 
 
364
  if return_video_sample_fps:
365
  return images, process_info.pop("sample_fps", 2.0)
366
  return images
@@ -421,6 +429,11 @@ def process_vision_info(
421
  else:
422
  image_inputs.append(fetch_image(vision_info))
423
  elif "video" in vision_info or "video_url" in vision_info:
 
 
 
 
 
424
  video_inputs.append(fetch_video(vision_info))
425
  elif "audio" in vision_info or "audio_url" in vision_info:
426
  if isinstance(vision_info["audio"], (tuple, list)):
 
268
  total_frames, video_fps = len(vr), vr.get_avg_fps()
269
  logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
270
 
271
+ sample_method = ele.get("sample", "sequence")
272
  # if sample_method == "sequence":
273
  # total_frames = int(total_frames / video_fps * 2)
274
+ if video_fps > 2.0 and total_frames / float(video_fps) > 5.0:
275
+ num_frames = get_frames(ele, int(total_frames / float(video_fps) * 2))
276
+ else:
277
+ num_frames = get_frames(ele, total_frames)
278
  frame_indices = sample_frames(
279
  num_frames=num_frames, total_frames=total_frames, sample=sample_method
280
  )
 
356
  fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
357
  for video_element in ele["video"]
358
  ]
359
+ # if len(images) > ele["max_frames"]:
360
+ # num_frames_target = ele["max_frames"]
361
+ # print(ele["max_frames"])
362
+ # interval = len(images) // num_frames_target # 计算抽取间隔
363
+ # images = [images[i] for i in range(0, len(images), interval)][:num_frames_target]
364
  num_frames = ceil_by_factor(len(images), FRAME_FACTOR)
365
  if len(images) < num_frames:
366
  images.extend([images[-1]] * (num_frames - len(images)))
367
+ if len(images) > ele["max_frames"]:
368
+ frame_indices = sample_frames(
369
+ num_frames=ele["max_frames"], total_frames=len(images), sample="uniform",
370
+ )
371
+ images = [images[i] for i in frame_indices]
372
  if return_video_sample_fps:
373
  return images, process_info.pop("sample_fps", 2.0)
374
  return images
 
429
  else:
430
  image_inputs.append(fetch_image(vision_info))
431
  elif "video" in vision_info or "video_url" in vision_info:
432
+ if is_video(vision_info['video']):
433
+ data_value = vision_info['video']
434
+ else:
435
+ data_value = [os.path.join(vision_info['video'], frame) for frame in os.listdir(vision_info['video'])]
436
+ vision_info['video']=data_value
437
  video_inputs.append(fetch_video(vision_info))
438
  elif "audio" in vision_info or "audio_url" in vision_info:
439
  if isinstance(vision_info["audio"], (tuple, list)):
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": ".",
3
  "architectures": [
4
  "BailingMMNativeForConditionalGeneration"
5
  ],
@@ -39,7 +38,6 @@
39
  "AutoConfig": "configuration_bailingmm.BailingMMConfig"
40
  },
41
  "llm_config": {
42
- "_name_or_path": "",
43
  "add_cross_attention": false,
44
  "architectures": [
45
  "BailingMoeForCausalLM"
@@ -93,7 +91,7 @@
93
  "moe_intermediate_size": 1408,
94
  "multi_gate": true,
95
  "no_repeat_ngram_size": 0,
96
- "norm_head": true,
97
  "norm_softmax": false,
98
  "norm_topk_prob": true,
99
  "num_attention_heads": 16,
@@ -146,7 +144,6 @@
146
  "mlp_depth": 2,
147
  "model_type": "bailingmm",
148
  "talker_config": {
149
- "_name_or_path": "./talker",
150
  "add_cross_attention": false,
151
  "architectures": null,
152
  "audio_vocab_size": 32768,
@@ -220,7 +217,6 @@
220
  "torch_dtype": "float32",
221
  "transformers_version": "4.45.0",
222
  "vision_config": {
223
- "_name_or_path": "",
224
  "add_cross_attention": false,
225
  "architectures": [
226
  "Qwen2_5_VisionTransformer"
@@ -307,5 +303,16 @@
307
  "typical_p": 1.0,
308
  "use_bfloat16": false,
309
  "window_size": 112
 
 
 
 
 
 
 
 
 
 
 
310
  }
311
  }
 
1
  {
 
2
  "architectures": [
3
  "BailingMMNativeForConditionalGeneration"
4
  ],
 
38
  "AutoConfig": "configuration_bailingmm.BailingMMConfig"
39
  },
40
  "llm_config": {
 
41
  "add_cross_attention": false,
42
  "architectures": [
43
  "BailingMoeForCausalLM"
 
91
  "moe_intermediate_size": 1408,
92
  "multi_gate": true,
93
  "no_repeat_ngram_size": 0,
94
+ "norm_head": false,
95
  "norm_softmax": false,
96
  "norm_topk_prob": true,
97
  "num_attention_heads": 16,
 
144
  "mlp_depth": 2,
145
  "model_type": "bailingmm",
146
  "talker_config": {
 
147
  "add_cross_attention": false,
148
  "architectures": null,
149
  "audio_vocab_size": 32768,
 
217
  "torch_dtype": "float32",
218
  "transformers_version": "4.45.0",
219
  "vision_config": {
 
220
  "add_cross_attention": false,
221
  "architectures": [
222
  "Qwen2_5_VisionTransformer"
 
303
  "typical_p": 1.0,
304
  "use_bfloat16": false,
305
  "window_size": 112
306
+ },
307
+ "whisper_config": {
308
+ "ds_kernel_size": 3,
309
+ "ds_stride": 2,
310
+ "whisper_encoder_config": {
311
+ "n_ctx": 15000,
312
+ "n_head": 20,
313
+ "n_layer": 32,
314
+ "n_mels": 128,
315
+ "n_state": 1280
316
+ }
317
  }
318
  }
configuration_bailing_moe.py CHANGED
@@ -42,6 +42,7 @@ class BailingMoeConfig(PretrainedConfig):
42
  output_router_logits=False,
43
  multi_gate=False,
44
  image_patch_token=126346,
 
45
  **kwargs,
46
  ):
47
  self.num_hidden_layers = num_hidden_layers
@@ -78,4 +79,5 @@ class BailingMoeConfig(PretrainedConfig):
78
  self.output_router_logits = output_router_logits
79
  self.multi_gate = multi_gate
80
  self.image_patch_token = image_patch_token
81
- super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 
42
  output_router_logits=False,
43
  multi_gate=False,
44
  image_patch_token=126346,
45
+ _attn_implementation="flash_attention_2",
46
  **kwargs,
47
  ):
48
  self.num_hidden_layers = num_hidden_layers
 
79
  self.output_router_logits = output_router_logits
80
  self.multi_gate = multi_gate
81
  self.image_patch_token = image_patch_token
82
+ super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
83
+ self._attn_implementation = _attn_implementation
configuration_bailingmm.py CHANGED
@@ -18,6 +18,8 @@ from qwen2_5_vit import Qwen2_5_VLVisionConfig
18
  from configuration_audio import GLMAudioConfig
19
  from configuration_bailing_moe import BailingMoeConfig
20
  from configuration_bailing_talker import BailingTalkerConfig
 
 
21
 
22
  class BailingMMConfig(PretrainedConfig):
23
  model_type = "bailingmm"
@@ -28,6 +30,7 @@ class BailingMMConfig(PretrainedConfig):
28
  llm_config: BailingMoeConfig = None,
29
  vision_config: Qwen2_5_VLVisionConfig = None,
30
  audio_config: GLMAudioConfig = None,
 
31
  talker_config: BailingTalkerConfig = None,
32
  **kwargs
33
  ):
@@ -36,4 +39,5 @@ class BailingMMConfig(PretrainedConfig):
36
  self.llm_config = BailingMoeConfig(**llm_config) if isinstance(llm_config, dict) else llm_config
37
  self.mlp_depth = mlp_depth
38
  self.talker_config = BailingTalkerConfig(**talker_config) if isinstance(talker_config, dict) else talker_config
 
39
  super().__init__(**kwargs)
 
18
  from configuration_audio import GLMAudioConfig
19
  from configuration_bailing_moe import BailingMoeConfig
20
  from configuration_bailing_talker import BailingTalkerConfig
21
+ from configuration_whisper_encoder import WhisperEncoderConfig
22
+
23
 
24
  class BailingMMConfig(PretrainedConfig):
25
  model_type = "bailingmm"
 
30
  llm_config: BailingMoeConfig = None,
31
  vision_config: Qwen2_5_VLVisionConfig = None,
32
  audio_config: GLMAudioConfig = None,
33
+ whisper_config: WhisperEncoderConfig = None,
34
  talker_config: BailingTalkerConfig = None,
35
  **kwargs
36
  ):
 
39
  self.llm_config = BailingMoeConfig(**llm_config) if isinstance(llm_config, dict) else llm_config
40
  self.mlp_depth = mlp_depth
41
  self.talker_config = BailingTalkerConfig(**talker_config) if isinstance(talker_config, dict) else talker_config
42
+ self.whisper_config = WhisperEncoderConfig(**whisper_config) if isinstance(whisper_config, dict) else whisper_config
43
  super().__init__(**kwargs)
configuration_whisper_encoder.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ GLMAudio model configuration """
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+
23
+ class WhisperEncoderConfig(PretrainedConfig):
24
+ def __init__(
25
+ self,
26
+ whisper_encoder_config: dict = None,
27
+ ds_kernel_size=3,
28
+ ds_stride=2,
29
+ **kwargs
30
+ ):
31
+ self.whisper_encoder_config = whisper_encoder_config
32
+ self.ds_kernel_size = ds_kernel_size
33
+ self.ds_stride = ds_stride
34
+
35
+ super().__init__(
36
+ **kwargs
37
+ )
connector/LICENSE ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Qwen RESEARCH LICENSE AGREEMENT
2
+
3
+ Qwen RESEARCH LICENSE AGREEMENT Release Date: September 19, 2024
4
+
5
+ By clicking to agree or by using or distributing any portion or element of the Qwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
6
+
7
+ 1. Definitions
8
+ a. This Qwen RESEARCH LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
9
+ b. "We" (or "Us") shall mean Alibaba Cloud.
10
+ c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
11
+ d. "Third Parties" shall mean individuals or legal entities that are not under common control with us or you.
12
+ e. "Qwen" shall mean the large language models, and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by us.
13
+ f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Qwen and Documentation (and any portion thereof) made available under this Agreement.
14
+ g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
15
+ h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
16
+ i. "Non-Commercial" shall mean for research or evaluation purposes only.
17
+
18
+ 2. Grant of Rights
19
+ a. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials FOR NON-COMMERCIAL PURPOSES ONLY.
20
+ b. If you are commercially using the Materials, you shall request a license from us.
21
+
22
+ 3. Redistribution
23
+ You may distribute copies or make the Materials, or derivative works thereof, available as part of a product or service that contains any of them, with or without modifications, and in Source or Object form, provided that you meet the following conditions:
24
+ a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
25
+ b. You shall cause any modified files to carry prominent notices stating that you changed the files;
26
+ c. You shall retain in all copies of the Materials that you distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Qwen is licensed under the Qwen RESEARCH LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
27
+ d. You may add your own copyright statement to your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of your modifications, or for any such derivative works as a whole, provided your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
28
+
29
+ 4. Rules of use
30
+ a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
31
+ b. If you use the Materials or any outputs or results therefrom to create, train, fine-tune, or improve an AI model that is distributed or made available, you shall prominently display “Built with Qwen” or “Improved using Qwen” in the related product documentation.
32
+
33
+ 5. Intellectual Property
34
+ a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
35
+ b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
36
+ c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licenses granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
37
+
38
+ 6. Disclaimer of Warranty and Limitation of Liability
39
+ a. We are not obligated to support, update, provide training for, or develop any further version of the Qwen Materials or to grant any license thereto.
40
+ b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
41
+ c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
42
+ d. You will defend, indemnify and hold harmless us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
43
+
44
+ 7. Survival and Termination.
45
+ a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
46
+ b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 6 and 8 shall survive the termination of this Agreement.
47
+
48
+ 8. Governing Law and Jurisdiction.
49
+ a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
50
+ b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
51
+
52
+ 9. Other Terms and Conditions.
53
+ a. Any arrangements, understandings, or agreements regarding the Material not stated herein are separate from and independent of the terms and conditions of this Agreement. You shall request a separate license from us, if you use the Materials in ways not expressly agreed to in this Agreement.
54
+ b. We shall not be bound by any additional or different terms or conditions communicated by you unless expressly agreed.
connector/README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: qwen-research
4
+ license_link: https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/blob/main/LICENSE
5
+ language:
6
+ - en
7
+ pipeline_tag: text-generation
8
+ base_model: Qwen/Qwen2.5-3B
9
+ tags:
10
+ - chat
11
+ library_name: transformers
12
+ ---
13
+
14
+ # Qwen2.5-3B-Instruct
15
+
16
+ ## Introduction
17
+
18
+ Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. Qwen2.5 brings the following improvements upon Qwen2:
19
+
20
+ - Significantly **more knowledge** and has greatly improved capabilities in **coding** and **mathematics**, thanks to our specialized expert models in these domains.
21
+ - Significant improvements in **instruction following**, **generating long texts** (over 8K tokens), **understanding structured data** (e.g, tables), and **generating structured outputs** especially JSON. **More resilient to the diversity of system prompts**, enhancing role-play implementation and condition-setting for chatbots.
22
+ - **Long-context Support** up to 128K tokens and can generate up to 8K tokens.
23
+ - **Multilingual support** for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more.
24
+
25
+ **This repo contains the instruction-tuned 3B Qwen2.5 model**, which has the following features:
26
+ - Type: Causal Language Models
27
+ - Training Stage: Pretraining & Post-training
28
+ - Architecture: transformers with RoPE, SwiGLU, RMSNorm, Attention QKV bias and tied word embeddings
29
+ - Number of Parameters: 3.09B
30
+ - Number of Paramaters (Non-Embedding): 2.77B
31
+ - Number of Layers: 36
32
+ - Number of Attention Heads (GQA): 16 for Q and 2 for KV
33
+ - Context Length: Full 32,768 tokens and generation 8192 tokens
34
+
35
+ For more details, please refer to our [blog](https://qwenlm.github.io/blog/qwen2.5/), [GitHub](https://github.com/QwenLM/Qwen2.5), and [Documentation](https://qwen.readthedocs.io/en/latest/).
36
+
37
+ ## Requirements
38
+
39
+ The code of Qwen2.5 has been in the latest Hugging face `transformers` and we advise you to use the latest version of `transformers`.
40
+
41
+ With `transformers<4.37.0`, you will encounter the following error:
42
+ ```
43
+ KeyError: 'qwen2'
44
+ ```
45
+
46
+ ## Quickstart
47
+
48
+ Here provides a code snippet with `apply_chat_template` to show you how to load the tokenizer and model and how to generate contents.
49
+
50
+ ```python
51
+ from transformers import AutoModelForCausalLM, AutoTokenizer
52
+
53
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
54
+
55
+ model = AutoModelForCausalLM.from_pretrained(
56
+ model_name,
57
+ torch_dtype="auto",
58
+ device_map="auto"
59
+ )
60
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
61
+
62
+ prompt = "Give me a short introduction to large language model."
63
+ messages = [
64
+ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
65
+ {"role": "user", "content": prompt}
66
+ ]
67
+ text = tokenizer.apply_chat_template(
68
+ messages,
69
+ tokenize=False,
70
+ add_generation_prompt=True
71
+ )
72
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
73
+
74
+ generated_ids = model.generate(
75
+ **model_inputs,
76
+ max_new_tokens=512
77
+ )
78
+ generated_ids = [
79
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
80
+ ]
81
+
82
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
83
+ ```
84
+
85
+
86
+ ## Evaluation & Performance
87
+
88
+ Detailed evaluation results are reported in this [📑 blog](https://qwenlm.github.io/blog/qwen2.5/).
89
+
90
+ For requirements on GPU memory and the respective throughput, see results [here](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html).
91
+
92
+ ## Citation
93
+
94
+ If you find our work helpful, feel free to give us a cite.
95
+
96
+ ```
97
+ @misc{qwen2.5,
98
+ title = {Qwen2.5: A Party of Foundation Models},
99
+ url = {https://qwenlm.github.io/blog/qwen2.5/},
100
+ author = {Qwen Team},
101
+ month = {September},
102
+ year = {2024}
103
+ }
104
+
105
+ @article{qwen2,
106
+ title={Qwen2 Technical Report},
107
+ author={An Yang and Baosong Yang and Binyuan Hui and Bo Zheng and Bowen Yu and Chang Zhou and Chengpeng Li and Chengyuan Li and Dayiheng Liu and Fei Huang and Guanting Dong and Haoran Wei and Huan Lin and Jialong Tang and Jialin Wang and Jian Yang and Jianhong Tu and Jianwei Zhang and Jianxin Ma and Jin Xu and Jingren Zhou and Jinze Bai and Jinzheng He and Junyang Lin and Kai Dang and Keming Lu and Keqin Chen and Kexin Yang and Mei Li and Mingfeng Xue and Na Ni and Pei Zhang and Peng Wang and Ru Peng and Rui Men and Ruize Gao and Runji Lin and Shijie Wang and Shuai Bai and Sinan Tan and Tianhang Zhu and Tianhao Li and Tianyu Liu and Wenbin Ge and Xiaodong Deng and Xiaohuan Zhou and Xingzhang Ren and Xinyu Zhang and Xipin Wei and Xuancheng Ren and Yang Fan and Yang Yao and Yichang Zhang and Yu Wan and Yunfei Chu and Yuqiong Liu and Zeyu Cui and Zhenru Zhang and Zhihao Fan},
108
+ journal={arXiv preprint arXiv:2407.10671},
109
+ year={2024}
110
+ }
111
+ ```
connector/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/video_hy2/modelzoo/Qwen2.5-3B-Instruct/",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 11008,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 70,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 36,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.46.1",
26
+ "use_cache": true,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
connector/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.46.1"
14
+ }
connector/gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
connector/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00015.safetensors → connector/model-00001-of-00003.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf878ba731271e7ec124eb0c6b8f1f2567da16693d6575fcf4bb418f2b247c22
3
- size 4989626072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a83d6ee4f16eaddeb3531ddda142489eb8e11f5b8b0f06c2b2494154693cc8c4
3
+ size 4982131536
connector/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65dcc8192c84451c559229ccadf78a8040c9a9aa4571f8fe9728a52b547ba32a
3
+ size 4932949336
connector/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f0f64f6662c7059f6578f03a0ed65c11c2a73f23c6c55d0872edb1423840b21
3
+ size 2428723160
connector/model.safetensors.index.json ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 12343754752
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
36
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
46
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
48
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
49
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
50
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
52
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
53
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
54
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
55
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
61
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
62
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
64
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
65
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
66
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
67
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
73
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
76
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
78
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
85
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
88
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
90
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
97
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
100
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
102
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
109
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
112
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
114
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
118
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
120
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
121
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
122
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
124
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
125
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
126
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
133
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
136
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
138
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
145
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
146
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
148
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
150
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
152
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
153
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
154
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
155
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
156
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
157
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
158
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
159
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
160
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
161
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
162
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
163
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
164
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
166
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
168
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
169
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
170
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
172
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
173
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
174
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
175
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
176
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
181
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
182
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
183
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
184
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
185
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
186
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
187
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
189
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
193
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
194
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
196
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
197
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
198
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
199
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
200
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
202
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
204
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
205
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
206
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
208
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
209
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
210
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
211
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
212
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
214
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
216
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
217
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
218
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
220
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
221
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
222
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
223
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
224
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
226
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
228
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
229
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
230
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
232
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
233
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
234
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
235
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
236
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
238
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
240
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
241
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
242
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
244
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
245
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
246
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
247
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
248
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
250
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
252
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
253
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
254
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
256
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
257
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
258
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
259
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
260
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
261
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
262
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
263
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
264
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
265
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
266
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
267
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
268
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
269
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
270
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
271
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
272
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
273
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
274
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
275
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
276
+ "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
277
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
278
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
279
+ "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
280
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
281
+ "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
282
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
283
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
289
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
292
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
294
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
296
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
297
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
298
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
299
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
300
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
301
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
302
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
303
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
304
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
305
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
306
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
307
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
308
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
309
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
310
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
311
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
312
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
313
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
314
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
315
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
316
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
317
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
318
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
319
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
320
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
321
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
322
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
323
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
324
+ "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
325
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
326
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
327
+ "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
328
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
329
+ "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
330
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
331
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
332
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
333
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
334
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
335
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
336
+ "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
337
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
338
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
339
+ "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
340
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
341
+ "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
342
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
343
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
344
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
345
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
346
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
347
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
348
+ "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
349
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
350
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
351
+ "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
352
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
353
+ "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
354
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
355
+ "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
356
+ "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
357
+ "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
358
+ "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
359
+ "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
360
+ "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
361
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
362
+ "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
363
+ "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
364
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
365
+ "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
366
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
367
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
368
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
369
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
370
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
371
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
372
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
373
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
374
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
375
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
376
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
377
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
378
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
379
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
380
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
381
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
382
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
383
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
384
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
385
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
386
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
387
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
388
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
389
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
390
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
391
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
392
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
393
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
394
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
395
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
396
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
397
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
398
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
399
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
400
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
401
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
402
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
403
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
404
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
405
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
406
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
407
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
408
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
409
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
410
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
411
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
412
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
413
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
414
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
415
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
416
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
417
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
418
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
419
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
420
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
421
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
422
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
423
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
424
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
425
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
426
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
427
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
428
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
429
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
430
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
431
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
432
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
433
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
434
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
435
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
436
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
437
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
438
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
439
+ "model.norm.weight": "model-00003-of-00003.safetensors"
440
+ }
441
+ }
connector/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
connector/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
connector/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
data/openai_whisper-20240930-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9616410f3c1e0ce500bc34f0231d1e56ce59a0081baba22253d153f01e8938f0
3
+ size 803668
data/wavs/speechQA_sample.wav ADDED
Binary file (51.6 kB). View file
 
diffusion/__init__.py ADDED
File without changes
diffusion/pipeline_sana.py ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import html
16
+ import inspect
17
+ import re
18
+ import urllib.parse as ul
19
+ import warnings
20
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
24
+
25
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
26
+ from diffusers.image_processor import PixArtImageProcessor
27
+ from diffusers.loaders import SanaLoraLoaderMixin
28
+ from diffusers.models import AutoencoderDC, SanaTransformer2DModel
29
+ from diffusers.schedulers import DPMSolverMultistepScheduler
30
+ from diffusers.utils import (
31
+ BACKENDS_MAPPING,
32
+ USE_PEFT_BACKEND,
33
+ is_bs4_available,
34
+ is_ftfy_available,
35
+ is_torch_xla_available,
36
+ logging,
37
+ replace_example_docstring,
38
+ scale_lora_layers,
39
+ unscale_lora_layers,
40
+ )
41
+ from diffusers.utils.torch_utils import randn_tensor
42
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
43
+ from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha import (
44
+ ASPECT_RATIO_512_BIN,
45
+ ASPECT_RATIO_1024_BIN,
46
+ )
47
+ from diffusers.pipelines.pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
48
+ from diffusers.pipelines.sana.pipeline_output import SanaPipelineOutput
49
+
50
+
51
+ if is_torch_xla_available():
52
+ import torch_xla.core.xla_model as xm
53
+
54
+ XLA_AVAILABLE = True
55
+ else:
56
+ XLA_AVAILABLE = False
57
+
58
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
59
+
60
+ if is_bs4_available():
61
+ from bs4 import BeautifulSoup
62
+
63
+ if is_ftfy_available():
64
+ import ftfy
65
+
66
+
67
+ ASPECT_RATIO_4096_BIN = {
68
+ "0.25": [2048.0, 8192.0],
69
+ "0.26": [2048.0, 7936.0],
70
+ "0.27": [2048.0, 7680.0],
71
+ "0.28": [2048.0, 7424.0],
72
+ "0.32": [2304.0, 7168.0],
73
+ "0.33": [2304.0, 6912.0],
74
+ "0.35": [2304.0, 6656.0],
75
+ "0.4": [2560.0, 6400.0],
76
+ "0.42": [2560.0, 6144.0],
77
+ "0.48": [2816.0, 5888.0],
78
+ "0.5": [2816.0, 5632.0],
79
+ "0.52": [2816.0, 5376.0],
80
+ "0.57": [3072.0, 5376.0],
81
+ "0.6": [3072.0, 5120.0],
82
+ "0.68": [3328.0, 4864.0],
83
+ "0.72": [3328.0, 4608.0],
84
+ "0.78": [3584.0, 4608.0],
85
+ "0.82": [3584.0, 4352.0],
86
+ "0.88": [3840.0, 4352.0],
87
+ "0.94": [3840.0, 4096.0],
88
+ "1.0": [4096.0, 4096.0],
89
+ "1.07": [4096.0, 3840.0],
90
+ "1.13": [4352.0, 3840.0],
91
+ "1.21": [4352.0, 3584.0],
92
+ "1.29": [4608.0, 3584.0],
93
+ "1.38": [4608.0, 3328.0],
94
+ "1.46": [4864.0, 3328.0],
95
+ "1.67": [5120.0, 3072.0],
96
+ "1.75": [5376.0, 3072.0],
97
+ "2.0": [5632.0, 2816.0],
98
+ "2.09": [5888.0, 2816.0],
99
+ "2.4": [6144.0, 2560.0],
100
+ "2.5": [6400.0, 2560.0],
101
+ "2.89": [6656.0, 2304.0],
102
+ "3.0": [6912.0, 2304.0],
103
+ "3.11": [7168.0, 2304.0],
104
+ "3.62": [7424.0, 2048.0],
105
+ "3.75": [7680.0, 2048.0],
106
+ "3.88": [7936.0, 2048.0],
107
+ "4.0": [8192.0, 2048.0],
108
+ }
109
+
110
+ EXAMPLE_DOC_STRING = """
111
+ Examples:
112
+ ```py
113
+ >>> import torch
114
+ >>> from diffusers import SanaPipeline
115
+
116
+ >>> pipe = SanaPipeline.from_pretrained(
117
+ ... "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers", torch_dtype=torch.float32
118
+ ... )
119
+ >>> pipe.to("cuda")
120
+ >>> pipe.text_encoder.to(torch.bfloat16)
121
+ >>> pipe.transformer = pipe.transformer.to(torch.bfloat16)
122
+
123
+ >>> image = pipe(prompt='a cyberpunk cat with a neon sign that says "Sana"')[0]
124
+ >>> image[0].save("output.png")
125
+ ```
126
+ """
127
+
128
+
129
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
130
+ def retrieve_timesteps(
131
+ scheduler,
132
+ num_inference_steps: Optional[int] = None,
133
+ device: Optional[Union[str, torch.device]] = None,
134
+ timesteps: Optional[List[int]] = None,
135
+ sigmas: Optional[List[float]] = None,
136
+ **kwargs,
137
+ ):
138
+ r"""
139
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
140
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
141
+
142
+ Args:
143
+ scheduler (`SchedulerMixin`):
144
+ The scheduler to get timesteps from.
145
+ num_inference_steps (`int`):
146
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
147
+ must be `None`.
148
+ device (`str` or `torch.device`, *optional*):
149
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
150
+ timesteps (`List[int]`, *optional*):
151
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
152
+ `num_inference_steps` and `sigmas` must be `None`.
153
+ sigmas (`List[float]`, *optional*):
154
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
155
+ `num_inference_steps` and `timesteps` must be `None`.
156
+
157
+ Returns:
158
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
159
+ second element is the number of inference steps.
160
+ """
161
+ if timesteps is not None and sigmas is not None:
162
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
163
+ if timesteps is not None:
164
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
165
+ if not accepts_timesteps:
166
+ raise ValueError(
167
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
168
+ f" timestep schedules. Please check whether you are using the correct scheduler."
169
+ )
170
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
171
+ timesteps = scheduler.timesteps
172
+ num_inference_steps = len(timesteps)
173
+ elif sigmas is not None:
174
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
175
+ if not accept_sigmas:
176
+ raise ValueError(
177
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
178
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
179
+ )
180
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
181
+ timesteps = scheduler.timesteps
182
+ num_inference_steps = len(timesteps)
183
+ else:
184
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
185
+ timesteps = scheduler.timesteps
186
+ return timesteps, num_inference_steps
187
+
188
+
189
+ class SanaPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
190
+ r"""
191
+ Pipeline for text-to-image generation using [Sana](https://huggingface.co/papers/2410.10629).
192
+ """
193
+
194
+ # fmt: off
195
+ bad_punct_regex = re.compile(r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}")
196
+ # fmt: on
197
+
198
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
199
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
200
+
201
+ def __init__(
202
+ self,
203
+ tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
204
+ text_encoder: Gemma2PreTrainedModel,
205
+ vae: AutoencoderDC,
206
+ transformer: SanaTransformer2DModel,
207
+ scheduler: DPMSolverMultistepScheduler,
208
+ ):
209
+ super().__init__()
210
+
211
+ self.register_modules(
212
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
213
+ )
214
+
215
+ self.vae_scale_factor = (
216
+ 2 ** (len(self.vae.config.encoder_block_out_channels) - 1)
217
+ if hasattr(self, "vae") and self.vae is not None
218
+ else 32
219
+ )
220
+ self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
221
+
222
+ def enable_vae_slicing(self):
223
+ r"""
224
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
225
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
226
+ """
227
+ self.vae.enable_slicing()
228
+
229
+ def disable_vae_slicing(self):
230
+ r"""
231
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
232
+ computing decoding in one step.
233
+ """
234
+ self.vae.disable_slicing()
235
+
236
+ def enable_vae_tiling(self):
237
+ r"""
238
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
239
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
240
+ processing larger images.
241
+ """
242
+ self.vae.enable_tiling()
243
+
244
+ def disable_vae_tiling(self):
245
+ r"""
246
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
247
+ computing decoding in one step.
248
+ """
249
+ self.vae.disable_tiling()
250
+
251
+ def _get_gemma_prompt_embeds(
252
+ self,
253
+ prompt: Union[str, List[str]],
254
+ device: torch.device,
255
+ dtype: torch.dtype,
256
+ clean_caption: bool = False,
257
+ max_sequence_length: int = 300,
258
+ complex_human_instruction: Optional[List[str]] = None,
259
+ ):
260
+ r"""
261
+ Encodes the prompt into text encoder hidden states.
262
+
263
+ Args:
264
+ prompt (`str` or `List[str]`, *optional*):
265
+ prompt to be encoded
266
+ device: (`torch.device`, *optional*):
267
+ torch device to place the resulting embeddings on
268
+ clean_caption (`bool`, defaults to `False`):
269
+ If `True`, the function will preprocess and clean the provided caption before encoding.
270
+ max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
271
+ complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
272
+ If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
273
+ the prompt.
274
+ """
275
+ prompt = [prompt] if isinstance(prompt, str) else prompt
276
+
277
+ if getattr(self, "tokenizer", None) is not None:
278
+ self.tokenizer.padding_side = "right"
279
+
280
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
281
+
282
+ # prepare complex human instruction
283
+ if not complex_human_instruction:
284
+ max_length_all = max_sequence_length
285
+ else:
286
+ chi_prompt = "\n".join(complex_human_instruction)
287
+ prompt = [chi_prompt + p for p in prompt]
288
+ num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
289
+ max_length_all = num_chi_prompt_tokens + max_sequence_length - 2
290
+
291
+ text_inputs = self.tokenizer(
292
+ prompt,
293
+ padding="max_length",
294
+ max_length=max_length_all,
295
+ truncation=True,
296
+ add_special_tokens=True,
297
+ return_tensors="pt",
298
+ )
299
+ text_input_ids = text_inputs.input_ids
300
+
301
+ prompt_attention_mask = text_inputs.attention_mask
302
+ prompt_attention_mask = prompt_attention_mask.to(device)
303
+
304
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
305
+ prompt_embeds = prompt_embeds[0].to(dtype=dtype, device=device)
306
+
307
+ return prompt_embeds, prompt_attention_mask
308
+
309
+ def encode_prompt(
310
+ self,
311
+ prompt: Union[str, List[str]],
312
+ do_classifier_free_guidance: bool = True,
313
+ negative_prompt: str = "",
314
+ num_images_per_prompt: int = 1,
315
+ device: Optional[torch.device] = None,
316
+ prompt_embeds: Optional[torch.Tensor] = None,
317
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
318
+ prompt_attention_mask: Optional[torch.Tensor] = None,
319
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
320
+ clean_caption: bool = False,
321
+ max_sequence_length: int = 300,
322
+ complex_human_instruction: Optional[List[str]] = None,
323
+ lora_scale: Optional[float] = None,
324
+ ):
325
+ r"""
326
+ Encodes the prompt into text encoder hidden states.
327
+
328
+ Args:
329
+ prompt (`str` or `List[str]`, *optional*):
330
+ prompt to be encoded
331
+ negative_prompt (`str` or `List[str]`, *optional*):
332
+ The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
333
+ instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
334
+ PixArt-Alpha, this should be "".
335
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
336
+ whether to use classifier free guidance or not
337
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
338
+ number of images that should be generated per prompt
339
+ device: (`torch.device`, *optional*):
340
+ torch device to place the resulting embeddings on
341
+ prompt_embeds (`torch.Tensor`, *optional*):
342
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
343
+ provided, text embeddings will be generated from `prompt` input argument.
344
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
345
+ Pre-generated negative text embeddings. For Sana, it's should be the embeddings of the "" string.
346
+ clean_caption (`bool`, defaults to `False`):
347
+ If `True`, the function will preprocess and clean the provided caption before encoding.
348
+ max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
349
+ complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
350
+ If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
351
+ the prompt.
352
+ """
353
+
354
+ if device is None:
355
+ device = self._execution_device
356
+
357
+ if self.transformer is not None:
358
+ dtype = self.transformer.dtype
359
+ elif self.text_encoder is not None:
360
+ dtype = self.text_encoder.dtype
361
+ else:
362
+ dtype = None
363
+
364
+ # set lora scale so that monkey patched LoRA
365
+ # function of text encoder can correctly access it
366
+ if lora_scale is not None and isinstance(self, SanaLoraLoaderMixin):
367
+ self._lora_scale = lora_scale
368
+
369
+ # dynamically adjust the LoRA scale
370
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
371
+ scale_lora_layers(self.text_encoder, lora_scale)
372
+
373
+ if prompt is not None and isinstance(prompt, str):
374
+ batch_size = 1
375
+ elif prompt is not None and isinstance(prompt, list):
376
+ batch_size = len(prompt)
377
+ else:
378
+ batch_size = prompt_embeds.shape[0]
379
+
380
+ if getattr(self, "tokenizer", None) is not None:
381
+ self.tokenizer.padding_side = "right"
382
+
383
+ # See Section 3.1. of the paper.
384
+ max_length = max_sequence_length
385
+ select_index = [0] + list(range(-max_length + 1, 0))
386
+
387
+ if prompt_embeds is None:
388
+ prompt_embeds, prompt_attention_mask = self._get_gemma_prompt_embeds(
389
+ prompt=prompt,
390
+ device=device,
391
+ dtype=dtype,
392
+ clean_caption=clean_caption,
393
+ max_sequence_length=max_sequence_length,
394
+ complex_human_instruction=complex_human_instruction,
395
+ )
396
+
397
+ prompt_embeds = prompt_embeds[:, select_index]
398
+ prompt_attention_mask = prompt_attention_mask[:, select_index]
399
+
400
+ bs_embed, seq_len, _ = prompt_embeds.shape
401
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
402
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
403
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
404
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
405
+ prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
406
+
407
+ # get unconditional embeddings for classifier free guidance
408
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
409
+ negative_prompt = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
410
+ negative_prompt_embeds, negative_prompt_attention_mask = self._get_gemma_prompt_embeds(
411
+ prompt=negative_prompt,
412
+ device=device,
413
+ dtype=dtype,
414
+ clean_caption=clean_caption,
415
+ max_sequence_length=max_sequence_length,
416
+ complex_human_instruction=False,
417
+ )
418
+
419
+ if do_classifier_free_guidance:
420
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
421
+ seq_len = negative_prompt_embeds.shape[1]
422
+
423
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
424
+
425
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
426
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
427
+
428
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
429
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
430
+ else:
431
+ negative_prompt_embeds = None
432
+ negative_prompt_attention_mask = None
433
+
434
+ if self.text_encoder is not None:
435
+ if isinstance(self, SanaLoraLoaderMixin) and USE_PEFT_BACKEND:
436
+ # Retrieve the original scale by scaling back the LoRA layers
437
+ unscale_lora_layers(self.text_encoder, lora_scale)
438
+
439
+ return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
440
+
441
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
442
+ def prepare_extra_step_kwargs(self, generator, eta):
443
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
444
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
445
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
446
+ # and should be between [0, 1]
447
+
448
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
449
+ extra_step_kwargs = {}
450
+ if accepts_eta:
451
+ extra_step_kwargs["eta"] = eta
452
+
453
+ # check if the scheduler accepts generator
454
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
455
+ if accepts_generator:
456
+ extra_step_kwargs["generator"] = generator
457
+ return extra_step_kwargs
458
+
459
+ def check_inputs(
460
+ self,
461
+ prompt,
462
+ height,
463
+ width,
464
+ callback_on_step_end_tensor_inputs=None,
465
+ negative_prompt=None,
466
+ prompt_embeds=None,
467
+ negative_prompt_embeds=None,
468
+ prompt_attention_mask=None,
469
+ negative_prompt_attention_mask=None,
470
+ ):
471
+ if height % 32 != 0 or width % 32 != 0:
472
+ raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
473
+
474
+ if callback_on_step_end_tensor_inputs is not None and not all(
475
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
476
+ ):
477
+ raise ValueError(
478
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
479
+ )
480
+
481
+ if prompt is not None and prompt_embeds is not None:
482
+ raise ValueError(
483
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
484
+ " only forward one of the two."
485
+ )
486
+ elif prompt is None and prompt_embeds is None:
487
+ raise ValueError(
488
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
489
+ )
490
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
491
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
492
+
493
+ if prompt is not None and negative_prompt_embeds is not None:
494
+ raise ValueError(
495
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
496
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
497
+ )
498
+
499
+ if negative_prompt is not None and negative_prompt_embeds is not None:
500
+ raise ValueError(
501
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
502
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
503
+ )
504
+
505
+ if prompt_embeds is not None and prompt_attention_mask is None:
506
+ raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
507
+
508
+ if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
509
+ raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
510
+
511
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
512
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
513
+ raise ValueError(
514
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
515
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
516
+ f" {negative_prompt_embeds.shape}."
517
+ )
518
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
519
+ raise ValueError(
520
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
521
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
522
+ f" {negative_prompt_attention_mask.shape}."
523
+ )
524
+
525
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
526
+ def _text_preprocessing(self, text, clean_caption=False):
527
+ if clean_caption and not is_bs4_available():
528
+ logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
529
+ logger.warning("Setting `clean_caption` to False...")
530
+ clean_caption = False
531
+
532
+ if clean_caption and not is_ftfy_available():
533
+ logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
534
+ logger.warning("Setting `clean_caption` to False...")
535
+ clean_caption = False
536
+
537
+ if not isinstance(text, (tuple, list)):
538
+ text = [text]
539
+
540
+ def process(text: str):
541
+ if clean_caption:
542
+ text = self._clean_caption(text)
543
+ text = self._clean_caption(text)
544
+ else:
545
+ text = text.lower().strip()
546
+ return text
547
+
548
+ return [process(t) for t in text]
549
+
550
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
551
+ def _clean_caption(self, caption):
552
+ caption = str(caption)
553
+ caption = ul.unquote_plus(caption)
554
+ caption = caption.strip().lower()
555
+ caption = re.sub("<person>", "person", caption)
556
+ # urls:
557
+ caption = re.sub(
558
+ r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
559
+ "",
560
+ caption,
561
+ ) # regex for urls
562
+ caption = re.sub(
563
+ r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
564
+ "",
565
+ caption,
566
+ ) # regex for urls
567
+ # html:
568
+ caption = BeautifulSoup(caption, features="html.parser").text
569
+
570
+ # @<nickname>
571
+ caption = re.sub(r"@[\w\d]+\b", "", caption)
572
+
573
+ # 31C0—31EF CJK Strokes
574
+ # 31F0��31FF Katakana Phonetic Extensions
575
+ # 3200—32FF Enclosed CJK Letters and Months
576
+ # 3300—33FF CJK Compatibility
577
+ # 3400—4DBF CJK Unified Ideographs Extension A
578
+ # 4DC0—4DFF Yijing Hexagram Symbols
579
+ # 4E00—9FFF CJK Unified Ideographs
580
+ caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
581
+ caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
582
+ caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
583
+ caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
584
+ caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
585
+ caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
586
+ caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
587
+ #######################################################
588
+
589
+ # все виды тире / all types of dash --> "-"
590
+ caption = re.sub(
591
+ r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
592
+ "-",
593
+ caption,
594
+ )
595
+
596
+ # кавычки к одному стандарту
597
+ caption = re.sub(r"[`´«»“”¨]", '"', caption)
598
+ caption = re.sub(r"[‘’]", "'", caption)
599
+
600
+ # &quot;
601
+ caption = re.sub(r"&quot;?", "", caption)
602
+ # &amp
603
+ caption = re.sub(r"&amp", "", caption)
604
+
605
+ # ip adresses:
606
+ caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
607
+
608
+ # article ids:
609
+ caption = re.sub(r"\d:\d\d\s+$", "", caption)
610
+
611
+ # \n
612
+ caption = re.sub(r"\\n", " ", caption)
613
+
614
+ # "#123"
615
+ caption = re.sub(r"#\d{1,3}\b", "", caption)
616
+ # "#12345.."
617
+ caption = re.sub(r"#\d{5,}\b", "", caption)
618
+ # "123456.."
619
+ caption = re.sub(r"\b\d{6,}\b", "", caption)
620
+ # filenames:
621
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
622
+
623
+ #
624
+ caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
625
+ caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
626
+
627
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
628
+ caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
629
+
630
+ # this-is-my-cute-cat / this_is_my_cute_cat
631
+ regex2 = re.compile(r"(?:\-|\_)")
632
+ if len(re.findall(regex2, caption)) > 3:
633
+ caption = re.sub(regex2, " ", caption)
634
+
635
+ caption = ftfy.fix_text(caption)
636
+ caption = html.unescape(html.unescape(caption))
637
+
638
+ caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
639
+ caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
640
+ caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
641
+
642
+ caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
643
+ caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
644
+ caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
645
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
646
+ caption = re.sub(r"\bpage\s+\d+\b", "", caption)
647
+
648
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
649
+
650
+ caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
651
+
652
+ caption = re.sub(r"\b\s+\:\s+", r": ", caption)
653
+ caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
654
+ caption = re.sub(r"\s+", " ", caption)
655
+
656
+ caption.strip()
657
+
658
+ caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
659
+ caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
660
+ caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
661
+ caption = re.sub(r"^\.\S+$", "", caption)
662
+
663
+ return caption.strip()
664
+
665
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
666
+ if latents is not None:
667
+ return latents.to(device=device, dtype=dtype)
668
+
669
+ shape = (
670
+ batch_size,
671
+ num_channels_latents,
672
+ int(height) // self.vae_scale_factor,
673
+ int(width) // self.vae_scale_factor,
674
+ )
675
+ if isinstance(generator, list) and len(generator) != batch_size:
676
+ raise ValueError(
677
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
678
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
679
+ )
680
+
681
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
682
+ return latents
683
+
684
+ @property
685
+ def guidance_scale(self):
686
+ return self._guidance_scale
687
+
688
+ @property
689
+ def attention_kwargs(self):
690
+ return self._attention_kwargs
691
+
692
+ @property
693
+ def do_classifier_free_guidance(self):
694
+ return self._guidance_scale > 1.0
695
+
696
+ @property
697
+ def num_timesteps(self):
698
+ return self._num_timesteps
699
+
700
+ @property
701
+ def interrupt(self):
702
+ return self._interrupt
703
+
704
+ @torch.no_grad()
705
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
706
+ def __call__(
707
+ self,
708
+ prompt: Union[str, List[str]] = None,
709
+ negative_prompt: Union[str, List[str]] = None,
710
+ num_inference_steps: int = 20,
711
+ timesteps: List[int] = None,
712
+ sigmas: List[float] = None,
713
+ guidance_scale: float = 4.5,
714
+ num_images_per_prompt: Optional[int] = 1,
715
+ height: int = 1024,
716
+ width: int = 1024,
717
+ eta: float = 0.0,
718
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
719
+ latents: Optional[torch.Tensor] = None,
720
+ prompt_embeds: Optional[torch.Tensor] = None,
721
+ prompt_attention_mask: Optional[torch.Tensor] = None,
722
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
723
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
724
+ output_type: Optional[str] = "pil",
725
+ device: Optional[Union[str, torch.device]] = None,
726
+ return_dict: bool = True,
727
+ clean_caption: bool = False,
728
+ use_resolution_binning: bool = True,
729
+ attention_kwargs: Optional[Dict[str, Any]] = None,
730
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
731
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
732
+ max_sequence_length: int = 300,
733
+ complex_human_instruction: List[str] = [
734
+ "Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:",
735
+ "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.",
736
+ "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
737
+ "Here are examples of how to transform or refine prompts:",
738
+ "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.",
739
+ "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.",
740
+ "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:",
741
+ "User Prompt: ",
742
+ ],
743
+ ) -> Union[SanaPipelineOutput, Tuple]:
744
+ """
745
+ Function invoked when calling the pipeline for generation.
746
+
747
+ Args:
748
+ prompt (`str` or `List[str]`, *optional*):
749
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
750
+ instead.
751
+ negative_prompt (`str` or `List[str]`, *optional*):
752
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
753
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
754
+ less than `1`).
755
+ num_inference_steps (`int`, *optional*, defaults to 20):
756
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
757
+ expense of slower inference.
758
+ timesteps (`List[int]`, *optional*):
759
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
760
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
761
+ passed will be used. Must be in descending order.
762
+ sigmas (`List[float]`, *optional*):
763
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
764
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
765
+ will be used.
766
+ guidance_scale (`float`, *optional*, defaults to 4.5):
767
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
768
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
769
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
770
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
771
+ usually at the expense of lower image quality.
772
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
773
+ The number of images to generate per prompt.
774
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
775
+ The height in pixels of the generated image.
776
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
777
+ The width in pixels of the generated image.
778
+ eta (`float`, *optional*, defaults to 0.0):
779
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
780
+ [`schedulers.DDIMScheduler`], will be ignored for others.
781
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
782
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
783
+ to make generation deterministic.
784
+ latents (`torch.Tensor`, *optional*):
785
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
786
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
787
+ tensor will ge generated by sampling using the supplied random `generator`.
788
+ prompt_embeds (`torch.Tensor`, *optional*):
789
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
790
+ provided, text embeddings will be generated from `prompt` input argument.
791
+ prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
792
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
793
+ Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
794
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
795
+ negative_prompt_attention_mask (`torch.Tensor`, *optional*):
796
+ Pre-generated attention mask for negative text embeddings.
797
+ output_type (`str`, *optional*, defaults to `"pil"`):
798
+ The output format of the generate image. Choose between
799
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
800
+ return_dict (`bool`, *optional*, defaults to `True`):
801
+ Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
802
+ attention_kwargs:
803
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
804
+ `self.processor` in
805
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
806
+ clean_caption (`bool`, *optional*, defaults to `True`):
807
+ Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
808
+ be installed. If the dependencies are not installed, the embeddings will be created from the raw
809
+ prompt.
810
+ use_resolution_binning (`bool` defaults to `True`):
811
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
812
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
813
+ the requested resolution. Useful for generating non-square images.
814
+ callback_on_step_end (`Callable`, *optional*):
815
+ A function that calls at the end of each denoising steps during the inference. The function is called
816
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
817
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
818
+ `callback_on_step_end_tensor_inputs`.
819
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
820
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
821
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
822
+ `._callback_tensor_inputs` attribute of your pipeline class.
823
+ max_sequence_length (`int` defaults to `300`):
824
+ Maximum sequence length to use with the `prompt`.
825
+ complex_human_instruction (`List[str]`, *optional*):
826
+ Instructions for complex human attention:
827
+ https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55.
828
+
829
+ Examples:
830
+
831
+ Returns:
832
+ [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] or `tuple`:
833
+ If `return_dict` is `True`, [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] is returned,
834
+ otherwise a `tuple` is returned where the first element is a list with the generated images
835
+ """
836
+
837
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
838
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
839
+
840
+ # 1. Check inputs. Raise error if not correct
841
+ if use_resolution_binning:
842
+ if self.transformer.config.sample_size == 128:
843
+ aspect_ratio_bin = ASPECT_RATIO_4096_BIN
844
+ elif self.transformer.config.sample_size == 64:
845
+ aspect_ratio_bin = ASPECT_RATIO_2048_BIN
846
+ elif self.transformer.config.sample_size == 32:
847
+ aspect_ratio_bin = ASPECT_RATIO_1024_BIN
848
+ elif self.transformer.config.sample_size == 16:
849
+ aspect_ratio_bin = ASPECT_RATIO_512_BIN
850
+ else:
851
+ raise ValueError("Invalid sample size")
852
+ orig_height, orig_width = height, width
853
+ height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
854
+
855
+ self.check_inputs(
856
+ prompt,
857
+ height,
858
+ width,
859
+ callback_on_step_end_tensor_inputs,
860
+ negative_prompt,
861
+ prompt_embeds,
862
+ negative_prompt_embeds,
863
+ prompt_attention_mask,
864
+ negative_prompt_attention_mask,
865
+ )
866
+
867
+ self._guidance_scale = guidance_scale
868
+ self._attention_kwargs = attention_kwargs
869
+ self._interrupt = False
870
+
871
+ # 2. Default height and width to transformer
872
+ if prompt is not None and isinstance(prompt, str):
873
+ batch_size = 1
874
+ elif prompt is not None and isinstance(prompt, list):
875
+ batch_size = len(prompt)
876
+ else:
877
+ batch_size = prompt_embeds.shape[0]
878
+
879
+ device = device or self._execution_device
880
+ lora_scale = self.attention_kwargs.get("scale", None) if self.attention_kwargs is not None else None
881
+
882
+ # 3. Encode input prompt
883
+ (
884
+ prompt_embeds,
885
+ prompt_attention_mask,
886
+ negative_prompt_embeds,
887
+ negative_prompt_attention_mask,
888
+ ) = self.encode_prompt(
889
+ prompt,
890
+ self.do_classifier_free_guidance,
891
+ negative_prompt=negative_prompt,
892
+ num_images_per_prompt=num_images_per_prompt,
893
+ device=device,
894
+ prompt_embeds=prompt_embeds,
895
+ negative_prompt_embeds=negative_prompt_embeds,
896
+ prompt_attention_mask=prompt_attention_mask,
897
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
898
+ clean_caption=clean_caption,
899
+ max_sequence_length=max_sequence_length,
900
+ complex_human_instruction=complex_human_instruction,
901
+ lora_scale=lora_scale,
902
+ )
903
+ if self.do_classifier_free_guidance:
904
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
905
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
906
+
907
+ # 4. Prepare timesteps
908
+ timesteps, num_inference_steps = retrieve_timesteps(
909
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
910
+ )
911
+
912
+ # 5. Prepare latents.
913
+ latent_channels = self.transformer.config.in_channels
914
+ latents = self.prepare_latents(
915
+ batch_size * num_images_per_prompt,
916
+ latent_channels,
917
+ height,
918
+ width,
919
+ torch.float32,
920
+ device,
921
+ generator,
922
+ latents,
923
+ )
924
+
925
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
926
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
927
+
928
+ # 7. Denoising loop
929
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
930
+ self._num_timesteps = len(timesteps)
931
+
932
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
933
+ for i, t in enumerate(timesteps):
934
+ if self.interrupt:
935
+ continue
936
+
937
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
938
+ latent_model_input = latent_model_input.to(prompt_embeds.dtype)
939
+
940
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
941
+ timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
942
+ timestep = timestep * self.transformer.config.timestep_scale
943
+
944
+ # predict noise model_output
945
+ noise_pred = self.transformer(
946
+ latent_model_input,
947
+ encoder_hidden_states=prompt_embeds,
948
+ encoder_attention_mask=prompt_attention_mask,
949
+ timestep=timestep,
950
+ return_dict=False,
951
+ attention_kwargs=self.attention_kwargs,
952
+ )[0]
953
+
954
+ noise_pred = noise_pred.float()
955
+
956
+ # perform guidance
957
+ if self.do_classifier_free_guidance:
958
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
959
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
960
+
961
+ # learned sigma
962
+ if self.transformer.config.out_channels // 2 == latent_channels:
963
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
964
+ else:
965
+ noise_pred = noise_pred
966
+
967
+ # compute previous image: x_t -> x_t-1
968
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
969
+
970
+ if callback_on_step_end is not None:
971
+ callback_kwargs = {}
972
+ for k in callback_on_step_end_tensor_inputs:
973
+ callback_kwargs[k] = locals()[k]
974
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
975
+
976
+ latents = callback_outputs.pop("latents", latents)
977
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
978
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
979
+
980
+ # call the callback, if provided
981
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
982
+ progress_bar.update()
983
+
984
+ if XLA_AVAILABLE:
985
+ xm.mark_step()
986
+
987
+ if output_type == "latent":
988
+ image = latents
989
+ else:
990
+ latents = latents.to(self.vae.dtype)
991
+ try:
992
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
993
+ except torch.cuda.OutOfMemoryError as e:
994
+ warnings.warn(
995
+ f"{e}. \n"
996
+ f"Try to use VAE tiling for large images. For example: \n"
997
+ f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
998
+ )
999
+ if use_resolution_binning:
1000
+ image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
1001
+
1002
+ if not output_type == "latent":
1003
+ image = self.image_processor.postprocess(image, output_type=output_type)
1004
+
1005
+ # Offload all models
1006
+ self.maybe_free_model_hooks()
1007
+
1008
+ if not return_dict:
1009
+ return (image,)
1010
+
1011
+ return SanaPipelineOutput(images=image)
diffusion/sana_loss.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+ import copy
5
+ from diffusers import DPMSolverMultistepScheduler
6
+ import os
7
+ from collections import OrderedDict
8
+ import logging
9
+ from safetensors.torch import load_file
10
+ from diffusers import (
11
+ AutoencoderDC,
12
+ FlowMatchEulerDiscreteScheduler,
13
+ SanaTransformer2DModel
14
+ )
15
+ import torch.nn as nn
16
+ from .pipeline_sana import SanaPipeline
17
+ # from flux_encoder import tokenize_prompt, encode_prompt
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class ToClipMLP(nn.Module):
23
+ def __init__(self, input_dim, output_dim):
24
+ super().__init__()
25
+ #self.activation_fn = ACT2FN[config.hidden_act]
26
+ self.fc1 = nn.Linear(input_dim, 2048)
27
+ self.layer_norm1 = nn.LayerNorm(2048)
28
+ self.relu = nn.ReLU()
29
+ self.fc2 = nn.Linear(2048, output_dim)
30
+ self.layer_norm2 = nn.LayerNorm(output_dim)
31
+
32
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
33
+ hidden_states = self.fc1(hidden_states)
34
+ hidden_states = self.layer_norm1(hidden_states)
35
+ hidden_states = self.relu(hidden_states)
36
+ hidden_states = self.fc2(hidden_states)
37
+ hidden_states = self.layer_norm2(hidden_states)
38
+ return hidden_states
39
+
40
+ class ToClipMLP(nn.Module):
41
+ def __init__(self, input_dim, output_dim):
42
+ super().__init__()
43
+ #self.activation_fn = ACT2FN[config.hidden_act]
44
+ self.fc1 = nn.Linear(input_dim, 2048)
45
+ self.layer_norm1 = nn.LayerNorm(2048)
46
+ self.relu = nn.ReLU()
47
+ self.fc2 = nn.Linear(2048, output_dim)
48
+ self.layer_norm2 = nn.LayerNorm(output_dim)
49
+
50
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
51
+ hidden_states = self.fc1(hidden_states)
52
+ hidden_states = self.layer_norm1(hidden_states)
53
+ hidden_states = self.relu(hidden_states)
54
+ hidden_states = self.fc2(hidden_states)
55
+ hidden_states = self.layer_norm2(hidden_states)
56
+ return hidden_states
57
+
58
+
59
+ class SanaModel_withMLP(nn.Module):
60
+ def __init__(self, sana, vision_dim=1152):
61
+ super().__init__()
62
+ self.sana = sana
63
+ self.dtype = torch.bfloat16
64
+ self.mlp = ToClipMLP(vision_dim, 2304)
65
+ # self.mlp_pool = ToClipMLP(vision_dim, 768)
66
+ self.config = self.sana.config
67
+
68
+ def forward(self, hidden_states,
69
+ timestep,
70
+ encoder_hidden_states,
71
+ return_dict,
72
+ encoder_attention_mask=None,
73
+ **kargs):
74
+
75
+ encoder_hidden_states = self.mlp(encoder_hidden_states)
76
+ hidden_states = self.sana(
77
+ hidden_states=hidden_states,
78
+ encoder_hidden_states=encoder_hidden_states,
79
+ encoder_attention_mask=encoder_attention_mask,
80
+ timestep=timestep,
81
+ return_dict=False,
82
+ **kargs
83
+ )
84
+ return hidden_states
85
+
86
+ def enable_gradient_checkpointing(self):
87
+ self.sana.enable_gradient_checkpointing()
88
+
89
+ def inference_load_denoising_pretrained_weights(
90
+ net,
91
+ weights_path,
92
+ names=None,
93
+ prefix_to_remove=None,
94
+ ):
95
+ # state_dict = load_file(weights_path, map_location="cpu")
96
+ state_dict = load_file(weights_path)
97
+ net.load_state_dict(state_dict, strict=False)
98
+ return
99
+
100
+
101
+ def load_denoising_pretrained_weights(
102
+ net,
103
+ weights_path,
104
+ names=None,
105
+ prefix_to_remove=None,
106
+ ):
107
+ state_dict = torch.load(weights_path, map_location="cpu")
108
+ if "model" in state_dict:
109
+ state_dict = state_dict["model"]
110
+ elif "net" in state_dict:
111
+ state_dict = state_dict["net"]
112
+
113
+ #if torch.distributed.get_rank() == 0 and names is not None:
114
+ # embed()
115
+
116
+ #torch.distributed.barrier()
117
+ if names is not None:
118
+ selected_state_dict = OrderedDict()
119
+ for ori_name in names:
120
+ name = ori_name[len(prefix_to_remove):] if prefix_to_remove is not None else ori_name
121
+ selected_state_dict[name] = state_dict[ori_name]
122
+
123
+ state_dict = selected_state_dict
124
+
125
+ net.load_state_dict(state_dict, strict=True)
126
+ return
127
+
128
+
129
+ class SANALoss(torch.nn.Module):
130
+ def __init__(
131
+ self,
132
+ model_path, scheduler_path, vision_dim=3584, diffusion_type='flow_matching', convert_vpred_to_xpred=True,
133
+ checkpoint_path=None,
134
+ # checkpoint_path_withmlp=None,
135
+ # mlp_checkpoint_path=None,
136
+ mlp_state_dict=None,
137
+ trainable_params='all', device='cpu', guidance_scale=3.5, revision=None, variant=None, repa_loss=False, mid_layer_idx=10, mid_loss_weight=1.0
138
+ ):
139
+ super(SANALoss, self).__init__()
140
+ self.torch_type = torch.bfloat16
141
+ self.base_model_path = model_path
142
+ self.use_mid_loss = repa_loss
143
+ self.mid_loss_weight = mid_loss_weight
144
+ self.mid_layer_idx = mid_layer_idx
145
+ #self.text_encoder = Gemma2Model.from_pretrained(model_path, subfolder="text_encoder")
146
+ #self.tokenizer = AutoTokenizer.from_pretrained(model_path,subfolder="tokenizer")
147
+ self.scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder="scheduler")
148
+ #self.sana_pipeline = SanaPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16,)
149
+
150
+ self.device = torch.device(torch.cuda.current_device())
151
+ self.scheduler_path = scheduler_path
152
+ self.vae = AutoencoderDC.from_pretrained(
153
+ model_path,
154
+ subfolder="vae",
155
+ revision=revision,
156
+ variant=variant,
157
+ )
158
+
159
+ # self.vae.to(self.torch_type).to(self.device)
160
+ self.vae.requires_grad_(False)
161
+
162
+ self.train_model = SanaTransformer2DModel.from_pretrained(
163
+ model_path, subfolder="transformer", revision=revision, variant=variant
164
+ )
165
+
166
+ if checkpoint_path is not None:
167
+ assert os.path.exists(checkpoint_path)
168
+ load_denoising_pretrained_weights(self.train_model, checkpoint_path)
169
+
170
+ # self.train_model = UNet2DConditionModel_withMLP(self.train_model, vision_dim=vision_dim)
171
+
172
+ self.train_model = SanaModel_withMLP(self.train_model, vision_dim=vision_dim)
173
+ # if checkpoint_path_withmlp is not None:
174
+ # assert os.path.exists(checkpoint_path_withmlp)
175
+ # load_denoising_pretrained_weights(self.train_model, checkpoint_path_withmlp)
176
+ # elif mlp_checkpoint_path is not None:
177
+ # assert os.path.exists(mlp_checkpoint_path)
178
+ # inference_load_denoising_pretrained_weights(self.train_model, mlp_checkpoint_path)
179
+ assert mlp_state_dict is not None
180
+ self.train_model.mlp.load_state_dict(mlp_state_dict, strict=True)
181
+
182
+ # 创建处理中间层特征的MLP
183
+ hidden_dim = 2240
184
+ self.mid_layer_mlp = None
185
+ if self.use_mid_loss:
186
+ self.mid_layer_mlp = torch.nn.Sequential(
187
+ torch.nn.Linear(hidden_dim, hidden_dim * 2),
188
+ torch.nn.GELU(),
189
+ torch.nn.Linear(hidden_dim * 2, 32),
190
+ torch.nn.LayerNorm(32)
191
+ )
192
+
193
+ # 初始化MLP的权重
194
+ for m in self.mid_layer_mlp.modules():
195
+ if isinstance(m, torch.nn.Linear):
196
+ # 使用Kaiming初始化权重
197
+ torch.nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='leaky_relu')
198
+ if m.bias is not None:
199
+ # 将偏置初始化为0
200
+ torch.nn.init.zeros_(m.bias)
201
+
202
+ self.train_model.enable_gradient_checkpointing()
203
+
204
+ self.set_trainable_params(trainable_params)
205
+
206
+
207
+ num_parameters_trainable = 0
208
+ num_parameters = 0
209
+ name_parameters_trainable = []
210
+ for n, p in self.train_model.named_parameters():
211
+ num_parameters += p.data.nelement()
212
+ if not p.requires_grad:
213
+ continue # frozen weights
214
+ name_parameters_trainable.append(n)
215
+ num_parameters_trainable += p.data.nelement()
216
+
217
+ self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
218
+ self.scheduler_path, subfolder="scheduler"
219
+ )
220
+ self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
221
+
222
+
223
+ # if self.train_model.config.guidance_embeds:
224
+ # self.guidance = torch.tensor([guidance_scale], device=self.device)
225
+ # # guidance = guidance.expand(model_input.shape[0])
226
+ # else:
227
+ # self.guidance = None
228
+
229
+ logger.info("Preparation done. Starting training diffusion ...")
230
+
231
+ def get_sigmas(self, timesteps, n_dim=4, dtype=torch.float32):
232
+ # sigmas = noise_scheduler_copy.sigmas.to(device=self.device, dtype=dtype)
233
+ sigmas = self.noise_scheduler_copy.sigmas
234
+ schedule_timesteps = self.noise_scheduler_copy.timesteps.to(device=timesteps.device)
235
+ timesteps = timesteps
236
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
237
+
238
+ sigma = sigmas[step_indices].flatten()
239
+ while len(sigma.shape) < n_dim:
240
+ sigma = sigma.unsqueeze(-1)
241
+ return sigma
242
+
243
+ def compute_text_embeddings(self, prompt, text_encoders, tokenizers):
244
+ with torch.no_grad():
245
+ prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
246
+ [text_encoders], [tokenizers], prompt, 77
247
+ )
248
+ # prompt_embeds = prompt_embeds.to(local_rank)
249
+ pooled_prompt_embeds = pooled_prompt_embeds.to(local_rank)
250
+ # text_ids = text_ids.to(local_rank)
251
+ return prompt_embeds, pooled_prompt_embeds, text_ids
252
+
253
+ def set_trainable_params(self, trainable_params):
254
+
255
+ self.vae.requires_grad_(False)
256
+
257
+ if trainable_params == 'all':
258
+ self.train_model.requires_grad_(True)
259
+ else:
260
+ self.train_model.requires_grad_(False)
261
+ for name, module in self.train_model.named_modules():
262
+ for trainable_param in trainable_params:
263
+ if trainable_param in name:
264
+ for params in module.parameters():
265
+ params.requires_grad = True
266
+
267
+ num_parameters_trainable = 0
268
+ num_parameters = 0
269
+ name_parameters_trainable = []
270
+ for n, p in self.train_model.named_parameters():
271
+ num_parameters += p.data.nelement()
272
+ if not p.requires_grad:
273
+ continue # frozen weights
274
+ name_parameters_trainable.append(n)
275
+ num_parameters_trainable += p.data.nelement()
276
+
277
+ def sample(self, encoder_hidden_states, steps=20, cfg=7.0, seed=42, height=512, width=512):
278
+ #self.pipelines = SanaPipeline.from_pretrained(self.base_model_path)#.to(device=self.device)
279
+ self.pipelines = SanaPipeline(vae=self.vae,
280
+ transformer=self.train_model,
281
+ text_encoder=None,
282
+ tokenizer=None,
283
+ scheduler=self.noise_scheduler,
284
+ ).to(self.device)
285
+
286
+ prompt_attention_mask = torch.ones(encoder_hidden_states.shape[:2]).to(self.device)
287
+ negative_attention_mask = torch.ones(encoder_hidden_states.shape[:2]).to(self.device)
288
+
289
+ image = self.pipelines(
290
+ prompt_embeds=encoder_hidden_states,
291
+ prompt_attention_mask=prompt_attention_mask,
292
+ negative_prompt_embeds=encoder_hidden_states*0,
293
+ negative_prompt_attention_mask=negative_attention_mask,
294
+ guidance_scale=cfg,
295
+ generator=torch.manual_seed(seed),
296
+ num_inference_steps=steps,
297
+ device=self.device,
298
+ height=height,
299
+ width=width,
300
+ max_sequence_length=300,
301
+ ).images[0]
302
+
303
+ return image