update diffusers weights
Browse files- image_encoder/config.json +3 -3
- image_encoder/model.safetensors +2 -2
- model_index.json +3 -4
- prior/config.json +39 -36
- prior/diffusion_pytorch_model.safetensors +2 -2
- scheduler/scheduler_config.json +1 -1
- text_encoder/config.json +3 -3
- text_encoder/model.safetensors +2 -2
- tokenizer/tokenizer.json +2 -16
image_encoder/config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"CLIPVisionModelWithProjection"
|
| 5 |
],
|
|
@@ -18,6 +18,6 @@
|
|
| 18 |
"num_hidden_layers": 24,
|
| 19 |
"patch_size": 14,
|
| 20 |
"projection_dim": 768,
|
| 21 |
-
"torch_dtype": "
|
| 22 |
-
"transformers_version": "4.38.
|
| 23 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "openai/clip-vit-large-patch14",
|
| 3 |
"architectures": [
|
| 4 |
"CLIPVisionModelWithProjection"
|
| 5 |
],
|
|
|
|
| 18 |
"num_hidden_layers": 24,
|
| 19 |
"patch_size": 14,
|
| 20 |
"projection_dim": 768,
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.38.2"
|
| 23 |
}
|
image_encoder/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77b33d2a3a643650857672e880ccf73adbaf114fbbadec36d142ee9d48af7e20
|
| 3 |
+
size 1215912728
|
model_index.json
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"_class_name": "StableCascadePriorPipeline",
|
| 3 |
-
"_diffusers_version": "0.
|
| 4 |
-
"_name_or_path": "StableCascade-prior/",
|
| 5 |
"feature_extractor": [
|
| 6 |
"transformers",
|
| 7 |
"CLIPImageProcessor"
|
|
@@ -11,8 +10,8 @@
|
|
| 11 |
"CLIPVisionModelWithProjection"
|
| 12 |
],
|
| 13 |
"prior": [
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
],
|
| 17 |
"resolution_multiple": 42.67,
|
| 18 |
"scheduler": [
|
|
|
|
| 1 |
{
|
| 2 |
"_class_name": "StableCascadePriorPipeline",
|
| 3 |
+
"_diffusers_version": "0.27.0.dev0",
|
|
|
|
| 4 |
"feature_extractor": [
|
| 5 |
"transformers",
|
| 6 |
"CLIPImageProcessor"
|
|
|
|
| 10 |
"CLIPVisionModelWithProjection"
|
| 11 |
],
|
| 12 |
"prior": [
|
| 13 |
+
"diffusers",
|
| 14 |
+
"StableCascadeUNet"
|
| 15 |
],
|
| 16 |
"resolution_multiple": 42.67,
|
| 17 |
"scheduler": [
|
prior/config.json
CHANGED
|
@@ -1,61 +1,64 @@
|
|
| 1 |
{
|
| 2 |
-
"_class_name": "
|
| 3 |
-
"_diffusers_version": "0.
|
| 4 |
-
"
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
1,
|
| 8 |
-
1
|
| 9 |
-
],
|
| 10 |
-
[
|
| 11 |
-
1,
|
| 12 |
-
1
|
| 13 |
-
]
|
| 14 |
],
|
| 15 |
-
"
|
| 16 |
[
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
],
|
| 20 |
[
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
]
|
| 24 |
],
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
| 34 |
],
|
| 35 |
-
"c_in": 16,
|
| 36 |
-
"c_out": 16,
|
| 37 |
-
"c_pixels": null,
|
| 38 |
-
"c_r": 64,
|
| 39 |
"dropout": [
|
| 40 |
0.1,
|
| 41 |
0.1
|
| 42 |
],
|
|
|
|
|
|
|
| 43 |
"kernel_size": 3,
|
| 44 |
-
"
|
| 45 |
-
"CTA",
|
| 46 |
-
"CTA"
|
| 47 |
-
],
|
| 48 |
-
"nhead": [
|
| 49 |
32,
|
| 50 |
32
|
| 51 |
],
|
|
|
|
| 52 |
"patch_size": 1,
|
|
|
|
| 53 |
"self_attn": true,
|
| 54 |
"switch_level": [
|
| 55 |
false
|
| 56 |
],
|
| 57 |
-
"
|
| 58 |
"sca",
|
| 59 |
"crp"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
]
|
| 61 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_class_name": "StableCascadeUNet",
|
| 3 |
+
"_diffusers_version": "0.27.0.dev0",
|
| 4 |
+
"block_out_channels": [
|
| 5 |
+
2048,
|
| 6 |
+
2048
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
],
|
| 8 |
+
"block_types_per_layer": [
|
| 9 |
[
|
| 10 |
+
"SDCascadeResBlock",
|
| 11 |
+
"SDCascadeTimestepBlock",
|
| 12 |
+
"SDCascadeAttnBlock"
|
| 13 |
],
|
| 14 |
[
|
| 15 |
+
"SDCascadeResBlock",
|
| 16 |
+
"SDCascadeTimestepBlock",
|
| 17 |
+
"SDCascadeAttnBlock"
|
| 18 |
]
|
| 19 |
],
|
| 20 |
+
"clip_image_in_channels": 768,
|
| 21 |
+
"clip_seq": 4,
|
| 22 |
+
"clip_text_in_channels": 1280,
|
| 23 |
+
"clip_text_pooled_in_channels": 1280,
|
| 24 |
+
"conditioning_dim": 2048,
|
| 25 |
+
"down_blocks_repeat_mappers": [
|
| 26 |
+
1,
|
| 27 |
+
1
|
| 28 |
+
],
|
| 29 |
+
"down_num_layers_per_block": [
|
| 30 |
+
8,
|
| 31 |
+
24
|
| 32 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"dropout": [
|
| 34 |
0.1,
|
| 35 |
0.1
|
| 36 |
],
|
| 37 |
+
"effnet_in_channels": null,
|
| 38 |
+
"in_channels": 16,
|
| 39 |
"kernel_size": 3,
|
| 40 |
+
"num_attention_heads": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
32,
|
| 42 |
32
|
| 43 |
],
|
| 44 |
+
"out_channels": 16,
|
| 45 |
"patch_size": 1,
|
| 46 |
+
"pixel_mapper_in_channels": null,
|
| 47 |
"self_attn": true,
|
| 48 |
"switch_level": [
|
| 49 |
false
|
| 50 |
],
|
| 51 |
+
"timestep_conditioning_type": [
|
| 52 |
"sca",
|
| 53 |
"crp"
|
| 54 |
+
],
|
| 55 |
+
"timestep_ratio_embedding_dim": 64,
|
| 56 |
+
"up_blocks_repeat_mappers": [
|
| 57 |
+
1,
|
| 58 |
+
1
|
| 59 |
+
],
|
| 60 |
+
"up_num_layers_per_block": [
|
| 61 |
+
24,
|
| 62 |
+
8
|
| 63 |
]
|
| 64 |
}
|
prior/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a2c7aa62c503780b85f74fd513b1b99c12ea4f83422bdbad5ac264aa68efb4b
|
| 3 |
+
size 14356584672
|
scheduler/scheduler_config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"_class_name": "DDPMWuerstchenScheduler",
|
| 3 |
-
"_diffusers_version": "0.
|
| 4 |
"s": 0.008,
|
| 5 |
"scaler": 1.0
|
| 6 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"_class_name": "DDPMWuerstchenScheduler",
|
| 3 |
+
"_diffusers_version": "0.27.0.dev0",
|
| 4 |
"s": 0.008,
|
| 5 |
"scaler": 1.0
|
| 6 |
}
|
text_encoder/config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"CLIPTextModelWithProjection"
|
| 5 |
],
|
|
@@ -19,7 +19,7 @@
|
|
| 19 |
"num_hidden_layers": 32,
|
| 20 |
"pad_token_id": 1,
|
| 21 |
"projection_dim": 1280,
|
| 22 |
-
"torch_dtype": "
|
| 23 |
-
"transformers_version": "4.38.
|
| 24 |
"vocab_size": 49408
|
| 25 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
|
| 3 |
"architectures": [
|
| 4 |
"CLIPTextModelWithProjection"
|
| 5 |
],
|
|
|
|
| 19 |
"num_hidden_layers": 32,
|
| 20 |
"pad_token_id": 1,
|
| 21 |
"projection_dim": 1280,
|
| 22 |
+
"torch_dtype": "float32",
|
| 23 |
+
"transformers_version": "4.38.2",
|
| 24 |
"vocab_size": 49408
|
| 25 |
}
|
text_encoder/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa5b2e6f4c2efc2d82e4b8312faec1a5540eabfc6415126c9a05c8436a530ef4
|
| 3 |
+
size 2778702264
|
tokenizer/tokenizer.json
CHANGED
|
@@ -1,21 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
-
"truncation":
|
| 4 |
-
|
| 5 |
-
"max_length": 77,
|
| 6 |
-
"strategy": "LongestFirst",
|
| 7 |
-
"stride": 0
|
| 8 |
-
},
|
| 9 |
-
"padding": {
|
| 10 |
-
"strategy": {
|
| 11 |
-
"Fixed": 77
|
| 12 |
-
},
|
| 13 |
-
"direction": "Right",
|
| 14 |
-
"pad_to_multiple_of": null,
|
| 15 |
-
"pad_id": 49407,
|
| 16 |
-
"pad_type_id": 0,
|
| 17 |
-
"pad_token": "<|endoftext|>"
|
| 18 |
-
},
|
| 19 |
"added_tokens": [
|
| 20 |
{
|
| 21 |
"id": 49406,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 49406,
|