Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
import torch
|
|
|
4 |
import subprocess
|
5 |
from huggingface_hub import InferenceClient
|
6 |
import os
|
@@ -46,7 +47,7 @@ with init_empty_weights():
|
|
46 |
attn_implementation="flash_attention_2",
|
47 |
# low_cpu_mem_usage=True,
|
48 |
# llm_int8_enable_fp32_cpu_offload=True,
|
49 |
-
device_map="cuda",
|
50 |
|
51 |
)
|
52 |
|
@@ -57,7 +58,9 @@ with init_empty_weights():
|
|
57 |
device_map = infer_auto_device_map(model, max_memory={0: "80GB", 1: "80GB", "cpu": "65GB"})
|
58 |
|
59 |
# Load the model with the inferred device map
|
60 |
-
model = load_checkpoint_and_dispatch(model, "path_to_checkpoint", device_map=device_map
|
|
|
|
|
61 |
|
62 |
|
63 |
@spaces.GPU(duration=60)
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
import torch
|
4 |
+
from torch.cuda.amp import autocast
|
5 |
import subprocess
|
6 |
from huggingface_hub import InferenceClient
|
7 |
import os
|
|
|
47 |
attn_implementation="flash_attention_2",
|
48 |
# low_cpu_mem_usage=True,
|
49 |
# llm_int8_enable_fp32_cpu_offload=True,
|
50 |
+
# device_map="cuda",
|
51 |
|
52 |
)
|
53 |
|
|
|
58 |
device_map = infer_auto_device_map(model, max_memory={0: "80GB", 1: "80GB", "cpu": "65GB"})
|
59 |
|
60 |
# Load the model with the inferred device map
|
61 |
+
model = load_checkpoint_and_dispatch(model, "path_to_checkpoint", device_map=device_map)
|
62 |
+
model.half()
|
63 |
+
|
64 |
|
65 |
|
66 |
@spaces.GPU(duration=60)
|