Spaces:

rphrp1985
/

zerogpu

Running on Zero

rphrp1985 commited on Jun 9, 2024

Commit

126e605

verified ·

1 Parent(s): 0080074

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -45,10 +45,10 @@ def respond(
                                              # attn_implementation="flash_attention_2",
                                              # low_cpu_mem_usage=True,
                                              # llm_int8_enable_fp32_cpu_offload=True,
-                                             device_map="auto"
                                             )
     messages = [{"role": "user", "content": "Hello, how are you?"}]
-    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
 ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
     gen_tokens = model.generate(

                                              # attn_implementation="flash_attention_2",
                                              # low_cpu_mem_usage=True,
                                              # llm_int8_enable_fp32_cpu_offload=True,
+                                             device_map="cuda"
                                             )
     messages = [{"role": "user", "content": "Hello, how are you?"}]
+    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
 ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
     gen_tokens = model.generate(