Update app.py
Browse files
app.py
CHANGED
@@ -45,10 +45,10 @@ def respond(
|
|
45 |
# attn_implementation="flash_attention_2",
|
46 |
# low_cpu_mem_usage=True,
|
47 |
# llm_int8_enable_fp32_cpu_offload=True,
|
48 |
-
device_map="
|
49 |
)
|
50 |
messages = [{"role": "user", "content": "Hello, how are you?"}]
|
51 |
-
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
52 |
## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
|
53 |
|
54 |
gen_tokens = model.generate(
|
|
|
45 |
# attn_implementation="flash_attention_2",
|
46 |
# low_cpu_mem_usage=True,
|
47 |
# llm_int8_enable_fp32_cpu_offload=True,
|
48 |
+
device_map="cuda"
|
49 |
)
|
50 |
messages = [{"role": "user", "content": "Hello, how are you?"}]
|
51 |
+
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
|
52 |
## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
|
53 |
|
54 |
gen_tokens = model.generate(
|