Spaces:

Emova-ollm
/

RACRO-demo

Running on Zero

KaiChen1998 commited on Jun 16

Commit

694f7e2

verified ·

1 Parent(s): 80524d9

use flash attention

Files changed (1) hide show

app.py CHANGED Viewed

@@ -35,8 +35,8 @@ LLM_MODEL_PATH = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
 processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
 tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
-mllm = Qwen2_5_VLForConditionalGeneration.from_pretrained(MLLM_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
-llm = AutoModelForCausalLM.from_pretrained(LLM_MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
 mllm_sampling = dict(do_sample=False, temperature=0, max_new_tokens=8192)
 llm_sampling = dict(temperature=0.6, top_p=0.95, max_new_tokens=8192)

 processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
 tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
+mllm = Qwen2_5_VLForConditionalGeneration.from_pretrained(MLLM_MODEL_PATH, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto")
+llm = AutoModelForCausalLM.from_pretrained(LLM_MODEL_PATH, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto")
 mllm_sampling = dict(do_sample=False, temperature=0, max_new_tokens=8192)
 llm_sampling = dict(temperature=0.6, top_p=0.95, max_new_tokens=8192)