MilanCalegari commited on
Commit
267cb21
·
1 Parent(s): fc534ec

fix: use less cpu in inference

Browse files
Files changed (1) hide show
  1. modules/llm/card_interpreter.py +4 -3
modules/llm/card_interpreter.py CHANGED
@@ -20,9 +20,11 @@ class CardInterpreter(CardInterpreterInterface):
20
  model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
21
  device_map="cpu", # Force CPU for HF Spaces compatibility
22
  pad_token_id=2,
23
- model_kwargs={"low_cpu_mem_usage": True} # Reduce memory usage
 
 
 
24
  )
25
-
26
  # Base prompt template
27
  self._base_content = """
28
  You are a powerful occultist and exceptional tarot reader. Provide a concise reading based on the given cards.
@@ -101,6 +103,5 @@ class CardInterpreter(CardInterpreterInterface):
101
  max_new_tokens=256, # Reduced token limit for faster inference
102
  num_return_sequences=1,
103
  do_sample=False,
104
- temperature=0.7 # Add some randomness while keeping coherence
105
  )
106
  return result[0]["generated_text"][-1]["content"]
 
20
  model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
21
  device_map="cpu", # Force CPU for HF Spaces compatibility
22
  pad_token_id=2,
23
+ model_kwargs={
24
+ "low_cpu_mem_usage": False,
25
+ "use_cache": False
26
+ }
27
  )
 
28
  # Base prompt template
29
  self._base_content = """
30
  You are a powerful occultist and exceptional tarot reader. Provide a concise reading based on the given cards.
 
103
  max_new_tokens=256, # Reduced token limit for faster inference
104
  num_return_sequences=1,
105
  do_sample=False,
 
106
  )
107
  return result[0]["generated_text"][-1]["content"]