Spaces:

as-cle-bert
/

SearchPhi

Runtime error

App Files Files Community

as-cle-bert commited on Jul 27, 2024

Commit

70230bc

verified ·

1 Parent(s): 2baf278

Update llama_cpp_inf.py

Browse files

Files changed (1) hide show

llama_cpp_inf.py +10 -23

llama_cpp_inf.py CHANGED Viewed

@@ -2,37 +2,24 @@
 from llama_cpp import Llama
 import re
 from huggingface_hub import hf_hub_download
-## Download the GGUF model
-model_name = "microsoft/Phi-3-mini-4k-instruct-gguf"
-model_file = "Phi-3-mini-4k-instruct-q4.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
-model_path = hf_hub_download(model_name, filename=model_file)
-## Instantiate model from downloaded file
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,  # Context length to use
-    n_threads=14,            # Number of CPU threads to use
-    n_gpu_layers=3        # Number of model layers to offload to GPU
-)
-## Generation kwargs
-generation_kwargs = {
-    "max_tokens":1024,
-    "stop":["<|end|>"],
-    "echo":False, # Echo the prompt in the output
-    "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
-}
 def run_inference_lcpp(jsonstr, user_search):
     prompt = f"""Instructions for the assistant: Starting from the URLs and the keywords deriving from Google search results and provided to you in JSON format, generate a meaningful summary of the search results that satisfies the user's query.
     URLs and keywords in JSON format: {jsonstr}.
     User's query to satisfy: {user_search}"""
-    res = llm(prompt, **generation_kwargs)
-    response = res["choices"][0]["text"]
     jsondict = eval(jsonstr)
     addon = "Reference websites:\n- "+ '\n- '.join(list(jsondict.keys()))
-    input_string = response.replace("<|assistant|>", "") + "\n\n" + addon
     frag_res = re.findall(r'\w+|\s+|[^\w\s]', input_string)
     for word in frag_res:
         yield word

 from llama_cpp import Llama
 import re
 from huggingface_hub import hf_hub_download
+from gradio_client import Client
+api_client = Client("eswardivi/Phi-3-mini-128k-instruct")
 def run_inference_lcpp(jsonstr, user_search):
     prompt = f"""Instructions for the assistant: Starting from the URLs and the keywords deriving from Google search results and provided to you in JSON format, generate a meaningful summary of the search results that satisfies the user's query.
     URLs and keywords in JSON format: {jsonstr}.
     User's query to satisfy: {user_search}"""
+    response = api_client.predict(
+        prompt,	# str  in 'Message' Textbox component
+        0.2,	# float (numeric value between 0 and 1) in 'Temperature' Slider component
+        True,	# bool  in 'Sampling' Checkbox component
+        512,	# float (numeric value between 128 and 4096) in 'Max new tokens' Slider component
+        api_name="/chat"
+    )
     jsondict = eval(jsonstr)
     addon = "Reference websites:\n- "+ '\n- '.join(list(jsondict.keys()))
+    input_string = response + "\n\n" + addon
     frag_res = re.findall(r'\w+|\s+|[^\w\s]', input_string)
     for word in frag_res:
         yield word