as-cle-bert commited on
Commit
70230bc
·
verified ·
1 Parent(s): 2baf278

Update llama_cpp_inf.py

Browse files
Files changed (1) hide show
  1. llama_cpp_inf.py +10 -23
llama_cpp_inf.py CHANGED
@@ -2,37 +2,24 @@
2
  from llama_cpp import Llama
3
  import re
4
  from huggingface_hub import hf_hub_download
 
5
 
6
- ## Download the GGUF model
7
- model_name = "microsoft/Phi-3-mini-4k-instruct-gguf"
8
- model_file = "Phi-3-mini-4k-instruct-q4.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
9
- model_path = hf_hub_download(model_name, filename=model_file)
10
-
11
- ## Instantiate model from downloaded file
12
- llm = Llama(
13
- model_path=model_path,
14
- n_ctx=4096, # Context length to use
15
- n_threads=14, # Number of CPU threads to use
16
- n_gpu_layers=3 # Number of model layers to offload to GPU
17
- )
18
-
19
- ## Generation kwargs
20
- generation_kwargs = {
21
- "max_tokens":1024,
22
- "stop":["<|end|>"],
23
- "echo":False, # Echo the prompt in the output
24
- "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
25
- }
26
 
27
  def run_inference_lcpp(jsonstr, user_search):
28
  prompt = f"""Instructions for the assistant: Starting from the URLs and the keywords deriving from Google search results and provided to you in JSON format, generate a meaningful summary of the search results that satisfies the user's query.
29
  URLs and keywords in JSON format: {jsonstr}.
30
  User's query to satisfy: {user_search}"""
31
- res = llm(prompt, **generation_kwargs)
32
- response = res["choices"][0]["text"]
 
 
 
 
 
33
  jsondict = eval(jsonstr)
34
  addon = "Reference websites:\n- "+ '\n- '.join(list(jsondict.keys()))
35
- input_string = response.replace("<|assistant|>", "") + "\n\n" + addon
36
  frag_res = re.findall(r'\w+|\s+|[^\w\s]', input_string)
37
  for word in frag_res:
38
  yield word
 
2
  from llama_cpp import Llama
3
  import re
4
  from huggingface_hub import hf_hub_download
5
+ from gradio_client import Client
6
 
7
+ api_client = Client("eswardivi/Phi-3-mini-128k-instruct")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def run_inference_lcpp(jsonstr, user_search):
10
  prompt = f"""Instructions for the assistant: Starting from the URLs and the keywords deriving from Google search results and provided to you in JSON format, generate a meaningful summary of the search results that satisfies the user's query.
11
  URLs and keywords in JSON format: {jsonstr}.
12
  User's query to satisfy: {user_search}"""
13
+ response = api_client.predict(
14
+ prompt, # str in 'Message' Textbox component
15
+ 0.2, # float (numeric value between 0 and 1) in 'Temperature' Slider component
16
+ True, # bool in 'Sampling' Checkbox component
17
+ 512, # float (numeric value between 128 and 4096) in 'Max new tokens' Slider component
18
+ api_name="/chat"
19
+ )
20
  jsondict = eval(jsonstr)
21
  addon = "Reference websites:\n- "+ '\n- '.join(list(jsondict.keys()))
22
+ input_string = response + "\n\n" + addon
23
  frag_res = re.findall(r'\w+|\s+|[^\w\s]', input_string)
24
  for word in frag_res:
25
  yield word