AhmedSSoliman
/

Llama2-CodeGen-PEFT-QLoRA

Text Generation

Code-Generation

Trained with AutoTrain

text-generation-inference

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

AhmedSSoliman commited on Jul 28, 2023

Commit

7bef3fd

·

1 Parent(s): 8fed7be

Update README.md

Files changed (1) hide show

README.md +34 -10

README.md CHANGED Viewed

@@ -28,6 +28,9 @@ This model is  [**LlaMa2-7b**](https://huggingface.co/meta-llama/Llama-2-7b) whi
 # You can load the LlaMa2-CodeGen model on google colab.
 ### Example
 ```py
@@ -46,9 +49,10 @@ model = PeftModel.from_pretrained(model, peft_model_id)
 def create_prompt(instruction):
-  system = "You are a coding assistant that will help the user to resolve the following instruction:"
-  instruction = "\n### Input: " + instruction
   return system + "\n" + instruction + "\n\n" + "### Response:" + "\n"
 def generate(
@@ -62,9 +66,10 @@ def generate(
 ):
     prompt = create_prompt(instruction)
     print(prompt)
-    inputs = tokenizer(prompt, return_tensors="pt")
-    input_ids = inputs["input_ids"].to("cuda")
-    attention_mask = inputs["attention_mask"].to("cuda")
     generation_config = GenerationConfig(
         temperature=temperature,
         top_p=top_p,
@@ -74,17 +79,36 @@ def generate(
     )
     with torch.no_grad():
         generation_output = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
             generation_config=generation_config,
             return_dict_in_generate=True,
             output_scores=True,
             max_new_tokens=max_new_tokens,
             early_stopping=True
         )
-    s = generation_output.sequences[0]
-    output = tokenizer.decode(s)
-    return output.split("### Response:")[1].lstrip("\n")
 instruction = """

 # You can load the LlaMa2-CodeGen model on google colab.
 ### Example
 ```py
 def create_prompt(instruction):
+  system = "You are using the Llam2-CodeGen model, a coding assistant that will help the user to resolve the following instruction:\n"
+  instruction = "### Input: " + instruction
   return system + "\n" + instruction + "\n\n" + "### Response:" + "\n"
 def generate(
 ):
     prompt = create_prompt(instruction)
     print(prompt)
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    #input_ids = inputs["input_ids"].to("cuda")
+    #attention_mask = inputs["attention_mask"].to("cuda")
     generation_config = GenerationConfig(
         temperature=temperature,
         top_p=top_p,
     )
     with torch.no_grad():
         generation_output = model.generate(
+            #input_ids=input_ids,
+            #attention_mask=attention_mask,
+            **inputs,
             generation_config=generation_config,
             return_dict_in_generate=True,
             output_scores=True,
             max_new_tokens=max_new_tokens,
             early_stopping=True
         )
+    generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    stop_output = "### Input"
+    gen_response = (generated_response.split(stop_output))[0]
+    #s = generation_output.sequences[0]
+    #output = tokenizer.decode(s, skip_special_tokens=True)
+    #stop_output = "### Input"
+    #gen_response = (output.split(stop_output))[0]
+    #return output.split("### Response:")[1].lstrip("\n")
+    return gen_response
 instruction = """