Spaces:

aauu1234
/

meta-llama-Meta-Llama-3.1-70B-Instruct

Sleeping

App Files Files Community

aauu1234 commited on Sep 5, 2024

Commit

1c22ce6

1 Parent(s): cf1021e

efefe

Browse files

Files changed (2) hide show

app.py +19 -17
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import traceback
 model_name_or_path = "stephenlzc/dolphin-llama3-zh-cn-uncensored"
@@ -10,25 +10,24 @@ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 model = AutoModelForCausalLM.from_pretrained(
-    model_name_or_path,
-    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
     trust_remote_code=True
-).to(device)
 print("Tokenizer loaded successfully")
 print("Model loaded successfully")
-# Test inference
-test_messages = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Hello, who are you?"},
-]
-test_input_ids = tokenizer.apply_chat_template(conversation=test_messages, tokenize=True, return_tensors="pt").to(device)
-test_output = model.generate(inputs=test_input_ids, max_new_tokens=50)
-test_response = tokenizer.decode(test_output[0])
-print("Test response:", test_response)
 def generate_response(system_message, user_message):
     try:
         messages = [
@@ -37,13 +36,16 @@ def generate_response(system_message, user_message):
         ]
         input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt").to(device)
         output = model.generate(
             inputs=input_ids,
-            max_new_tokens=512
         )
-        generated_response = tokenizer.decode(output[0])
         return generated_response
     except Exception as e:
         error_message = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
@@ -56,7 +58,7 @@ iface = gr.Interface(
         gr.Textbox(label="User Message")
     ],
     outputs=gr.Textbox(label="Generated Response"),
-    title="llama3 cn uncensored Chatbot (GPU-enabled)"
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import traceback
 model_name_or_path = "stephenlzc/dolphin-llama3-zh-cn-uncensored"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# Configure quantization
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    llm_int8_threshold=6.0,
+    llm_int8_has_fp16_weight=False,
+)
+# Load the model with quantization
 model = AutoModelForCausalLM.from_pretrained(
+    model_name_or_path,
+    quantization_config=quantization_config,
+    device_map="auto",
     trust_remote_code=True
+)
 print("Tokenizer loaded successfully")
 print("Model loaded successfully")
 def generate_response(system_message, user_message):
     try:
         messages = [
         ]
         input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt").to(device)
+        attention_mask = torch.ones_like(input_ids).to(device)
         output = model.generate(
             inputs=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=512,
+            pad_token_id=tokenizer.eos_token_id
         )
+        generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
         return generated_response
     except Exception as e:
         error_message = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
         gr.Textbox(label="User Message")
     ],
     outputs=gr.Textbox(label="Generated Response"),
+    title="llama3 cn uncensored Chatbot (GPU-enabled, 8-bit quantized)"
 )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ huggingface_hub
 torch
 transformers
 accelerate
-sentencepiece

 torch
 transformers
 accelerate
+sentencepiece
+bitsandbytes