Instruction-tuned model finetuned on WizardLMTeam/WizardLM_evol_instruct_V2_196k

Small enough to be run on a phone
124 million parameters
Comparable performance to TinyLlama-Chat

We ran some zero-shot tests to compare Lazarus Instruct with the much larger TinyLlama-Chat

🚀 Usage

You can interact with Lazarus using the script below:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("CUDA Available:", torch.cuda.is_available())

model_name = "Aclevo/Lazarus-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

system_prompt = (
    "Your name is Lazarus. You are an intelligent AI assistant. You help users with whatever they need. "
    "You always think before answering, and explain your reasoning out loud step by step.\n"
)

chat_history = []

def chat():
    print("Chatting with GPT-2 (type 'exit' to quit)\n")

    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            break

        chat_history.append(f"You: {user_input}")
        recent_history = chat_history[-6:]
        full_prompt = system_prompt + "\n".join(recent_history) + "\nAI:"

        inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=inputs["input_ids"].shape[1] + 150,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                top_k=100,
                top_p=0.92,
                temperature=0.7,
                eos_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
        response = response.strip()

        bad_responses = {"I hope that", "I don't know", "", "I'm excited"}
        if response in bad_responses:
            print("AI: [Regenerating due to low-quality response]")
            continue

        print(f"AI: {response}")
        chat_history.append(f"AI: {response}")

if __name__ == "__main__":
    chat()

Please consider citing us if you find this model useful

Aclevo is not responsible for the misuse of this model