Talk with the model:

  • Paste this code in your python file:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})
tokenizer.eos_token = "<|endoftext|>"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    low_cpu_mem_usage=True
)

model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def stream_response(user_input):
    system_prompt = "You are a helpful assistant."
    prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    input_len = inputs['input_ids'].shape[-1]
    max_new_tokens = 128

    start_time = time.time()

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=2.1,
        temperature=0.7
    )

    output_tokens = output[0][input_len:]
    generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
    print(generated_text, end='', flush=True)

    end_time = time.time()
    duration = end_time - start_time
    total_tokens = len(output_tokens)
    tps = total_tokens / duration
    tpm = tps * 60

    print("\n" + "-"*20)
    print(f"Time taken: {duration:.2f}s")
    print(f"Total tokens: {total_tokens}")
    print(f"Tokens/sec: {tps:.2f}")
    print(f"Tokens/min: {tpm:.2f}")

print("VLM 1.1 Chat - Type 'exit' to quit")
while True:
    user_input = input("User: ")
    if user_input.lower() == 'exit':
        print("Exiting chat. Goodbye!")
        break
    print("VLM: ", end="", flush=True)
    stream_response(user_input)
Downloads last month
78
Safetensors
Model size
355M params
Tensor type
F32
ยท
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for VortexIntelligence/VLM-1.1-K1-Preview

Merges
1 model
Quantizations
2 models