
Talk with the model:
- Paste this code in your python file:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
MODEL_NAME = "VortexIntelligence/VLM-1.1-K1-Preview"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})
tokenizer.eos_token = "<|endoftext|>"
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
low_cpu_mem_usage=True
)
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def stream_response(user_input):
system_prompt = "You are a helpful assistant."
prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_input}\n<|assistant|>\n"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
input_len = inputs['input_ids'].shape[-1]
max_new_tokens = 128
start_time = time.time()
output = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
repetition_penalty=2.1,
temperature=0.7
)
output_tokens = output[0][input_len:]
generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(generated_text, end='', flush=True)
end_time = time.time()
duration = end_time - start_time
total_tokens = len(output_tokens)
tps = total_tokens / duration
tpm = tps * 60
print("\n" + "-"*20)
print(f"Time taken: {duration:.2f}s")
print(f"Total tokens: {total_tokens}")
print(f"Tokens/sec: {tps:.2f}")
print(f"Tokens/min: {tpm:.2f}")
print("VLM 1.1 Chat - Type 'exit' to quit")
while True:
user_input = input("User: ")
if user_input.lower() == 'exit':
print("Exiting chat. Goodbye!")
break
print("VLM: ", end="", flush=True)
stream_response(user_input)
- Downloads last month
- 78
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support