import re import torch from transformers import AutoModelForCausalLM, AutoTokenizer import gradio as gr MODEL_ID = "thoughtcast/marketing-spiked-lassie-experiment" # ─────────────────── Load model (fp16 on GPU) ─────────────────────────── tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", # puts it on the available GPU torch_dtype=torch.float16 # fp16 = small & fast enough for T4-small ) # ─────────────────── Post-processing helper ───────────────────────────── def _shorten(txt: str) -> str: """return only first sentence & dedupe immediate repeat words.""" txt = txt.split("\n")[-1].strip() # keep up to first sentence-ending punctuation m = re.search(r"[.!?]", txt) if m: txt = txt[: m.end()] words = txt.split() deduped = [words[0]] + [ w for i, w in enumerate(words[1:]) if w.lower() != words[i].lower() ] return " ".join(deduped) # ─────────────────── Chat function ────────────────────────────────────── def chat(message, history=[]): prompt = "".join(f"\n{u}\n\n{a}\n" for u, a in history) prompt += f"\n{message}\n\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate( **inputs, max_new_tokens=80, do_sample=True, temperature=0.6, top_p=0.85, repetition_penalty=1.25, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, ) decoded = tokenizer.decode(output[0], skip_special_tokens=True) return _shorten(decoded) gr.ChatInterface(chat, title="Marketing Lassie 🐾 (Trained on Lassie's Website Marketing Information in FAQ / Conversational Form C=8, S_c=3, S=1)").launch()