from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer, AutoConfig from threading import Thread from transformers import TextIteratorStreamer import streamlit as st import warnings warnings.filterwarnings(action='ignore') import datetime import random import string from time import sleep import tiktoken import asyncio # 비동기 처리를 위해 asyncio 추가 # requirements.txt 파일 필요: # optimum[openvino] # transformers # streamlit # tiktoken # asyncio # 토큰 수 계산을 위한 인코딩 설정 encoding = tiktoken.get_encoding("cl100k_base") # 모델 이름 및 ID 설정 (변수 통일) model_name = "Gemma2-2B-it" model_id = "AIFunOver/gemma-2-2b-it-openvino-4bit" # Hugging Face Hub 모델 ID # 웹페이지 기본 설정 st.set_page_config( page_title=f"Your LocalGPT ✨ with {model_name}", page_icon="🌟", layout="wide") # Session State 초기화 (Hugging Face Space 재실행 시 상태 유지) if "hf_model" not in st.session_state: st.session_state.hf_model = model_name if "messages" not in st.session_state: st.session_state.messages = [] if "chatMessages" not in st.session_state: st.session_state.chatMessages = [] if "repeat" not in st.session_state: st.session_state.repeat = 1.35 if "temperature" not in st.session_state: st.session_state.temperature = 0.1 if "maxlength" not in st.session_state: st.session_state.maxlength = 500 if "speed" not in st.session_state: st.session_state.speed = 0.0 if "numOfTurns" not in st.session_state: st.session_state.numOfTurns = 0 if "maxTurns" not in st.session_state: st.session_state.maxTurns = 5 # must be odd number, greater than equal to 5 if "logfilename" not in st.session_state: ## Logger file logfile = f'logs/Gemma2-2B_{genRANstring(5)}_log.txt' # Space 루트의 logs 폴더에 저장 st.session_state.logfilename = logfile # Write in the history the first 2 sessions writehistory(st.session_state.logfilename,f'{str(datetime.datetime.now())}\n\nYour own LocalGPT with 🌀 {model_name}\n---\n🧠🫡: You are a helpful assistant.') writehistory(st.session_state.logfilename,f'🌀: How may I help you today?') def writehistory(filename,text): try: with open(filename, 'a', encoding='utf-8') as f: f.write(text) f.write('\n') f.close() except Exception as e: print(f"Error writing to log file: {e}") # Log error to console def genRANstring(n): """ n = int number of char to randomize """ N = n res = ''.join(random.choices(string.ascii_uppercase + string.digits, k=N)) return res # @st.cache_resource def create_chat(): try: tokenizer = AutoTokenizer.from_pretrained(model_id) ov_model = OVModelForCausalLM.from_pretrained( model_id = model_id, device='CPU', ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}, # OpenVINO config config=AutoConfig.from_pretrained(model_id) ) #Credit to https://github.com/openvino-dev-samples/chatglm3.openvino/blob/main/chat.py streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) return tokenizer, ov_model, streamer except Exception as e: st.error(f"Error loading model: {e}") return None, None, None # Return None values to indicate failure @st.cache_resource def countTokens(text): encoding = tiktoken.get_encoding("cl100k_base") # context_count = len(encoding.encode(yourtext)) numoftokens = len(encoding.encode(text)) return numoftokens #AVATARS - using emojis instead of images av_us = "👤" # User avatar emoji av_ass = "🤖" # Assistant avatar emoji nCTX = 8192 ### START STREAMLIT UI # Create a header element - using markdown instead of image st.header(f"🌟 {model_name} Chatbot") st.markdown(f"> *🌟 {model_name} with {nCTX} tokens Context window* - Turn based Chat available with max capacity of :orange[**{st.session_state.maxTurns} messages**].", unsafe_allow_html=True) st.markdown(f"#### Powered by OpenVINO") # CREATE THE SIDEBAR - using markdown and text instead of images with st.sidebar: st.subheader("Configuration") # Sidebar header # st.image('images/banner.png', use_column_width=True) # Removed image st.markdown("---") st.markdown("**Model Parameters**") st.session_state.temperature = st.slider('Temperature:', min_value=0.0, max_value=1.0, value=0.65, step=0.01) st.session_state.maxlength = st.slider('Length reply:', min_value=150, max_value=2000, value=550, step=50) st.session_state.repeat = st.slider('Repeat Penalty:', min_value=0.0, max_value=2.0, value=1.176, step=0.02) st.markdown("---") st.markdown("**Chat Options**") st.session_state.turns = st.toggle('Turn based', value=False, help='Activate Conversational Turn Chat with History', disabled=False, label_visibility="visible") st.markdown(f"*Number of Max Turns*: {st.session_state.maxTurns}") actualTurns = st.markdown(f"*Chat History Lenght*: :green[Good]") statspeed = st.markdown(f'💫 speed: {st.session_state.speed} t/s') btnClear = st.button("Clear History",type="primary", use_container_width=True) st.markdown("---") st.markdown("**Logs**") st.markdown(f"**Logfile**: {st.session_state.logfilename}") tokenizer, ov_model, streamer = create_chat() if tokenizer and ov_model and streamer: # Only proceed if model loading was successful # Display chat messages from history on app rerun for message in st.session_state.chatMessages: if message["role"] == "user": with st.chat_message(message["role"],avatar=av_us): st.markdown(message["content"]) else: with st.chat_message(message["role"],avatar=av_ass): st.markdown(message["content"]) # Accept user input using text_area and form for more dynamic updates with st.form(key='chat_form', clear_on_submit=False): # clear_on_submit=False 중요! 폼 내용 유지, 제출 버튼 제거 myprompt = st.text_area("What is an AI model?", key="prompt_input", height=100) # text_area 사용 if myprompt: # myprompt 가 입력되면 (text_area 내용이 변경되면) # Add user message to chat history st.session_state.messages.append({"role": "user", "content": myprompt}) st.session_state.chatMessages.append({"role": "user", "content": myprompt}) st.session_state.numOfTurns = len(st.session_state.messages) # Display user message in chat message container with st.chat_message("user", avatar=av_us): st.markdown(myprompt) usertext = f"user: {myprompt}" writehistory(st.session_state.logfilename,usertext) # Display assistant response in chat message container with st.chat_message("assistant",avatar=av_ass): message_placeholder = st.empty() with st.spinner("Thinking..."): start = datetime.datetime.now() response = '' conv_messages = [] if st.session_state.turns: if st.session_state.numOfTurns > st.session_state.maxTurns: conv_messages = st.session_state.messages[-st.session_state.maxTurns:] actualTurns.markdown(f"*Chat History Lenght*: :red[Trimmed]") else: conv_messages = st.session_state.messages else: conv_messages.append(st.session_state.messages[-1]) full_response = "" model_inputs = tokenizer.apply_chat_template(conv_messages, add_generation_prompt=True, tokenize=True, return_tensors="pt") generate_kwargs = dict(input_ids=model_inputs, max_new_tokens=st.session_state.maxlength, temperature=st.session_state.temperature, do_sample=True, top_p=0.5, repetition_penalty=st.session_state.repeat, streamer=streamer) # 비동기적으로 모델 생성 실행 (asyncio 사용) async def generate_response(): t1 = Thread(target=ov_model.generate, kwargs=generate_kwargs) t1.start() start_time = datetime.datetime.now() partial_text = "" first_token = 0 for chunk in streamer: if first_token == 0: ttft = datetime.datetime.now() - start_time first_token = 1 for char in chunk: partial_text += char message_placeholder.markdown(partial_text + "🟡") sleep(0.005) # 더 빠른 타자기 효과 (0.005초로 감소, 필요에 따라 조절) full_response += chunk delta_time = datetime.datetime.now() - start_time total_seconds = delta_time.total_seconds() prompt_tokens = len(encoding.encode(myprompt)) assistant_tokens = len(encoding.encode(full_response)) total_tokens = prompt_tokens + assistant_tokens st.session_state.speed = total_tokens / total_seconds statspeed.markdown(f'💫 speed: {st.session_state.speed:.2f} t/s') delta_time = datetime.datetime.now() - start_time prompt_tokens = len(encoding.encode(myprompt)) assistant_tokens = len(encoding.encode(full_response)) message_placeholder.markdown(full_response) # Display only the response, without stats asstext = f"assistant: {full_response}" writehistory(st.session_state.logfilename, asstext) st.session_state.messages.append({"role": "assistant", "content": full_response}) st.session_state.chatMessages.append({"role": "assistant", "content": full_response}) # Store just the response st.session_state.numOfTurns = len(st.session_state.messages) asyncio.run(generate_response()) # 비동기 함수 실행 if btnClear: # Clear History 버튼 클릭 시 st.session_state.messages = [] st.session_state.chatMessages = [] st.session_state.numOfTurns = 0 st.rerun() # Streamlit 앱 다시 실행 else: st.error("Model initialization failed. Please check the logs for details.")