Spaces:

kalpanie
/

MedQA

Sleeping

App Files Files Community

kalpanie commited on May 16

Commit

477e2b5

verified ·

1 Parent(s): 0a52bc1

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +83 -41

src/streamlit_app.py CHANGED Viewed

@@ -11,39 +11,50 @@ st.set_page_config(
 )
 # --- Model Loading ---
-# Choose your OpenBioLLM model. The 8B parameter model is more manageable for typical Hugging Face Spaces resources.
-# For larger models like 70B, you might need upgraded hardware on Spaces.
 MODEL_NAME = "aaditya/Llama3-OpenBioLLM-8B"
-@st.cache_resource # Caches the model and tokenizer for better performance
 def load_model_and_tokenizer():
     """Loads the pre-trained model and tokenizer."""
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        # Load the model with torch_dtype=torch.float16 for potentially faster inference and lower memory,
-        # and device_map='auto' to leverage available hardware (CPU/GPU) efficiently.
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
-            torch_dtype=torch.float16, # Using float16 to reduce memory footprint
-            device_map="auto", # Automatically uses GPU if available, otherwise CPU
         )
-        # For models that might not explicitly support "question-answering" pipeline directly,
-        # we use "text-generation".
         qa_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            max_new_tokens=512,  # Adjust as needed for answer length
             do_sample=True,
-            temperature=0.7, # Controls randomness. Lower for more factual, higher for more creative.
-            top_p=0.9,       # Nucleus sampling
         )
         return qa_pipeline
     except Exception as e:
         st.error(f"Error loading model: {e}")
-        st.error("This could be due to model availability, network issues, or resource limitations on the Hugging Face Space.")
-        st.error(f"Attempted to load: {MODEL_NAME}")
-        st.info("If you are running this on a free Hugging Face Space, larger models like the 70B version might exceed resource limits. The 8B version is generally more suitable.")
         return None
 qa_pipeline = load_model_and_tokenizer()
@@ -51,7 +62,12 @@ qa_pipeline = load_model_and_tokenizer()
 # --- Application Interface ---
 st.title("⚕️ Medical Question Answering with OpenBioLLM")
 st.markdown("Ask a medical-related question and get an answer from the OpenBioLLM model.")
-st.markdown(f"**Model used:** `{MODEL_NAME}`")
 st.sidebar.header("⚠️ Disclaimer")
 st.sidebar.warning(
@@ -70,49 +86,75 @@ question = st.text_area("Enter your medical question here:", height=100, key="qu
 if st.button("Get Answer", key="get_answer_button"):
     if qa_pipeline and question:
-        with st.spinner("Generating answer... Please wait."):
             try:
-                # Construct a prompt for the Llama3-based OpenBioLLM model.
-                # Llama 3 uses a specific chat template structure.
-                # We adapt this for a direct question.
                 messages = [
-                    {"role": "system", "content": "You are a helpful medical information assistant. Please answer the user's question based on your knowledge. Provide informative and clear answers."},
                     {"role": "user", "content": question}
                 ]
-                # The pipeline with a text-generation model expects a string prompt.
-                # We'll format the messages into a string that Llama3 expects.
-                # A simpler approach for direct QA might be a direct instruction:
-                prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful medical information assistant. Please answer the user's question based on your knowledge. Provide informative and clear answers.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
                 response = qa_pipeline(prompt)
-                # The output from the text-generation pipeline is usually a list of dictionaries.
                 if response and isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
-                    generated_answer = response[0]["generated_text"]
-                    # The model will repeat the prompt, so we need to extract only the assistant's response.
-                    assistant_response_start = generated_answer.rfind("<|start_header_id|>assistant<|end_header_id|>")
-                    if assistant_response_start != -1:
-                        answer_text = generated_answer[assistant_response_start + len("<|start_header_id|>assistant<|end_header_id|>"):].strip()
-                        # Further clean up any trailing special tokens if necessary
-                        if "<|eot_id|>" in answer_text:
-                            answer_text = answer_text.split("<|eot_id|>")[0].strip()
-                        st.subheader("📝 Model's Answer:")
-                        st.info(answer_text)
                     else:
-                        st.warning("Could not properly parse the assistant's response from the model output.")
-                        st.text_area("Raw model output:", generated_answer, height=200)
                 else:
-                    st.error("The model did not return a valid response.")
                     st.write("Raw response:", response)
             except Exception as e:
                 st.error(f"An error occurred during answer generation: {e}")
                 st.info("This might be due to the complexity of the question, model limitations, or resource constraints.")
     elif not qa_pipeline:
-        st.error("Model could not be loaded. Please check the logs for more details.")
     elif not question:
         st.warning("Please enter a question.")

 )
 # --- Model Loading ---
 MODEL_NAME = "aaditya/Llama3-OpenBioLLM-8B"
+@st.cache_resource
 def load_model_and_tokenizer():
     """Loads the pre-trained model and tokenizer."""
+    st.info(f"Attempting to load model: {MODEL_NAME}")
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True  # Crucial for some custom Llama variants or if config.json is minimal
         )
+        # If the model uses a specific chat template (common for Llama3),
+        # ensure the tokenizer has it or set it.
+        # For Llama-3, the template is often pre-configured.
+        # if tokenizer.chat_template is None:
+        #     # This is a generic Llama3 chat template example; the specific model might have its own nuance
+        #     # However, OpenBioLLM-8B seems to be a fine-tune, so its tokenizer should ideally have this.
+        #     tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'assistant' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         qa_pipeline = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            max_new_tokens=512,
             do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            # Explicitly setting pad_token_id if not already set by the tokenizer
+            # Llama models typically use eos_token_id as pad_token_id
+            pad_token_id=tokenizer.eos_token_id
         )
+        st.success("Model and tokenizer loaded successfully!")
         return qa_pipeline
     except Exception as e:
         st.error(f"Error loading model: {e}")
+        st.error("This could be due to model availability, network issues, resource limitations, or a configuration issue with the model on Hugging Face Hub.")
+        st.error(f"Attempted to load: {MODEL_NAME} with 'trust_remote_code=True'.")
+        st.info("Please ensure your Hugging Face Space has enough resources (RAM, CPU). The 8B model is large.")
+        st.info("You might also want to check the 'Files and versions' tab of the model on Hugging Face Hub for any specific loading instructions or issues reported by others.")
         return None
 qa_pipeline = load_model_and_tokenizer()
 # --- Application Interface ---
 st.title("⚕️ Medical Question Answering with OpenBioLLM")
 st.markdown("Ask a medical-related question and get an answer from the OpenBioLLM model.")
+if qa_pipeline:
+    st.markdown(f"**Model used:** `{MODEL_NAME}` (Loaded)")
+else:
+    st.markdown(f"**Model used:** `{MODEL_NAME}` (Failed to load)")
 st.sidebar.header("⚠️ Disclaimer")
 st.sidebar.warning(
 if st.button("Get Answer", key="get_answer_button"):
     if qa_pipeline and question:
+        with st.spinner("Generating answer... This may take a moment for an 8B model on CPU."):
             try:
+                # For Llama 3 style models, the pipeline's tokenizer should handle the template.
+                # If not, you'd apply the template manually using tokenizer.apply_chat_template
                 messages = [
+                    {"role": "system", "content": "You are a knowledgeable and helpful medical information assistant. Your goal is to provide clear, accurate, and concise answers to medical questions based on the information you have been trained on. Do not provide medical advice or diagnoses. State that you are an AI assistant if asked about your nature."},
                     {"role": "user", "content": question}
                 ]
+                # The pipeline for text-generation with Llama3 models often expects the chat formatted as a string
+                # or can take the list of messages if the tokenizer is correctly configured with a chat_template.
+                # Let's try applying the template if the pipeline doesn't do it implicitly.
+                try:
+                    prompt = qa_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                except Exception as e:
+                    st.warning(f"Could not apply chat template directly, using a basic prompt structure: {e}")
+                    # Fallback prompt structure - this might be less effective than the proper chat template
+                    prompt = f"System: You are a helpful medical information assistant. Please answer the user's question based on your knowledge. Provide informative and clear answers.\nUser: {question}\nAssistant:"
                 response = qa_pipeline(prompt)
                 if response and isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
+                    generated_answer_full = response[0]["generated_text"]
+                    # Extract only the assistant's response after the prompt
+                    # The prompt structure from apply_chat_template should end with the signal for the assistant to start.
+                    # Or, if using the fallback, find the last "Assistant:"
+                    # Find the last occurrence of the assistant's turn signal from the template
+                    assistant_signal_templated = "<|start_header_id|>assistant<|end_header_id|>"
+                    # The prompt produced by `apply_chat_template` ends with this typically.
+                    # So the generated text will *start* after this.
+                    # The actual generated text from the model starts *after* the full prompt.
+                    # So, if `prompt` was fed to the pipeline, the `generated_answer_full`
+                    # will be `prompt + actual_answer`.
+                    if generated_answer_full.startswith(prompt):
+                        answer_text = generated_answer_full[len(prompt):].strip()
                     else:
+                        # If the prompt is not exactly at the beginning (e.g., some prefixes added by pipeline)
+                        # try a more generic way to find the assistant's first actual text.
+                        # This part is tricky and depends heavily on the exact output format of the model/pipeline.
+                        # For Llama3, it typically doesn't repeat the whole prompt IF the input was already templated.
+                        # The output of text-generation is just the *new* tokens.
+                        answer_text = generated_answer_full # Assuming the pipeline output is ONLY the new text
+                    # Clean up End-of-Text token if present
+                    if qa_pipeline.tokenizer.eos_token and qa_pipeline.tokenizer.eos_token in answer_text:
+                        answer_text = answer_text.split(qa_pipeline.tokenizer.eos_token)[0].strip()
+                    # Sometimes other special tokens might linger
+                    answer_text = answer_text.replace("<|eot_id|>", "").strip()
+                    st.subheader("📝 Model's Answer:")
+                    st.info(answer_text)
                 else:
+                    st.error("The model did not return a valid response structure.")
                     st.write("Raw response:", response)
             except Exception as e:
                 st.error(f"An error occurred during answer generation: {e}")
                 st.info("This might be due to the complexity of the question, model limitations, or resource constraints.")
+                # You can add more detailed error logging here if needed, e.g., print(traceback.format_exc())
     elif not qa_pipeline:
+        st.error("Model is not loaded. Cannot generate an answer. Please check the error messages above.")
     elif not question:
         st.warning("Please enter a question.")