|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer |
|
|
from wordcloud import WordCloud |
|
|
import os |
|
|
import shutil |
|
|
|
|
|
|
|
|
MODEL_CONFIG = { |
|
|
"local_path": "./models/tripadvisor_sentiment_model", |
|
|
"required_files": [ |
|
|
"config.json", |
|
|
"pytorch_model.bin", |
|
|
"tokenizer.json", |
|
|
"vocab.txt", |
|
|
"special_tokens_map.json", |
|
|
"tokenizer_config.json" |
|
|
] |
|
|
} |
|
|
|
|
|
def setup_model_directory(): |
|
|
"""Create model directory structure if it doesn't exist""" |
|
|
os.makedirs(MODEL_CONFIG["local_path"], exist_ok=True) |
|
|
|
|
|
|
|
|
readme_path = os.path.join(MODEL_CONFIG["local_path"], "README.md") |
|
|
if not os.path.exists(readme_path): |
|
|
with open(readme_path, "w") as f: |
|
|
f.write("# TripAdvisor Sentiment Model\n\n") |
|
|
f.write("Place the following files in this directory:\n") |
|
|
for file in MODEL_CONFIG["required_files"]: |
|
|
f.write(f"- {file}\n") |
|
|
|
|
|
def verify_model_files(): |
|
|
"""Check if all required model files exist""" |
|
|
missing_files = [] |
|
|
for file in MODEL_CONFIG["required_files"]: |
|
|
if not os.path.exists(os.path.join(MODEL_CONFIG["local_path"], file)): |
|
|
|
|
|
if file == "pytorch_model.bin" and os.path.exists(os.path.join(MODEL_CONFIG["local_path"], "model.safetensors")): |
|
|
continue |
|
|
missing_files.append(file) |
|
|
return missing_files |
|
|
|
|
|
@st.cache_resource |
|
|
def load_models(): |
|
|
"""Load sentiment analysis model with comprehensive error handling""" |
|
|
setup_model_directory() |
|
|
missing_files = verify_model_files() |
|
|
|
|
|
if missing_files: |
|
|
st.warning(f"Missing model files: {', '.join(missing_files)}") |
|
|
show_model_instructions(missing_files) |
|
|
return None |
|
|
|
|
|
try: |
|
|
st.success("Loading local sentiment model...") |
|
|
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CONFIG["local_path"]) |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG["local_path"]) |
|
|
|
|
|
return pipeline( |
|
|
"text-classification", |
|
|
model=model, |
|
|
tokenizer=tokenizer |
|
|
) |
|
|
except Exception as e: |
|
|
st.error(f"Model loading failed: {str(e)}") |
|
|
show_model_instructions() |
|
|
return None |
|
|
|
|
|
def show_model_instructions(missing_files=None): |
|
|
"""Display detailed model setup instructions""" |
|
|
with st.expander("Model Setup Instructions", expanded=True): |
|
|
st.markdown(""" |
|
|
### Option 1: Set Up Local Model |
|
|
|
|
|
1. **Create the model folder**: |
|
|
```bash |
|
|
mkdir -p models/tripadvisor_sentiment_model |
|
|
``` |
|
|
|
|
|
2. **Add these required files** to the folder: |
|
|
""") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.markdown("**Essential Files**") |
|
|
for file in ["config.json", "pytorch_model.bin", "model.safetensors"]: |
|
|
st.write(f"- {file}") |
|
|
|
|
|
with col2: |
|
|
st.markdown("**Tokenizer Files**") |
|
|
for file in ["tokenizer.json", "vocab.txt", "special_tokens_map.json", "tokenizer_config.json"]: |
|
|
st.write(f"- {file}") |
|
|
|
|
|
if missing_files: |
|
|
st.warning(f"Currently missing: {', '.join(missing_files)}") |
|
|
|
|
|
st.markdown(""" |
|
|
### Option 2: Use Test Model |
|
|
For testing, you can use this sample model: |
|
|
```python |
|
|
MODEL_CONFIG["hf_model_name"] = "distilbert-base-uncased-finetuned-sst-2-english" |
|
|
``` |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.title("TripAdvisor Review Analysis System") |
|
|
|
|
|
|
|
|
model = load_models() |
|
|
if model is None: |
|
|
st.error("Please set up the sentiment model first") |
|
|
st.stop() |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload review data (CSV or Parquet)", type=["csv", "parquet"]) |
|
|
if uploaded_file: |
|
|
df = load_file(uploaded_file) |
|
|
|
|
|
if df is not None: |
|
|
|