File size: 7,183 Bytes
fda81d5 ac21751 fda81d5 238b216 3a56bef ac21751 78467af ac21751 78467af ac21751 78467af ac21751 3a56bef fda81d5 9e3753a fda81d5 ef9db0b 9e3753a fda81d5 21a7915 fda81d5 21a7915 fda81d5 21a7915 fda81d5 21a7915 fda81d5 21a7915 fda81d5 21a7915 fda81d5 21a7915 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import gradio as gr
import torch
import faiss
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
# Model paths
sent = "dnzblgn/Sentiment-Analysis-Customer-Reviews"
sarc = "dnzblgn/Sarcasm-Detection-Customer-Reviews"
doc = "dnzblgn/Customer-Reviews-Classification"
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Your models (no token, no fast tokenizer)
sentiment_tokenizer = AutoTokenizer.from_pretrained("dnzblgn/Sentiment-Analysis-Customer-Reviews", use_fast=False)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("dnzblgn/Sentiment-Analysis-Customer-Reviews")
sarcasm_tokenizer = AutoTokenizer.from_pretrained("dnzblgn/Sarcasm-Detection-Customer-Reviews", use_fast=False)
sarcasm_model = AutoModelForSequenceClassification.from_pretrained("dnzblgn/Sarcasm-Detection-Customer-Reviews")
classification_tokenizer = AutoTokenizer.from_pretrained("dnzblgn/Customer-Reviews-Classification", use_fast=False)
classification_model = AutoModelForSequenceClassification.from_pretrained("dnzblgn/Customer-Reviews-Classification")
# ** Mistral model for RAG **
mistral_model_name = "mistralai/Mistral-7B-v0.1"
causal_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
causal_model = AutoModelForCausalLM.from_pretrained(mistral_model_name, torch_dtype=torch.float16).eval()
# Paths and files
UPLOAD_FOLDER = "uploads"
SUMMARY_FILE = "summary.txt"
FAISS_INDEX_PATH = "faiss_index"
DOCUMENTS_FILE = "documents.txt"
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
categories = {
0: "Shipping and Delivery",
1: "Customer Service",
2: "Price and Value",
3: "Quality and Performance",
4: "Use and Design",
5: "Other"
}
# Helper functions
def analyze_sentiment(sentence):
inputs = sentiment_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = sentiment_model(**inputs)
logits = outputs.logits
sentiment = torch.argmax(logits, dim=-1).item()
return "Positive" if sentiment == 0 else "Negative"
def detect_sarcasm(sentence):
inputs = sarcasm_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = sarcasm_model(**inputs)
logits = outputs.logits
sarcasm = torch.argmax(logits, dim=-1).item()
return sarcasm == 1
def classify_document(sentence):
inputs = classification_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = classification_model(**inputs)
logits = outputs.logits
category = torch.argmax(logits, dim=-1).item()
return categories[category]
def preprocess_summary(file_path):
with open(file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
chunks = []
current_chunk = []
for line in lines:
line = line.strip()
if not line:
continue
if line.endswith(":") and current_chunk:
chunks.append("\n".join(current_chunk))
current_chunk = []
current_chunk.append(line)
if current_chunk:
chunks.append("\n".join(current_chunk))
return chunks
def create_faiss_index(chunks):
embeddings = [embedding_model.encode(chunk, normalize_embeddings=True) for chunk in chunks]
embeddings_np = np.array(embeddings)
embedding_dimension = embeddings_np.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dimension)
faiss_index.add(embeddings_np)
faiss.write_index(faiss_index, FAISS_INDEX_PATH)
with open(DOCUMENTS_FILE, "w", encoding="utf-8") as doc_file:
for chunk in chunks:
doc_file.write(chunk + "\n--END--\n")
def handle_uploaded_file(file):
# Save the contents directly from the NamedString
file_path = os.path.join(UPLOAD_FOLDER, "uploaded_comments.txt")
with open(file_path, "w", encoding="utf-8") as f:
f.write(file) # `file` is already the content of the file as a string
with open(file_path, "r", encoding="utf-8") as f:
comments = f.readlines()
results = []
for comment in comments:
comment = comment.strip()
if not comment:
continue
sentiment = analyze_sentiment(comment)
if sentiment == "Positive" and detect_sarcasm(comment):
sentiment = "Negative"
category = classify_document(comment)
results.append({"comment": comment, "sentiment": sentiment, "category": category})
chunks = preprocess_summary(file_path)
create_faiss_index(chunks)
return "File uploaded and processed successfully."
def causal_generate_response(prompt):
inputs = causal_tokenizer(prompt, return_tensors="pt") # Default CPU
with torch.no_grad():
outputs = causal_model.generate(inputs["input_ids"], max_length=500, do_sample=True, temperature=0.7)
response = causal_tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
def query_chatbot(query):
top_k = 5
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
with open(DOCUMENTS_FILE, "r", encoding="utf-8") as doc_file:
documents = doc_file.read().split("\n--END--\n")
query_embedding = embedding_model.encode([query], normalize_embeddings=True)
distances, indices = faiss_index.search(np.array(query_embedding), top_k)
relevant_docs = [documents[idx] for idx in indices[0] if idx < len(documents)]
context = "\n\n".join(relevant_docs[:top_k])
# Custom Prompt for RAG
final_prompt = (
f"You are a business data analyst. Analyze the feedback data and identify the overall sentiment trends. "
f"Focus on determining whether positive feedback or negative feedback dominates in each category, and avoid overstating less significant trends. "
f"Provide clear, data-driven insights.\n\n"
f"Context:\n{context}\n\n"
f"Question: {query}\n\n"
f"Your Answer (based on the data and context):"
)
return causal_generate_response(final_prompt)
# Gradio interface
with gr.Blocks() as interface:
gr.Markdown("# Sentiment Analysis Powered by Sarcasm Detection")
with gr.Row():
upload = gr.File(label="Upload .txt File")
chatbot_output = gr.Textbox(label="Processing Report", lines=10, interactive=False)
upload_btn = gr.Button("Process File")
with gr.Row():
query_input = gr.Textbox(label="Ask a Question")
answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
query_btn = gr.Button("Get Answer")
def process_file_and_show_chatbot(file):
result_message = handle_uploaded_file(file)
return result_message
upload_btn.click(process_file_and_show_chatbot, inputs=upload, outputs=chatbot_output)
def handle_query(query):
response = query_chatbot(query)
return response
query_btn.click(handle_query, inputs=query_input, outputs=answer_output)
# Run Gradio app
if __name__ == "__main__":
interface.launch() |