File size: 7,183 Bytes
fda81d5
 
 
 
 
 
ac21751
fda81d5
 
238b216
 
 
3a56bef
ac21751
78467af
 
 
ac21751
78467af
 
ac21751
78467af
 
ac21751
3a56bef
 
 
 
fda81d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e3753a
fda81d5
ef9db0b
9e3753a
fda81d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21a7915
 
fda81d5
21a7915
 
fda81d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21a7915
fda81d5
21a7915
 
 
fda81d5
 
21a7915
fda81d5
 
21a7915
 
fda81d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21a7915
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import gradio as gr
import torch
import faiss
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification

# Model paths
sent = "dnzblgn/Sentiment-Analysis-Customer-Reviews"
sarc = "dnzblgn/Sarcasm-Detection-Customer-Reviews"
doc = "dnzblgn/Customer-Reviews-Classification"
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Your models (no token, no fast tokenizer)
sentiment_tokenizer = AutoTokenizer.from_pretrained("dnzblgn/Sentiment-Analysis-Customer-Reviews", use_fast=False)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("dnzblgn/Sentiment-Analysis-Customer-Reviews")

sarcasm_tokenizer = AutoTokenizer.from_pretrained("dnzblgn/Sarcasm-Detection-Customer-Reviews", use_fast=False)
sarcasm_model = AutoModelForSequenceClassification.from_pretrained("dnzblgn/Sarcasm-Detection-Customer-Reviews")

classification_tokenizer = AutoTokenizer.from_pretrained("dnzblgn/Customer-Reviews-Classification", use_fast=False)
classification_model = AutoModelForSequenceClassification.from_pretrained("dnzblgn/Customer-Reviews-Classification")

# ** Mistral model for RAG **
mistral_model_name = "mistralai/Mistral-7B-v0.1"
causal_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
causal_model = AutoModelForCausalLM.from_pretrained(mistral_model_name, torch_dtype=torch.float16).eval()

# Paths and files
UPLOAD_FOLDER = "uploads"
SUMMARY_FILE = "summary.txt"
FAISS_INDEX_PATH = "faiss_index"
DOCUMENTS_FILE = "documents.txt"

if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)

categories = {
    0: "Shipping and Delivery",
    1: "Customer Service",
    2: "Price and Value",
    3: "Quality and Performance",
    4: "Use and Design",
    5: "Other"
}

# Helper functions
def analyze_sentiment(sentence):
    inputs = sentiment_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
    logits = outputs.logits
    sentiment = torch.argmax(logits, dim=-1).item()
    return "Positive" if sentiment == 0 else "Negative"

def detect_sarcasm(sentence):
    inputs = sarcasm_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = sarcasm_model(**inputs)
    logits = outputs.logits
    sarcasm = torch.argmax(logits, dim=-1).item()
    return sarcasm == 1

def classify_document(sentence):
    inputs = classification_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = classification_model(**inputs)
    logits = outputs.logits
    category = torch.argmax(logits, dim=-1).item()
    return categories[category]

def preprocess_summary(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    chunks = []
    current_chunk = []

    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.endswith(":") and current_chunk:
            chunks.append("\n".join(current_chunk))
            current_chunk = []
        current_chunk.append(line)

    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

def create_faiss_index(chunks):
    embeddings = [embedding_model.encode(chunk, normalize_embeddings=True) for chunk in chunks]
    embeddings_np = np.array(embeddings)
    embedding_dimension = embeddings_np.shape[1]

    faiss_index = faiss.IndexFlatL2(embedding_dimension)
    faiss_index.add(embeddings_np)
    faiss.write_index(faiss_index, FAISS_INDEX_PATH)

    with open(DOCUMENTS_FILE, "w", encoding="utf-8") as doc_file:
        for chunk in chunks:
            doc_file.write(chunk + "\n--END--\n")

def handle_uploaded_file(file):
    # Save the contents directly from the NamedString
    file_path = os.path.join(UPLOAD_FOLDER, "uploaded_comments.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(file)  # `file` is already the content of the file as a string

    with open(file_path, "r", encoding="utf-8") as f:
        comments = f.readlines()

    results = []
    for comment in comments:
        comment = comment.strip()
        if not comment:
            continue
        sentiment = analyze_sentiment(comment)
        if sentiment == "Positive" and detect_sarcasm(comment):
            sentiment = "Negative"
        category = classify_document(comment)
        results.append({"comment": comment, "sentiment": sentiment, "category": category})

    chunks = preprocess_summary(file_path)
    create_faiss_index(chunks)

    return "File uploaded and processed successfully."

def causal_generate_response(prompt):
    inputs = causal_tokenizer(prompt, return_tensors="pt")  # Default CPU
    with torch.no_grad():
        outputs = causal_model.generate(inputs["input_ids"], max_length=500, do_sample=True, temperature=0.7)
    response = causal_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def query_chatbot(query):
    top_k = 5
    faiss_index = faiss.read_index(FAISS_INDEX_PATH)

    with open(DOCUMENTS_FILE, "r", encoding="utf-8") as doc_file:
        documents = doc_file.read().split("\n--END--\n")

    query_embedding = embedding_model.encode([query], normalize_embeddings=True)
    distances, indices = faiss_index.search(np.array(query_embedding), top_k)

    relevant_docs = [documents[idx] for idx in indices[0] if idx < len(documents)]
    context = "\n\n".join(relevant_docs[:top_k])

    # Custom Prompt for RAG
    final_prompt = (
        f"You are a business data analyst. Analyze the feedback data and identify the overall sentiment trends. "
        f"Focus on determining whether positive feedback or negative feedback dominates in each category, and avoid overstating less significant trends. "
        f"Provide clear, data-driven insights.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n\n"
        f"Your Answer (based on the data and context):"
    )

    return causal_generate_response(final_prompt)


# Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("# Sentiment Analysis Powered by Sarcasm Detection")
    with gr.Row():
        upload = gr.File(label="Upload .txt File")
        chatbot_output = gr.Textbox(label="Processing Report", lines=10, interactive=False)

    upload_btn = gr.Button("Process File")

    with gr.Row():
        query_input = gr.Textbox(label="Ask a Question")
        answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)

    query_btn = gr.Button("Get Answer")

    def process_file_and_show_chatbot(file):
        result_message = handle_uploaded_file(file)
        return result_message

    upload_btn.click(process_file_and_show_chatbot, inputs=upload, outputs=chatbot_output)

    def handle_query(query):
        response = query_chatbot(query)
        return response

    query_btn.click(handle_query, inputs=query_input, outputs=answer_output)

# Run Gradio app
if __name__ == "__main__":
    interface.launch()