Spaces:

mohamedrasheqA
/

Deepseek-R1-PF

Runtime error

App Files Files Community

MRasheq commited on Jan 31

Commit

07be9b4

1 Parent(s): 1332eb3

Third Commit

Browse files

Files changed (2) hide show

app.py +189 -124
requirements.txt +13 -2

app.py CHANGED Viewed

@@ -1,157 +1,222 @@
 import gradio as gr
 import torch
-from typing import List
 from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import psycopg2
 import numpy as np
 class RAGPipeline:
     def __init__(self):
-        # Database connection string
         self.connection_string = "postgresql://Data_owner:JsxygNDC15IO@ep-cool-hill-a5k13m05-pooler.us-east-2.aws.neon.tech/Data?sslmode=require"
-        # Initialize embedding model
         self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-        # Initialize LLM
         self.llm_model = AutoModelForCausalLM.from_pretrained(
             "deepseek-ai/DeepSeek-R1",
             trust_remote_code=True,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         )
         self.llm_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)
-        # Move model to GPU if available
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.llm_model = self.llm_model.to(self.device)
-        # Initialize prompt template
-        self.prompt_template = """
-        Use the following context to answer the question. If you cannot answer the question based on the context, say so.
-        Context: {context}
-        Question: {question}
-        Answer: Let me help you with that.
-        """
     def generate_embedding(self, text: str) -> List[float]:
-        """Generate embeddings for input text."""
-        embedding = self.embedding_model.encode(text)
-        return embedding.tolist()
-    def similarity_search(self, query_embedding: List[float], top_k: int = 3) -> List[dict]:
-        """Perform similarity search in PostgreSQL using vector comparison."""
-        with psycopg2.connect(self.connection_string) as conn:
-            with conn.cursor() as cur:
-                embedding_array = np.array(query_embedding)
-                query = """
-                SELECT text, title, url,
-                       1 - (vector <=> %s) as similarity
-                FROM bents
-                ORDER BY vector <=> %s
-                LIMIT %s;
-                """
-                cur.execute(query, (embedding_array.tolist(), embedding_array.tolist(), top_k))
-                results = cur.fetchall()
-                similar_docs = [
-                    {
-                        'text': row[0],
-                        'title': row[1],
-                        'url': row[2],
-                        'similarity': row[3]
-                    }
-                    for row in results
-                ]
-                return similar_docs
-    def generate_response(self, query: str, context: str,
-                         max_tokens: int = 512,
-                         temperature: float = 0.7,
-                         top_p: float = 0.95) -> str:
-        """Generate response using the LLM."""
-        prompt = self.prompt_template.format(context=context, question=query)
-        inputs = self.llm_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.device)
-        with torch.no_grad():
-            outputs = self.llm_model.generate(
-                **inputs,
-                max_new_tokens=max_tokens,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p,
-                pad_token_id=self.llm_tokenizer.eos_token_id,
-            )
-        response = self.llm_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-        return response.strip()
-    def process_query(self,
-                     query: str,
-                     max_tokens: int = 512,
-                     temperature: float = 0.7,
-                     top_p: float = 0.95,
-                     top_k: int = 3) -> dict:
-        """Process user query through the complete RAG pipeline."""
-        query_embedding = self.generate_embedding(query)
-        similar_docs = self.similarity_search(query_embedding, top_k=top_k)
-        context = "\n".join([doc['text'] for doc in similar_docs])
-        response = self.generate_response(query, context, max_tokens, temperature, top_p)
-        # Format sources for display
-        sources = "\n\nSources:\n" + "\n".join([
             f"- {doc['title']} (Similarity: {doc['similarity']:.2f})\n  URL: {doc['url']}"
-            for doc in similar_docs
         ])
-        return response + sources
-# Initialize RAG pipeline globally
-rag_pipeline = RAGPipeline()
-def process_message(
-    message: str,
-    history: List[tuple[str, str]],
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-    top_k: int
-) -> str:
-    """Process message and maintain chat history."""
-    try:
-        response = rag_pipeline.process_query(
-            message,
             max_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
-            top_k=top_k
         )
-        return response
-    except Exception as e:
-        return f"An error occurred: {str(e)}"
-# Create Gradio interface
-demo = gr.ChatInterface(
-    process_message,
-    additional_inputs=[
-        gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max tokens"),
-        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
-        gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Top-k documents"),
-    ],
-    title="RAG-Powered Chat Assistant",
-    description="""This chat interface uses RAG (Retrieval Augmented Generation) to provide informed responses
-                   based on the content in the database. The assistant retrieves relevant documents and uses them
-                   as context for generating responses.""",
-    theme="soft"
-)
-# Launch the interface
 if __name__ == "__main__":
-    demo.launch(share=True)  # Set share=False in production

 import gradio as gr
 import torch
+from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import psycopg2
 import numpy as np
+from dataclasses import dataclass
+from datetime import datetime
+import json
+import os
+@dataclass
+class ChatConfig:
+    max_tokens: int = 512
+    temperature: float = 0.7
+    top_p: float = 0.95
+    top_k: int = 3
+    system_prompt: str = "You are a helpful AI assistant that provides accurate information based on the given context."
 class RAGPipeline:
     def __init__(self):
         self.connection_string = "postgresql://Data_owner:JsxygNDC15IO@ep-cool-hill-a5k13m05-pooler.us-east-2.aws.neon.tech/Data?sslmode=require"
         self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        self.load_llm()
+        self.chat_config = ChatConfig()
+    def load_llm(self):
         self.llm_model = AutoModelForCausalLM.from_pretrained(
             "deepseek-ai/DeepSeek-R1",
             trust_remote_code=True,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            load_in_4bit=True,
+            device_map="auto",
+            quantization_config={
+                "load_in_4bit": True,
+                "bnb_4bit_compute_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
+                "bnb_4bit_quant_type": "nf4",
+                "bnb_4bit_use_double_quant": True
+            }
         )
         self.llm_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def generate_embedding(self, text: str) -> List[float]:
+        return self.embedding_model.encode(text).tolist()
+    def similarity_search(self, query_embedding: List[float]) -> List[dict]:
+        try:
+            with psycopg2.connect(self.connection_string) as conn:
+                with conn.cursor() as cur:
+                    embedding_array = np.array(query_embedding)
+                    query = """
+                    SELECT text, title, url,
+                           1 - (vector <=> %s) as similarity
+                    FROM bents
+                    ORDER BY vector <=> %s
+                    LIMIT %s;
+                    """
+                    cur.execute(query, (embedding_array.tolist(), embedding_array.tolist(), self.chat_config.top_k))
+                    results = cur.fetchall()
+                    return [
+                        {
+                            'text': row[0],
+                            'title': row[1],
+                            'url': row[2],
+                            'similarity': row[3]
+                        }
+                        for row in results
+                    ]
+        except Exception as e:
+            print(f"Database error: {str(e)}")
+            return []
+    def format_conversation(self, messages: List[Dict[str, str]]) -> str:
+        formatted = f"System: {self.chat_config.system_prompt}\n\n"
+        for msg in messages:
+            role = msg["role"].capitalize()
+            content = msg["content"]
+            formatted += f"{role}: {content}\n\n"
+        return formatted.strip()
+    def generate_response(self, messages: List[Dict[str, str]], context: str) -> str:
+        try:
+            conversation = self.format_conversation(messages)
+            context_prompt = f"Context:\n{context}\n\nCurrent conversation:\n{conversation}\n\nAssistant:"
+            inputs = self.llm_tokenizer(context_prompt, return_tensors="pt", truncation=True, max_length=2048).to(self.device)
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    **inputs,
+                    max_new_tokens=self.chat_config.max_tokens,
+                    do_sample=True,
+                    temperature=self.chat_config.temperature,
+                    top_p=self.chat_config.top_p,
+                    pad_token_id=self.llm_tokenizer.eos_token_id,
+                )
+            response = self.llm_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            return response.strip()
+        except Exception as e:
+            return f"Error generating response: {str(e)}"
+    def process_query(self, message: str, chat_history: List[Dict[str, str]]) -> tuple[str, List[dict]]:
+        query_embedding = self.generate_embedding(message)
+        similar_docs = self.similarity_search(query_embedding)
+        context = "\n".join([doc['text'] for doc in similar_docs])
+        messages = chat_history + [{"role": "user", "content": message}]
+        response = self.generate_response(messages, context)
+        return response, similar_docs
+class GradioRAGChat:
+    def __init__(self):
+        self.rag = RAGPipeline()
+        self.chat_history = []
+    def process_message(self, message: str, history: List[tuple[str, str]]) -> tuple[str, List[dict]]:
+        # Convert Gradio history format to our format
+        chat_history = []
+        for user_msg, assistant_msg in history:
+            if user_msg:
+                chat_history.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                chat_history.append({"role": "assistant", "content": assistant_msg})
+        response, sources = self.rag.process_query(message, chat_history)
+        # Format response with sources
+        formatted_sources = "\n\nSources:\n" + "\n".join([
             f"- {doc['title']} (Similarity: {doc['similarity']:.2f})\n  URL: {doc['url']}"
+            for doc in sources
         ])
+        return response + formatted_sources
+    def update_config(
+        self,
+        max_tokens: int,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        system_prompt: str
+    ) -> str:
+        self.rag.chat_config = ChatConfig(
             max_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
+            top_k=top_k,
+            system_prompt=system_prompt
         )
+        return f"Configuration updated successfully at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+    def create_interface(self):
+        with gr.Blocks(theme=gr.themes.Soft()) as interface:
+            gr.Markdown("# RAG-Powered Chat Assistant")
+            with gr.Tabs():
+                with gr.Tab("Chat"):
+                    chatbot = gr.ChatInterface(
+                        fn=self.process_message,
+                        title="",
+                        description="Ask questions about the content in the database."
+                    )
+                with gr.Tab("Configuration"):
+                    with gr.Group():
+                        gr.Markdown("### Model Parameters")
+                        with gr.Row():
+                            max_tokens = gr.Slider(
+                                minimum=64, maximum=2048, value=512, step=64,
+                                label="Max Tokens"
+                            )
+                            temperature = gr.Slider(
+                                minimum=0.1, maximum=2.0, value=0.7, step=0.1,
+                                label="Temperature"
+                            )
+                        with gr.Row():
+                            top_p = gr.Slider(
+                                minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                                label="Top-p"
+                            )
+                            top_k = gr.Slider(
+                                minimum=1, maximum=10, value=3, step=1,
+                                label="Top-k Documents"
+                            )
+                        system_prompt = gr.Textbox(
+                            value=self.rag.chat_config.system_prompt,
+                            label="System Prompt",
+                            lines=3
+                        )
+                        update_btn = gr.Button("Update Configuration")
+                        config_status = gr.Textbox(label="Status", interactive=False)
+                        update_btn.click(
+                            fn=self.update_config,
+                            inputs=[max_tokens, temperature, top_p, top_k, system_prompt],
+                            outputs=[config_status]
+                        )
+            gr.Markdown("""
+            ### About
+            This chat interface uses RAG (Retrieval Augmented Generation) to provide informed responses based on the content in the database.
+            The assistant retrieves relevant documents and uses them as context for generating responses.
+            - Use the Chat tab for asking questions
+            - Use the Configuration tab to adjust model parameters
+            """)
+        return interface
+def main():
+    chat_app = GradioRAGChat()
+    interface = chat_app.create_interface()
+    interface.launch(share=False)  # Set share=True for public URL
 if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -5,6 +5,11 @@ torch>=2.0.0
 transformers>=4.36.0
 sentence-transformers>=2.2.2
 # Database
 psycopg2-binary>=2.9.9
 pgvector>=0.2.3
@@ -15,9 +20,15 @@ pandas>=2.0.0
 # Deep learning
 accelerate>=0.24.0
-bitsandbytes>=0.41.0
 safetensors>=0.4.0
 # Utilities
 tqdm>=4.65.0
-python-dotenv>=1.0.0

 transformers>=4.36.0
 sentence-transformers>=2.2.2
+# Web UI
+gradio>=4.13.0
+uvicorn>=0.27.0
+fastapi>=0.109.0
 # Database
 psycopg2-binary>=2.9.9
 pgvector>=0.2.3
 # Deep learning
 accelerate>=0.24.0
+bitsandbytes>=0.41.3
 safetensors>=0.4.0
+transformers>=4.36.2  # Specific version for compatibility
 # Utilities
 tqdm>=4.65.0
+python-dotenv>=1.0.0
+# Optional: for better performance
+httpx>=0.26.0
+websockets>=12.0
+aiohttp>=3.9.0