Spaces:
Sleeping
Sleeping
| # Import modules and classes | |
| from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage | |
| from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank | |
| from llama_index.core.indices.query.query_transform import HyDEQueryTransform | |
| from llama_index.core.query_engine import TransformQueryEngine | |
| from langchain_core.documents import Document as LangDocument | |
| from llama_index.core import Document as LlamaDocument | |
| from llama_index.core import Settings | |
| from llama_parse import LlamaParse | |
| import streamlit as st | |
| import os | |
| # Set environmental variables | |
| nvidia_api_key = os.getenv("NVIDIA_KEY") | |
| llamaparse_api_key = os.getenv("PARSE_KEY") | |
| # Initialize ChatNVIDIA, NVIDIARerank, and NVIDIAEmbeddings | |
| client = ChatNVIDIA( | |
| model="meta/llama-3.1-8b-instruct", | |
| api_key=nvidia_api_key, | |
| temperature=0.2, | |
| top_p=0.7, | |
| max_tokens=1024 | |
| ) | |
| embed_model = NVIDIAEmbeddings( | |
| model="nvidia/nv-embedqa-e5-v5", | |
| api_key=nvidia_api_key, | |
| truncate="NONE" | |
| ) | |
| reranker = NVIDIARerank( | |
| model="nvidia/nv-rerankqa-mistral-4b-v3", | |
| api_key=nvidia_api_key, | |
| ) | |
| # Set the NVIDIA models globally | |
| Settings.embed_model = embed_model | |
| Settings.llm = client | |
| # Parse the local PDF document | |
| parser = LlamaParse( | |
| api_key=llamaparse_api_key, | |
| result_type="markdown", | |
| verbose=True | |
| ) | |
| documents = parser.load_data("C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\Files\\PhilDataset.pdf") | |
| print("Document Parsed") | |
| # Split parsed text into chunks for embedding model | |
| def split_text(text, max_tokens=512): | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| word_length = len(word) | |
| if current_length + word_length + 1 > max_tokens: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [word] | |
| current_length = word_length + 1 | |
| else: | |
| current_chunk.append(word) | |
| current_length += word_length + 1 | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| # Generate embeddings for document chunks | |
| all_embeddings = [] | |
| all_documents = [] | |
| for doc in documents: | |
| text_chunks = split_text(doc.text) | |
| for chunk in text_chunks: | |
| embedding = embed_model.embed_query(chunk) | |
| all_embeddings.append(embedding) | |
| all_documents.append(LlamaDocument(text=chunk)) | |
| print("Embeddings generated") | |
| # Create and persist index with NVIDIAEmbeddings | |
| index = VectorStoreIndex.from_documents(all_documents, embeddings=all_embeddings, embed_model=embed_model) | |
| index.set_index_id("vector_index") | |
| index.storage_context.persist("./storage") | |
| print("Index created") | |
| # Load index from storage | |
| storage_context = StorageContext.from_defaults(persist_dir="storage") | |
| index = load_index_from_storage(storage_context, index_id="vector_index") | |
| print("Index loaded") | |
| # Initialize HyDEQueryTransform and TransformQueryEngine | |
| hyde = HyDEQueryTransform(include_original=True) | |
| query_engine = index.as_query_engine() | |
| hyde_query_engine = TransformQueryEngine(query_engine, hyde) | |
| # Query the index with HyDE and use output as LLM context | |
| def query_model_with_context(question): | |
| # Generate a hypothetical document using HyDE | |
| hyde_response = hyde_query_engine.query(question) | |
| print(f"HyDE Response: {hyde_response}") | |
| if isinstance(hyde_response, str): | |
| hyde_query = hyde_response | |
| else: | |
| hyde_query = hyde_response.response | |
| # Use the hypothetical document to retrieve relevant documents | |
| retriever = index.as_retriever(similarity_top_k=3) | |
| nodes = retriever.retrieve(hyde_query) | |
| for node in nodes: | |
| print(node) | |
| # Rerank the retrieved documents | |
| ranked_documents = reranker.compress_documents( | |
| query=question, | |
| documents=[LangDocument(page_content=node.text) for node in nodes] | |
| ) | |
| # Print the most relevant and least relevant node | |
| print(f"Most relevant node: {ranked_documents[0].page_content}") | |
| # Use the most relevant node as context | |
| context = ranked_documents[0].page_content | |
| # Send context and question to the client (NVIDIA Llama 3.1 8B model) | |
| messages = [ | |
| {"role": "system", "content": context}, | |
| {"role": "user", "content": str(question)} | |
| ] | |
| completion = client.stream(messages) | |
| # Process response | |
| response_text = "" | |
| for chunk in completion: | |
| if chunk.content is not None: | |
| response_text += chunk.content | |
| return response_text | |
| # Streamlit UI | |
| st.title("Chat with HyDE + Rerank RAG") | |
| question = st.text_input("Enter your question:") | |
| if st.button("Submit"): | |
| if question: | |
| st.write("**RAG Response:**") | |
| response = query_model_with_context(question) | |
| st.write(response) | |
| else: | |
| st.warning("Please enter a question.") |