# ðŸ“Š Evaluasi RAG Politeknik Negeri Padang dengan LangSmith

Evaluasi menggunakan metrik:
- **CP**: Context Precision
- **CR**: Context Recall
- **F1**: Harmonik CP dan CR
- **AR**: Answer Relevance (semantic similarity)

Menggunakan LangChain + Replicate + Vectorstore (FAISS), dan logging ke LangSmith.


In [6]:
pip install openai

Collecting openai
  Downloading openai-1.88.0-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)
Downloading openai-1.88.0-py3-none-any.whl (734 kB)
   ---------------------------------------- 0.0/734.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/734.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/734.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/734.3 kB ? eta -:--:--
   -------------- ------------------------- 262.1/734.3 kB ? eta -:--:--
   -------------- ------------------------- 262.1/734.3 kB ? eta -:--:--
   -------------- ------------------------- 262.1/734.3 kB ? eta -:--:--
   -------------- ------------------------- 262.1/734.3 kB ? eta -:--:--
   -------------- ------------------------- 262.1/734.3 kB ? eta -:--:--
  


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import dotenv
import numpy as np
import pandas as pd
from langsmith import traceable
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.evaluation import load_evaluator
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import Replicate
from dataset_rag_pnp import dataset  # pastikan file ini ada

# âœ… Load ENV dari file .env
dotenv.load_dotenv()

# âœ… Set ENV untuk LangSmith & OpenAI
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGSMITH_PROJECT', 'PNP RAG Eval')
os.environ['LANGCHAIN_ENDPOINT'] = os.getenv('LANGSMITH_ENDPOINT', 'https://api.smith.langchain.com')
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGSMITH_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['REPLICATE_API_TOKEN'] = os.getenv('REPLICATE_API_TOKEN')


ModuleNotFoundError: No module named 'langchain_core.evaluation'

In [8]:
embeddings = HuggingFaceEmbeddings(
    model_name="LazarusNLP/all-indo-e5-small-v4",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.load_local("vector_store_data", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

llm = Replicate(
        model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
        model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
    )

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

chat_history = []

def run_rag(question: str):
    global chat_history
    result = qa_chain({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    contexts = [doc.page_content for doc in result['source_documents']]
    return {"answer": result['answer'], "contexts": contexts}


In [None]:
from langchain_core.evaluation import load_evaluator
from langchain.embeddings.openai import OpenAIEmbeddings
import numpy as np

@traceable(name="Evaluate RAG Question")
def evaluate_question(q, ground_truth):
    emb = OpenAIEmbeddings(model="text-embedding-3-small")
    ar_eval = load_evaluator("embedding_distance")

    def embed(text):
        return emb.embed_query(text)

    def cosine_sim(a, b):
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) if np.linalg.norm(a) and np.linalg.norm(b) else 0.0

    rag_out = run_rag(q)
    answer = rag_out['answer']
    contexts = rag_out['contexts']
    gt_emb = embed(ground_truth)
    flags = [cosine_sim(gt_emb, embed(ctx)) >= 0.78 for ctx in contexts]
    cp = sum(flags) / len(contexts) if contexts else 0.0
    cr = 1.0 if any(flags) else 0.0
    f1 = 2 * cp * cr / (cp + cr) if cp + cr > 0 else 0.0

    ar_score = 1.0 - ar_eval.evaluate_strings(
        prediction=answer,
        reference=ground_truth,
        input=q
    )["score"]

    return {
        "question": q,
        "answer": answer,
        "CP": round(cp, 3),
        "CR": round(cr, 3),
        "F1": round(f1, 3),
        "AR": round(ar_score, 3),
        "contexts": contexts
    }


In [12]:
results = []
for item in dataset:
    result = evaluate_question(item['question'], item['ground_truth'])
    results.append(result)

pd.DataFrame(results)[["question","ground_truth", "CP", "CR", "F1", "AR"]]


ImportError: The tiktoken library is required to use the default OpenAI embeddings with embedding distance evaluators. Please either manually select a different Embeddings object or install tiktoken using `pip install tiktoken`.