RAG Evaluation Fundamentals
1
Connect and Initialize Evaluators
import pandas as pd
from fiddler_evals import init
from fiddler_evals.evaluators import RAGFaithfulness, AnswerRelevance
URL = 'https://your-org.fiddler.ai'
TOKEN = 'your-access-token'
LLM_CREDENTIAL_NAME = 'your-llm-credential' # From Settings > LLM Gateway
LLM_MODEL_NAME = 'openai/gpt-4o' # Or your preferred model
init(url=URL, token=TOKEN)
# Initialize evaluators
faithfulness = RAGFaithfulness(model=LLM_MODEL_NAME, credential=LLM_CREDENTIAL_NAME)
relevance = AnswerRelevance(model=LLM_MODEL_NAME, credential=LLM_CREDENTIAL_NAME)2
Create Test Cases
test_cases = pd.DataFrame(
[
{
'scenario': 'Perfect Match',
'user_query': 'What is the capital of France?',
'retrieved_documents': ['Paris is the capital of France.'],
'rag_response': 'The capital of France is Paris.',
},
{
'scenario': 'Hallucination',
'user_query': 'What are the office hours?',
'retrieved_documents': ['We are closed on weekends.'],
'rag_response': 'We are open 9 AM to 5 PM every day.',
},
{
'scenario': 'Irrelevant Answer',
'user_query': 'How do I reset my password?',
'retrieved_documents': ['To reset, click "Forgot Password".'],
'rag_response': 'Our system is very secure and uses 256-bit encryption.',
},
]
)3
Evaluate Each Test Case
def evaluate_row(row):
f_score = faithfulness.score(
user_query=row['user_query'],
rag_response=row['rag_response'],
retrieved_documents=row['retrieved_documents'],
)
r_score = relevance.score(
user_query=row['user_query'],
rag_response=row['rag_response'],
)
return pd.Series(
{
'Faithfulness': f_score.label,
'Relevance': r_score.label,
'Status': 'HEALTHY'
if f_score.label == 'yes' and r_score.value >= 0.5
else 'ISSUE DETECTED',
}
)
results = test_cases.join(test_cases.apply(evaluate_row, axis=1))