Running RAG Experiments at Scale
1
Set Up the Experiment Infrastructure
import pandas as pd
from fiddler_evals import Application, Dataset, Project, evaluate, init
from fiddler_evals.evaluators import AnswerRelevance, ContextRelevance, RAGFaithfulness
URL = 'https://your-org.fiddler.ai'
TOKEN = 'your-access-token'
LLM_CREDENTIAL_NAME = 'your-llm-credential'
LLM_MODEL_NAME = 'openai/gpt-4o'
init(url=URL, token=TOKEN)
project = Project.get_or_create(name='rag_experiments')
application = Application.get_or_create(
name='rag-pipeline-comparison',
project_id=project.id,
)
dataset = Dataset.get_or_create(
name='rag-test-cases',
application_id=application.id,
)2
Create Test Cases with Golden Labels
rag_data = pd.DataFrame(
[
{
'scenario': 'Perfect Match',
'expected_quality': 'good',
'user_query': 'What is the capital of France?',
'retrieved_documents': [
'Paris is the capital and largest city of France.',
'France is located in Western Europe.',
],
'rag_response': 'The capital of France is Paris.',
},
{
'scenario': 'Irrelevant Context',
'expected_quality': 'bad',
'user_query': 'How do I reset my password?',
'retrieved_documents': [
'To make pasta, boil water and add salt.',
'Italian cuisine features many pasta dishes.',
],
'rag_response': 'To reset your password, go to the login page '
'and click Forgot Password.',
},
{
'scenario': 'Hallucination',
'expected_quality': 'bad',
'user_query': 'What are the business hours?',
'retrieved_documents': [
'Our office is located at 123 Main Street.',
'We are closed on federal holidays.',
],
'rag_response': 'Our business hours are Monday through Friday, '
'9 AM to 5 PM.',
},
{
'scenario': 'Irrelevant Answer',
'expected_quality': 'bad',
'user_query': 'What is your return policy?',
'retrieved_documents': [
'Returns are accepted within 30 days of purchase.',
'Items must be unused and in original packaging.',
],
'rag_response': 'We offer free shipping on orders over $50. '
'Delivery takes 3-5 business days.',
},
]
)3
Insert Data into the Dataset
if not list(dataset.get_items()):
dataset.insert_from_pandas(
df=rag_data,
input_columns=['user_query', 'retrieved_documents', 'rag_response'],
expected_output_columns=['expected_quality'],
metadata_columns=['scenario'],
)
print(f'Inserted {len(rag_data)} test cases')
else:
print('Dataset already has items, skipping insert')Inserted 4 test cases4
Run the Experiment
def rag_task(inputs: dict, extras: dict, metadata: dict) -> dict:
"""Return pre-recorded RAG response.
Replace with your actual RAG pipeline in production.
"""
return {'rag_response': inputs['rag_response']}
evaluators = [
ContextRelevance(model=LLM_MODEL_NAME, credential=LLM_CREDENTIAL_NAME),
RAGFaithfulness(model=LLM_MODEL_NAME, credential=LLM_CREDENTIAL_NAME),
AnswerRelevance(model=LLM_MODEL_NAME, credential=LLM_CREDENTIAL_NAME),
]
result = evaluate(
dataset=dataset,
task=rag_task,
evaluators=evaluators,
score_fn_kwargs_mapping={
'user_query': lambda x: x['inputs']['user_query'],
'retrieved_documents': lambda x: x['inputs']['retrieved_documents'],
'rag_response': 'rag_response',
},
)
print(f'Experiment: {result.experiment.name}')
print(f'Evaluated {len(result.results)} test cases')Experiment: rag-test-cases-2026-02-07-001
Evaluated 4 test cases5
Validate Against Golden Labels
from fiddler_evals.pydantic_models.experiment import ExperimentItemResult
from fiddler_evals.pydantic_models.score import Score
validation_results = []
correct = 0
for r in result.results:
expected = r.dataset_item.expected_outputs.get('expected_quality')
has_problem = any(s.value < 0.5 for s in r.scores)
predicted = 'bad' if has_problem else 'good'
if expected == predicted:
correct += 1
validation_results.append(
ExperimentItemResult(
experiment_item=r.experiment_item,
dataset_item=r.dataset_item,
scores=[
Score(
name='predicted_quality',
evaluator_name='OverallQuality',
value=1.0 if predicted == 'good' else 0.0,
label=predicted,
reasoning=f'Expected: {expected}',
)
],
)
)
result.experiment.add_results(validation_results)
print(
f'Evaluator Accuracy: {correct}/{len(result.results)} '
f'({100 * correct / len(result.results):.0f}%)'
)Evaluator Accuracy: 4/4 (100%)6
View Results
print(f'View in Fiddler: {URL}/evals/experiments/{result.experiment.id}')
# Build results DataFrame
rows = []
for r, v in zip(result.results, validation_results):
row = {
'scenario': r.dataset_item.metadata.get('scenario'),
'expected': r.dataset_item.expected_outputs.get('expected_quality'),
'predicted': v.scores[0].label,
}
row.update({s.evaluator_name: s.value for s in r.scores})
rows.append(row)
pd.DataFrame(rows)scenario
expected
predicted
ContextRelevance
RAGFaithfulness
AnswerRelevance
Comparing Pipeline Configurations
# Experiment 1: Default retrieval
result_v1 = evaluate(
dataset=dataset,
task=rag_pipeline_v1,
evaluators=evaluators,
score_fn_kwargs_mapping={
'user_query': lambda x: x['inputs']['user_query'],
'retrieved_documents': lambda x: x['inputs']['retrieved_documents'],
'rag_response': 'rag_response',
},
)
# Experiment 2: Improved retrieval with re-ranking
result_v2 = evaluate(
dataset=dataset,
task=rag_pipeline_v2,
evaluators=evaluators,
score_fn_kwargs_mapping={
'user_query': lambda x: x['inputs']['user_query'],
'retrieved_documents': lambda x: x['inputs']['retrieved_documents'],
'rag_response': 'rag_response',
},
)
# Compare results side-by-side in the Fiddler UI
print(f'V1: {URL}/evals/experiments/{result_v1.experiment.id}')
print(f'V2: {URL}/evals/experiments/{result_v2.experiment.id}')