from fiddler_evals import init, Project, Application, Dataset, evaluate
from fiddler_evals.pydantic_models.dataset import NewDatasetItem
from fiddler_evals.evaluators import (
RAGFaithfulness,
AnswerRelevance,
ContextRelevance,
Conciseness
)
# Step 1: Initialize
init(url='https://your-org.fiddler.ai', token='your-token')
# Step 2: Set up organization
project = Project.get_or_create(name='rag_experiments')
app = Application.get_or_create(name='doc_qa_system', project_id=project.id)
dataset = Dataset.create(name='qa_test_set', application_id=app.id)
# Step 3: Create test cases
test_cases = [
NewDatasetItem(
inputs={
"user_query": "What is machine learning?",
"retrieved_documents": "Machine learning is a subset of AI that enables "
"systems to learn from data."
},
expected_outputs={
"rag_response": "Machine learning is a subset of AI."
},
metadata={"difficulty": "easy"}
),
]
dataset.insert(test_cases)
# Step 4: Define RAG task
def rag_task(inputs, extras, metadata):
"""Your RAG system implementation."""
user_query = inputs["user_query"]
retrieved_documents = inputs["retrieved_documents"]
# Call your RAG system (simplified example)
rag_response = generate_answer(user_query, retrieved_documents)
return {
"rag_response": rag_response,
"retrieved_documents": retrieved_documents,
}
# Step 5: Run comprehensive evaluation
results = evaluate(
dataset=dataset,
task=rag_task,
evaluators=[
RAGFaithfulness(model="openai/gpt-4o", credential="your-llm-credential"), # Check factual grounding (Yes/No)
AnswerRelevance(model="openai/gpt-4o", credential="your-llm-credential"), # Check relevance to query (High/Medium/Low)
ContextRelevance(model="openai/gpt-4o", credential="your-llm-credential"), # Check retrieval quality (High/Medium/Low)
Conciseness(model="openai/gpt-4o", credential="your-llm-credential"), # Check for verbosity
],
name_prefix="rag_eval_v1",
score_fn_kwargs_mapping={
"rag_response": "rag_response",
"retrieved_documents": "retrieved_documents",
"user_query": lambda x: x["inputs"]["user_query"],
}
)
# Step 6: Analyze
print(f"Evaluated {len(results.results)} test cases")
for result in results.results:
print(f"\n{result.dataset_item_id}:")
for score in result.scores:
print(f" {score.name}: {score.value:.3f}")