from fiddler_evals import init, evaluate, Dataset
from fiddler_evals.evaluators import (
AnswerRelevance,
Conciseness,
FTLResponseFaithfulness
)
MODEL = "openai/gpt-4o"
CREDENTIAL = "your-credential-name"
# Initialize connection
init(url='https://your-org.fiddler.ai', token='your-access-token')
# Get existing dataset
dataset = Dataset.get_by_name(
name='llm_outputs',
application_id=application.id
)
# Define your LLM task
def evaluate_llm(inputs, extras, metadata):
question = inputs['question']
context = extras.get('context', '')
# Your LLM call here
response = my_llm_model.generate(question, context)
return {
"answer": response,
"question": question,
"context": context
}
# Run evaluation with multiple evaluators
results = evaluate(
dataset=dataset,
task=evaluate_llm,
evaluators=[
AnswerRelevance(model=MODEL, credential=CREDENTIAL),
Conciseness(model=MODEL, credential=CREDENTIAL),
FTLResponseFaithfulness() # Centor Models don't require model= parameter
],
name_prefix="llm-eval",
description="Comprehensive LLM experiment",
metadata={"model_version": "v2.1", "environment": "production"},
score_fn_kwargs_mapping={
"user_query": lambda x: x["inputs"]["question"],
"rag_response": "answer",
"response": "answer",
"context": lambda x: x["extras"].get("context", "")
},
max_workers=4 # Parallel processing
)
# Access results programmatically
for result in results.results:
item = result.experiment_item
print(f"\nTest Case: {item.dataset_item_id}")
print(f"Status: {item.status}")
print(f"Duration: {item.duration_ms}ms")
for score in result.scores:
print(f" {score.name}: {score.value}")