Was this helpful?
from fiddler_evals import init
# Initialize connection
init(
url='https://your-org.fiddler.ai',
token='your-access-token'
)from fiddler_evals import Project, Application, Dataset
from fiddler_evals.pydantic_models.dataset import NewDatasetItem
# Create organizational structure
project = Project.get_or_create(name='my_eval_project')
application = Application.get_or_create(
name='my_llm_app',
project_id=project.id
)
# Create evaluation dataset
dataset = Dataset.create(
name='evaluation_dataset',
application_id=application.id,
description='Test cases for LLM evaluation'
)# Define test cases
test_cases = [
NewDatasetItem(
inputs={"question": "What is the capital of France?"},
expected_outputs={"answer": "Paris is the capital of France"},
metadata={"type": "Factual", "category": "Geography"}
),
NewDatasetItem(
inputs={"question": "Explain photosynthesis"},
expected_outputs={"answer": "Photosynthesis is the process by which plants convert sunlight into energy"},
metadata={"type": "Explanation", "category": "Science"}
),
]
# Insert test cases into dataset
dataset.insert(test_cases)
print(f"✅ Added {len(test_cases)} test cases")def my_llm_task(inputs, extras, metadata):
"""Your LLM application logic."""
question = inputs.get("question", "")
# Call your LLM here (example uses placeholder)
# In production, call OpenAI, Anthropic, or your LLM
answer = f"Mock response to: {question}"
return {"answer": answer}from fiddler_evals import evaluate
from fiddler_evals.evaluators import (
AnswerRelevance,
Conciseness,
Toxicity,
PIIDetection
)
# Run evaluation
results = evaluate(
dataset=dataset,
task=my_llm_task,
evaluators=[
AnswerRelevance(),
Conciseness(),
Toxicity(),
PIIDetection()
],
name_prefix="my_evaluation",
score_fn_kwargs_mapping={
"response": "answer",
"text": "answer",
"prompt": lambda x: x["inputs"]["question"]
}
)
print(f"✅ Evaluated {len(results.results)} test cases")# Access results programmatically
for item_result in results.results:
print(f"\nTest Case: {item_result.dataset_item_id}")
print(f"Task Output: {item_result.task_output}")
# View scores from each evaluator
for score in item_result.scores:
print(f" {score.name}: {score.value} - {score.reasoning}")
# View results in Fiddler UI
print(f"\n🔗 View results: https://your-org.fiddler.ai")from fiddler_evals.evaluators import Faithfulness, ContextRelevance
# Add context to test cases
rag_test_cases = [
NewDatasetItem(
inputs={
"question": "What is the capital of France?",
"context": "Paris is the capital and largest city of France."
},
expected_outputs={"answer": "Paris"}
),
]
dataset.insert(rag_test_cases)
# Evaluate with RAG-specific evaluators
rag_results = evaluate(
dataset=dataset,
task=my_rag_task, # Your RAG system
evaluators=[
Faithfulness(),
ContextRelevance(),
AnswerRelevance()
],
score_fn_kwargs_mapping={
"response": "answer",
"context": lambda x: x["inputs"]["context"],
"question": lambda x: x["inputs"]["question"]
}
)from fiddler_evals.evaluators.base import Evaluator
from fiddler_evals.pydantic_models.score import Score
class CustomToneEvaluator(Evaluator):
"""Evaluates if response matches desired tone."""
def score(self, response: str, desired_tone: str = "professional") -> Score:
# Your custom evaluation logic
is_professional = self._check_tone(response, desired_tone)
return Score(
name="tone_match",
value=1.0 if is_professional else 0.0,
reasoning=f"Response {'matches' if is_professional else 'does not match'} {desired_tone} tone"
)
def _check_tone(self, text: str, tone: str) -> bool:
# Implement your tone detection logic
# Could use keyword matching, LLM-as-judge, or ML model
return True # Placeholder
# Use custom evaluator
results = evaluate(
dataset=dataset,
task=my_llm_task,
evaluators=[CustomToneEvaluator()],
score_fn_kwargs_mapping={
"response": "answer",
"desired_tone": "professional"
}
)# Evaluate with parallel workers for faster execution
results = evaluate(
dataset=dataset,
task=my_llm_task,
evaluators=[AnswerRelevance(), Toxicity()],
max_workers=5 # Process 5 test cases in parallel
)# From CSV
dataset.insert_from_csv_file(
csv_file_path='test_cases.csv',
inputs_columns=['question'],
expected_outputs_columns=['answer']
)
# From JSONL
dataset.insert_from_jsonl_file(
jsonl_file_path='test_cases.jsonl'
)
# From Pandas DataFrame
import pandas as pd
df = pd.DataFrame({
'question': ['Q1', 'Q2'],
'expected_answer': ['A1', 'A2']
})
dataset.insert_from_pandas(
dataframe=df,
inputs_columns=['question'],
expected_outputs_columns=['expected_answer']
)# Add experiment metadata for tracking
results = evaluate(
dataset=dataset,
task=my_llm_task,
evaluators=[AnswerRelevance()],
name_prefix="experiment_v2", # Version your experiments
score_fn_kwargs_mapping={"response": "answer"}
)
# Results are automatically tracked in Fiddler
# View experiment history in the Fiddler UIfrom fiddler_evals import init, Project, Application, Dataset, evaluate
from fiddler_evals.pydantic_models.dataset import NewDatasetItem
from fiddler_evals.evaluators import (
Faithfulness,
AnswerRelevance,
ContextRelevance,
Conciseness,
PIIDetection
)
# Step 1: Initialize
init(url='https://your-org.fiddler.ai', token='your-token')
# Step 2: Set up organization
project = Project.get_or_create(name='rag_evaluation')
app = Application.get_or_create(name='doc_qa_system', project_id=project.id)
dataset = Dataset.create(name='qa_test_set', application_id=app.id)
# Step 3: Create test cases
test_cases = [
NewDatasetItem(
inputs={
"question": "What is machine learning?",
"context": "Machine learning is a subset of AI that enables systems to learn from data."
},
expected_outputs={"answer": "Machine learning is a subset of AI."},
metadata={"difficulty": "easy"}
),
]
dataset.insert(test_cases)
# Step 4: Define RAG task
def rag_task(inputs, extras, metadata):
"""Your RAG system implementation."""
question = inputs["question"]
context = inputs["context"]
# Call your RAG system (simplified example)
answer = generate_answer(question, context)
return {"answer": answer, "context_used": context}
# Step 5: Run comprehensive evaluation
results = evaluate(
dataset=dataset,
task=rag_task,
evaluators=[
Faithfulness(), # Check factual accuracy
AnswerRelevance(), # Check relevance to question
ContextRelevance(), # Check context quality
Conciseness(), # Check for verbosity
PIIDetection() # Check for sensitive data
],
name_prefix="rag_eval_v1",
score_fn_kwargs_mapping={
"response": "answer",
"context": "context_used",
"question": lambda x: x["inputs"]["question"]
}
)
# Step 6: Analyze
print(f"Evaluated {len(results.results)} test cases")
for result in results.results:
print(f"\n{result.dataset_item_id}:")
for score in result.scores:
print(f" {score.name}: {score.value:.3f}")