> ## Documentation Index
> Fetch the complete documentation index at: https://docs.fiddler.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Evaluators & Metrics Guide

> Interactive guide for selecting the right Fiddler evaluators and metrics for your use case. Filter by observability type, use case, and rating to find what to deploy.

export const EvaluatorsGuide = ({data}) => {
  const PILLS_VISIBLE = 7;
  const displayHosting = p => p;
  const ratingClass = r => r == null ? "" : r === "Recommended" ? "rec" : r === "Consider" ? "con" : "opt";
  const ratingRank = r => r == null ? 99 : r === "Recommended" ? 0 : r === "Consider" ? 1 : 2;
  const isGuardrailsView = state => state.modelType === "Guardrails";
  const isSearchActive = state => !!(state.query && state.query.trim());
  const visibleRatings = state => {
    if (isGuardrailsView(state)) return new Set(["Recommended", "Consider", "Optional"]);
    if (state.rating === "rec") return new Set(["Recommended"]);
    if (state.rating === "recCon") return new Set(["Recommended", "Consider"]);
    return new Set(["Recommended", "Consider", "Optional"]);
  };
  const matchSearch = (state, name) => {
    if (!state.query) return true;
    return name.toLowerCase().includes(state.query.toLowerCase());
  };
  const matchImplementation = (state, m) => {
    if (state.implementations.size === 0) return true;
    const t = m.implementationType;
    return t != null && state.implementations.has(t);
  };
  const matchHosting = (state, m) => {
    if (state.hostings.size === 0) return true;
    const opts = m.providerOptions || [];
    return opts.some(o => state.hostings.has(o));
  };
  const chipTypeTip = (data, m) => {
    if (!m.type || m.type === "Metric") return "Platform-level metric computed from spans — no LLM call required.";
    if (m.type === "Either") return "Can be configured as a platform metric or an evaluator depending on setup.";
    return (data.evaluatorTypeTips || ({}))[m.implementationType] || "Evaluator run on matching spans.";
  };
  const chipProviderTip = (data, p) => (data.providerTips || ({}))[p] || "";
  const getEvaluatorRows = (state, data) => {
    const allUseCases = data.useCases || [];
    const selected = isGuardrailsView(state) || isSearchActive(state) ? allUseCases : Array.from(state.useCases);
    if (selected.length === 0) return [];
    const rows = [];
    for (const m of data.metrics || []) {
      if (m.hidden) continue;
      if (isGuardrailsView(state)) {
        if (!m.guardrails) continue;
      } else if (!m.models || !m.models.includes(state.modelType)) {
        continue;
      }
      let bestRating = "Optional";
      const perCase = [];
      for (const uc of selected) {
        const cell = m.uc[uc];
        if (!cell) continue;
        const [r, why] = cell;
        perCase.push({
          uc,
          rating: r,
          why
        });
        if (ratingRank(r) < ratingRank(bestRating)) bestRating = r;
      }
      rows.push({
        ...m,
        bestRating: isSearchActive(state) ? null : bestRating,
        perCase
      });
    }
    return rows;
  };
  const getMLRows = (state, data) => {
    const allTasks = data.mlTasks || [];
    const selected = isSearchActive(state) ? allTasks.slice() : Array.from(state.mlTasks);
    if (selected.length === 0) return [];
    const taskSet = new Set(selected);
    const rows = [];
    for (const m of data.mlMetrics || []) {
      const overlap = m.tasks.filter(t => taskSet.has(t));
      if (overlap.length === 0) continue;
      rows.push({
        name: m.name,
        objective: m.objective,
        q: m.q,
        type: "Metric",
        category: "Performance",
        domain: "Platform",
        bestRating: "Recommended",
        perCase: [{
          uc: overlap.join(", "),
          rating: "Recommended",
          why: ""
        }],
        notes: ""
      });
    }
    for (const m of data.mlCrosscut || []) {
      const rating = m.objective === "Operational Health & Efficiency" ? "Recommended" : "Consider";
      rows.push({
        name: m.name,
        objective: m.objective,
        q: m.q,
        type: "Metric",
        category: m.category,
        domain: m.domain,
        bestRating: rating,
        perCase: [{
          uc: "all ML tasks",
          rating,
          why: m.notes
        }],
        notes: m.notes
      });
    }
    return rows;
  };
  const cookbookEvaluatorFor = (data, useCase, metricName) => {
    const cb = data.cookbook && data.cookbook.byUseCase ? data.cookbook.byUseCase[useCase] : null;
    if (!cb || !cb.evaluators) return null;
    return cb.evaluators.find(e => e.name === metricName || e.aka === metricName) || null;
  };
  const cookbookCoverageFor = (state, data, metricName) => {
    if (state.modelType === "ML") return [];
    const out = [];
    for (const uc of state.useCases) {
      if (cookbookEvaluatorFor(data, uc, metricName)) out.push(uc);
    }
    return out;
  };
  const getDocsCookbooksFor = (data, uc, modelType) => (data.cookbookDocs || []).filter(d => {
    if (d.embedded) return false;
    if (!d.content) return false;
    if (!d.models.includes(modelType)) return false;
    if (d.useCases === "all") return true;
    return Array.isArray(d.useCases) && d.useCases.includes(uc);
  });
  const RATING_FILTER_LABEL = {
    rec: "Recommended only",
    recCon: "Recommended + Consider",
    all: "All (Recommended, Consider, Optional)"
  };
  const buildAgentContextMarkdown = (state, data) => {
    const isLLMOrAgentic = state.modelType !== "ML";
    const selected = Array.from(isLLMOrAgentic ? state.useCases : state.mlTasks);
    const useCaseLabel = isLLMOrAgentic ? "Use case" : "ML task type";
    const objectives = Array.from(state.objectives);
    const searching = isSearchActive(state);
    const rows = (isLLMOrAgentic ? getEvaluatorRows(state, data) : getMLRows(state, data)).filter(r => (searching || !isLLMOrAgentic || visibleRatings(state).has(r.bestRating)) && (searching || !isLLMOrAgentic || matchImplementation(state, r)) && (searching || !isLLMOrAgentic || matchHosting(state, r)) && matchSearch(state, r.name));
    const lines = [];
    lines.push("# Fiddler Observability — Metric Selection Context");
    lines.push("");
    lines.push("## Suggested system prompt for your agent");
    lines.push("");
    lines.push("You are helping a Fiddler customer select observability metrics and evaluators for their GenAI application. Use the context below — the customer's current selection and the matching subset of Fiddler's metric guide — to recommend specific metrics, explain trade-offs (LLM-as-a-judge cost vs. non-AI fast-path, Fiddler-hosted vs. BYO models), and walk through cookbook scenarios when relevant. Keep recommendations grounded in the data provided; do not invent metrics that aren't listed.");
    lines.push("");
    lines.push("## Current selection");
    lines.push("");
    lines.push(`- Model type: ${state.modelType}`);
    lines.push(`- ${useCaseLabel}: ${selected.length ? selected.join(", ") : "(none)"}`);
    lines.push(`- Observability objectives: ${objectives.length ? objectives.join(", ") : "(none)"}`);
    lines.push(`- Rating filter: ${RATING_FILTER_LABEL[state.rating] || state.rating}`);
    lines.push(`- Evaluator Type filter: ${state.implementations.size ? Array.from(state.implementations).join(", ") : "(all)"}`);
    lines.push(`- Model Provider filter: ${state.hostings.size ? Array.from(state.hostings).map(displayHosting).join(", ") : "(all)"}`);
    lines.push("");
    lines.push(`## Matching metrics (${rows.length})`);
    lines.push("");
    if (rows.length === 0) {
      lines.push("_No metrics match the current filters._");
    } else {
      rows.forEach(m => {
        lines.push(`### ${m.name}${m.bestRating ? ` — ${m.bestRating}` : ""}`);
        if (m.q) lines.push(`> ${m.q}`);
        lines.push("");
        const bucket = m.implementationType;
        const typeBit = bucket ? `${m.type || "Metric"} (${bucket})` : m.type || "Metric";
        lines.push(`- Type: ${typeBit}`);
        if (m.provider && m.provider !== "N/A") lines.push(`- Runs on: ${m.provider}`);
        if (m.objective) lines.push(`- Observability objective: ${m.objective}`);
        if (Array.isArray(m.perCase)) {
          m.perCase.forEach(c => {
            lines.push(`- ${useCaseLabel} — ${c.uc} (${c.rating}): ${c.why}`);
          });
        }
        if (m.notes) lines.push(`- Notes: ${m.notes}`);
        lines.push("");
      });
    }
    if (isLLMOrAgentic && data.cookbook && data.cookbook.byUseCase) {
      const eligible = selected.filter(uc => data.cookbook.byUseCase[uc]);
      if (eligible.length > 0) {
        lines.push("## Bias & Accuracy Cookbook context");
        lines.push("");
        eligible.forEach(uc => {
          const cb = data.cookbook.byUseCase[uc];
          if (cb.workedExample) {
            const we = cb.workedExample;
            lines.push(`### Worked example — ${uc}: ${we.title}`);
            if (we.scenario) lines.push(`- Scenario: ${we.scenario}`);
            if (Array.isArray(we.evaluatorsApplied)) lines.push(`- Evaluators applied: ${we.evaluatorsApplied.join(", ")}`);
            if (Array.isArray(we.findings)) lines.push(`- Findings: ${we.findings.join("; ")}`);
            if (Array.isArray(we.rootCause)) lines.push(`- Root cause: ${we.rootCause.join("; ")}`);
            if (Array.isArray(we.remediation)) {
              lines.push(`- Remediation:`);
              we.remediation.forEach(step => {
                const when = step.when ? `${step.when}: ` : "";
                lines.push(`  - ${when}${step.action || step}`);
              });
            }
            if (we.result) lines.push(`- Result: ${we.result}`);
            lines.push("");
          }
          if (cb.biasStrategy) {
            lines.push(`### Bias detection strategy — ${uc}`);
            if (Array.isArray(cb.biasStrategy.bullets)) cb.biasStrategy.bullets.forEach(b => lines.push(`- ${b}`));
            if (cb.biasStrategy.example) lines.push(`- Example question: ${cb.biasStrategy.example}`);
            lines.push("");
          }
          if (Array.isArray(cb.evaluators)) {
            lines.push(`### Cookbook evaluators — ${uc}`);
            cb.evaluators.forEach(ev => {
              const aka = ev.aka ? ` (aka ${ev.aka})` : "";
              const tag = ev.cookbookOnly ? " [cookbook-only]" : "";
              lines.push(`- **${ev.name}**${aka}${tag}: ${ev.measures || ""}${ev.value ? ` — ${ev.value}` : ""}`);
            });
            lines.push("");
          }
        });
      }
    }
    lines.push("---");
    lines.push("_Exported from the Fiddler Evaluators & Metrics Guide. Full dataset available as JSON via the same panel._");
    return lines.join("\n");
  };
  const buildFullDatasetJSON = (data, generatedAt) => JSON.stringify({
    schema: "fiddler-metric-selection-guide-v1",
    generatedAt,
    note: "Self-contained snapshot of the Fiddler Evaluators & Metrics Guide. Drop into your agent's context, or wire into your own tooling.",
    objectives: data.objectives,
    agenticUseCases: data.useCases,
    mlTasks: data.mlTasks,
    metrics: data.metrics,
    mlMetrics: data.mlMetrics,
    mlCrossCut: data.mlCrosscut,
    cookbook: data.cookbook
  }, null, 2);
  const withToggle = (set, val) => {
    const next = new Set(set);
    if (next.has(val)) next.delete(val); else next.add(val);
    return next;
  };
  const makeInitialState = data => ({
    modelType: "Agentic",
    useCases: new Set(["Q&A (RAG)"]),
    mlTasks: new Set(),
    objectives: new Set((data.objectives || []).map(o => o.name)),
    rating: "recCon",
    query: "",
    implementations: new Set(),
    hostings: new Set(),
    drawer: null
  });
  const reducer = (state, action) => {
    switch (action.type) {
      case "SET_MODEL_TYPE":
        return {
          ...state,
          modelType: action.modelType,
          drawer: null
        };
      case "SELECT_USE_CASE":
        return {
          ...state,
          useCases: new Set([action.uc]),
          drawer: null
        };
      case "SELECT_ML_TASK":
        return {
          ...state,
          mlTasks: new Set([action.task]),
          drawer: null
        };
      case "CLEAR_SELECTION":
        return state.modelType === "ML" ? {
          ...state,
          mlTasks: new Set()
        } : {
          ...state,
          useCases: new Set()
        };
      case "TOGGLE_OBJECTIVE":
        return {
          ...state,
          objectives: withToggle(state.objectives, action.name)
        };
      case "CLEAR_OBJECTIVES":
        return {
          ...state,
          objectives: new Set()
        };
      case "RESET_OBJECTIVES":
        return {
          ...state,
          objectives: new Set(action.all)
        };
      case "SET_RATING":
        return {
          ...state,
          rating: action.rating
        };
      case "SET_QUERY":
        return {
          ...state,
          query: action.query
        };
      case "TOGGLE_IMPL":
        return {
          ...state,
          implementations: withToggle(state.implementations, action.val)
        };
      case "CLEAR_IMPL":
        return {
          ...state,
          implementations: new Set()
        };
      case "TOGGLE_HOSTING":
        return {
          ...state,
          hostings: withToggle(state.hostings, action.val)
        };
      case "CLEAR_HOSTING":
        return {
          ...state,
          hostings: new Set()
        };
      case "OPEN_DRAWER":
        return {
          ...state,
          drawer: action.drawer
        };
      case "CLOSE_DRAWER":
        return {
          ...state,
          drawer: null
        };
      default:
        return state;
    }
  };
  const MODEL_SEG = [{
    val: "Agentic",
    label: "Gen AI App"
  }, {
    val: "LLM",
    label: "LLM Model"
  }, {
    val: "ML",
    label: "Predictive Model"
  }, {
    val: "Guardrails",
    label: "🛡 Guardrails",
    guardrails: true,
    tip: "Fiddler Guardrails block unsafe inputs and outputs in real time, on top of monitoring. Switch to this view to see only the evaluators that can also run as a Guardrail. Use case and Rating filters are disabled in this view."
  }];
  const RATING_SEG = [{
    val: "rec",
    label: "Recommended only",
    tip: "Only metrics rated Recommended for the selected use case(s) — Fiddler's strongest endorsement. The fastest starter set."
  }, {
    val: "recCon",
    label: "Recommended + Consider",
    tip: "Metrics rated Recommended or Consider for the selected use case(s) — the core set plus add-ons worth thinking about."
  }, {
    val: "all",
    label: "All incl. Optional",
    tip: "Every metric in the matrix, including those rated Optional for the selected use case(s). For full inventory or exploration."
  }];
  const Badge = ({rating, large}) => <span className={`badge ${ratingClass(rating)}${large ? " badge-large" : ""}`}>{rating}</span>;
  const Paragraphs = ({text}) => {
    if (!text) return null;
    return String(text).split(/\n\n+/).map((para, i) => <p key={i}>{para}</p>);
  };
  const CookbookTable = ({table}) => {
    if (!table || !Array.isArray(table.headers) || !Array.isArray(table.rows)) return null;
    return <>
      {table.caption ? <h3>{table.caption}</h3> : null}
      <table>
        <thead>
          <tr>
            {table.headers.map((h, i) => <th key={i}>{h}</th>)}
          </tr>
        </thead>
        <tbody>
          {table.rows.map((row, ri) => <tr key={ri}>
              {row.map((c, ci) => <td key={ci}>{c}</td>)}
            </tr>)}
        </tbody>
      </table>
    </>;
  };
  const MetricCard = ({m, data, state, dispatch}) => {
    const cls = ratingClass(m.bestRating);
    const cbHits = cookbookCoverageFor(state, data, m.name);
    const bucket = m.implementationType;
    const openDetail = () => dispatch({
      type: "OPEN_DRAWER",
      drawer: {
        view: "metric",
        metric: m
      }
    });
    const onKeyDown = e => {
      if (e.key === "Enter" || e.key === " ") {
        e.preventDefault();
        openDetail();
      }
    };
    return <div className={`card ${cls} clickable`} role="button" tabIndex={0} aria-label={`${m.name} — open full details`} onClick={openDetail} onKeyDown={onKeyDown}>
      <span className="card-open-icon" aria-hidden="true">
        ↗
      </span>
      {m.bestRating ? <div className="card-rating">
          <Badge rating={m.bestRating} />
        </div> : null}
      <div className="card-head">
        <div className="metric-name">{m.name}</div>
      </div>
      {m.q ? <div className="metric-q">{m.q}</div> : null}
      <div className="chip-row">
        <span className="chip" title={chipTypeTip(data, m)} data-tip={chipTypeTip(data, m)}>
          <b>Type:</b> {m.type || "Metric"}
          {bucket ? ` (${bucket})` : ""}
        </span>
        {m.provider && m.provider !== "N/A" ? <span className="chip" title={chipProviderTip(data, m.provider)} data-tip={chipProviderTip(data, m.provider)}>
            <b>Provider:</b> {m.provider}
          </span> : null}
        {cbHits.length ? <button type="button" className="chip ck cookbook-chip" title="Open the Bias & Accuracy Cookbook entry — what this metric measures, what value it provides, and (for Q&amp;A RAG) a worked customer scenario." onClick={e => {
      e.stopPropagation();
      dispatch({
        type: "OPEN_DRAWER",
        drawer: {
          view: "cookbookMetric",
          metricName: m.name,
          useCases: cbHits
        }
      });
    }}>
            📖 in bias cookbook
          </button> : null}
        {m.guardrails ? <span className="chip guardrail-chip" title="This evaluator can run as a Fiddler Guardrail — used to block unsafe inputs/outputs in real time, in addition to monitoring.">
            🛡 Guardrails Available
          </span> : null}
      </div>
    </div>;
  };
  const ObjectiveSection = ({objective, items, data, state, dispatch}) => <section className="obj-section">
    <div className="obj-header">
      <div>
        <div className="obj-title">{objective.name} Metrics</div>
        <div className="obj-q">{objective.question}</div>
      </div>
      <div className="obj-counts">
        {items.length} metric{items.length === 1 ? "" : "s"}
      </div>
    </div>
    <div className="cards">
      {items.map(m => <MetricCard key={m.name} m={m} data={data} state={state} dispatch={dispatch} />)}
    </div>
  </section>;
  const FilterPills = ({state, data, dispatch, expanded, setExpanded}) => {
    const isLLMOrAgentic = state.modelType !== "ML";
    const opts = isLLMOrAgentic ? data.useCases || [] : data.mlTasks || [];
    const sel = isLLMOrAgentic ? state.useCases : state.mlTasks;
    const selectedName = sel.size > 0 ? Array.from(sel)[0] : null;
    const selectedIndex = selectedName ? opts.indexOf(selectedName) : -1;
    const needsExpandForSelection = selectedIndex >= PILLS_VISIBLE;
    const shouldOverflow = opts.length > PILLS_VISIBLE && !expanded && !needsExpandForSelection;
    const visibleOpts = shouldOverflow ? opts.slice(0, PILLS_VISIBLE) : opts;
    const hiddenCount = opts.length - PILLS_VISIBLE;
    const onSelect = name => {
      if (sel.has(name)) return;
      dispatch(isLLMOrAgentic ? {
        type: "SELECT_USE_CASE",
        uc: name
      } : {
        type: "SELECT_ML_TASK",
        task: name
      });
    };
    return <div className="pills">
      {visibleOpts.map(name => {
      const active = sel.has(name);
      const tip = isLLMOrAgentic ? (data.useCaseDescriptions || ({}))[name] : undefined;
      return <label key={name} className={"pill" + (active ? " active" : "")} title={tip} data-tip={tip} onClick={e => {
        e.preventDefault();
        onSelect(name);
      }}>
            <input type="radio" name="useCasePill" checked={active} readOnly /> {name}
          </label>;
    })}
      {shouldOverflow ? <button type="button" className="pill pill-more" title={`Show ${hiddenCount} more option${hiddenCount === 1 ? "" : "s"}`} onClick={() => setExpanded(true)}>
          + {hiddenCount} more
        </button> : null}
    </div>;
  };
  const MultiSelect = useMemo(() => ({label, tip, options, selectedSet, displayFn, onToggle, onClear}) => {
    const [open, setOpen] = useState(false);
    const [wrapEl, setWrapEl] = useState(null);
    useEffect(() => {
      if (!open) return undefined;
      const onDoc = e => {
        if (wrapEl && !wrapEl.contains(e.target)) setOpen(false);
      };
      const onKey = e => {
        if (e.key === "Escape") setOpen(false);
      };
      document.addEventListener("click", onDoc);
      document.addEventListener("keydown", onKey);
      return () => {
        document.removeEventListener("click", onDoc);
        document.removeEventListener("keydown", onKey);
      };
    }, [open, wrapEl]);
    const n = selectedSet.size;
    const triggerLabel = n === 0 ? "All" : n <= 2 ? Array.from(selectedSet).map(displayFn).join(", ") : `${n} selected`;
    return <div className="ms-cell">
      <div className="group-label" title={tip} data-tip={tip}>
        {label}
      </div>
      <div className="ms-wrap" ref={setWrapEl}>
        <button type="button" className={"ms-trigger" + (n > 0 ? " is-filtered" : "")} aria-haspopup="true" aria-expanded={open} onClick={e => {
      e.stopPropagation();
      setOpen(o => !o);
    }}>
          <span className="ms-trigger-label">{triggerLabel}</span>
          <span className="ms-chev" aria-hidden="true">
            ▾
          </span>
        </button>
        {open ? <div className="ms-popover" role="dialog" aria-label={`${label} filter`} onClick={e => e.stopPropagation()}>
            <div className="ms-popover-options">
              {options.map(val => <label key={val} className="ms-option">
                  <input type="checkbox" checked={selectedSet.has(val)} onChange={() => onToggle(val)} />{" "}
                  <span>{displayFn(val)}</span>
                </label>)}
            </div>
            <div className="ms-popover-foot">
              <button type="button" className="clear-btn" onClick={onClear}>
                Clear
              </button>
            </div>
          </div> : null}
      </div>
    </div>;
  }, []);
  const ActiveChips = ({state, data, dispatch}) => {
    const chips = [];
    if (state.query && state.query.trim()) {
      chips.push({
        label: `Search: "${state.query.trim()}"`,
        kind: "query",
        active: true
      });
    }
    if (state.implementations.size > 0) {
      chips.push({
        label: `Evaluator Type: ${Array.from(state.implementations).join(", ")}`,
        kind: "impl",
        active: true
      });
    } else {
      chips.push({
        label: "Evaluator Type: All",
        kind: "impl",
        active: false
      });
    }
    if (state.hostings.size > 0) {
      chips.push({
        label: `Model Provider: ${Array.from(state.hostings).map(displayHosting).join(", ")}`,
        kind: "hosting",
        active: true
      });
    } else {
      chips.push({
        label: "Model Provider: All",
        kind: "hosting",
        active: false
      });
    }
    const totalObjectives = (data.objectives || []).length;
    if (state.objectives.size < totalObjectives) {
      chips.push({
        label: `Objective: ${state.objectives.size} of ${totalObjectives}`,
        kind: "objective",
        active: true
      });
    }
    const clearHiddenFilter = kind => {
      if (kind === "query") dispatch({
        type: "SET_QUERY",
        query: ""
      }); else if (kind === "impl") dispatch({
        type: "CLEAR_IMPL"
      }); else if (kind === "hosting") dispatch({
        type: "CLEAR_HOSTING"
      }); else if (kind === "objective") dispatch({
        type: "RESET_OBJECTIVES",
        all: (data.objectives || []).map(o => o.name)
      });
    };
    return <div className="active-chips">
      {chips.map((c, i) => <span key={i} className={"active-chip" + (c.active ? "" : " is-default")}>
          <span className="active-chip-label">{c.label}</span>
          {c.active ? <button type="button" className="active-chip-clear" aria-label={`Clear ${c.kind} filter`} onClick={() => clearHiddenFilter(c.kind)}>
              ×
            </button> : null}
        </span>)}
    </div>;
  };
  const CookbookRow = ({state, data, dispatch}) => {
    if (state.modelType === "ML" || isGuardrailsView(state) || isSearchActive(state)) return null;
    const selected = Array.from(state.useCases);
    if (selected.length === 0) return null;
    const callouts = [];
    selected.forEach(uc => {
      const cb = data.cookbook && data.cookbook.byUseCase ? data.cookbook.byUseCase[uc] : null;
      if (cb) {
        const we = cb.workedExample;
        callouts.push(<div className="cookbook-callout" key={`bias-${uc}`}>
          <div className="ck-heading">
            <span className="ck-cookbook">Bias & Accuracy</span>{" "}
            {we ? we.title : "Bias detection strategy"}{" "}
            <button type="button" className="ck-link" onClick={() => dispatch({
          type: "OPEN_DRAWER",
          drawer: {
            view: "useCase",
            uc
          }
        })}>
              {we ? "Open worked example →" : "Open strategy →"}
            </button>
          </div>
        </div>);
      }
      getDocsCookbooksFor(data, uc, state.modelType).forEach(doc => {
        const label = doc.content && doc.content.workedExample ? "Open worked example →" : "Open snippet →";
        callouts.push(<div className="cookbook-callout" key={`${doc.id}-${uc}`}>
          <div className="ck-heading">
            <span className="ck-cookbook">{doc.name}</span> {doc.teaser}{" "}
            <button type="button" className="ck-link" onClick={() => dispatch({
          type: "OPEN_DRAWER",
          drawer: {
            view: "docsCookbook",
            cookbookId: doc.id,
            uc
          }
        })}>
              {label}
            </button>
          </div>
        </div>);
      });
    });
    if (callouts.length === 0) return null;
    return <section className="cookbook-row-section">
      <div className="ck-section-label">📖 Relevant Cookbook Snippets</div>
      <div className="cookbook-row-cards">{callouts}</div>
    </section>;
  };
  const SideColumn = ({dispatch}) => <div className="cookbook-column">
    <div className="ck-resource">
      <div className="ck-section-label">📖 Cookbooks</div>
      <button type="button" className="cookbook-ref-card" onClick={() => dispatch({
    type: "OPEN_DRAWER",
    drawer: {
      view: "browseCookbooks"
    }
  })}>
        <div className="crc-heading">Browse Cookbooks</div>
        <div className="crc-teaser">
          Explore Fiddler cookbooks for proven recipes to ensure your models deliver reliable, fair,
          high-quality outputs.
        </div>
        <div className="crc-link">Open →</div>
      </button>
    </div>

    <div className="ck-resource">
      <div className="ck-section-label">💬 Companion Tool</div>
      <button type="button" className="tool-card" onClick={() => dispatch({
    type: "OPEN_DRAWER",
    drawer: {
      view: "agentTools"
    }
  })}>
        <div className="tc-heading">Use Your Own Agent</div>
        <div className="tc-teaser">
          Copy as context for Claude or any agent. MCP support coming soon.
        </div>
        <div className="tc-link">See options →</div>
      </button>
    </div>
  </div>;
  const DrawerHead = ({eyebrow, title, onClose}) => <div className="drawer-head">
    <div>
      <div className="ck-eyebrow">{eyebrow}</div>
      <h2>{title}</h2>
    </div>
    <button type="button" className="close-btn" data-action="close" aria-label="Close" onClick={onClose}>
      ×
    </button>
  </div>;
  const AgentTools = ({state, data}) => {
    const [toast, setToast] = useState("");
    const copy = async () => {
      const markdown = buildAgentContextMarkdown(state, data);
      try {
        await navigator.clipboard.writeText(markdown);
        setToast("Copied to clipboard");
      } catch (err) {
        const ta = document.createElement("textarea");
        ta.value = markdown;
        ta.style.position = "fixed";
        ta.style.opacity = "0";
        document.body.appendChild(ta);
        ta.select();
        try {
          document.execCommand("copy");
          setToast("Copied to clipboard");
        } catch (e2) {
          setToast("Copy failed — select and copy manually");
        }
        document.body.removeChild(ta);
      }
    };
    const download = () => {
      const json = buildFullDatasetJSON(data, new Date().toISOString());
      const blob = new Blob([json], {
        type: "application/json"
      });
      const url = URL.createObjectURL(blob);
      const a = document.createElement("a");
      a.href = url;
      a.download = "fiddler_metric_selection.json";
      document.body.appendChild(a);
      a.click();
      document.body.removeChild(a);
      URL.revokeObjectURL(url);
      setToast("Downloaded fiddler_metric_selection.json");
    };
    return <div className="drawer-body">
      <p className="agent-intro">
        Bring this guide's content (and your current selections) into Claude, ChatGPT, Cursor, Claude
        Code, or your own agent. Both options below are self-contained — no API calls back to Fiddler.
      </p>
      <div className="agent-actions">
        <button type="button" className="primary" onClick={copy}>
          📋 Copy current selection as context
        </button>
        <button type="button" onClick={download}>
          ⬇ Download full dataset (JSON)
        </button>
      </div>
      {toast ? <p className="copy-toast">{toast}</p> : null}
    </div>;
  };
  const MetricDetailBody = ({m, data, state, dispatch}) => {
    const isLLMOrAgenticView = state.modelType !== "ML";
    const useCaseList = isLLMOrAgenticView ? data.useCases || [] : data.mlTasks || [];
    const selectedUcs = isLLMOrAgenticView ? state.useCases : state.mlTasks;
    const cbHits = cookbookCoverageFor(state, data, m.name);
    const bucket = m.implementationType;
    const sections = isLLMOrAgenticView ? useCaseList.filter(uc => selectedUcs.has(uc)).map(uc => {
      const cell = m.uc && m.uc[uc];
      if (!cell) {
        return <section className="md-section md-uc-section" key={uc}>
                <h3>
                  <span className="md-uc-label">Use case:</span> {uc}
                </h3>
                <div className="md-uc-rating-display">
                  <span className="md-uc-label">Rating:</span>{" "}
                  <span className="badge badge-large muted">No rating</span>
                </div>
                <p className="md-uc-empty">No rating in the selection sheet for this use case.</p>
              </section>;
      }
      const [rating, why] = cell;
      return <section className="md-section md-uc-section" key={uc}>
              <h3>
                <span className="md-uc-label">Use case:</span> {uc}
              </h3>
              <div className="md-uc-rating-display">
                <span className="md-uc-label">Rating:</span> <Badge rating={rating} large />
              </div>
              {why ? <Paragraphs text={why} /> : <p className="md-uc-empty">No rationale captured.</p>}
            </section>;
    }) : (m.perCase || []).map((c, i) => <section className="md-section md-uc-section" key={i}>
          <h3>
            <span className="md-uc-label">Use case:</span> {c.uc}
          </h3>
          <div className="md-uc-rating-display">
            <span className="md-uc-label">Rating:</span> <Badge rating={c.rating} large />
          </div>
          {c.why ? <p>{c.why}</p> : null}
        </section>);
    return <div className="drawer-body">
      {m.q ? <p className="md-question">{m.q}</p> : null}
      <div className="chip-row md-chips">
        <span className="chip" title={chipTypeTip(data, m)} data-tip={chipTypeTip(data, m)}>
          <b>Type:</b> {m.type || "Metric"}
          {bucket ? ` (${bucket})` : ""}
        </span>
        {m.provider && m.provider !== "N/A" ? <span className="chip" title={chipProviderTip(data, m.provider)} data-tip={chipProviderTip(data, m.provider)}>
            <b>Provider:</b> {m.provider}
          </span> : null}
        {m.guardrails ? <span className="chip guardrail-chip">🛡 Guardrails Available</span> : null}
        {cbHits.length ? <button type="button" className="chip ck cookbook-chip md-cookbook-chip" onClick={() => dispatch({
      type: "OPEN_DRAWER",
      drawer: {
        view: "cookbookMetric",
        metricName: m.name,
        useCases: cbHits
      }
    })}>
            📖 in bias cookbook
          </button> : null}
      </div>
      {sections}
      {m.docsUrl ? <section className="md-section md-docs">
          <a className="docs-link" href={m.docsUrl} target="_blank" rel="noopener noreferrer">
            📄 Read the official Fiddler docs ↗
          </a>
        </section> : null}
    </div>;
  };
  const CookbookMetricBody = ({metricName, useCases, data, dispatch}) => <div className="drawer-body">
    <p>
      How the <b>{metricName}</b> evaluator shows up in the Bias & Accuracy Cookbook for your
      selected use case{useCases.length > 1 ? "s" : ""}.
    </p>
    {useCases.map(uc => {
    const ev = cookbookEvaluatorFor(data, uc, metricName);
    if (!ev) return null;
    return <div className="metric-cookbook-card" key={uc}>
          <span className="uc-tag">{uc}</span>
          <dl>
            <dt>What it measures</dt>
            <dd>{ev.measures}</dd>
            <dt>What value it provides</dt>
            <dd>{ev.value}</dd>
          </dl>
          <button type="button" className="view-uc-btn" onClick={() => dispatch({
      type: "OPEN_DRAWER",
      drawer: {
        view: "useCase",
        uc
      }
    })}>
            View full {uc} cookbook entry →
          </button>
        </div>;
  })}
  </div>;
  const UseCaseBody = ({uc, data}) => {
    const cb = data.cookbook && data.cookbook.byUseCase ? data.cookbook.byUseCase[uc] : null;
    if (!cb) return <div className="drawer-body" />;
    const we = cb.workedExample;
    return <div className="drawer-body">
      {we ? <>
          <h3>Scenario</h3>
          <p>{we.scenario}</p>
          <p>
            <b>Evaluators applied:</b>{" "}
            {Array.isArray(we.evaluatorsApplied) ? we.evaluatorsApplied.map(e => <span className="pill-lite" key={e}>
                    {e}
                  </span>) : null}
          </p>
          <p>
            <b>Tags:</b>{" "}
            {Array.isArray(we.tags) ? we.tags.map(t => <span className="pill-lite" key={t}>
                    {t}
                  </span>) : null}
          </p>
          <CookbookTable table={we.beforeTable} />
          <h3>Findings</h3>
          <div className="findings-row">
            {Array.isArray(we.findings) ? we.findings.map(f => <span className="finding-pill" key={f}>
                    {f}
                  </span>) : null}
          </div>
          <h3>Root cause investigation</h3>
          <ul>
            {Array.isArray(we.rootCause) ? we.rootCause.map(r => <li key={r}>{r}</li>) : null}
          </ul>
          <h3>Remediation plan</h3>
          {Array.isArray(we.remediation) ? we.remediation.map((r, i) => <div className="remediation-row" key={i}>
                  <div className="when">{r.when}</div>
                  <div className="action">{r.action}</div>
                </div>) : null}
          <CookbookTable table={we.afterTable} />
          <div className="findings-row">
            <span className="improved-pill">{we.result}</span>
          </div>
        </> : null}

      {cb.evaluators && cb.evaluators.length > 0 ? <>
          <h3>{we ? "Recommended evaluators (cookbook)" : "Recommended evaluators"}</h3>
          {cb.evaluators.map(ev => <div className="metric-cookbook-card" key={ev.name}>
              <div style={{
      fontWeight: 600
    }}>
                {ev.name}
                {ev.cookbookOnly ? <span className="pill-lite"> cookbook-only</span> : null}
              </div>
              <dl>
                <dt>What it measures</dt>
                <dd>{ev.measures}</dd>
                <dt>What value it provides</dt>
                <dd>{ev.value}</dd>
              </dl>
            </div>)}
        </> : null}

      {cb.strategyNote ? <p style={{
      fontStyle: "italic"
    }}>{cb.strategyNote}</p> : null}
      {cb.note ? <p style={{
      fontStyle: "italic"
    }}>{cb.note}</p> : null}

      {cb.biasStrategy && Array.isArray(cb.biasStrategy.bullets) ? <>
          <h3>Bias detection strategy</h3>
          <ul>
            {cb.biasStrategy.bullets.map(b => <li key={b}>{b}</li>)}
          </ul>
          {cb.biasStrategy.example ? <p>
              <i>Example:</i> {cb.biasStrategy.example}
            </p> : null}
        </> : null}
    </div>;
  };
  const DocsCookbookBody = ({doc}) => {
    const c = doc.content;
    return <div className="drawer-body">
      {c.intro ? <p>{c.intro}</p> : null}

      {c.workedExample ? <>
          <h3>Worked example — {c.workedExample.title}</h3>
          {c.workedExample.scenario ? <p>{c.workedExample.scenario}</p> : null}
          {Array.isArray(c.workedExample.evaluatorsApplied) && c.workedExample.evaluatorsApplied.length ? <p>
              <b>Evaluators applied:</b>{" "}
              {c.workedExample.evaluatorsApplied.map(e => <span className="pill-lite" key={e}>
                  {e}
                </span>)}
            </p> : null}
          {c.workedExample.table ? <CookbookTable table={c.workedExample.table} /> : null}
          {c.workedExample.takeaway ? <div className="findings-row">
              <span className="improved-pill">{c.workedExample.takeaway}</span>
            </div> : null}
        </> : null}

      {Array.isArray(c.evaluators) && c.evaluators.length ? <>
          <h3>Evaluators this cookbook teaches</h3>
          {c.evaluators.map(ev => <div className="metric-cookbook-card" key={ev.name}>
              <div style={{
      fontWeight: 600
    }}>
                {ev.name}
                {ev.scoring ? <span className="pill-lite"> {ev.scoring}</span> : null}
              </div>
              <dl>
                <dt>What it measures</dt>
                <dd>{ev.measures}</dd>
                <dt>What value it provides</dt>
                <dd>{ev.value}</dd>
              </dl>
            </div>)}
        </> : null}

      {c.strategy && Array.isArray(c.strategy.bullets) ? <>
          <h3>{c.strategy.title || "Recommended approach"}</h3>
          <ul>
            {c.strategy.bullets.map(b => <li key={b}>{b}</li>)}
          </ul>
        </> : null}

      {Array.isArray(c.apiSurface) && c.apiSurface.length ? <>
          <h3>API surface</h3>
          <ul>
            {c.apiSurface.map(a => <li key={a}>
                <code>{a}</code>
              </li>)}
          </ul>
        </> : null}

      {Array.isArray(c.prerequisites) && c.prerequisites.length ? <>
          <h3>Prerequisites</h3>
          <ul>
            {c.prerequisites.map(p => <li key={p}>{p}</li>)}
          </ul>
        </> : null}

      {c.completionTimeMin ? <p style={{
      fontStyle: "italic"
    }}>Approx. completion time: {c.completionTimeMin} min.</p> : null}

      {doc.url && doc.url !== "#" ? <p>
          <a className="ck-link" href={doc.url} target="_blank" rel="noopener noreferrer">
            Read the full cookbook ↗
          </a>
        </p> : null}
    </div>;
  };
  const BrowseCookbooksBody = ({data}) => {
    const docs = data.cookbookDocs || [];
    const intro = docs.length > 1 ? "Browse the full source cookbook docs below — one section per cookbook." : "Browse the full source cookbook doc below.";
    return <div className="drawer-body">
      <p className="agent-intro">{intro}</p>
      <div className="cookbook-ref-list">
        {docs.map(doc => {
      const isPlaceholder = !doc.url || doc.url === "#";
      return <section className="cookbook-ref-section" key={doc.id}>
              <h3>{doc.name}</h3>
              <p>{doc.description}</p>
              {isPlaceholder ? <p className="placeholder-note">
                  Link coming soon — published URL will be wired in here.
                </p> : <a className="ck-link" href={doc.url} target="_blank" rel="noopener noreferrer">
                  Read the cookbook ↗
                </a>}
            </section>;
    })}
      </div>
    </div>;
  };
  const resolveDrawer = (drawer, data, state, dispatch) => {
    if (drawer.view === "metric") {
      const m = drawer.metric;
      return {
        eyebrow: `${m.objective} Metrics`,
        title: m.name,
        body: <MetricDetailBody m={m} data={data} state={state} dispatch={dispatch} />
      };
    }
    if (drawer.view === "cookbookMetric") {
      return {
        eyebrow: "Bias & Accuracy Cookbook",
        title: drawer.metricName,
        body: <CookbookMetricBody metricName={drawer.metricName} useCases={drawer.useCases} data={data} dispatch={dispatch} />
      };
    }
    if (drawer.view === "useCase") {
      const cb = data.cookbook && data.cookbook.byUseCase ? data.cookbook.byUseCase[drawer.uc] : null;
      return {
        eyebrow: `Bias & Accuracy Cookbook · ${drawer.uc}`,
        title: cb && cb.workedExample ? cb.workedExample.title : drawer.uc,
        body: <UseCaseBody uc={drawer.uc} data={data} />
      };
    }
    if (drawer.view === "docsCookbook") {
      const doc = (data.cookbookDocs || []).find(d => d.id === drawer.cookbookId);
      if (!doc || !doc.content) {
        return {
          eyebrow: "Cookbooks",
          title: "Browse Cookbooks",
          body: <BrowseCookbooksBody data={data} />
        };
      }
      return {
        eyebrow: `${doc.name} · ${drawer.uc}`,
        title: doc.content.workedExample ? doc.content.workedExample.title : doc.teaser,
        body: <DocsCookbookBody doc={doc} />
      };
    }
    if (drawer.view === "browseCookbooks") {
      return {
        eyebrow: "Cookbooks",
        title: "Browse Cookbooks",
        body: <BrowseCookbooksBody data={data} />
      };
    }
    if (drawer.view === "agentTools") {
      return {
        eyebrow: "Companion Tools",
        title: "Use Your Own Agent",
        body: <AgentTools state={state} data={data} />
      };
    }
    return {
      eyebrow: "",
      title: "",
      body: <div className="drawer-body" />
    };
  };
  const Drawer = useMemo(() => ({drawer, data, state, dispatch}) => {
    const [panelEl, setPanelEl] = useState(null);
    const close = () => dispatch({
      type: "CLOSE_DRAWER"
    });
    useEffect(() => {
      const previouslyFocused = document.activeElement;
      const prevOverflow = document.body.style.overflow;
      document.body.style.overflow = "hidden";
      return () => {
        document.body.style.overflow = prevOverflow;
        if (previouslyFocused && typeof previouslyFocused.focus === "function") {
          previouslyFocused.focus();
        }
      };
    }, []);
    useEffect(() => {
      if (!panelEl) return;
      const closeBtn = panelEl.querySelector("[data-action='close']");
      if (closeBtn) closeBtn.focus();
    }, [panelEl, drawer]);
    useEffect(() => {
      const onKey = e => {
        if (e.key === "Escape") {
          dispatch({
            type: "CLOSE_DRAWER"
          });
          return;
        }
        if (e.key !== "Tab" || !panelEl) return;
        const focusable = panelEl.querySelectorAll('a[href], button:not([disabled]), input:not([disabled]), [tabindex]:not([tabindex="-1"])');
        if (!focusable.length) return;
        const first = focusable[0];
        const last = focusable[focusable.length - 1];
        if (e.shiftKey && document.activeElement === first) {
          e.preventDefault();
          last.focus();
        } else if (!e.shiftKey && document.activeElement === last) {
          e.preventDefault();
          first.focus();
        }
      };
      document.addEventListener("keydown", onKey);
      return () => document.removeEventListener("keydown", onKey);
    }, [panelEl, dispatch]);
    const resolved = resolveDrawer(drawer, data, state, dispatch);
    return <>
      <div className="drawer-backdrop open" onClick={close} />
      <aside className="drawer open" role="dialog" aria-modal="true" aria-label={resolved.title} ref={setPanelEl} onClick={e => e.stopPropagation()}>
        <div className="drawer-content">
          <DrawerHead eyebrow={resolved.eyebrow} title={resolved.title} onClose={close} />
          {resolved.body}
        </div>
      </aside>
    </>;
  }, []);
  const D = useMemo(() => Array.isArray(data) ? {
    metrics: data
  } : data || ({}), [data]);
  const [state, dispatch] = useReducer(reducer, D, makeInitialState);
  const [pillsExpanded, setPillsExpanded] = useState(false);
  const [moreOpen, setMoreOpen] = useState(false);
  const [objOpen, setObjOpen] = useState(false);
  useEffect(() => {
    setPillsExpanded(false);
  }, [state.modelType]);
  const view = useMemo(() => {
    const isLLMOrAgentic = state.modelType !== "ML";
    const rows = isLLMOrAgentic ? getEvaluatorRows(state, D) : getMLRows(state, D);
    const searching = isSearchActive(state);
    const rShow = visibleRatings(state);
    const filtered = rows.filter(r => (searching || state.objectives.has(r.objective)) && (searching || !isLLMOrAgentic || rShow.has(r.bestRating)) && (searching || !isLLMOrAgentic || matchImplementation(state, r)) && (searching || !isLLMOrAgentic || matchHosting(state, r)) && matchSearch(state, r.name));
    const objectives = D.objectives || [];
    const byObj = {};
    for (const o of objectives) byObj[o.name] = [];
    for (const r of filtered) {
      if (byObj[r.objective]) byObj[r.objective].push(r);
    }
    for (const k of Object.keys(byObj)) {
      byObj[k].sort((a, b) => ratingRank(a.bestRating) - ratingRank(b.bestRating) || a.name.localeCompare(b.name));
    }
    const counts = {
      Recommended: 0,
      Consider: 0,
      Optional: 0
    };
    filtered.forEach(r => {
      if (r.bestRating) counts[r.bestRating] += 1;
    });
    return {
      isLLMOrAgentic,
      searching,
      filtered,
      byObj,
      counts
    };
  }, [state, D]);
  const guardrails = isGuardrailsView(state);
  const isML = state.modelType === "ML";
  const objectives = D.objectives || [];
  const totalObjectives = objectives.length;
  const activeSelectionSet = isML ? state.mlTasks : state.useCases;
  const heading = (() => {
    if (view.searching) return <>
          Search results for <span className="context-bit">"{state.query.trim()}"</span>
        </>;
    if (guardrails) return <>Guardrails Available Evaluators</>;
    const mt = (D.modelTypeDisplay || ({}))[state.modelType] || state.modelType;
    const selected = Array.from(activeSelectionSet);
    let suffix;
    if (selected.length === 0) suffix = mt; else if (selected.length <= 2) suffix = `${mt} · ${selected.join(", ")}`; else suffix = `${mt} · ${selected.length} ${isML ? "task types" : "use cases"}`;
    return <>
        Recommended Evaluators & Metrics for <span className="context-bit">{suffix}</span>
      </>;
  })();
  const summary = (() => {
    const n = view.filtered.length;
    if (view.searching) return <>
          <b>{n}</b> metric{n === 1 ? "" : "s"} matching <b>"{state.query.trim()}"</b>
        </>;
    if (guardrails) return <>
          <b>{n}</b> Guardrails Available evaluator{n === 1 ? "" : "s"}
        </>;
    return <>
        <b>{n}</b> metrics — <b className="rec">{view.counts.Recommended}</b> recommended,{" "}
        <b className="con">{view.counts.Consider}</b> consider,{" "}
        <b className="opt">{view.counts.Optional}</b> optional
      </>;
  })();
  const emptyState = (() => {
    if (guardrails) return <div className="empty">
          <b>No Guardrails Available evaluators match the current objectives or search.</b>
          <br />
          Try widening the objective filter or clearing the search.
        </div>;
    const selectionEmpty = view.isLLMOrAgentic ? state.useCases.size === 0 : state.mlTasks.size === 0;
    if (selectionEmpty) {
      const label = view.isLLMOrAgentic ? "use case" : "predictive task type";
      return <div className="empty">
          <b>Pick a {label} above to see the relevant Fiddler metrics & evaluators.</b>
          <br />
          Each {label} maps to a tailored set of recommended, consider, and optional metrics.
        </div>;
    }
    return <div className="empty">
        <b>No metrics match these filters.</b>
        <br />
        Try widening the rating, objective, implementation, or hosting filters.
      </div>;
  })();
  const filtersClass = ["filters", guardrails ? "guardrails-only" : "", isML ? "ml-mode" : "", view.searching ? "search-active" : ""].filter(Boolean).join(" ");
  const moreFiltersActive = view.searching || state.implementations.size > 0 || state.hostings.size > 0 || state.objectives.size < totalObjectives;
  return <div className="fdl-eval-guide">
      <div className="main-layout">
        <div className="main-column">
          <section className={filtersClass}>
            <div className="filter-row">
              <div>
                <div className="group-label">Observability Type</div>
                <div className="group-desc">What you're observing with Fiddler</div>
                <div className="seg" role="tablist" aria-label="Observability type">
                  {MODEL_SEG.flatMap(seg => {
    const btn = <button type="button" key={seg.val} className={(state.modelType === seg.val ? "active" : "") + (seg.guardrails ? " seg-guardrails" : "")} title={seg.tip} data-tip={seg.tip} aria-pressed={state.modelType === seg.val} onClick={() => dispatch({
      type: "SET_MODEL_TYPE",
      modelType: seg.val
    })}>
                        {seg.label}
                      </button>;
    return seg.guardrails ? [<span key={`${seg.val}-div`} className="seg-divider" aria-hidden="true" />, btn] : [btn];
  })}
                </div>
              </div>

              {!guardrails ? <div className="case-filter-cell">
                  <div className="label-row">
                    <div className="group-label">{isML ? "Predictive Task Type" : "Use case"}</div>
                    {activeSelectionSet.size > 0 ? <button type="button" className="clear-btn" onClick={() => dispatch({
    type: "CLEAR_SELECTION"
  })}>
                        Clear
                      </button> : null}
                  </div>
                  <div className="group-desc">
                    {isML ? "Pick the predictive task type" : "Pick the application's primary use case"}
                  </div>
                  <FilterPills state={state} data={D} dispatch={dispatch} expanded={pillsExpanded} setExpanded={setPillsExpanded} />
                </div> : null}

              {!guardrails && !isML ? <div className="rating-filter-cell">
                  <div className="group-label">Fiddler Evaluator Rating</div>
                  <div className="group-desc">
                    How strongly each metric is recommended by Fiddler
                  </div>
                  <div className="seg" role="tablist" aria-label="Rating filter">
                    {RATING_SEG.map(seg => <button type="button" key={seg.val} className={state.rating === seg.val ? "active" : ""} title={seg.tip} data-tip={seg.tip} aria-pressed={state.rating === seg.val} onClick={() => dispatch({
    type: "SET_RATING",
    rating: seg.val
  })}>
                        {seg.label}
                      </button>)}
                  </div>
                </div> : null}
            </div>

            <div className="more-filters-bar">
              <button type="button" className={"more-filters-toggle" + (moreFiltersActive ? " is-filtered" : "")} aria-expanded={moreOpen} onClick={() => setMoreOpen(o => !o)}>
                <span>{moreOpen ? "Hide more filters" : "Show more filters"}</span>
                <span className="more-filters-toggle-chev" aria-hidden="true">
                  ▾
                </span>
              </button>
              <ActiveChips state={state} data={D} dispatch={dispatch} />
            </div>

            {moreOpen ? <div className="more-filters">
                <div className="objective-footer">
                  <div className="objective-footer-left">
                    <MultiSelect label="Evaluator Type" tip="What powers the evaluator: an LLM judge (pre-built or custom), a non-LLM model (open-source classifier or Fiddler-trained model), or a non-AI heuristic / rule." options={D.implementations || []} selectedSet={state.implementations} displayFn={v => v} onToggle={val => dispatch({
    type: "TOGGLE_IMPL",
    val
  })} onClear={() => dispatch({
    type: "CLEAR_IMPL"
  })} />
                    <MultiSelect label="Model Provider" tip="Which model powers the evaluator at run time. Fiddler Centor evaluators run on a Fiddler-managed model. External lets you point the judge at your own LLM provider (BYO key)." options={D.hostings || []} selectedSet={state.hostings} displayFn={displayHosting} onToggle={val => dispatch({
    type: "TOGGLE_HOSTING",
    val
  })} onClear={() => dispatch({
    type: "CLEAR_HOSTING"
  })} />
                  </div>
                  <div className="objective-footer-right">
                    <div className="objective-toggle-row">
                      <button type="button" className={"objective-toggle" + (state.objectives.size < totalObjectives ? " is-filtered" : "")} aria-expanded={objOpen} onClick={() => setObjOpen(o => !o)}>
                        <span>
                          {state.objectives.size === totalObjectives ? `Filter by observability objective (showing all ${totalObjectives})` : `Filter by observability objective (showing ${state.objectives.size} of ${totalObjectives})`}
                        </span>
                        <span className="objective-toggle-chev" aria-hidden="true">
                          ▾
                        </span>
                      </button>
                      {state.objectives.size > 0 ? <button type="button" className="clear-btn" onClick={() => dispatch({
    type: "CLEAR_OBJECTIVES"
  })}>
                          Clear
                        </button> : null}
                    </div>
                    {objOpen ? <div className="objective-pills-wrap">
                        <div className="pills">
                          {objectives.map(o => {
    const active = state.objectives.has(o.name);
    return <label key={o.name} className={"pill" + (active ? " active" : "")} title={o.question} data-tip={o.question} onClick={e => {
      e.preventDefault();
      dispatch({
        type: "TOGGLE_OBJECTIVE",
        name: o.name
      });
    }}>
                                <input type="checkbox" checked={active} readOnly /> {o.name}
                              </label>;
  })}
                        </div>
                      </div> : null}
                  </div>
                </div>

                <div className="search-col">
                  <div className="group-label">Evaluator Search</div>
                  <div className="group-desc">
                    Find a specific metric or evaluator by name (ignores all other filters)
                  </div>
                  <div className="search">
                    <input type="search" placeholder="Optionally search for metrics by name" value={state.query} onChange={e => dispatch({
    type: "SET_QUERY",
    query: e.target.value
  })} />
                  </div>
                </div>
              </div> : null}
          </section>

          <h2 className="section-heading">
            <span className="heading-text">{heading}</span>
            <div className="summary">{summary}</div>
          </h2>

          <div className="results">
            {view.filtered.length === 0 ? emptyState : objectives.map(o => {
    const items = view.byObj[o.name];
    if (!items || !items.length) return null;
    return <ObjectiveSection key={o.name} objective={o} items={items} data={D} state={state} dispatch={dispatch} />;
  })}
          </div>
        </div>
      </div>

      {}
      <aside className="cookbook-rail" aria-label="Cookbooks and resources">
        <h2 className="section-heading rail-heading">Cookbooks & Resources</h2>
        <CookbookRow state={state} data={D} dispatch={dispatch} />
        <SideColumn dispatch={dispatch} />
      </aside>

      {state.drawer ? <Drawer drawer={state.drawer} data={D} state={state} dispatch={dispatch} /> : null}
    </div>;
};

export const EVAL_GUIDE_DATA = (() => {
  const implementationTypeOf = evaluatorType => {
    if (!evaluatorType || evaluatorType === "N/A") return null;
    if (evaluatorType.startsWith("LLM ")) return "LLM";
    if (evaluatorType === "Non-AI") return "non-AI";
    return "non-LLM";
  };
  const providerOptionsOf = provider => {
    if (!provider || provider === "N/A") return [];
    const opts = [];
    if (provider.includes("Fiddler Centor")) opts.push("Fiddler Centor");
    if (provider.includes("External")) opts.push("External");
    return opts;
  };
  const _OBJECTIVES = [{
    id: "perf",
    name: "Model Performance",
    question: "Is my model accurate and producing quality output?"
  }, {
    id: "safety",
    name: "Model Safety & Trust",
    question: "Is my model producing safe, unbiased, policy-compliant output?"
  }, {
    id: "drift",
    name: "Performance Risk (Data Drift)",
    question: "Are inputs/outputs shifting in ways that may degrade performance?"
  }, {
    id: "domain",
    name: "Performance Risk (Domain Insight)",
    question: "What does my data tell me about how the model is being used?"
  }, {
    id: "ops",
    name: "Operational Health & Efficiency",
    question: "Is my pipeline healthy and cost-efficient?"
  }];
  const _AGENTIC_USE_CASES = ["Summarization", "Code Generation", "Content Generation", "Q&A (RAG)", "Chatbot", "Info Extraction", "Classification", "Autonomous Agents"];
  const _MODEL_TYPE_DISPLAY = {
    Agentic: "Gen AI App",
    LLM: "LLM Model",
    ML: "Predictive Model",
    Guardrails: "Guardrails"
  };
  const _USE_CASE_DESCRIPTIONS = {
    "Summarization": "Summarization models condense long-form content into concise summaries while preserving key information and maintaining factual accuracy.",
    "Code Generation": "Code generation models translate natural language descriptions into working code for developer tools or low-code/no-code platforms.",
    "Content Generation": "Content generation models create original written content tailored to specific audiences and purposes, such as marketing copy, blog posts, or social media.",
    "Q&A (RAG)": "RAG systems answer user questions by first retrieving relevant information from a knowledge base, then generating responses grounded in that context.",
    "Chatbot": "Conversational AI agents engage in multi-turn dialogues with users while maintaining conversation context and handling follow-up questions.",
    "Info Extraction": "These models parse unstructured or semi-structured text to identify and extract specific data fields like names, dates, amounts, or custom entities.",
    "Classification": "Classification models categorize text inputs into predefined classes, commonly applied to sentiment analysis, topic categorization, or intent detection.",
    "Autonomous Agents": "These AI systems independently plan, execute multi-step workflows, call external tools or APIs, and make decisions to accomplish complex goals."
  };
  const _ML_TASKS = ["Binary Classification", "Multi-Class Classification", "Regression", "Ranking"];
  const _IMPLEMENTATIONS = ["LLM", "non-LLM", "non-AI"];
  const _HOSTINGS = ["Fiddler Centor", "External"];
  const _EVALUATOR_TYPE_TIPS = {
    "LLM": "Evaluated by an LLM — either a Fiddler-authored prompt (pre-built judge) or one you write yourself (custom judge). Pick the judge model.",
    "non-LLM": "Runs on a non-LLM model — an open-source classifier or Fiddler's own proprietary model (Centor). No LLM API call; in-platform, fast, free.",
    "non-AI": "Deterministic rule or heuristic. No model call; instant and free."
  };
  const _PROVIDER_TIPS = {
    "Fiddler Centor": "Runs on a Fiddler-managed model — no LLM API key, no per-eval cost.",
    "Fiddler Centor or External": "Use the batteries-included Fiddler-managed model at no extra cost, or swap in your own external model via the Fiddler gateway for more complex tasks (you cover its token costs)."
  };
  const _ML_METRICS = [{
    name: "Accuracy",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification", "Multi-Class Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Precision",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Recall / TPR",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "F1 Score",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "False Positive Rate",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "AUROC",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Binary Cross Entropy",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Geometric Mean",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Calibrated Threshold",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Expected Calibration Error",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Binary Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "Log Loss",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Multi-Class Classification"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "R-squared",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Regression"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "MSE",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Regression"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "MAE",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Regression"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "MAPE",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Regression"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "WMAPE",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Regression"],
    q: "How accurate is my model?",
    objective: "Model Performance"
  }, {
    name: "MAP (Mean Average Precision)",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Ranking"],
    q: "How well does my model rank results?",
    objective: "Model Performance"
  }, {
    name: "NDCG",
    docsUrl: "/reference/ml-metrics-reference",
    tasks: ["Ranking"],
    q: "How well does my model rank results?",
    objective: "Model Performance"
  }];
  const _ML_CROSSCUT = [{
    name: "Data Drift (JSD / PSI)",
    docsUrl: "/observability/platform/data-drift-platform",
    q: "Are feature or prediction distributions shifting from baseline?",
    objective: "Performance Risk (Data Drift)",
    category: "Drift",
    domain: "Platform",
    notes: "Feature distribution shift detection"
  }, {
    name: "Statistics (user-defined fields)",
    docsUrl: "/observability/platform/statistics",
    q: "What are the descriptive statistics on my data?",
    objective: "Performance Risk (Domain Insight)",
    category: "Statistics",
    domain: "Platform",
    notes: "Descriptive stats on any column"
  }, {
    name: "Custom Metrics (FQL)",
    docsUrl: "/observability/platform/custom-metrics",
    q: "How do I track custom business KPIs?",
    objective: "Performance Risk (Domain Insight)",
    category: "Custom",
    domain: "Platform",
    notes: "User-defined via Fiddler Query Language"
  }, {
    name: "Traffic",
    docsUrl: "/observability/platform/traffic-platform",
    q: "How much traffic is my model receiving?",
    objective: "Operational Health & Efficiency",
    category: "Traffic",
    domain: "Platform",
    notes: "Event volume monitoring"
  }, {
    name: "Data Integrity",
    docsUrl: "/observability/platform/data-integrity-platform",
    q: "Is my data pipeline producing valid data?",
    objective: "Operational Health & Efficiency",
    category: "Integrity",
    domain: "Platform",
    notes: "Null/type/range violations"
  }];
  const _METRICS = [{
    name: "RAG Faithfulness",
    docsUrl: "/sdk-api/evals/rag-faithfulness",
    objective: "Model Performance",
    q: "Is my model output faithful to the retrieved documents?",
    models: ["Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Pre-built Judge)",
    provider: "Fiddler Centor or External",
    how: "Fiddler prompt + user-selected LLM",
    notes: "LLM Judge, binary",
    uc: {
      "Summarization": ["Recommended", "RAG Faithfulness scores whether the model's output is fully supported by the source content, sentence-by-sentence.\n\nFor Summarization, the source content is the original document being summarized — hallucinated facts that don't appear in the source are the #1 quality failure for summarization models.\n\nRecommended because faithfulness to the source is the defining quality criterion for any summary; a fluent but unfaithful summary is worse than no summary at all."],
      "Code Generation": ["Optional", "RAG Faithfulness scores whether output is grounded in retrieved context.\n\nCode generation typically operates on a user prompt + optional inline code context (open files, API signatures) rather than RAG retrieval over a knowledge base — so the 'is the output supported by retrieved docs?' framing rarely applies.\n\nOptional because most code-gen pipelines don't have a retrieval step to validate against; if you do retrieve API docs or code snippets before generating, it becomes more useful."],
      "Content Generation": ["Optional", "RAG Faithfulness scores whether output is grounded in retrieved context.\n\nStandard content generation (marketing copy, blog posts, social) starts from a brief or prompt without retrieval — so there's no retrieved context to score against.\n\nOptional because the typical content-gen pipeline doesn't fit the evaluator's input shape; consider it only if your content pipeline pulls source material via RAG before generating."],
      "Q&A (RAG)": ["Recommended", "RAG Faithfulness scores whether the model's answer is fully supported by the retrieved context, sentence-by-sentence.\n\nFor Q&A (RAG), hallucinated answers — facts that look plausible but aren't in the source docs — are the #1 quality failure customers report, and this metric directly catches them.\n\nRecommended because it targets the highest-risk and most use-case-defining failure mode for RAG; if you only deploy one evaluator on a RAG system, this is it."],
      "Chatbot": ["Recommended", "RAG Faithfulness scores whether the chatbot's response is fully supported by the retrieved knowledge-base content, sentence-by-sentence.\n\nFor Chatbots that ground responses in retrieved knowledge (the most common pattern for support and assistant bots), hallucinated answers erode user trust faster than any other quality failure.\n\nRecommended because grounding is the entire premise of a retrieval-augmented chatbot; without this metric, you can't tell when the model is making things up."],
      "Info Extraction": ["Consider", "RAG Faithfulness scores whether extracted values are supported by the retrieved or provided source documents.\n\nFor Info Extraction systems that use RAG to locate relevant document sections before extracting fields, it validates that the extracted values trace back to the source rather than being fabricated by the model.\n\nConsider when your extraction pipeline has a retrieval step; for direct extraction over a fixed document, per-field accuracy evaluators are more targeted."],
      "Classification": ["Optional", "RAG Faithfulness scores whether output text is supported by retrieved context.\n\nClassification produces a category label, not generated text — there's nothing to score for faithfulness because the output isn't a claim about source content.\n\nOptional because the metric's input shape (output text + retrieved context) doesn't match classification's input shape (input text → label); use precision/recall/F1 instead."],
      "Autonomous Agents": ["Recommended", "RAG Faithfulness scores whether the agent's responses or reasoning steps are supported by the retrieved context it pulled.\n\nFor Autonomous Agents that retrieve knowledge before acting (most non-trivial agents do), ungrounded reasoning leads to actions based on hallucinated facts — which compounds into bigger failures than a single hallucinated answer would.\n\nRecommended because agents take actions on the world; faithful grounding is a prerequisite for those actions to be safe and correct."]
    }
  }, {
    name: "Faithfulness",
    docsUrl: "/observability/llm/llm-based-metrics",
    objective: "Model Performance",
    q: "Is my model output faithful to the retrieved documents?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Pre-built Judge)",
    provider: "Fiddler Centor or External",
    how: "Fiddler prompt + user-selected LLM",
    notes: "OpenAI-based, binary; removed from Agentic in favor of RAG Faithfulness",
    uc: {
      "Summarization": ["Recommended", "Faithfulness scores whether generated text is factually consistent with a reference text — for Summarization, the reference is the original document being summarized.\n\nIt's the Traditional / LLM Monitoring equivalent of RAG Faithfulness; both catch hallucinations, the #1 summarization quality failure customers report.\n\nRecommended for Summarization deployments running in LLM Monitoring; on Agentic Monitoring prefer RAG Faithfulness, which is purpose-built for retrieval flows but works equally well for summary-vs-source comparisons."],
      "Code Generation": ["Consider", "Faithfulness scores whether generated text matches a reference text — for Code Generation, the reference would be the specification, docstring, or requirements document.\n\nIt tells you whether the code reflects the intent stated in the spec, not whether the code is technically correct (other evaluators handle that).\n\nConsider when you have a written specification to compare against; for typical interactive code-gen without a formal spec, Answer Relevance + a custom code-quality judge are higher-value."],
      "Content Generation": ["Consider", "Faithfulness scores whether generated text is consistent with a reference — for Content Generation, the reference is whatever source material the writer was given (briefs, prior content, brand guidelines, factual sources).\n\nIt catches content that drifts from the source brief or invents facts the brief didn't include.\n\nConsider when your content generation has explicit source material to validate against; for fully open-ended creative generation, Answer Relevance and bias detection are more useful."],
      "Q&A (RAG)": ["Recommended", "Faithfulness scores whether a generated answer is factually consistent with a reference text — for Q&A (RAG), the reference is the retrieved context.\n\nIt's the older Traditional / LLM Monitoring equivalent of RAG Faithfulness; both catch hallucinations, the #1 quality failure in RAG systems.\n\nRecommended for Q&A deployments running in LLM Monitoring; on Agentic Monitoring prefer RAG Faithfulness, which is purpose-built for retrieval flows."],
      "Chatbot": ["Recommended", "Faithfulness scores whether responses are consistent with the reference content provided.\n\nFor Chatbots that ground answers in retrieved knowledge bases or provided context, it's the Traditional / LLM Monitoring equivalent of RAG Faithfulness — both catch hallucinated responses, the #1 trust-eroding failure in conversational AI.\n\nRecommended for Chatbots running in LLM Monitoring; on Agentic Monitoring prefer RAG Faithfulness, which is purpose-built for retrieval-augmented flows."],
      "Info Extraction": ["Recommended", "Faithfulness scores whether generated output is consistent with the source content.\n\nFor Info Extraction, this means every extracted value must trace back to the source document — fabricated values (a phone number that wasn't in the invoice, a name the model hallucinated) are the highest-stakes failure mode for extraction systems feeding downstream business processes.\n\nRecommended because extraction outputs typically feed systems that trust the values as ground truth; unfaithful extractions cause cascading downstream errors that are expensive to detect after the fact."],
      "Classification": ["Optional", "Faithfulness scores whether generated text is consistent with a reference.\n\nClassification produces a label, not generated text, so there's no text-to-text comparison to make.\n\nOptional because the metric's premise (compare output text to source text) doesn't apply to label-prediction tasks; use precision/recall/F1 to validate classification quality instead."],
      "Autonomous Agents": ["Recommended", "Faithfulness scores whether the agent's textual outputs (responses, reasoning steps, tool-call rationales) are consistent with the context it was given.\n\nFor Autonomous Agents, ungrounded reasoning steps compound — one hallucinated fact early in a chain becomes a wrong action later, harder to debug than a one-shot hallucination.\n\nRecommended for agent deployments running in LLM Monitoring; on Agentic Monitoring prefer RAG Faithfulness, which is purpose-built for retrieval flows."]
    }
  }, {
    name: "Fast Faithfulness",
    docsUrl: "/sdk-api/evals/ftl-response-faithfulness",
    objective: "Model Performance",
    q: "Is my model output faithful to the retrieved documents?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "Fiddler Centor Model",
    provider: "Fiddler Centor",
    how: "Proprietary BERT-scale SLM (FTL)",
    notes: "Fiddler Fast Trust Model, binary + score",
    guardrails: true,
    uc: {
      "Summarization": ["Recommended", "Fast Faithfulness is a low-latency variant of RAG Faithfulness powered by a smaller proprietary model (FTL).\n\nFor high-volume Summarization deployments — news pipelines, document processing systems — scoring every summary with a full LLM judge is too slow or expensive; Fast Faithfulness keeps the hallucination signal at a fraction of the cost.\n\nRecommended when you need to evaluate every summary (or wire as a real-time Guardrail before publication); otherwise stick with the more accurate RAG Faithfulness."],
      "Code Generation": ["Consider", "Fast Faithfulness is a low-latency variant of RAG Faithfulness powered by a smaller proprietary model.\n\nFor Code Generation it carries the same trade-offs as Faithfulness — useful only when you have a reference (spec, docstring) to compare against, with the SLM-vs-LLM accuracy/speed trade-off layered on top.\n\nConsider when your code-gen has spec-based reference content and you need to score every generation at low cost; for sampled or batch scoring, full RAG Faithfulness is more accurate."],
      "Content Generation": ["Consider", "Fast Faithfulness is the low-latency variant of RAG Faithfulness.\n\nFor Content Generation, it applies when you have source material to validate against and need to score every piece of generated content (vs sampling) — the smaller FTL model is faster and cheaper but slightly less accurate.\n\nConsider for high-volume content pipelines with source-material grounding; for low-volume or research/QA workflows, the more accurate full RAG Faithfulness is the better choice."],
      "Q&A (RAG)": ["Recommended", "Fast Faithfulness is a low-latency variant of RAG Faithfulness powered by a smaller proprietary model (FTL).\n\nFor high-volume Q&A (RAG) deployments where every request needs scoring inline, the standard LLM-judge variant is too slow or expensive — Fast Faithfulness keeps the hallucination signal at a fraction of the cost.\n\nRecommended when you need to evaluate every Q&A response (or wire it as a real-time Guardrail) rather than sample; otherwise stick with the more accurate RAG Faithfulness."],
      "Chatbot": ["Recommended", "Fast Faithfulness is a low-latency variant of RAG Faithfulness powered by a smaller proprietary model (FTL).\n\nFor real-time Chatbot interactions, the standard LLM-judge variant adds too much latency per turn and the cost per interaction adds up quickly at scale.\n\nRecommended when you want to evaluate every chatbot response (or wire it as a real-time Guardrail to block ungrounded responses); otherwise sample with full RAG Faithfulness."],
      "Info Extraction": ["Recommended", "Fast Faithfulness is the low-latency variant of RAG Faithfulness.\n\nFor high-volume Info Extraction pipelines (document processing, post-OCR validation), evaluating every extraction with an LLM judge is cost-prohibitive — Fast Faithfulness keeps the hallucination signal at SLM speeds.\n\nRecommended for production extraction pipelines where every output needs validation; the slight accuracy trade-off is worth the cost reduction at scale."],
      "Classification": ["Optional", "Fast Faithfulness is a low-latency variant of RAG Faithfulness — same applicability as Faithfulness for any given use case.\n\nClassification produces a label, not generated text, so faithfulness scoring (fast or otherwise) doesn't apply.\n\nOptional because the underlying evaluator's premise doesn't fit classification's output shape; use traditional precision/recall/F1 for classification quality."],
      "Autonomous Agents": ["Recommended", "Fast Faithfulness is the low-latency variant of RAG Faithfulness.\n\nFor Autonomous Agents running multi-step workflows, scoring faithfulness at every step with a full LLM judge multiplies cost and latency across the chain.\n\nRecommended when you need per-step grounding validation in production agent deployments; for offline evaluation or sampled scoring, full RAG Faithfulness is more accurate."]
    }
  }, {
    name: "Context Relevance",
    docsUrl: "/sdk-api/evals/context-relevance",
    objective: "Model Performance",
    q: "Are the retrieved documents relevant to the user query?",
    models: ["Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Pre-built Judge)",
    provider: "Fiddler Centor or External",
    how: "Fiddler prompt + user-selected LLM",
    notes: "LLM Judge, ordinal; Agentic & Experiments only",
    uc: {
      "Summarization": ["Optional", "Context Relevance scores whether retrieved documents are relevant to the query.\n\nSummarization typically operates on a provided document to summarize rather than retrieving context — there's no retrieval step to score.\n\nOptional because the standard summarization pipeline doesn't have a retrieval component; consider only if your pipeline retrieves additional reference material to enhance the summary (e.g. multi-document summarization)."],
      "Code Generation": ["Optional", "Context Relevance scores the quality of retrieved context against a query.\n\nCode generation typically uses provided code context (open files, function signatures, the user's prompt) rather than retrieved snippets from a knowledge base.\n\nOptional because the typical code-gen workflow doesn't have a retrieval step; if you're doing retrieval-augmented code-gen (pulling API docs, code examples), it becomes more relevant."],
      "Content Generation": ["Optional", "Context Relevance scores whether retrieved context is relevant to the input.\n\nStandard content generation works from a brief or prompt without retrieval, so there's no retrieved context to evaluate.\n\nOptional because retrieval isn't part of the typical content-gen pipeline; consider if your content workflow retrieves brand guidelines, prior content, or factual sources before generating."],
      "Q&A (RAG)": ["Recommended", "Context Relevance scores whether the documents your retriever pulled are actually relevant to the user's question, independent of what the model does next.\n\nFor Q&A (RAG), poor retrieval is the root cause of many hallucinations — the model can only ground in what you give it.\n\nRecommended because it surfaces retrieval problems separately from generation problems; without it, you can't tell whether to fix the retriever (indexing, embeddings, reranking) or the prompt."],
      "Chatbot": ["Recommended", "Context Relevance scores whether the documents your retriever pulled are actually relevant to the user's question, independent of what the model does next.\n\nFor Chatbots, poor retrieval is the root cause of many wrong or hallucinated answers — the model can only ground in what you give it.\n\nRecommended because it surfaces retrieval problems separately from generation problems; without it, you can't tell whether to fix the retriever (indexing, embeddings, reranking) or the prompt."],
      "Info Extraction": ["Optional", "Context Relevance scores whether retrieved context is relevant to the query.\n\nInfo Extraction typically runs against provided documents directly (the invoice being parsed, the contract being analyzed), without a retrieval step.\n\nOptional because most extraction pipelines don't retrieve — they process the document handed to them; consider only when extraction is preceded by a retrieval step (e.g. finding the right section of a long document before extracting fields)."],
      "Classification": ["Optional", "Context Relevance scores retrieval quality.\n\nClassification operates directly on input text → label and doesn't involve retrieval as part of the inference flow.\n\nOptional because there's no retrieval step in classification to evaluate; the metric's premise simply doesn't apply."],
      "Autonomous Agents": ["Recommended", "Context Relevance scores whether documents the agent retrieved are actually relevant to its current sub-goal, independent of what the agent does with them.\n\nFor Autonomous Agents that retrieve knowledge as part of multi-step planning, poor retrieval cascades — the agent acts on irrelevant context and produces wrong actions downstream.\n\nRecommended because it isolates retrieval failures from planning / tool-call failures; without it, you can't tell where in the chain things broke."]
    }
  }, {
    name: "Answer Relevance",
    docsUrl: "/sdk-api/evals/answer-relevance",
    objective: "Model Performance",
    q: "Does the response address the user's query?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Pre-built Judge)",
    provider: "Fiddler Centor or External",
    how: "Fiddler prompt + user-selected LLM",
    notes: "OpenAI-based binary in Traditional Monitoring; v2.0 prompt (ordinal High/Med/Low) active in Agentic Monitoring post-26.2",
    uc: {
      "Summarization": ["Consider", "Answer Relevance scores whether the output addresses the input prompt, regardless of factual grounding.\n\nFor Summarization, the 'question' is implicit (summarize this document), so Answer Relevance mostly checks whether the summary actually covers the document vs going off on tangents.\n\nConsider as a secondary signal; for Summarization, Faithfulness (does the summary match the source?) is far more important than relevance (the summary is by definition about the source)."],
      "Code Generation": ["Recommended", "Answer Relevance scores whether the generated output addresses what the user asked for.\n\nFor Code Generation, this is the core quality question: does the generated code actually do what the user requested? — distinct from whether the code runs or is well-structured.\n\nRecommended because functional alignment with the user's intent is the primary success criterion for code generation; well-written code that solves the wrong problem is a worse failure than buggy code that solves the right problem."],
      "Content Generation": ["Recommended", "Answer Relevance scores whether generated content addresses the original brief or prompt.\n\nFor Content Generation, content that's well-written but misses the brief — wrong audience, wrong topic, wrong call-to-action — is the most common failure customers report.\n\nRecommended because brief alignment is the defining quality criterion; an off-brief piece of content is a worse failure than a stylistically rough one that's on-brief."],
      "Q&A (RAG)": ["Recommended", "Answer Relevance scores whether the generated answer actually addresses the user's question, regardless of whether it's factually grounded.\n\nFor Q&A (RAG), it's possible to have a faithful answer that still misses the question — quoting irrelevant context, hedging, or going off on tangents.\n\nRecommended because it captures the user-perceived quality of 'did I get what I asked for?', complementing Faithfulness's 'is what I got accurate?'. Together they're the core Q&A quality pair."],
      "Chatbot": ["Recommended", "Answer Relevance scores whether the chatbot's response addresses what the user actually asked.\n\nFor Chatbots, drifting off-topic, answering a related-but-different question, or hedging excessively are common failure modes — and ones that aren't caught by Faithfulness (the response can be grounded but still miss the question).\n\nRecommended because it captures user-perceived helpfulness; together with Faithfulness it's the core chatbot quality pair."],
      "Info Extraction": ["Consider", "Answer Relevance scores whether output addresses the input.\n\nFor Info Extraction driven by natural-language queries ('find the invoice total', 'extract the parties to the contract'), it catches extractions that return adjacent-but-wrong fields.\n\nConsider when your extraction interface accepts NL queries; for schema-driven extraction (extract all fields per a fixed schema), per-field accuracy is the more direct quality signal."],
      "Classification": ["Consider", "Answer Relevance scores whether the output addresses the input.\n\nFor Classification framed as an LLM prompt-response task ('which category does this belong to?'), it sanity-checks that the model is actually classifying vs. emitting unrelated text.\n\nConsider for LLM-judge-based classification; for native classifier models that emit labels directly, traditional precision/recall metrics are more direct."],
      "Autonomous Agents": ["Recommended", "Answer Relevance scores whether the agent's final output addresses the user's actual goal.\n\nFor Autonomous Agents that take many steps before producing an output, it's easy for the agent to drift mid-execution and produce output that's well-formed but no longer serves the original ask.\n\nRecommended because goal alignment is the only end-to-end success measure for an agent; passing every intermediate step but failing the final ask is still a failure."]
    }
  }, {
    name: "Coherence",
    docsUrl: "/sdk-api/evals/coherence",
    objective: "Model Performance",
    q: "Is the response logically structured and clear?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Pre-built Judge)",
    provider: "Fiddler Centor or External",
    how: "Fiddler prompt + user-selected LLM",
    notes: "OpenAI-based, binary",
    uc: {
      "Summarization": ["Recommended", "Coherence judges whether the output reads logically — claims connect, structure makes sense, no abrupt jumps.\n\nFor Summarization, a coherent summary is the difference between a useful condensed view of the source and a disjointed list of facts the reader has to mentally reassemble.\n\nRecommended because summaries are read sequentially and rely on flow to communicate meaning; an incoherent summary fails its purpose even when factually accurate."],
      "Code Generation": ["Optional", "Coherence judges whether output text reads logically across sentences.\n\nCode is structured by syntax, not prose flow — code quality lives in correctness, readability, and security, not in narrative coherence.\n\nOptional because the metric's literary framing doesn't map to code; use code-specific signals (does it run? does it pass tests? does it match style guides?) instead."],
      "Content Generation": ["Recommended", "Coherence judges whether content flows logically — claims connect, paragraphs build on each other, the reader doesn't get lost.\n\nFor Content Generation, especially long-form content (blog posts, articles, white papers), coherence is what separates content that's actually readable from content that reads as AI-generated word salad.\n\nRecommended because professional-quality content requires structural and logical flow; this metric directly measures it."],
      "Q&A (RAG)": ["Consider", "Coherence judges whether the answer reads logically — claims connect, structure makes sense, no abrupt jumps.\n\nFor Q&A (RAG), this matters most when answers are multi-paragraph explanations rather than short factual lookups; on a one-sentence answer there's little surface area for incoherence.\n\nConsider when your Q&A produces long-form explanatory answers (technical support, education); for short factual Q&A, Faithfulness + Answer Relevance already cover the quality bar."],
      "Chatbot": ["Recommended", "Coherence judges whether responses read logically.\n\nFor Chatbots, coherence matters within a single response (the response makes sense on its own) and across turns (responses build on prior context, don't contradict earlier statements).\n\nRecommended because conversational quality depends on the user being able to follow the chat without re-reading; incoherent responses force users to re-ask or give up."],
      "Info Extraction": ["Optional", "Coherence judges narrative flow in generated text.\n\nInfo Extraction produces structured data (JSON, key-value fields, table rows), not prose — there's no narrative to evaluate for coherence.\n\nOptional because the output shape doesn't match the metric's input shape; use schema validation and per-field accuracy for extraction quality."],
      "Classification": ["Optional", "Coherence judges narrative flow in generated text.\n\nClassification produces a category label — a single token or short string — with no narrative structure to assess.\n\nOptional because the metric simply doesn't apply to label-prediction tasks."],
      "Autonomous Agents": ["Consider", "Coherence judges whether output text reads logically.\n\nFor Autonomous Agents, this matters for the reasoning chains and final responses the agent produces — incoherent reasoning is a sign of poor planning even when the final answer happens to be correct.\n\nConsider as a quality signal for agent reasoning visibility; for end-result correctness, Answer Relevance + tool-call accuracy are more direct measures."]
    }
  }, {
    name: "Conciseness",
    docsUrl: "/sdk-api/evals/conciseness",
    objective: "Model Performance",
    q: "Is the response concise and not verbose?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Pre-built Judge)",
    provider: "Fiddler Centor or External",
    how: "Fiddler prompt + user-selected LLM",
    notes: "OpenAI-based, binary",
    uc: {
      "Summarization": ["Recommended", "Conciseness scores whether the output is appropriately brief for its purpose.\n\nFor Summarization, this is the defining quality criterion — a summary that's nearly as long as the source isn't really a summary, it's just a slightly-edited version.\n\nRecommended because conciseness IS the function of summarization; a summary that fails conciseness fails its primary job, regardless of how accurate it is."],
      "Code Generation": ["Optional", "Conciseness scores whether output text is appropriately brief.\n\nFor Code Generation, brevity is a style preference (some teams prefer terse code, others prefer verbose-but-clear) rather than a quality signal — both terse and verbose code can be functionally correct.\n\nOptional because the relationship between code length and code quality isn't direct; use correctness + readability evaluators instead."],
      "Content Generation": ["Consider", "Conciseness scores whether output is appropriately brief for its purpose.\n\nFor Content Generation, this depends heavily on content type — a social-media post that's verbose is a quality failure; a blog post that's too brief misses depth expectations.\n\nConsider when your content types have explicit length expectations (microcopy, ad copy, tweets); for long-form content where length is intentional, conciseness signals are misleading."],
      "Q&A (RAG)": ["Recommended", "Conciseness scores whether the answer is appropriately brief for the question — no padding, hedging, or burying the key point under preamble.\n\nFor Q&A (RAG), bloated answers degrade UX even when factually correct; users skip past long answers and miss the actual information.\n\nRecommended because conciseness directly impacts perceived answer quality and is often where RAG systems drift over time as prompts get hardened; it's a high-leverage, low-cost signal to track."],
      "Chatbot": ["Recommended", "Conciseness scores whether responses are appropriately brief for the question.\n\nFor Chatbots, bloated responses degrade conversational UX — users skim past long answers and miss key information, and lengthy responses feel less natural than concise ones.\n\nRecommended because conciseness directly impacts perceived quality of conversational responses; it's a high-leverage signal that's often where chatbots drift over time as prompts get hardened."],
      "Info Extraction": ["Optional", "Conciseness scores whether generated text is appropriately brief.\n\nInfo Extraction produces structured data fields, not prose — the output is already shaped by the schema and has no length dimension to score.\n\nOptional because conciseness doesn't apply to structured outputs; use schema completeness and per-field accuracy instead."],
      "Classification": ["Optional", "Conciseness scores brevity of generated text.\n\nClassification produces a category label, not prose — there's no length dimension to evaluate.\n\nOptional because the metric doesn't apply to label-prediction tasks."],
      "Autonomous Agents": ["Optional", "Conciseness scores brevity of generated text.\n\nFor Autonomous Agents, the primary quality concerns are correctness, planning soundness, and tool-call accuracy — response brevity is far down the priority list.\n\nOptional because conciseness isn't a meaningful quality signal for autonomous execution; focus on Answer Relevance and tool-call evaluators."]
    }
  }, {
    name: "Custom Judge",
    docsUrl: "/sdk-api/evals/custom-judge",
    objective: "Model Performance",
    q: "Does the response meet my domain-specific quality criteria?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Quality",
    domain: "Enrichment",
    evaluatorType: "LLM (Custom Judge)",
    provider: "Fiddler Centor or External",
    how: "User-supplied prompt + user-selected Judge model (Fiddler-hosted or via LLM Gateway)",
    notes: "Bring-your-own prompt + judge model. See Building Custom Judge Evaluators cookbook for end-to-end recipe.",
    uc: {
      "Summarization": ["Consider", "Custom Judge lets you write a bring-your-own prompt + judge LLM to score summaries against domain-specific quality criteria the built-ins don't cover.\n\nFor Summarization, the built-in evaluators (Faithfulness + Conciseness) handle the core quality dimensions for most teams; Custom Judge earns its keep when you have domain-specific summarization rules (regulatory language constraints, structured-summary requirements, must-include fields).\n\nConsider when your domain has formal summarization standards; otherwise the built-ins are higher-leverage and cheaper."],
      "Code Generation": ["Recommended", "Custom Judge lets you write a bring-your-own prompt + judge LLM to evaluate generated code against criteria the built-ins can't capture (style, security, framework conventions, organizational standards).\n\nCode quality is inherently domain-specific — what counts as 'good' Python in one team can be 'too clever' in another — and the cookbook's primary recommendation for code-gen evaluation is a custom judge.\n\nRecommended because there's no built-in 'code quality' evaluator that fits all teams; custom judges are the only path to encoding your team's actual standards."],
      "Content Generation": ["Recommended", "Custom Judge lets you write a bring-your-own prompt + judge LLM to score generated content against criteria built-ins don't cover (brand voice, audience alignment, content guidelines, tone).\n\nThese dimensions vary per company and per content type — there's no universal 'good content' definition that fits all brands.\n\nRecommended because the cookbook treats Custom Judge as the primary content-quality evaluator; built-in Answer Relevance covers brief-fit, but voice/brand alignment needs a custom prompt."],
      "Q&A (RAG)": ["Consider", "Custom Judge lets you write a bring-your-own prompt + judge LLM to evaluate Q&A responses against criteria the built-in evaluators don't cover.\n\nFor Q&A (RAG), the built-ins (Answer Relevance + RAG Faithfulness) handle the core quality dimensions; custom judges shine for domain-specific concerns like citation format, hedging-vs-confident tone, or compliance with internal style guides.\n\nConsider when your Q&A has bespoke quality rules; otherwise the built-in evaluators are higher-leverage and cheaper."],
      "Chatbot": ["Consider", "Custom Judge lets you write a bring-your-own prompt + judge LLM to score chatbot responses against custom criteria.\n\nFor Chatbots, the built-ins (Answer Relevance + RAG Faithfulness + Fast Safety) cover the core dimensions well; Custom Judge becomes useful for product-specific policies — scope boundaries ('only answer questions about our product'), escalation rules ('hand off to human if X'), or tone guidelines particular to your brand.\n\nConsider when you have explicit conversational policies to enforce; for general quality, the built-ins handle most needs."],
      "Info Extraction": ["Recommended", "Custom Judge lets you write a bring-your-own prompt + judge LLM to evaluate extracted fields against schema-specific quality criteria.\n\nFor Info Extraction, per-field accuracy is inherently domain-specific — what counts as 'correctly extracted' for an invoice total is different from a contract effective date or a resume work-history entry — and the Agentic Document Extraction cookbook treats custom judges as the primary evaluation method.\n\nRecommended because extraction quality lives at the field level, and only custom judges can encode field-specific validation rules."],
      "Classification": ["Recommended", "Custom Judge lets you write a bring-your-own prompt + judge LLM to evaluate classification outputs against criteria beyond simple label accuracy.\n\nFor Classification, traditional metrics (precision/recall/F1) tell you whether labels match a ground truth — but they don't capture nuanced quality like 'was the disambiguation reasoning correct?' or 'did the model handle this edge case the way the policy says?'.\n\nRecommended because classification policies often have nuances that label-accuracy doesn't capture; custom judges encode those nuances."],
      "Autonomous Agents": ["Recommended", "Custom Judge lets you write a bring-your-own prompt + judge LLM to evaluate agent behavior against criteria the built-ins don't cover.\n\nFor Autonomous Agents, the failure modes that matter most are agent-specific — tool-call correctness, sub-goal completion, reasoning-step quality, plan coherence — and no built-in evaluator targets these specifically.\n\nRecommended because agent quality is defined by behavioral criteria custom to the agent's job; without a custom judge, you're scoring agents on generic text-quality metrics that miss what actually matters."]
    }
  }, {
    name: "Evaluate (BLEU/ROUGE/METEOR)",
    docsUrl: "/sdk-api/evals/eval-fn",
    objective: "Model Performance",
    q: "How does model output compare to a reference text?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Text Stats",
    domain: "Enrichment",
    evaluatorType: "Non-AI",
    provider: "N/A",
    how: "`evaluate` library, n-gram comparison",
    notes: "N-gram comparison",
    uc: {
      "Summarization": ["Recommended", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Summarization, this is THE primary use case for n-gram metrics — summarization has clearly-defined reference summaries you can compare against, and lexical overlap with a reference is a reasonable proxy for summary quality.\n\nRecommended because Summarization is the one use case where reference-text comparison is both available and meaningful; pair with LLM-based evaluators for the nuanced quality dimensions n-grams miss."],
      "Code Generation": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Code Generation, lexical similarity to a reference is a poor proxy for code quality — semantically equivalent code can have wildly different tokens, and tokens-matching code can be broken.\n\nOptional because n-gram comparison fundamentally doesn't fit the code-quality question; execution-based evaluators (does the code run? does it pass tests?) are far more informative."],
      "Content Generation": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Content Generation, the whole point is often originality — penalizing valid-but-novel phrasings against a reference is the opposite of what you want for creative content.\n\nOptional because n-gram comparison rewards mimicry rather than quality; LLM-judge evaluators are better suited for evaluating creative output."],
      "Q&A (RAG)": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Q&A (RAG) — where answers are open-ended and a single question has many valid phrasings — these metrics penalize valid answers that don't lexically match the reference, making them a poor proxy for quality.\n\nOptional because they only apply when you have a curated test set with reference answers; for production Q&A monitoring, RAG Faithfulness and Answer Relevance are far better quality signals."],
      "Chatbot": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Chatbots, conversational responses to the same question can vary widely in phrasing while remaining equally good — n-gram comparison against a reference unfairly penalizes that variation.\n\nOptional because the open-ended nature of conversation makes reference-based metrics misleading; use Answer Relevance + Faithfulness instead."],
      "Info Extraction": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Info Extraction, the output is structured fields with exact-match expectations — not generated prose where n-gram overlap makes sense.\n\nOptional because the metric's premise (compare phrasing variations) doesn't apply to extraction's binary correct/incorrect field values; use per-field accuracy instead."],
      "Classification": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nClassification produces a category label — there's no text to compare n-grams against.\n\nOptional because the metric simply doesn't apply to label-prediction tasks; use precision/recall/F1."],
      "Autonomous Agents": ["Optional", "BLEU, ROUGE, and METEOR compare generated text against reference text using n-gram overlap.\n\nFor Autonomous Agents, success is measured by whether the agent achieved the goal, not whether its reasoning or output matches a reference text token-for-token.\n\nOptional because n-gram comparison can't capture agent success criteria (tool-call correctness, plan completion, end-state achievement); use task-success metrics or custom judges instead."]
    }
  }, {
    name: "Fast Safety (11 dims)",
    docsUrl: "/sdk-api/evals/ftl-prompt-safety",
    objective: "Model Safety & Trust",
    q: "Is the model producing unsafe or harmful content?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Safety",
    domain: "Enrichment",
    evaluatorType: "Fiddler Centor Model",
    provider: "Fiddler Centor",
    how: "Proprietary BERT-scale SLM (FTL); 11 dimensions",
    notes: "11 dims: illegal, hateful, harassing, racist, sexist, violent, sexual, harmful, unethical, jailbreaking, roleplaying. Agentic Monitoring applies per-label thresholds.",
    guardrails: true,
    uc: {
      "Summarization": ["Optional", "Fast Safety scores a response across 11 unsafe-content dimensions (illegal, hateful, harassing, sexual, violent, jailbreaking, etc.) using a fast proprietary model.\n\nSummarization condenses existing content rather than generating novel claims — if source documents are sanitized, the summaries inherit that safety; if not, the underlying problem is in the corpus, not the model.\n\nOptional because the risk surface is small and addressed upstream; consider only if you summarize user-generated content that may itself contain unsafe material."],
      "Code Generation": ["Consider", "Fast Safety scores a response across 11 unsafe-content dimensions using a fast proprietary model.\n\nFor Code Generation, the primary safety concern isn't natural-language toxicity — it's whether the model generates exploit code, malware patterns, or insecure code that could be weaponized; Fast Safety catches some of this via its 'illegal' and 'harmful' dimensions but isn't purpose-built for code security.\n\nConsider as a baseline safety check; for serious code-security validation, layer a custom code-security judge or static-analysis evaluator on top."],
      "Content Generation": ["Recommended", "Fast Safety scores a response across 11 unsafe-content dimensions (illegal, hateful, harassing, sexual, violent, jailbreaking, etc.) using a fast proprietary model.\n\nFor Content Generation, output is by definition user-facing — any unsafe content that ships represents a direct brand and reputational risk.\n\nRecommended because content-gen is the highest-risk surface for safety failures; promote to a real-time Guardrail to block unsafe content before it gets published."],
      "Q&A (RAG)": ["Consider", "Fast Safety scores a response across 11 unsafe-content dimensions (illegal, hateful, harassing, sexual, violent, jailbreaking, etc.) using a fast proprietary model.\n\nFor Q&A (RAG), accuracy is the primary concern — but safety still matters when the knowledge base contains sensitive content or the user-facing context calls for moderation.\n\nConsider for any customer-facing Q&A; promote to Recommended (or wire as a Guardrail) when the audience is external or regulated."],
      "Chatbot": ["Recommended", "Fast Safety scores a response across 11 unsafe-content dimensions (illegal, hateful, harassing, sexual, violent, jailbreaking, etc.) using a fast proprietary model.\n\nFor Chatbots, conversational responses to users are the most public surface a model has — a single unsafe response can become a screenshot that damages trust at scale.\n\nRecommended for any customer-facing chatbot; promote to a real-time Guardrail to block unsafe responses before they reach the user."],
      "Info Extraction": ["Optional", "Fast Safety scores a response across 11 unsafe-content dimensions.\n\nInfo Extraction returns structured data fields extracted from source documents — not freshly-generated natural language — so the model isn't introducing unsafe content; if the source has unsafe content, that's a corpus issue, not an extraction issue.\n\nOptional because the output shape (data fields) rarely triggers the safety dimensions; the metric's design assumes generative output."],
      "Classification": ["Optional", "Fast Safety scores a response across 11 unsafe-content dimensions.\n\nClassification produces a category label, not freshly-generated text, so there's nothing for the safety dimensions to score against.\n\nOptional because the metric's input shape doesn't match classification's output shape; safety concerns for classifier inputs (e.g. unsafe content being categorized) belong upstream."],
      "Autonomous Agents": ["Recommended", "Fast Safety scores a response across 11 unsafe-content dimensions (illegal, hateful, harassing, sexual, violent, jailbreaking, etc.) using a fast proprietary model.\n\nFor Autonomous Agents, the risk surface is the largest of any use case — agents take real-world actions via tools, so an unsafe response can become an unsafe email sent, an unsafe API call, or a destructive operation.\n\nRecommended because the blast radius is high; promote to a real-time Guardrail to block unsafe responses before the agent can act on them."]
    }
  }, {
    name: "PII Detection",
    docsUrl: "/developers/tutorials/guardrails/guardrails-pii",
    objective: "Model Safety & Trust",
    q: "Did the model leak any sensitive data / PII?",
    models: [],
    guardrails: true,
    type: "Evaluator",
    category: "Safety",
    domain: "Enrichment",
    evaluatorType: "Open-Source Model",
    provider: "Fiddler Centor",
    how: "Presidio (NER + rules)",
    notes: "Presidio-based, 14+ entity types",
    uc: {
      "Summarization": ["Consider", "PII Detection flags personally identifiable information (names, emails, SSNs, addresses, etc.) in inputs or outputs.\n\nFor Summarization, source documents may contain PII that gets carried into the summary — and condensing identifiers into a short summary can actually reduce the anonymization a long document otherwise provided through dilution.\n\nConsider for Summarization over corpora that may include personal data (medical records, HR documents, customer correspondence); promote to Recommended for regulated or external-facing summary deployments."],
      "Code Generation": ["Consider", "PII Detection flags personally identifiable information in inputs or outputs.\n\nFor Code Generation, the risk is less about names/emails and more about hardcoded secrets — API keys, passwords, customer identifiers — that the model may emit if it has seen similar patterns in its training data.\n\nConsider for any code-gen deployment that could leak secrets in committed code; pair with secret-scanning tools for more complete coverage of the credential side of this risk."],
      "Content Generation": ["Consider", "PII Detection flags personally identifiable information in inputs or outputs.\n\nFor Content Generation, generated marketing copy or blog posts may inadvertently reference real personal information if the brief includes examples, or if the model pulls from training data containing such references.\n\nConsider for any content-gen deployment where the output is published; promote to Recommended (or wire as a Guardrail) for regulated industries or external-facing publication pipelines."],
      "Q&A (RAG)": ["Consider", "PII Detection flags personally identifiable information (names, emails, SSNs, etc.) in inputs or outputs.\n\nFor Q&A (RAG), responses may inadvertently surface PII pulled from the knowledge base — even when individual docs look sanitized, retrieval + summarization can recombine signals.\n\nConsider for any Q&A over corpora that contain real customer or employee data; promote to Recommended (or wire as a Guardrail) for regulated or external-facing deployments."],
      "Chatbot": ["Recommended", "PII Detection flags personally identifiable information (names, emails, SSNs, etc.) in inputs or outputs.\n\nFor Chatbots, both directions matter — users frequently share PII in their messages (sometimes accidentally), and the chatbot may surface PII from its knowledge base in responses.\n\nRecommended because conversational AI is a known privacy-leak vector; combined with input-side Guardrails it's the standard pattern for compliant customer-facing chat."],
      "Info Extraction": ["Recommended", "PII Detection flags personally identifiable information in inputs or outputs.\n\nFor Info Extraction, source documents are often densely full of PII by design — invoices, contracts, resumes, customer forms — and any extracted output likely contains it.\n\nRecommended because PII handling is a core requirement for most extraction workflows (regulatory compliance, downstream system requirements); knowing which extracted fields contain PII shapes how they get stored, transmitted, and accessed downstream."],
      "Classification": ["Optional", "PII Detection flags personally identifiable information in inputs or outputs.\n\nClassification produces a category label as output — there's no PII to leak through a label like 'positive' or 'urgent'.\n\nOptional because the output shape doesn't carry PII; concerns about PII in classification inputs belong upstream (input filtering, secure storage of the corpus being classified)."],
      "Autonomous Agents": ["Recommended", "PII Detection flags personally identifiable information in inputs or outputs.\n\nFor Autonomous Agents, the risk is amplified — agents may access PII through tool calls (CRM queries, database reads), include it in reasoning steps, and emit it in responses or downstream tool actions.\n\nRecommended because the surface area is broad and the blast radius of leaked PII is large (it could be sent to external services via tools); pair with Guardrails to block PII before it leaves the agent boundary."]
    }
  }, {
    name: "Profanity",
    docsUrl: "/observability/llm/enrichments",
    objective: "Model Safety & Trust",
    q: "Is the output using offensive language?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Safety",
    domain: "Enrichment",
    evaluatorType: "Non-AI",
    provider: "N/A",
    how: "Keyword list match",
    notes: "Keyword-based",
    uc: {
      "Summarization": ["Optional", "Profanity flags responses containing offensive language.\n\nSummarization condenses existing content — the model rarely introduces profanity that wasn't already in the source.\n\nOptional because the typical Summarization pipeline operates on curated corpora; if profanity appears in a summary, the issue is the source content, not the model."],
      "Code Generation": ["Optional", "Profanity flags responses containing offensive language.\n\nCode Generation produces code — variable names, function definitions, comments — where profanity is extremely rare and easy to catch with simple linting tools.\n\nOptional because the metric's premise (natural-language offensive content) doesn't match code-gen's typical output shape; if you're worried about comments or string literals, a code-linting rule is more direct."],
      "Content Generation": ["Recommended", "Profanity flags responses containing offensive language.\n\nFor Content Generation, output is user-facing by definition — any profanity that ships represents a direct brand-safety risk, and even a single incident can become a screenshot that travels.\n\nRecommended because brand-safety standards for published content typically require zero profanity; promote to a real-time Guardrail to block before publication."],
      "Q&A (RAG)": ["Consider", "Profanity flags responses containing offensive language.\n\nFor Q&A (RAG), the model rarely generates profanity directly, but it can echo profane content if it appears in the retrieved docs (user-generated knowledge bases, forum content, etc.).\n\nConsider for Q&A over corpora that may include unsanitized user content; less critical for curated knowledge bases (technical docs, internal wikis) where profanity in the source is already filtered."],
      "Chatbot": ["Recommended", "Profanity flags responses containing offensive language.\n\nFor Chatbots, even rare profanity in responses damages trust disproportionately — users expect professional, consistent tone from customer-facing bots.\n\nRecommended because conversational outputs are real-time and user-facing; wire as a Guardrail to block unsafe responses before they reach the user."],
      "Info Extraction": ["Optional", "Profanity flags responses containing offensive language.\n\nInfo Extraction returns structured fields extracted from source documents — the model isn't generating natural language responses that could contain profanity.\n\nOptional because the output shape (data fields) doesn't fit the metric's input shape; if profanity in the source matters, scan the source corpus."],
      "Classification": ["Optional", "Profanity flags responses containing offensive language.\n\nClassification produces a category label, not natural language responses — labels are predefined values that don't contain profanity.\n\nOptional because the metric's premise doesn't apply to label-prediction tasks."],
      "Autonomous Agents": ["Consider", "Profanity flags responses containing offensive language.\n\nFor Autonomous Agents, user-facing responses can contain profanity if the agent retrieves or processes content that includes it.\n\nConsider for customer-facing agent deployments; for internal-only agent tasks (data processing, system administration), the user-facing surface is small enough that profanity monitoring rarely matters."]
    }
  }, {
    name: "Banned Keywords",
    docsUrl: "/observability/llm/enrichments",
    objective: "Model Safety & Trust",
    q: "Does the output contain restricted terms?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Safety",
    domain: "Enrichment",
    evaluatorType: "Non-AI",
    provider: "N/A",
    how: "Keyword list match",
    notes: "User-configurable",
    uc: {
      "Summarization": ["Optional", "Banned Keywords flags responses containing terms from a deny-list you maintain (competitor names, deprecated product names, legal red flags).\n\nSummarization rarely introduces new terms that weren't in the source — the metric mostly triggers on the source rather than the summarization process.\n\nOptional because there's no model-generated novel language to police; if banned terms in the source matter, filter the corpus upstream."],
      "Code Generation": ["Optional", "Banned Keywords flags responses containing terms from a deny-list.\n\nFor Code Generation, the typical deny-list use cases (competitor names, legal red-flags) rarely appear in generated code.\n\nOptional because the output shape (code, not marketing prose) doesn't intersect well with deny-list patterns; if you're guarding against unsafe APIs or banned libraries in code, a code-specific allowlist/denylist tool is more targeted."],
      "Content Generation": ["Recommended", "Banned Keywords flags responses containing terms from a deny-list you maintain (competitor names, deprecated product names, legal red flags).\n\nFor Content Generation, output goes directly to customers, partners, and search engines — surfacing a competitor's name in your own marketing copy, or using a deprecated product name, has real brand and legal consequences.\n\nRecommended because content-gen is the highest-value surface for keyword discipline; promote to a Guardrail to block at publication time."],
      "Q&A (RAG)": ["Consider", "Banned Keywords flags responses containing terms from a deny-list you maintain (competitor names, deprecated product names, legal red flags).\n\nFor Q&A (RAG), this enforces messaging discipline — making sure the model doesn't surface things you don't want mentioned, even if they appear in retrieved context.\n\nConsider when your Q&A has explicit content guardrails (brand voice, competitive positioning); the value scales with how strict your editorial standards are."],
      "Chatbot": ["Recommended", "Banned Keywords flags responses containing terms from a deny-list.\n\nFor Chatbots, this enforces messaging discipline in real-time conversations — preventing the model from mentioning competitor products, deprecated names, or legally-sensitive terms even when those appear in retrieved context.\n\nRecommended because conversational responses can't be reviewed before they ship to users; Guardrails-level enforcement is the standard pattern for customer-facing chat."],
      "Info Extraction": ["Optional", "Banned Keywords flags responses containing terms from a deny-list.\n\nInfo Extraction returns extracted field values from source documents — if a banned term appears in a field, it was in the source document, not introduced by the model.\n\nOptional because the metric's premise (catching model-generated novel language) doesn't fit extraction's input-mirroring output shape."],
      "Classification": ["Optional", "Banned Keywords flags responses containing terms from a deny-list.\n\nClassification produces a category label drawn from a fixed taxonomy — there's no novel language to police.\n\nOptional because the output shape (predefined labels) makes deny-list filtering trivially unnecessary."],
      "Autonomous Agents": ["Consider", "Banned Keywords flags responses containing terms from a deny-list.\n\nFor Autonomous Agents that produce user-facing responses or send messages via tools, this can prevent off-brand or restricted language from reaching customers.\n\nConsider for customer-facing agents; for internal agents (data processing, system tasks), the deny-list framing is rarely relevant."]
    }
  }, {
    name: "Toxicity",
    docsUrl: "/observability/llm/enrichments",
    objective: "Model Safety & Trust",
    q: "Is the model generating toxic content?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Safety",
    domain: "Enrichment",
    evaluatorType: "Open-Source Model",
    provider: "Fiddler Centor",
    how: "BERT-scale toxicity classifier",
    notes: "Deprecated 25.21 — TBD deletion date for Traditional Monitoring. Use Fast Safety instead.",
    uc: {
      "Summarization": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Summarization, it has been deprecated in favor of Fast Safety, which covers toxicity as one of 11 unsafe-content dimensions in a single faster evaluator. Optional because it's a legacy evaluator scheduled for removal; use Fast Safety to get the same signal in modern deployments."],
      "Code Generation": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Code Generation, it has been deprecated in favor of Fast Safety, which covers toxicity as one of 11 unsafe-content dimensions in a single faster evaluator. Optional because it's a legacy evaluator scheduled for removal; use Fast Safety instead."],
      "Content Generation": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Content Generation, it has been deprecated in favor of Fast Safety, which covers toxicity as one of 11 unsafe-content dimensions in a single faster evaluator. Optional because it's a legacy evaluator scheduled for removal; use Fast Safety to get the same signal at higher quality."],
      "Q&A (RAG)": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Q&A (RAG), it has been deprecated in favor of Fast Safety, which covers toxicity as one of 11 unsafe-content dimensions in a single faster evaluator. Optional because it's a legacy evaluator scheduled for removal; use Fast Safety to get the same signal in Agentic and modern LLM Monitoring deployments."],
      "Chatbot": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Chatbots, it has been deprecated in favor of Fast Safety, which covers toxicity as one of 11 unsafe-content dimensions in a single faster evaluator. Optional because it's a legacy evaluator scheduled for removal; use Fast Safety instead."],
      "Info Extraction": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Info Extraction, it has been deprecated in favor of Fast Safety; structured-extraction outputs rarely contain natural-language toxicity in the first place. Optional because both the metric and the use-case fit are weak; use Fast Safety on the source content if toxicity matters."],
      "Classification": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Classification, it has been deprecated; label outputs don't contain toxicity that needs scoring. Optional because both the metric and the use-case fit don't apply; use Fast Safety on classification inputs upstream if needed."],
      "Autonomous Agents": ["Optional", "Toxicity scored responses for hateful, harassing, or otherwise toxic content. For Autonomous Agents, it has been deprecated in favor of Fast Safety, which covers toxicity as one of 11 unsafe-content dimensions in a single faster evaluator. Optional because it's a legacy evaluator scheduled for removal; use Fast Safety instead."]
    }
  }, {
    name: "Bias (Custom Judge or Segment Analysis)",
    hidden: true,
    docsUrl: "/observability/llm/llm-based-metrics",
    objective: "Model Safety & Trust",
    q: "Is my model exhibiting bias across protected groups?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Safety",
    domain: "Platform",
    evaluatorType: "LLM (Custom Judge)",
    provider: "Fiddler Centor or External",
    how: "User prompt + user-selected LLM",
    notes: "Fiddler supports both LLM Judge for individual bias flagging and Segment Analysis for cross-cohort comparison",
    uc: {
      "Summarization": ["Consider", "Bias detection in Summarization uses either a Custom Judge scoped to fairness, or segment analysis — comparing summary-quality scores (Faithfulness, Conciseness) across content topics or audience demographics to surface systemic gaps.\n\nFor Summarization, this catches cases where the model produces lower-quality summaries for certain demographic groups or content topics (e.g. medical research about women's health summarized less faithfully than men's).\n\nConsider for summarization deployments serving diverse content or audience groups; promote to Recommended for healthcare, legal, or other high-stakes domains."],
      "Code Generation": ["Consider", "Bias detection in Code Generation uses either a Custom Judge scoped to fairness, or segment analysis — comparing code-quality scores (correctness, security) across user segments or language/framework categories.\n\nFor Code Generation, this surfaces cases where the model produces lower-quality code for less-common languages, frameworks, or user demographics (e.g. junior vs senior developer prompt patterns).\n\nConsider when your code-gen serves diverse user populations; for narrow internal tools, segment analysis offers less leverage."],
      "Content Generation": ["Recommended", "Bias detection in Content Generation uses either a Custom Judge scoped to fairness, or segment analysis — flagging stereotypical language, exclusionary phrasing, and comparing content quality across audience segments.\n\nThe Bias & Accuracy Cookbook treats Content Generation as a primary bias deep-dive use case because generated content reaches users directly and biased phrasing has immediate brand and ethical impact.\n\nRecommended because content-gen is the highest-stakes surface for representational bias; the cookbook's primary deep-dive lives here."],
      "Q&A (RAG)": ["Recommended", "Bias detection in Q&A (RAG) uses either a Custom Judge scoped to fairness, or segment analysis — comparing core quality scores (Answer Relevance, RAG Faithfulness) across user-demographic or topic segments.\n\nThe Bias & Accuracy Cookbook's healthcare example shows answer quality dropping for certain gender segments in medical Q&A.\n\nRecommended because Q&A systems are typically customer-facing and high-stakes; bias detection is the only way to surface quality gaps that aggregate metrics mask."],
      "Chatbot": ["Recommended", "Bias detection in Chatbots uses either a Custom Judge scoped to fairness, or segment analysis — comparing response quality, sentiment, and length across user demographic groups.\n\nFor Chatbots, biased responses (e.g. shorter or less-helpful answers to certain user segments, more negative sentiment toward some demographic groups) erode trust and create regulatory exposure.\n\nRecommended because conversational AI is customer-facing and bias issues compound rapidly; segment analysis is the only way to catch quality gaps that aggregate metrics mask."],
      "Info Extraction": ["Consider", "Bias detection in Info Extraction uses either a Custom Judge scoped to fairness, or segment analysis — comparing per-field accuracy across document types and demographic contexts.\n\nThe Bias & Accuracy Cookbook's name-extraction example shows extraction systems producing lower accuracy on non-Western names, with cascading downstream consequences.\n\nConsider for extraction over documents containing demographic-sensitive fields (names, addresses, ID numbers); promote to Recommended for HR, healthcare, or government workflows."],
      "Classification": ["Recommended", "Bias detection in Classification uses either a Custom Judge scoped to fairness, or segment analysis — comparing precision, recall, and F1 across demographic segments.\n\nFor Classification, segment-aware bias detection is the canonical fairness framework — historically called 'algorithmic fairness' in ML — and is mandatory in regulated domains (lending, hiring, healthcare).\n\nRecommended because classification bias has explicit legal and ethical accountability frameworks; the Bias & Accuracy Cookbook treats it as a core classification practice."],
      "Autonomous Agents": ["Recommended", "Bias detection in Autonomous Agents uses either a Custom Judge scoped to fairness, or segment analysis — comparing tool-call accuracy, task completion rates, and response quality across user demographic groups.\n\nFor Autonomous Agents, biased behavior compounds across the multi-step workflow — a small per-step bias becomes a large divergence in end outcomes by demographic group.\n\nRecommended because agents take real-world actions; biased actions have direct downstream impact on the user groups they affect, often before the bias is detectable in single-step metrics."]
    }
  }, {
    name: "Text Embedding",
    docsUrl: "/observability/llm/embedding-visualization-with-umap",
    objective: "Performance Risk (Data Drift)",
    q: "Are prompt/input topics shifting over time?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Embedding",
    domain: "Enrichment",
    evaluatorType: "Open-Source Model",
    provider: "Fiddler Centor",
    how: "Sentence Transformer (OSS)",
    notes: "UMAP visualization, clustering",
    uc: {
      "Summarization": ["Consider", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Summarization, tracking embeddings of source documents tells you whether the corpus you're summarizing has shifted — new document types, new topics, new tones — that the original prompt wasn't designed for.\n\nConsider for production summarization where the source corpus evolves; less critical when summarizing a fixed, curated content stream."],
      "Code Generation": ["Consider", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Code Generation, tracking embeddings of user prompts surfaces shifts in what users are asking for — new languages, frameworks, problem domains — which often correlates with prompt-design or fine-tuning needing updates.\n\nConsider for production code-gen with an evolving user base; less critical for narrow-scope tools with stable prompt patterns."],
      "Content Generation": ["Consider", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Content Generation, embeddings of incoming briefs reveal whether the content team is asking for different things than your prompts were tuned for — new audiences, new tone, new content types.\n\nConsider for content-gen tools with evolving editorial direction; less critical for narrow templated workflows."],
      "Q&A (RAG)": ["Recommended", "Text Embedding stores a vector representation of each input/output so you can analyze semantic drift over time.\n\nFor Q&A (RAG), tracking embeddings of incoming questions tells you whether users are asking new kinds of things your knowledge base wasn't designed for — a leading indicator of retrieval-relevance degradation.\n\nRecommended because question topics evolve faster than knowledge bases get updated; embedding drift is the earliest signal that the gap between user need and corpus coverage is widening."],
      "Chatbot": ["Recommended", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Chatbots, tracking embeddings of user questions tells you whether users are asking new kinds of things your knowledge base wasn't designed for — a leading indicator of retrieval-relevance degradation and conversational quality decline.\n\nRecommended because conversation topics evolve faster than knowledge bases get updated; embedding drift is the earliest signal that the gap between user need and corpus coverage is widening."],
      "Info Extraction": ["Consider", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Info Extraction, embeddings of source documents reveal shifts in document type or layout — new invoice formats, new contract templates, new languages — which often correlate with extraction-quality regressions before they show up in per-field accuracy.\n\nConsider for extraction over a heterogeneous or evolving document corpus; less critical for narrow extraction on a stable template."],
      "Classification": ["Recommended", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Classification, input-text drift is the leading indicator of accuracy degradation — when the inputs shift, the classifier's training distribution no longer matches reality and accuracy drops follow.\n\nRecommended because classifier performance degradation tracks input drift more tightly than in any other use case; catching the embedding shift early lets you retrain before accuracy collapses."],
      "Autonomous Agents": ["Recommended", "Text Embedding stores vector representations of inputs/outputs so you can analyze semantic drift over time.\n\nFor Autonomous Agents, tracking embeddings of incoming task requests reveals shifts in what users are asking the agent to do — new task types, new tools required, new domains — that the agent's prompt or tool set may not handle well.\n\nRecommended because agent failure modes often correlate with novel task types; embedding drift is the earliest signal that the agent's capability envelope is being stress-tested."]
    }
  }, {
    name: "Centroid Distance",
    docsUrl: "/observability/llm/embedding-visualization-with-umap",
    objective: "Performance Risk (Data Drift)",
    q: "Are prompt/input topics shifting over time?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Embedding",
    domain: "Enrichment",
    evaluatorType: "Non-AI",
    provider: "N/A",
    how: "Vector distance calc",
    notes: "Auto-generated with TextEmbedding",
    uc: {
      "Summarization": ["Consider", "Centroid Distance measures how far each input's embedding sits from the centroid of historical inputs — a per-example outlier score.\n\nFor Summarization, it flags individual source documents that are unusual compared to your historical corpus, which often correlates with lower-quality summaries (the model's prompt wasn't tuned for that document shape).\n\nConsider as a triage signal — combined with Text Embedding (population drift), you get both macro and micro views of summarization-corpus health."],
      "Code Generation": ["Consider", "Centroid Distance measures how far each request's embedding sits from the centroid of historical requests — a per-example outlier score.\n\nFor Code Generation, it flags unusual user prompts (novel problem types, unusual languages, off-pattern requests) that often correlate with lower-quality generations.\n\nConsider as a per-request triage signal; combined with Text Embedding for population drift, you get both macro and micro views of request-distribution health."],
      "Content Generation": ["Consider", "Centroid Distance measures how far each input's embedding sits from the centroid of historical inputs — a per-example outlier score.\n\nFor Content Generation, it flags briefs that are unusual compared to your historical content workload (different audience, different content type, different brief structure), which often correlates with off-tone or brief-missing output.\n\nConsider as a triage signal for which generations to QA most carefully; combined with Text Embedding for population drift, you get both views."],
      "Q&A (RAG)": ["Recommended", "Centroid Distance measures how far each question's embedding sits from the centroid of historical questions — a per-example outlier score.\n\nFor Q&A (RAG), it flags individual user queries that are unusual or off-distribution, which often correlates with retrieval failures or low-quality answers.\n\nRecommended because it lets you triage which specific user interactions to investigate first; combined with Text Embedding (population-level drift), you get both the macro and micro view of question-distribution health."],
      "Chatbot": ["Recommended", "Centroid Distance measures how far each question's embedding sits from the centroid of historical questions — a per-example outlier score.\n\nFor Chatbots, it flags individual user queries that are unusual or off-distribution, which often correlates with retrieval failures or low-quality answers.\n\nRecommended because it lets you triage which specific user interactions to investigate first; combined with Text Embedding (population-level drift), you get both the macro and micro view of conversation-distribution health."],
      "Info Extraction": ["Consider", "Centroid Distance measures how far each document's embedding sits from the centroid of historical documents — a per-example outlier score.\n\nFor Info Extraction, it flags documents that look unusual compared to your processing history (new formats, languages, layouts) — which is often where per-field accuracy quietly degrades.\n\nConsider as a triage signal — combined with Text Embedding for population drift, it tells you which specific documents to QA."],
      "Classification": ["Recommended", "Centroid Distance measures how far each input's embedding sits from the centroid of historical inputs — a per-example outlier score.\n\nFor Classification, out-of-distribution inputs are the highest-risk category for misclassification — the model is most likely to be wrong on examples that don't look like anything it was trained on.\n\nRecommended because per-prediction outlier scores let you route high-risk classifications to human review or trigger retraining; combined with Text Embedding for population drift, it gives you both signals."],
      "Autonomous Agents": ["Recommended", "Centroid Distance measures how far each request's embedding sits from the centroid of historical requests — a per-example outlier score.\n\nFor Autonomous Agents, anomalous requests often signal either novel legitimate use cases (a capability gap) or adversarial probing (an attack); both deserve attention.\n\nRecommended because the per-request signal is critical for agent safety and capability planning; combined with Text Embedding for population drift, you get both views."]
    }
  }, {
    name: "Data Drift (JSD / PSI)",
    docsUrl: "/observability/platform/data-drift-platform",
    objective: "Performance Risk (Data Drift)",
    q: "Are feature or prediction distributions shifting from baseline?",
    models: ["ML", "LLM"],
    type: "Metric",
    category: "Drift",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "Feature distribution shift detection",
    uc: {
      "Summarization": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Summarization, they apply to scalar metadata about source documents and summaries (length, sentiment, evaluator scores) rather than the text itself.\n\nConsider for tracking stability of derived signals; for source-content drift, Text Embedding / Centroid Distance are more direct and informative."],
      "Code Generation": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Code Generation, they apply to scalar metadata — programming language, problem domain category, prompt length — rather than the code text itself.\n\nConsider for tracking categorical/scalar feature distributions; for content drift, Text Embedding is the more direct signal."],
      "Content Generation": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Content Generation, they apply to scalar metadata about briefs and outputs (length, audience tags, evaluator scores) rather than the content text itself.\n\nConsider for monitoring stability of derived signals; for brief-content drift, Text Embedding / Centroid Distance are more informative."],
      "Q&A (RAG)": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Q&A (RAG), they apply to scalar metadata about questions/answers (length, sentiment, evaluator scores) rather than the text itself.\n\nConsider for monitoring the stability of derived signals; for text-content drift, Text Embedding / Centroid Distance are more direct and informative."],
      "Chatbot": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Chatbots, they apply to scalar metadata — conversation length, sentiment, latency, evaluator scores — not to the message text.\n\nConsider for monitoring derived-signal stability over time; for conversation-content drift, Text Embedding / Centroid Distance are more direct."],
      "Info Extraction": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Info Extraction, they apply to scalar metadata about documents (page count, field counts, OCR confidence) rather than the document text.\n\nConsider for monitoring document-pipeline stability; for content drift, Text Embedding / Centroid Distance handle the text side better."],
      "Classification": ["Recommended", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Classification, input drift is the single highest-correlation predictor of accuracy degradation — distribution shifts directly cause the trained boundaries to mismatch reality.\n\nRecommended because classification is the use case where statistical drift analysis (per-feature) most directly maps to model-quality decay; the Bias & Accuracy Cookbook treats drift as a primary classification signal."],
      "Autonomous Agents": ["Consider", "Jensen-Shannon Divergence and Population Stability Index quantify how much a feature's distribution has shifted between two time windows.\n\nFor Autonomous Agents, they apply to scalar metadata about agent runs — iteration counts, tool-call counts, latency, evaluator scores — rather than the prompts or reasoning text.\n\nConsider for tracking operational-signal stability; for task-content drift, Text Embedding / Centroid Distance are more direct."]
    }
  }, {
    name: "Topic Classification",
    docsUrl: "/sdk-api/evals/topic-classification",
    objective: "Performance Risk (Domain Insight)",
    q: "What topics are users asking about?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Text Stats",
    domain: "Enrichment",
    evaluatorType: "Open-Source Model",
    provider: "Fiddler Centor",
    how: "Zero-shot BERT classifier (OSS)",
    notes: "Zero-shot classifier",
    uc: {
      "Summarization": ["Consider", "Topic Classification labels each input with one of a set of topics you define (or that the system auto-discovers).\n\nFor Summarization, this gives you visibility into what kinds of documents are being summarized — which often drives prompt updates and audience-targeting decisions.\n\nConsider for domain insight on summarization workload; less critical when the document type is already known and tracked upstream."],
      "Code Generation": ["Optional", "Topic Classification labels each input with one of a set of topics.\n\nFor Code Generation, the user's intent is usually explicit in the prompt — language, framework, problem area — and doesn't need topic inference layered on top.\n\nOptional because the categorization signal is typically available from explicit user input or context; if you want to roll up reporting by problem domain, FQL Custom Metrics over prompt tags is more direct."],
      "Content Generation": ["Consider", "Topic Classification labels each input with one of a set of topics you define (or that the system auto-discovers).\n\nFor Content Generation, this tells you what topics your content team is generating about — useful for editorial-calendar reporting and content-mix analysis.\n\nConsider for content-strategy insight; less critical for narrow-scope tools where content type is already tagged on input."],
      "Q&A (RAG)": ["Recommended", "Topic Classification labels each question with one of a set of topics you define (or that the system auto-discovers).\n\nFor Q&A (RAG), this tells you what users are actually asking about — which directly drives knowledge base prioritization, FAQ creation, and retrieval index tuning.\n\nRecommended because it converts the firehose of Q&A traffic into actionable category data; without it, you're flying blind on which areas of your corpus need the most investment."],
      "Chatbot": ["Recommended", "Topic Classification labels each question with one of a set of topics you define (or that the system auto-discovers).\n\nFor Chatbots, this tells you what users are actually asking about — which directly drives knowledge base prioritization, FAQ creation, and conversational design improvements.\n\nRecommended because it converts the firehose of chatbot traffic into actionable category data; without it, you're flying blind on which conversation topics need the most product investment."],
      "Info Extraction": ["Optional", "Topic Classification labels each input with one of a set of topics.\n\nFor Info Extraction, the document type is typically known upstream (which template is being processed, which schema applies) — so topic inference is redundant with information already available.\n\nOptional because the categorization signal is usually upstream; if document categorization is genuinely unknown, consider it as a triage signal."],
      "Classification": ["Consider", "Topic Classification labels each input with one of a set of topics.\n\nFor (predictive) Classification, this can give you a secondary view of the input distribution — useful for segmenting classification accuracy by topic and surfacing class/topic interactions.\n\nConsider as a meta-analysis tool; the classification labels themselves are the primary signal."],
      "Autonomous Agents": ["Consider", "Topic Classification labels each input with one of a set of topics.\n\nFor Autonomous Agents, this tells you what kinds of tasks users are asking the agent to perform — driving capability planning and tool-suite design.\n\nConsider for agent-operations insight (which tasks are most common, which are growing); less critical for narrow single-purpose agents."]
    }
  }, {
    name: "Language Detection",
    docsUrl: "/observability/llm/enrichments",
    objective: "Performance Risk (Domain Insight)",
    q: "What languages are users interacting in?",
    models: ["LLM"],
    type: "Evaluator",
    category: "Text Stats",
    domain: "Enrichment",
    evaluatorType: "Non-AI",
    provider: "N/A",
    how: "fastText",
    notes: "fastText-based",
    uc: {
      "Summarization": ["Optional", "Language Detection identifies the natural language of each input/output.\n\nFor Summarization, this matters only when you serve a multilingual user base — a model trained primarily on English summarizing a French document into English (instead of French) is a quality failure.\n\nOptional for single-language deployments; promote to Consider for any multilingual summarization pipeline."],
      "Code Generation": ["Optional", "Language Detection identifies the natural language of each input/output.\n\nFor Code Generation, programming language is specified explicitly (in prompt, file extension, or framework context), and the natural language of comments is usually English by convention.\n\nOptional because the metric's premise (detect language of natural text) rarely adds signal to code-gen; if multilingual comments matter for your team, it becomes Consider."],
      "Content Generation": ["Consider", "Language Detection identifies the natural language of each input/output.\n\nFor Content Generation, it surfaces multilingual content patterns and catches outputs that drift into the wrong language — generating English copy when a French brief was provided, for example.\n\nConsider for any content-gen tool with multilingual briefs; promote to Recommended if you've made explicit per-market language commitments."],
      "Q&A (RAG)": ["Consider", "Language Detection identifies the natural language of each input/output.\n\nFor Q&A (RAG), it surfaces multilingual usage patterns and catches responses that drift into the wrong language (a model trained primarily on English answering a French question in English, for example).\n\nConsider for any Q&A with a non-trivial multilingual user base; promote to Recommended if you've made explicit per-market language commitments."],
      "Chatbot": ["Recommended", "Language Detection identifies the natural language of each input/output.\n\nFor Chatbots, multilingual user bases are common (especially in consumer-facing or international deployments), and responses drifting into the wrong language (English answer to a Spanish question) is a visible quality failure.\n\nRecommended because language-match is a baseline conversational quality requirement for multilingual chat; without this signal, language drift slips through other monitoring."],
      "Info Extraction": ["Consider", "Language Detection identifies the natural language of each input/output.\n\nFor Info Extraction, it surfaces multilingual document patterns and catches extractions that mishandle non-English content — the Bias & Accuracy Cookbook covers bias-across-languages as a key extraction concern.\n\nConsider for any extraction pipeline serving multilingual document corpora; less critical for English-only document workflows."],
      "Classification": ["Optional", "Language Detection identifies the natural language of each input/output.\n\nFor Classification, the input language is usually controlled (single-language model, single-language corpus) and the output is a label that doesn't have a language.\n\nOptional because the metric's premise doesn't add signal to typical classification pipelines; consider only for multilingual classification deployments."],
      "Autonomous Agents": ["Consider", "Language Detection identifies the natural language of each input/output.\n\nFor Autonomous Agents serving multilingual users, it catches responses that mismatch the user's language and surfaces multilingual usage patterns.\n\nConsider for agents with multilingual user bases; less critical for single-language deployments."]
    }
  }, {
    name: "Sentiment Analysis",
    docsUrl: "/sdk-api/evals/sentiment",
    objective: "Performance Risk (Domain Insight)",
    q: "What is the sentiment of inputs/outputs?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Text Stats",
    domain: "Enrichment",
    evaluatorType: "Open-Source Model",
    provider: "Fiddler Centor",
    how: "VADER (OSS)",
    notes: "VADER-based",
    uc: {
      "Summarization": ["Optional", "Sentiment Analysis scores responses as positive, neutral, or negative in tone.\n\nFor Summarization, the summary's sentiment usually reflects the source content's sentiment — and scoring 'is this summary appropriately negative?' isn't a useful quality signal because Faithfulness already handles 'does the summary match the source?'.\n\nOptional because the metric measures a derived property of the source that adds little to summarization-specific quality monitoring."],
      "Code Generation": ["Optional", "Sentiment Analysis scores text as positive, neutral, or negative in tone.\n\nCode Generation produces code, not natural-language text where sentiment is a meaningful dimension.\n\nOptional because the metric's premise (natural-language tone) doesn't apply to code; if you want to score comment tone specifically, a Custom Judge would be more direct."],
      "Content Generation": ["Recommended", "Sentiment Analysis scores responses as positive, neutral, or negative in tone.\n\nFor Content Generation, brand-safety standards typically require tone control — overly negative copy, inappropriately upbeat language for serious topics, or sentiment misaligned with brand voice are direct quality failures.\n\nRecommended because the Bias & Accuracy Cookbook treats sentiment as a primary brand-safety evaluator for content; pair with Custom Judge for nuanced brand-voice criteria."],
      "Q&A (RAG)": ["Consider", "Sentiment Analysis scores each response as positive, neutral, or negative in tone.\n\nFor Q&A (RAG), it's most useful as a bias-detection signal — comparing answer sentiment across user demographic segments to catch systemically harsher responses for some groups (per the Bias & Accuracy Cookbook).\n\nConsider as a supporting metric for bias workflows; less critical as a standalone quality measure since Q&A is information delivery, not relationship building."],
      "Chatbot": ["Recommended", "Sentiment Analysis scores responses as positive, neutral, or negative in tone.\n\nFor Chatbots, tone matching matters — a chatbot responding to a frustrated user with relentlessly upbeat language reads as tone-deaf; a chatbot turning negative when users escalate damages relationships.\n\nRecommended because the Bias & Accuracy Cookbook treats response sentiment as a primary chatbot evaluator; emotional appropriateness is a core conversational quality dimension."],
      "Info Extraction": ["Optional", "Sentiment Analysis scores text as positive, neutral, or negative in tone.\n\nInfo Extraction produces structured fields, not natural-language responses — sentiment of extracted fields rarely makes sense.\n\nOptional because the output shape doesn't fit the metric's input shape; if sentiment of the source document is meaningful for downstream analysis, score it on the source separately."],
      "Classification": ["Optional", "Sentiment Analysis scores text as positive, neutral, or negative in tone.\n\nFor Classification, sentiment is often what's being classified (sentiment classifiers are a common application) — so the Sentiment Analysis evaluator becomes circular if applied to a sentiment classifier's outputs.\n\nOptional because the metric is either redundant (if your classifier handles sentiment) or off-topic (if it classifies something else)."],
      "Autonomous Agents": ["Consider", "Sentiment Analysis scores responses as positive, neutral, or negative in tone.\n\nFor Autonomous Agents that produce user-facing responses, tracking response sentiment surfaces cases where the agent's tone is misaligned with the situation (overly cheerful when reporting failure, overly negative when delivering routine updates).\n\nConsider for customer-facing agents; for internal/system agents, response tone is rarely a quality-affecting dimension."]
    }
  }, {
    name: "Statistics (user-defined fields)",
    docsUrl: "/observability/platform/statistics",
    objective: "Performance Risk (Domain Insight)",
    q: "What are the descriptive statistics on my data?",
    models: ["ML", "LLM", "Agentic"],
    type: "Metric",
    category: "Statistics",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "Descriptive stats on any column",
    uc: {
      "Summarization": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Summarization, this is useful for tracking signals like average source-document length, summarization ratio (summary length / source length), or evaluator score distributions over time.\n\nConsider when you instrument your summarization pipeline with custom numeric fields you want to track; the value scales with the depth of your instrumentation."],
      "Code Generation": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Code Generation, this is useful for tracking generation length, lines-of-code metrics, test-pass rates, or other numeric signals you tag per generation.\n\nConsider when you've instrumented numeric quality signals you want trended over time; for evaluator-output aggregation, FQL Custom Metrics offers more flexibility."],
      "Content Generation": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Content Generation, this can track per-content-type numeric signals — average word count, average judge score, generations per editor per day — that aggregate into content-ops dashboards.\n\nConsider when your content pipeline has explicit numeric signals to monitor; the value scales with instrumentation depth."],
      "Q&A (RAG)": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Q&A (RAG), this is useful for tracking things like average retrieval count per question, average judge-evaluator score, or business-specific signals you log alongside the question.\n\nConsider as a flexible companion to the dedicated metrics; the value scales with how much custom instrumentation you do beyond out-of-the-box fields."],
      "Chatbot": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Chatbots, this tracks conversational signals — average turns per conversation, average response length, average evaluator scores by user segment — useful for ops dashboards and reporting.\n\nConsider when your chatbot instrumentation includes custom numeric signals you want aggregated over time."],
      "Info Extraction": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Info Extraction, this surfaces patterns in extracted-field values — average invoice amounts, percentile distributions of contract durations, anomalies in numeric field distributions — useful both for extraction-quality monitoring and downstream business analytics.\n\nConsider for any extraction pipeline whose outputs include numeric fields you want monitored."],
      "Classification": ["Recommended", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Classification, predicted-class distribution monitoring is critical — sudden shifts in the rate of class A vs class B predictions often indicate input drift, model drift, or upstream data-pipeline issues, and statistical aggregates over predictions are the simplest way to surface that.\n\nRecommended because class-distribution monitoring is a standard classification operations practice; the metric automates it."],
      "Autonomous Agents": ["Consider", "The Statistics metric computes descriptive aggregates (mean, percentiles, min/max) on numeric fields you tag in your spans.\n\nFor Autonomous Agents, this tracks agent-run signals — average tool-calls per run, percentile distributions of step counts, average end-to-end durations — that feed agent-ops dashboards.\n\nConsider when your agent instrumentation includes numeric signals you want trended; for derived KPIs combining signals, FQL Custom Metrics offers more flexibility."]
    }
  }, {
    name: "Custom Metrics (FQL)",
    docsUrl: "/observability/platform/custom-metrics",
    objective: "Performance Risk (Domain Insight)",
    q: "How do I track custom business KPIs?",
    models: ["ML", "LLM", "Agentic"],
    type: "Metric",
    category: "Custom",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "User-defined via Fiddler Query Language",
    uc: {
      "Summarization": ["Consider", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Summarization, this lets you encode hard quality rules — 'percentage of summaries longer than 500 tokens', 'count of summaries failing both Faithfulness and Conciseness', 'average evaluator score by document type'.\n\nConsider when you have rule-based summarization quality definitions (length caps, required fields) or want to combine multiple evaluator signals into a composite quality KPI."],
      "Code Generation": ["Recommended", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Code Generation, FQL lets you build code-quality KPIs that combine multiple signals — 'percentage of generations failing test execution', 'average judge score by language', 'count of generations with security warnings'.\n\nRecommended because code-quality reporting typically needs composite KPIs spanning multiple evaluator outputs; FQL is the canonical way to define them in Fiddler."],
      "Content Generation": ["Recommended", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Content Generation, FQL encodes content-specific KPIs — 'rate of off-brand content per editor', 'average brand-voice judge score by content type', 'composite quality score combining multiple judges'.\n\nRecommended because content-team reporting usually needs custom aggregations across multiple quality signals; FQL is the rule-engine for that aggregation."],
      "Q&A (RAG)": ["Consider", "Custom Metrics use Fiddler Query Language (FQL) to define computed metrics over span fields and evaluator outputs.\n\nFor Q&A (RAG), this lets you encode domain-specific quality rules — e.g. 'answer score by topic-x retrieval mode' or 'percentage of answers without citations'.\n\nConsider when your team has nuanced quality definitions that don't map cleanly to the built-in evaluators; the lift is in the FQL authoring, not the metric itself."],
      "Chatbot": ["Consider", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Chatbots, FQL can encode conversational KPIs — 'rate of unresolved escalations', 'percentage of turns failing scope policy', 'composite chatbot quality score'.\n\nConsider when you have rule-based chatbot policies or composite KPI requirements; for core quality monitoring, the built-in evaluators are usually sufficient."],
      "Info Extraction": ["Recommended", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Info Extraction, FQL is the standard way to define per-field accuracy rules and aggregate them into extraction-quality KPIs — 'invoice total extraction accuracy', 'contract party-name accuracy', 'composite extraction score'.\n\nRecommended because the cookbook treats FQL custom metrics as a primary evaluation method for extraction; aggregate KPIs across fields are how teams actually report extraction quality."],
      "Classification": ["Recommended", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Classification, FQL is the natural place to define per-class precision/recall/F1, confusion matrices, and class-imbalance signals — and to aggregate them into business-meaningful KPIs ('false-positive rate on high-stakes class X', 'macro-F1 across customer-tier segments').\n\nRecommended because classification quality is reported at the class-level granularity FQL handles natively."],
      "Autonomous Agents": ["Recommended", "Custom Metrics use Fiddler Query Language (FQL) to compute deterministic metrics over span fields and evaluator outputs.\n\nFor Autonomous Agents, FQL encodes agent-specific KPIs — 'tool-call success rate', 'percentage of agent runs hitting iteration cap', 'sub-goal completion rate by task type', or composite scores combining tool-call accuracy + reasoning quality + answer relevance.\n\nRecommended because agent reporting typically aggregates many signals into single KPIs; FQL is the rule-engine that produces those KPIs."]
    }
  }, {
    name: "Traffic",
    docsUrl: "/observability/platform/traffic-platform",
    objective: "Operational Health & Efficiency",
    q: "How much traffic is my model receiving?",
    models: ["ML", "LLM", "Agentic"],
    type: "Metric",
    category: "Traffic",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "Event volume monitoring",
    uc: {
      "Summarization": ["Consider", "Traffic counts the volume of Summarization requests over time, sliced by whatever dimensions you care about.\n\nFor production summarization pipelines, this is basic observability — knowing usage growth, peak times, and document-type mix — useful for capacity planning and contextualizing other metrics.\n\nConsider as a foundational signal; standard for any production deployment, but rarely the metric that surfaces quality issues."],
      "Code Generation": ["Consider", "Traffic counts the volume of code-gen requests over time.\n\nFor production code-gen services (developer copilots, automated PR creation), it informs capacity planning and helps contextualize quality dips (a Faithfulness drop during a low-traffic window is less significant than during peak).\n\nConsider as a foundational signal; standard for production but not a quality alert in itself."],
      "Content Generation": ["Consider", "Traffic counts the volume of content-gen requests over time.\n\nFor production content-gen pipelines, it informs cost planning and capacity decisions, and contextualizes other metrics (a quality dip in a low-traffic window is less critical than one during peak content production).\n\nConsider as basic ops signal; not a quality indicator on its own."],
      "Q&A (RAG)": ["Consider", "Traffic counts the volume of Q&A interactions over time, sliced by whatever dimensions you care about.\n\nFor Q&A (RAG), it's basic observability — knowing whether usage is growing, which times of day are peak, and which user segments are driving load.\n\nConsider as a foundational signal that doesn't directly indicate quality but helps you size cost/capacity decisions and contextualize the other metrics (a Faithfulness dip during a low-traffic window means less than the same dip during peak hours)."],
      "Chatbot": ["Recommended", "Traffic counts the volume of chatbot interactions over time, sliced by whatever dimensions you care about.\n\nFor Chatbots, sudden traffic spikes often indicate real-world events — a marketing campaign drove customers in, a product outage caused a flood of support questions, or an incident is unfolding — that you need to know about quickly.\n\nRecommended because chatbot traffic correlates tightly with operational events; the signal is more actionable here than for other use cases."],
      "Info Extraction": ["Consider", "Traffic counts the volume of extraction requests over time.\n\nFor production extraction pipelines, it informs capacity planning and helps contextualize per-field accuracy dips.\n\nConsider as a foundational ops signal; standard for production but not a primary quality indicator."],
      "Classification": ["Consider", "Traffic counts the volume of classification requests over time.\n\nFor production classifiers, it informs capacity decisions and helps contextualize accuracy regressions (a precision dip on low-traffic class X means less than the same dip during high-volume class Y).\n\nConsider as basic ops signal; combined with Statistics on class distribution, it gives you the workload-shape view."],
      "Autonomous Agents": ["Recommended", "Traffic counts the volume of agent runs over time, sliced by whatever dimensions you care about.\n\nFor Autonomous Agents, traffic spikes are early-warning signals — cascading agent failures can cause runaway invocation patterns, and unexpected traffic surges often signal users (or other agents) probing capabilities at scale.\n\nRecommended because agent ops requires tight loops on usage patterns; spikes are often the first signal of behavioral issues."]
    }
  }, {
    name: "Data Integrity",
    docsUrl: "/observability/platform/data-integrity-platform",
    objective: "Operational Health & Efficiency",
    q: "Is my data pipeline producing valid data?",
    models: ["ML", "LLM", "Agentic"],
    type: "Metric",
    category: "Integrity",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "Null/type/range violations",
    uc: {
      "Summarization": ["Consider", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Summarization, this catches operational issues — a pipeline step failing silently and returning an empty summary, a logging bug dropping the response field, etc.\n\nConsider for production reliability — it's not a quality signal per se, but it's often the first thing to check when other metrics suddenly go strange (you may be measuring missing data, not bad data)."],
      "Code Generation": ["Consider", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Code Generation, this catches operational issues like empty code outputs, dropped completion fields, or truncated generations that didn't get logged correctly.\n\nConsider for production reliability monitoring; surfaces silent failures before they show up in quality metrics."],
      "Content Generation": ["Consider", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Content Generation, it surfaces operational issues — generation failures returning empty content, logging bugs dropping the output field, content that wasn't formatted correctly.\n\nConsider for production reliability; first signal to check when quality metrics suddenly look weird."],
      "Q&A (RAG)": ["Consider", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Q&A (RAG), this catches operational bugs — a retrieval pipeline failing silently and returning empty context, or a response field that didn't get logged correctly.\n\nConsider for production reliability — it's not a quality signal per se, but it's the first thing to check when other metrics suddenly go weird (you might be measuring missing data, not bad data)."],
      "Chatbot": ["Consider", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Chatbots, it catches silent failures — empty responses, dropped fields, truncated answers — that may not be obvious from quality evaluators alone.\n\nConsider for production reliability; especially valuable when other metrics inexplicably regress (you may be looking at missing rather than bad data)."],
      "Info Extraction": ["Recommended", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Info Extraction, this is core — missing required fields, type violations (a string where a number was expected), or schema violations are direct extraction-quality failures, not just operational issues; the cookbook treats schema completeness as the 'overall accuracy' signal for extraction.\n\nRecommended because extraction outputs feed downstream systems that expect specific shapes; integrity failures break those pipelines immediately."],
      "Classification": ["Recommended", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Classification, null predictions, type violations, and missing labels are direct quality failures — a classifier returning null where a label is required indicates either a degenerate model state or an upstream input issue, both of which matter immediately.\n\nRecommended because classifier outputs are typically routed into downstream systems that expect well-formed labels; integrity failures break those routings."],
      "Autonomous Agents": ["Consider", "Data Integrity detects null, empty, or malformed values in expected fields.\n\nFor Autonomous Agents, this catches malformed agent outputs and failed tool calls — but for agents, tool-call success is usually monitored separately through tool-specific evaluators and FQL metrics.\n\nConsider as a fallback signal for general output integrity; for tool-call-specific issues, instrument those directly."]
    }
  }, {
    name: "Token Count",
    docsUrl: "/observability/llm/enrichments",
    objective: "Operational Health & Efficiency",
    q: "How many tokens are being consumed?",
    models: ["LLM", "Agentic"],
    type: "Evaluator",
    category: "Text Stats",
    domain: "Enrichment",
    evaluatorType: "Open-Source Model",
    provider: "Fiddler Centor",
    how: "TikToken (OSS)",
    notes: "tiktoken-based",
    uc: {
      "Summarization": ["Recommended", "Token Count records the input and output token counts for each Summarization interaction.\n\nFor Summarization, token counts feed bias detection (the Bias & Accuracy Cookbook compares summary length across content-source segments) and directly measure summarization efficiency (summary tokens / source tokens).\n\nRecommended because the signal is free, supports the cookbook's bias workflow, and lets you track whether the model is genuinely summarizing vs producing roughly source-length text."],
      "Code Generation": ["Recommended", "Token Count records the input and output token counts for each Code Generation interaction.\n\nFor Code Generation, this is critical for cost control — code generation is typically token-heavy compared to other LLM use cases — and feeds bias-detection workflows where output length differences across user/language segments may indicate quality gaps.\n\nRecommended because the cookbook treats token count as a primary bias-detection signal and cost matters at code-gen scale."],
      "Content Generation": ["Recommended", "Token Count records the input and output token counts for each Content Generation interaction.\n\nFor Content Generation, token counts feed bias detection (the cookbook compares content length across audience segments) and provide foundational data for cost / efficiency reporting.\n\nRecommended because the signal is free, supports the cookbook's bias workflow, and content-length monitoring is a basic content-ops requirement."],
      "Q&A (RAG)": ["Recommended", "Token Count records the input and output token counts for each Q&A interaction.\n\nFor Q&A (RAG), it directly feeds bias detection (comparing answer length across user-demographic segments per the Bias & Accuracy Cookbook) as well as cost forecasting and prompt-length analysis.\n\nRecommended because it's a free signal that supports multiple high-value downstream analyses; there's no good reason not to track it on every Q&A deployment."],
      "Chatbot": ["Recommended", "Token Count records the input and output token counts for each Chatbot interaction.\n\nFor Chatbots, the cookbook compares response length across user-demographic segments to surface bias (systemically shorter responses for certain user groups), and token costs add up rapidly across high-volume conversations.\n\nRecommended because token count directly enables both the bias workflow and cost monitoring at conversation scale."],
      "Info Extraction": ["Consider", "Token Count records the input and output token counts for each Info Extraction interaction.\n\nFor Info Extraction, outputs are typically structured fields rather than long generated text — so token counts are lower and less varied than in generation use cases.\n\nConsider for cost monitoring on high-volume extraction; for bias detection, per-field accuracy and Custom Judge signals are more direct than token-count comparisons."],
      "Classification": ["Optional", "Token Count records the input and output token counts for each Classification interaction.\n\nFor Classification, inputs are typically short (a sentence, a document fragment) and outputs are single labels — token counts are uniformly low and don't carry meaningful variation.\n\nOptional because the metric's signal-to-noise is low for classification; use Statistics on class distribution and Data Integrity for the operational signals you actually need."],
      "Autonomous Agents": ["Recommended", "Token Count records the input and output token counts for each agent interaction.\n\nFor Autonomous Agents, every step makes one or more LLM calls — multi-step workflows multiply token spend rapidly, and bias detection (comparing token totals across user segments) catches cases where some users get longer agent runs than others.\n\nRecommended because both cost control and bias detection require this signal; without it, agent operations are flying blind on both fronts."]
    }
  }, {
    name: "Latency",
    docsUrl: "/observability/platform/traffic-platform",
    objective: "Operational Health & Efficiency",
    q: "Are my LLMs / agents slow to respond?",
    models: ["ML", "LLM", "Agentic"],
    type: "Metric",
    category: "Performance",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "End-to-end and per-step response time",
    uc: {
      "Summarization": ["Optional", "Latency measures the end-to-end response time for each Summarization interaction.\n\nFor Summarization, most production deployments are batch (overnight document processing, scheduled report generation) where latency is dominated by orchestration rather than per-summary time.\n\nOptional for batch-style summarization; promote to Consider for interactive summarization tools where the user is waiting."],
      "Code Generation": ["Consider", "Latency measures the end-to-end response time for each Code Generation interaction.\n\nFor Code Generation, latency matters for interactive developer tools (copilots, in-IDE assistants) where users wait for completions in real time; for batch use cases (automated PR creation, bulk migration), it's less critical.\n\nConsider for interactive code-gen; for batch, focus on throughput rather than latency."],
      "Content Generation": ["Optional", "Latency measures the end-to-end response time for each Content Generation interaction.\n\nFor Content Generation, generation is typically not latency-sensitive — content is generated, reviewed, and published in workflows measured in minutes-to-hours, not the seconds where latency matters to UX.\n\nOptional because the metric rarely drives quality decisions for content; promote to Consider for high-volume content pipelines with explicit SLAs."],
      "Q&A (RAG)": ["Consider", "Latency measures the end-to-end response time for each Q&A interaction (retrieval + generation + evaluation).\n\nFor Q&A (RAG), interactive UX is sensitive to latency — users tolerate a few seconds but lose engagement past that, especially in chat-style Q&A.\n\nConsider for any user-facing Q&A; the threshold for promotion to Recommended depends on whether your deployment is interactive (chat, support tools) vs batch (overnight document QA)."],
      "Chatbot": ["Recommended", "Latency measures the end-to-end response time for each Chatbot interaction (retrieval + generation + evaluation).\n\nFor Chatbots, interactive UX is highly latency-sensitive — users tolerate a few seconds but disengage past that, especially in chat-style flows where typing indicators set expectations.\n\nRecommended because latency directly impacts conversational engagement; the threshold for 'too slow' is tighter for chat than any other use case."],
      "Info Extraction": ["Optional", "Latency measures the end-to-end response time for each extraction interaction.\n\nFor Info Extraction, most production pipelines are batch (overnight document processing, post-OCR pipelines) where per-extraction latency is less important than batch throughput.\n\nOptional for batch extraction; promote to Consider for real-time extraction (e.g. live document processing during a phone call)."],
      "Classification": ["Optional", "Latency measures the end-to-end response time for each classification interaction.\n\nFor Classification, inference is typically fast (especially with native classifier models vs LLM-based judges) and most use cases are batch-oriented — latency rarely drives quality or UX decisions.\n\nOptional because per-classification latency is rarely a quality concern; throughput matters more for batch classification."],
      "Autonomous Agents": ["Recommended", "Latency measures the end-to-end response time for each agent interaction.\n\nFor Autonomous Agents, both end-to-end latency (total time to complete a task) and per-step latency (how long each tool call or reasoning step takes) matter — agents amplify per-step latency across many steps, and total latency directly affects UX for interactive agent flows.\n\nRecommended because agents are the use case where latency compounds most dramatically; the cookbook treats end-to-end task time as a primary monitoring signal."]
    }
  }, {
    name: "Token Usage & Cost",
    docsUrl: "/observability/llm/llm-based-metrics",
    objective: "Operational Health & Efficiency",
    q: "How much is my model costing?",
    models: ["LLM", "Agentic"],
    type: "Either",
    category: "Cost",
    domain: "Platform",
    evaluatorType: "N/A",
    provider: "N/A",
    how: "N/A",
    notes: "Track token consumption and associated costs; inform model selection and optimization",
    uc: {
      "Summarization": ["Consider", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Summarization pipeline, including any LLM-judge evaluator calls.\n\nFor Summarization at moderate volume, cost is usually predictable and dominated by input tokens (long source documents).\n\nConsider for high-volume summarization pipelines or cost-sensitive deployments; for low-volume or research workflows, cost is rarely the binding constraint."],
      "Code Generation": ["Recommended", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Code Generation pipeline.\n\nFor Code Generation, costs can be substantial — long context windows, expensive code-specialist models, repeated generations during iterative coding — and cost tracking is essential for model-selection and prompt-optimization decisions.\n\nRecommended because code-gen is one of the higher-cost LLM use cases per request; without tracking, the bill compounds before you notice."],
      "Content Generation": ["Consider", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Content Generation pipeline.\n\nFor Content Generation, costs scale with content volume — at moderate volume, content costs are usually modest; at high volume (e.g. SEO content factories, automated email campaigns), cost monitoring becomes essential.\n\nConsider based on your scale; for high-volume content operations, promote to Recommended."],
      "Q&A (RAG)": ["Consider", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Q&A (RAG) pipeline, including any LLM-judge evaluator calls.\n\nFor Q&A specifically, evaluator spend can rival generation spend if you score every response with LLM judges — cost monitoring tells you when to switch some evaluators to Fast variants or sampled scoring.\n\nConsider for any Q&A deployment with budget pressure or at moderate-to-high scale."],
      "Chatbot": ["Recommended", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Chatbot pipeline, including any LLM-judge evaluator calls.\n\nFor Chatbots, costs add up rapidly at scale — each conversation makes multiple LLM calls, and evaluator-spend can rival generation-spend if you score every response with LLM judges.\n\nRecommended because cost monitoring is essential for any chatbot at moderate-or-higher scale; it informs when to switch some evaluators to Fast variants or sampled scoring."],
      "Info Extraction": ["Consider", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Info Extraction pipeline.\n\nFor Info Extraction, costs scale with document volume — at moderate volume, costs are predictable; at high volume (e.g. document processing platforms), cost tracking informs model-selection trade-offs (cheaper extraction model + custom judge vs expensive single-pass extraction).\n\nConsider based on your extraction volume."],
      "Classification": ["Optional", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your Classification pipeline.\n\nFor Classification, inference is typically cheap — small inputs, single-token outputs, often using small specialized classifier models rather than expensive LLMs.\n\nOptional because classification cost rarely warrants dedicated monitoring; for LLM-based classifiers at scale, consider it."],
      "Autonomous Agents": ["Recommended", "Token Usage & Cost tracks the cumulative token spend (and downstream $ cost) of running your agent pipeline.\n\nFor Autonomous Agents, costs are typically the highest of any LLM use case — every step makes one or more LLM calls, agents often iterate to convergence (which can blow past iteration caps under failure modes), and tool-call results often re-enter the prompt context, compounding costs.\n\nRecommended because agents are the most expensive AI pattern at scale; without cost tracking, agent operations have no early-warning signal for runaway spend."]
    }
  }];
  const _COOKBOOK = {
    meta: {
      title: "Bias and Accuracy Tracking for GenAI Applications",
      subtitle: "Fiddler AI Observability Cookbook"
    },
    byUseCase: {
      "Summarization": {
        evaluators: [{
          name: "Faithfulness",
          docsUrl: "/observability/llm/llm-based-metrics",
          aka: "Faithfulness (optimized for summarization)",
          measures: "Assesses whether the summary accurately represents the source material without introducing hallucinations or distortions.",
          value: "Accuracy Baseline: Ensures the summary doesn't fabricate or misrepresent information from the original text."
        }, {
          name: "Conciseness",
          docsUrl: "/sdk-api/evals/conciseness",
          measures: "Evaluates whether the summary is appropriately brief while retaining essential information.",
          value: "Efficiency: Confirms the model is truly summarizing, not just excerpting or paraphrasing at length."
        }],
        biasStrategy: {
          bullets: ["Tag summaries with metadata about source content (author demographics, topic categories, document type)", "Compare Faithfulness and Conciseness scores across segments", "Monitor token count differences between summaries of similar-length documents from different segments"],
          example: "Do summaries of technical papers by women researchers average 20% fewer tokens than those by men, suggesting less thorough coverage?"
        }
      },
      "Info Extraction": {
        evaluators: [{
          name: "Per-Field Accuracy",
          aka: "Per-Field Accuracy LLM Judge",
          cookbookOnly: true,
          measures: "Evaluates extraction accuracy for each specific field (e.g., 'name,' 'date,' 'amount').",
          value: "Granular Quality Control: Pinpoints which fields the model extracts reliably vs. which require improvement."
        }, {
          name: "Overall Accuracy",
          aka: "Overall Accuracy LLM Judge",
          cookbookOnly: true,
          measures: "Assesses whether all required information was extracted correctly in aggregate.",
          value: "Completeness Check: Ensures no critical data is missed during extraction."
        }],
        biasStrategy: {
          bullets: ["Tag extractions with metadata about the source (document format, language, content domain, demographic context)", "Compare per-field accuracy rates across segments"],
          example: "Does the model correctly extract names from resumes with non-Western names at the same rate as Western names?"
        }
      },
      "Q&A (RAG)": {
        evaluators: [{
          name: "Answer Relevance",
          docsUrl: "/sdk-api/evals/answer-relevance",
          measures: "Assesses whether the response accurately addresses the user's question.",
          value: "Intent Matching: Especially critical for open-ended questions — confirms the response addresses the spirit of the question, not just surface-level keywords."
        }, {
          name: "RAG Faithfulness",
          docsUrl: "/sdk-api/evals/rag-faithfulness",
          measures: "Evaluates whether the answer is grounded in the retrieved context without hallucination.",
          value: "Factual Accuracy: For closed-ended questions, ensures the model said the 'right thing' based on retrieved documents. For open-ended questions, confirms claims are supported by context."
        }, {
          name: "Context Relevance",
          docsUrl: "/sdk-api/evals/context-relevance",
          measures: "Measures whether the retrieved documents are actually relevant to the question.",
          value: "Retrieval Quality: Identifies when the retrieval subsystem is returning poor-quality or off-topic context."
        }, {
          name: "Conciseness",
          docsUrl: "/sdk-api/evals/conciseness",
          measures: "Evaluates whether answers are appropriately brief.",
          value: "User Experience: Prevents over-explaining simple questions or burying key information in verbose responses."
        }],
        biasStrategy: {
          bullets: ["Tag questions with metadata about topic category, user demographics (if available), or question complexity", "Compare Answer Relevance and RAG Faithfulness scores across segments", "Monitor response token count and sentiment across different question types"],
          example: "Do questions about minority health issues receive lower Answer Relevance scores or shorter responses than general health questions?"
        },
        workedExample: {
          title: "Detecting gender bias in a healthcare Q&A system",
          scenario: "Ensure a healthcare Q&A system provides equally helpful answers regardless of the topic's demographic context.",
          evaluatorsApplied: ["Answer Relevance", "RAG Faithfulness"],
          tags: ["women's health", "men's health", "general health", "pediatric care"],
          beforeTable: {
            caption: "Before remediation",
            headers: ["Segment", "Answer Relevance", "RAG Faithfulness", "Response length (tokens)", "Sample size"],
            rows: [["Women's health", "0.78", "0.82", "145", "1,247"], ["Men's health", "0.91", "0.89", "203", "1,156"], ["General health", "0.88", "0.87", "195", "3,892"], ["Pediatric care", "0.85", "0.86", "188", "891"]]
          },
          findings: ["14% lower Answer Relevance (0.78 vs 0.91)", "8% lower RAG Faithfulness (0.82 vs 0.89)", "29% shorter responses (145 vs 203 tokens)"],
          rootCause: ["Knowledge base has 40% fewer articles on women's health topics", "Model often retrieves general health content instead of women's-health-specific sources", "Context Relevance is materially lower for women's health (0.72 vs 0.85)"],
          remediation: [{
            when: "Immediate",
            action: "Adjust retrieval parameters to prioritize topic-specific matches"
          }, {
            when: "Short-term",
            action: "Expand knowledge base with high-quality women's health content"
          }, {
            when: "Medium-term",
            action: "Fine-tune retrieval model on balanced health topic dataset"
          }, {
            when: "Ongoing",
            action: "Monitor disparity metrics weekly to ensure improvement"
          }],
          afterTable: {
            caption: "After remediation",
            headers: ["Segment", "Answer Relevance", "RAG Faithfulness", "Response length (tokens)"],
            rows: [["Women's health", "0.86 ↑", "0.87 ↑", "192 ↑"], ["Men's health", "0.91", "0.89", "203"]]
          },
          result: "Gap reduced from 14% to 5% for Answer Relevance"
        }
      },
      "Chatbot": {
        evaluators: [{
          name: "Answer Relevance",
          docsUrl: "/sdk-api/evals/answer-relevance",
          measures: "Assesses whether responses stay on-topic throughout the conversation.",
          value: "Conversational Coherence: Prevents the chatbot from drifting off-topic or ignoring user intent."
        }, {
          name: "RAG Faithfulness",
          docsUrl: "/sdk-api/evals/rag-faithfulness",
          measures: "Ensures responses are grounded in retrieved knowledge (if using RAG).",
          value: "Trust Building: Users trust chatbots that cite or reference real information rather than making things up."
        }, {
          name: "Sentiment Analysis",
          docsUrl: "/sdk-api/evals/sentiment",
          measures: "Tracks the emotional tone of chatbot responses.",
          value: "Tone Management: Ensures the bot maintains an appropriate, helpful tone even when users are frustrated."
        }],
        note: "Apply the same Context Relevance and Conciseness evaluators recommended for Q&A systems if your chatbot uses retrieval.",
        biasStrategy: {
          bullets: ["Tag conversations with user demographics, conversation topic, or user satisfaction ratings", "Compare Answer Relevance, RAG Faithfulness, and Sentiment Analysis scores across segments", "Monitor average response length and response time across different user groups"],
          example: "Do users from certain demographic groups receive responses with consistently more negative sentiment or fewer tokens?"
        }
      },
      "Classification": {
        evaluators: [],
        strategyNote: "For GenAI classification tasks, traditional ML performance metrics (precision, recall, F1) are highly effective — treat it the same way as a predictive ML task.",
        biasStrategy: {
          bullets: ["Tag classifications with metadata about input characteristics (writing style, topic, demographic context)", "Compare precision, recall, and F1 scores across segments in Fiddler"],
          example: "Does sentiment classification achieve 85% accuracy for product reviews written by younger users but only 70% for older users?"
        }
      },
      "Autonomous Agents": {
        evaluators: [{
          name: "Tool Call Accuracy",
          aka: "Tool Call Accuracy LLM Judge",
          cookbookOnly: true,
          measures: "Evaluates whether the agent selected and invoked the correct tool with proper parameters.",
          value: "Execution Reliability: Prevents the agent from calling the wrong APIs or passing invalid arguments."
        }, {
          name: "Context Relevance",
          docsUrl: "/sdk-api/evals/context-relevance",
          measures: "Assesses whether the agent retrieved or used appropriate information before taking action.",
          value: "Decision Quality: Ensures the agent's actions are informed by relevant context."
        }],
        biasStrategy: {
          bullets: ["Tag agent interactions with task type, user demographics, or workflow complexity", "Compare Context Relevance and Tool Call Accuracy across segments", "Monitor number of tool calls and task completion time across different user groups"],
          example: "Does the agent require more steps to complete identical tasks for certain user populations?"
        }
      },
      "Code Generation": {
        evaluators: [{
          name: "LLM (Custom Judge)",
          cookbookOnly: true,
          measures: "Evaluates code quality, correctness, security, and adherence to coding standards.",
          value: "Code Review Automation: Acts as a first-pass reviewer to catch obvious errors, security vulnerabilities, or style violations."
        }],
        biasStrategy: {
          bullets: ["Tag code generation requests with programming language, user experience level, or problem domain", "Compare correctness and security scores across segments", "Monitor generated code token count for similar requests across different segments"],
          example: "Does the model generate less secure code for web development tasks compared to data science tasks?"
        }
      },
      "Content Generation": {
        evaluators: [{
          name: "Answer Relevance",
          docsUrl: "/sdk-api/evals/answer-relevance",
          measures: "Assesses whether the content addresses the original prompt or brief.",
          value: "Instruction Adherence: Ensures the model delivers what was requested, not something tangentially related."
        }, {
          name: "Audience Alignment",
          aka: "LLM (Custom Judge): Audience Alignment",
          cookbookOnly: true,
          measures: "Evaluates whether the content is appropriate for the target audience (tone, complexity, terminology).",
          value: "Targeted Communication: Confirms the content speaks to the intended reader (e.g., executives vs. technical users vs. general public)."
        }, {
          name: "Coherence",
          docsUrl: "/sdk-api/evals/coherence",
          measures: "Assesses logical flow and narrative quality.",
          value: "Readability: Ensures content is easy to follow and professionally written."
        }, {
          name: "Sentiment Analysis",
          docsUrl: "/sdk-api/evals/sentiment",
          measures: "Tracks the emotional tone of generated content.",
          value: "Brand Safety: Prevents overly negative or inappropriate messaging from reaching audiences."
        }],
        biasStrategy: {
          bullets: ["Tag content with target audience demographics, topic category, or content type", "Compare Answer Relevance, Coherence, and audience alignment scores across segments", "Monitor content token count and sentiment scores for similar prompts targeting different audiences"],
          example: "Does content generated for female audiences consistently receive lower Coherence scores or different sentiment patterns than content for male audiences?"
        }
      }
    }
  };
  const _COOKBOOK_DOCS = [{
    id: "bias-accuracy",
    name: "Bias & Accuracy Cookbook",
    teaser: "Fairness, accuracy & remediation playbook for GenAI apps.",
    description: "A full reference of Fiddler-recommended evaluators for detecting bias and accuracy issues in GenAI apps. Includes per-use-case evaluator tables, worked customer scenarios (e.g. the healthcare gender-bias deep dive), and remediation playbooks.",
    url: null,
    useCases: "all",
    models: ["LLM", "Agentic"],
    embedded: true
  }, {
    id: "rag-eval-fundamentals",
    name: "RAG Evaluation Fundamentals",
    teaser: "Assess quality in RAG apps using faithfulness and relevance.",
    description: "Demonstrates how to assess quality in RAG applications using faithfulness and relevance metrics. The starting point for any RAG observability program.",
    url: "/developers/cookbooks/rag-evaluation-fundamentals",
    useCases: ["Q&A (RAG)", "Chatbot", "Autonomous Agents"],
    models: ["LLM", "Agentic"],
    content: {
      intro: "Quick-start: assess RAG retrieval and generation quality using Fiddler's built-in evaluators via the .score() API. Designed for rapid iteration before scaling to full experiments.",
      evaluators: [{
        name: "RAGFaithfulness",
        measures: "Whether the response is grounded in and supported by the retrieved documents.",
        value: "Detects hallucinations where the LLM generates plausible but unsupported claims.",
        scoring: "Binary — Yes (1.0) / No (0.0)"
      }, {
        name: "AnswerRelevance",
        measures: "How well the response addresses the user's query.",
        value: "Identifies off-topic responses where the LLM answers a different question.",
        scoring: "Ordinal — High (1.0), Medium (0.5), Low (0.0)"
      }],
      strategy: {
        title: "Recommended approach",
        bullets: ["Define representative test cases covering both successful and failing RAG scenarios", "Initialize evaluators with your LLM credentials and preferred Judge model", "Use the .score() method to evaluate individual test cases for fast iteration", "Check both faithfulness (grounding in documents) and relevance (addressing the query)", "Use both signals together — a response can be faithful but irrelevant, or relevant but hallucinated"]
      },
      workedExample: {
        title: "Three test cases — healthy, hallucination, irrelevant",
        scenario: "Score three RAG outputs against the same evaluator set to see how faithfulness and relevance diverge across failure modes.",
        evaluatorsApplied: ["RAGFaithfulness", "AnswerRelevance"],
        table: {
          headers: ["Scenario", "User query", "Retrieved context", "Model response", "Faithfulness", "Relevance", "Status"],
          rows: [["Perfect match", "What is the capital of France?", "Paris is the capital of France.", "The capital of France is Paris.", "yes", "high", "HEALTHY"], ["Hallucination", "What are the office hours?", "We are closed on weekends.", "We are open 9 AM to 5 PM every day.", "no", "high", "ISSUE DETECTED"], ["Irrelevant", "How do I reset my password?", "To reset, click 'Forgot Password'.", "Our system uses 256-bit encryption.", "yes", "low", "ISSUE DETECTED"]]
        },
        takeaway: "Faithfulness and relevance measure different dimensions — use both. High relevance + low faithfulness = hallucination. High faithfulness + low relevance = off-topic answer."
      },
      apiSurface: ["RAGFaithfulness().score()", "AnswerRelevance().score()"],
      prerequisites: ["Fiddler account with API access", "LLM credential configured in Settings > LLM Gateway", "pip install fiddler-evals pandas"],
      completionTimeMin: 15
    }
  }, {
    id: "rag-experiments-scale",
    name: "Running RAG Experiments at Scale",
    teaser: "Compare RAG pipeline setups via datasets & experiments.",
    description: "Shows methods for comparing different RAG pipeline setups using datasets and experiments — useful when iterating on retrieval, chunking, or prompt strategies.",
    url: "/developers/cookbooks/rag-experiments-at-scale",
    useCases: ["Q&A (RAG)", "Chatbot", "Autonomous Agents"],
    models: ["LLM", "Agentic"]
  }, {
    id: "rag-hallucination-detection",
    name: "Detecting Hallucinations in RAG",
    teaser: "Monitor RAG systems for hallucinations via evaluator rules.",
    description: "Covers monitoring RAG systems for hallucinations via health diagnostics and evaluator rules. Operational playbook for production RAG.",
    url: "/developers/cookbooks/hallucination-detection-pipeline",
    useCases: ["Q&A (RAG)", "Chatbot", "Autonomous Agents"],
    models: ["LLM", "Agentic"]
  }, {
    id: "custom-judge-evaluators",
    name: "Building Custom Judge Evaluators",
    teaser: "Domain-specific LLM-Judge criteria, end to end.",
    description: "Explains creating domain-specific evaluation criteria with custom judge implementations. Use when out-of-the-box evaluators don't capture your quality bar.",
    url: "/developers/cookbooks/custom-judge-evaluators",
    useCases: "all",
    models: ["LLM", "Agentic"]
  }, {
    id: "agentic-content-generation",
    name: "Monitoring Agentic Content Generation",
    teaser: "Quality assurance and brand alignment for content systems.",
    description: "Addresses quality assurance and brand alignment in AI-powered content creation systems.",
    url: "/developers/cookbooks/agentic-content-generation",
    useCases: ["Content Generation"],
    models: ["Agentic"]
  }, {
    id: "agentic-document-extraction",
    name: "Agentic Document Extraction",
    teaser: "Measurable document extraction with tracing & monitoring.",
    description: "Demonstrates building measurable document extraction pipelines with tracing and monitoring.",
    url: "/developers/cookbooks/agentic-document-extraction",
    useCases: ["Info Extraction"],
    models: ["Agentic"],
    content: {
      intro: "Build reliable, measurable document extraction pipelines with tracing, custom evaluators, and experiments — catch hallucinated fields, schema drift, and silent accuracy degradation before they hit production.",
      evaluators: [{
        name: "Field Accuracy",
        measures: "Fraction of extracted scalar fields matching ground truth.",
        value: "Granular control — pinpoints which fields need prompt tuning vs. model changes.",
        scoring: "Continuous 0.0–1.0 (e.g., 0.83 = 5 of 6 fields matched)"
      }, {
        name: "Schema Completeness",
        measures: "Fraction of required fields present and non-null in extraction output.",
        value: "Detects schema drift when the model silently omits previously extracted fields.",
        scoring: "Continuous 0.0–1.0"
      }, {
        name: "Per-Field Accuracy (Custom Judge)",
        measures: "LLM-based per-field accuracy assessment against source text.",
        value: "Human-like review at scale without manual comparison.",
        scoring: "Per-field boolean + overall (All Correct / Partially Correct / Mostly Incorrect)"
      }, {
        name: "Math Consistency (Custom Judge)",
        measures: "Whether extracted numeric fields satisfy arithmetic constraints (e.g., total = subtotal + tax).",
        value: "Catches hallucinated dollar amounts that pass schema but fail the math.",
        scoring: "Per-field boolean + overall (Fully Consistent / Minor Discrepancy / Major Error)"
      }, {
        name: "Coherence (Built-in)",
        measures: "Logical flow and clarity of extraction output.",
        value: "Catches garbled or malformed responses before downstream processing.",
        scoring: "Continuous 0.0–1.0"
      }, {
        name: "Conciseness (Built-in)",
        measures: "Whether the output is focused and free of extraneous commentary.",
        value: "Ensures structured JSON output, not explanations mixed in.",
        scoring: "Continuous 0.0–1.0"
      }],
      strategy: {
        title: "Recommended approach",
        bullets: ["Instrument extraction pipelines with OpenTelemetry spans (parse → extract → validate) for full traceability", "Apply domain-specific custom evaluators to compare extracted fields against ground truth", "Use Fiddler Experiments to benchmark prompt or model changes against a test dataset before production deployment", "Monitor aggregate production signals (success rate, field completeness, math accuracy) with rolling windows and alerts", "Segment monitoring by document type or vendor to surface type-specific weaknesses for targeted prompt tuning", "Route low-confidence extractions for human review based on evaluator signals — not by reviewing everything"]
      },
      workedExample: {
        title: "Invoice Extraction Benchmarking",
        scenario: "Compare extraction accuracy across prompt versions using a ground-truth invoice dataset.",
        evaluatorsApplied: ["Field Accuracy", "Schema Completeness", "Per-Field Accuracy", "Math Consistency"],
        table: {
          headers: ["Field", "Source value", "Extracted", "Status"],
          rows: [["Vendor", "Acme Corp", "Acme Corp", "✓"], ["Invoice #", "12345", "12345", "✓"], ["Date", "2025-01-15", "2025-01-15", "✓ (fails on unusual formats)"], ["Subtotal", "$100.00", "$100.00", "✓"], ["Tax", "$8.00", "$8.00", "✓"], ["Total", "$108.00", "$108.00", "✓ math consistent"]]
        },
        takeaway: "Across 8 test invoices, 5 scored 100% field accuracy. Drilling into the lowest-scoring case revealed unusual date formatting as the weakness — guiding targeted prompt improvements."
      },
      apiSurface: ["Project.get_or_create() / Application.get_or_create()", "Dataset.create() / dataset.insert(NewDatasetItem)", "evaluate(dataset, task, evaluators, score_fn_kwargs_mapping)", "CustomJudge(prompt_template, output_fields, model, credential)", "EvalFn(function, score_name)", "Evaluator subclass with score() method"],
      prerequisites: ["Fiddler account with API access", "LLM credential configured in Settings > LLM Gateway", "pip install fiddler-evals pandas"],
      completionTimeMin: 30
    }
  }];
  return {
    metrics: _METRICS.map(m => ({
      ...m,
      implementationType: implementationTypeOf(m.evaluatorType),
      providerOptions: providerOptionsOf(m.provider)
    })),
    cookbook: _COOKBOOK,
    cookbookDocs: _COOKBOOK_DOCS,
    objectives: _OBJECTIVES,
    useCases: _AGENTIC_USE_CASES,
    useCaseDescriptions: _USE_CASE_DESCRIPTIONS,
    mlTasks: _ML_TASKS,
    modelTypes: Object.keys(_MODEL_TYPE_DISPLAY),
    modelTypeDisplay: _MODEL_TYPE_DISPLAY,
    mlMetrics: _ML_METRICS,
    mlCrosscut: _ML_CROSSCUT,
    implementations: _IMPLEMENTATIONS,
    hostings: _HOSTINGS,
    evaluatorTypeTips: _EVALUATOR_TYPE_TIPS,
    providerTips: _PROVIDER_TIPS
  };
})();

## How to use this guide

Use this guide to choose the right Fiddler evaluators and metrics for what you're building. Start by picking your **observability type** — a Gen AI application, a single LLM, a predictive (ML) model, or real-time Guardrails — then select a use case to see which metrics Fiddler rates **Recommended**, worth a **Consider**, or **Optional** for that scenario.

Open any metric card for its per-use-case rationale, evaluator type and model provider, and a link to the documentation. Use **Show more filters** to narrow by evaluator type, model provider, or observability objective, or search by name to jump straight to a specific metric.

Below the guide, **Cookbooks & Resources** pairs proven, copy-pasteable recipes with your selection — worked examples for your chosen use case — and **Use Your Own Agent** exports your current filters as context for Claude or any agent.

<EvaluatorsGuide data={EVAL_GUIDE_DATA} />
