Tutorial 26: Evaluation Framework — Systematic Agent Quality Testing¶
This tutorial covers:
- EvalCase: defining test cases with expected behaviors
- EvalRunner: running agents against test suites
- EvalReport: analyzing results and scoring
Prerequisites:
- Configure model via environment variables
Difficulty: Intermediate
Source¶
# Copyright (c) 2025, 2026 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v1.0 as shown at
# https://oss.oracle.com/licenses/upl/
"""
Tutorial 26: Evaluation Framework — Systematic Agent Quality Testing
This tutorial covers:
- EvalCase: defining test cases with expected behaviors
- EvalRunner: running agents against test suites
- EvalReport: analyzing results and scoring
Prerequisites:
- Configure model via environment variables
Difficulty: Intermediate
"""
from config import get_model
from locus.agent import Agent, AgentConfig
from locus.evaluation import EvalCase, EvalRunner
# =============================================================================
# Part 1: Define evaluation cases
# =============================================================================
def example_evaluation():
"""Run a systematic evaluation of an agent."""
print("=== Agent Evaluation ===\n")
model = get_model()
agent = Agent(
config=AgentConfig(
system_prompt="You are a helpful assistant. Answer concisely.",
max_iterations=3,
model=model,
)
)
# Define test cases
cases = [
EvalCase(
name="basic_knowledge",
prompt="What is the capital of France?",
expected_output_contains=["paris"],
max_iterations=3,
),
EvalCase(
name="math",
prompt="What is 15 * 7?",
expected_output_contains=["105"],
),
EvalCase(
name="no_hallucination",
prompt="What is the capital of France?",
expected_output_not_contains=["berlin", "london"],
),
]
# Run evaluation
runner = EvalRunner(agent=agent)
report = runner.run(cases)
# Print results
print(report.summary())
print(f"\nTotal: {report.total_cases}, Passed: {report.passed}, Failed: {report.failed}")
print(f"Average score: {report.avg_score:.2f}")
if __name__ == "__main__":
example_evaluation()