Guardrails and Security¶
Block dangerous calls before the model sees them. Each part wires a guardrail into a real agent run and prints the model round-trip cost, so the safety policy is exercised live, not described in the abstract.
GuardrailsHookwith a typedGuardrailConfig(block list, length caps, default action).- PII detection and redaction on user input.
- Content pattern blocking (SQL injection, path traversal, shell escapes).
- Tool allowlist vs denylist.
- Stacked hooks via
HookRegistryplus a separateContentFilterHook.
Run it (OCI Generative AI is the default; auto-detected from ~/.oci/config):
python examples/notebook_52_guardrails_security.py
Offline / no credentials:
LOCUS_MODEL_PROVIDER=mock python examples/notebook_52_guardrails_security.py
Pin a specific OCI model:
LOCUS_MODEL_ID=openai.gpt-4.1 python examples/notebook_52_guardrails_security.py
Source¶
# Copyright (c) 2025, 2026 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v1.0 as shown at
# https://oss.oracle.com/licenses/upl/
"""Notebook 47: Guardrails and security — block dangerous calls before the model sees them.
Each part wires a guardrail into a real agent run and prints the model
round-trip cost, so the safety policy is exercised live, not described
in the abstract.
- GuardrailsHook with a typed GuardrailConfig (block list, length caps,
default action).
- PII detection and redaction on user input.
- Content pattern blocking (SQL injection, path traversal, shell escapes).
- Tool allowlist vs denylist.
- Stacked hooks via HookRegistry plus a separate ContentFilterHook.
Run it
# Default: OCI Generative AI auto-detected from ~/.oci/config
python examples/notebook_52_guardrails_security.py
# Offline / no credentials:
LOCUS_MODEL_PROVIDER=mock python examples/notebook_52_guardrails_security.py
# Pin a specific OCI model:
LOCUS_MODEL_ID=openai.gpt-4.1 python examples/notebook_52_guardrails_security.py
"""
import asyncio
import time
from config import get_model, print_config
from locus.agent import Agent
from locus.core.events import BeforeToolCallEvent
from locus.core.state import AgentState
from locus.hooks import HookRegistry
from locus.hooks.builtin.guardrails import (
ContentFilterHook,
GuardrailAction,
GuardrailConfig,
GuardrailsHook,
GuardrailViolation,
)
# Helper used by every Part: one model call with a timing/token banner so
# you can see the guardrail running against a real round-trip.
def _llm_call(
prompt: str,
*,
system: str = "Reply in one short sentence.",
max_tokens: int = 100,
hooks: list | None = None,
) -> str:
agent = Agent(
model=get_model(max_tokens=max_tokens),
system_prompt=system,
hooks=hooks,
)
t0 = time.perf_counter()
result = agent.run_sync(prompt)
dt = time.perf_counter() - t0
print(
f" [model call: {dt:.2f}s · "
f"{result.metrics.prompt_tokens}→{result.metrics.completion_tokens} tokens]"
)
return result.message.strip()
async def main():
print("=" * 60)
print("Notebook 47: Guardrails and security")
print("=" * 60)
print()
print_config()
# Part 1: declare a GuardrailConfig, then ask the model to summarise it.
print("\n=== Part 1: Basic guardrail configuration ===\n")
config = GuardrailConfig(
block_dangerous_tools=frozenset(
{"eval", "exec", "system", "shell", "rm", "delete", "drop", "truncate"}
),
max_prompt_length=100000,
max_tool_result_length=50000,
default_action=GuardrailAction.BLOCK,
)
print(f" block_dangerous_tools: {sorted(config.block_dangerous_tools)[:5]}…")
print(f" max_prompt_length: {config.max_prompt_length:,}")
print(f" default_action: {config.default_action.value}")
summary = _llm_call(
"In one sentence, summarise what a security policy that blocks "
"{eval, exec, system, shell, rm, delete, drop, truncate} protects "
"an LLM agent against.",
max_tokens=80,
)
print(f"AI policy summary: {summary}")
# Part 2: wire the config into a GuardrailsHook and run an agent through it.
print("\n=== Part 2: GuardrailsHook on a live agent ===\n")
violations_log: list[GuardrailViolation] = []
def on_violation(v: GuardrailViolation):
violations_log.append(v)
print(f" VIOLATION: {v.rule_name} - {v.description}")
guardrails = GuardrailsHook(config=config, on_violation=on_violation)
print(f" Hook: {guardrails.name}, priority={guardrails.priority}")
answer = _llm_call(
"What's a sensible default password policy length?",
system="Reply in one short sentence.",
hooks=[guardrails],
)
print(f"Guarded answer: {answer}")
# Part 3: PII detection. The hook scans each input against the
# configured patterns before the model ever sees it.
print("\n=== Part 3: PII detection ===\n")
print("Built-in PII patterns:")
for name in list(config.pii_patterns)[:5]:
print(f" - {name}")
test_inputs = [
"Contact me at john@example.com for details",
"Call 555-123-4567 for support",
"SSN: 123-45-6789",
"No sensitive data here",
]
state = AgentState(agent_id="test")
print("\nSDK-side PII detection:")
for text in test_inputs:
guardrails.clear_violations()
try:
await guardrails.on_before_invocation(text, state)
seen = guardrails.violations
label = ", ".join(v.rule_name for v in seen) if seen else "Clean"
print(f" '{text[:40]}…' -> {label}")
except ValueError as e:
print(f" '{text[:40]}…' -> BLOCKED: {e}")
pii_advice = _llm_call(
"Give one concrete piece of advice for an SRE on what to do when an "
"LLM application logs PII like emails or SSNs.",
max_tokens=80,
)
print(f"AI advice: {pii_advice}")
# Part 4: block known-malicious input shapes (SQL injection, path
# traversal, shell escapes) by pattern.
print("\n=== Part 4: Content pattern blocking ===\n")
dangerous_inputs = [
"DROP TABLE users;",
"../../etc/passwd",
"ls -la; rm -rf /",
"Normal query SELECT * FROM users",
]
for text in dangerous_inputs:
guardrails.clear_violations()
try:
await guardrails.on_before_invocation(text, state)
print(f" '{text[:40]}…' -> Allowed")
except ValueError:
print(f" '{text[:40]}…' -> BLOCKED")
risk_summary = _llm_call(
"List the top three classes of malicious input an LLM service should "
"filter at the gateway. Three short bullets.",
max_tokens=120,
)
print(f"AI risk summary:\n{risk_summary}")
# Part 5: tool denylist. block_dangerous_tools rejects calls before
# they reach the tool runner.
print("\n=== Part 5: Tool restrictions ===\n")
tool_tests = [
("read_file", {"path": "/app/data.txt"}),
("exec", {"code": "print('hello')"}),
("shell", {"command": "ls"}),
("search", {"query": "test"}),
]
for name, args in tool_tests:
guardrails.clear_violations()
try:
await guardrails.on_before_tool_call(
BeforeToolCallEvent(tool_name=name, arguments=args)
)
print(f" {name} -> Allowed")
except ValueError:
print(f" {name} -> BLOCKED")
rationale = _llm_call(
"Why is it dangerous to expose `exec` or `shell` tools to an LLM agent?",
max_tokens=80,
)
print(f"AI rationale: {rationale}")
# Part 6: allowlist mode — safer default for production because new
# tools added later need explicit listing.
print("\n=== Part 6: Tool allowlist mode ===\n")
allowlist_config = GuardrailConfig(
allow_only_tools=frozenset({"read_file", "search", "analyze"})
)
allowlist_guardrails = GuardrailsHook(config=allowlist_config)
for name in ["read_file", "write_file", "search", "delete"]:
try:
await allowlist_guardrails.on_before_tool_call(
BeforeToolCallEvent(tool_name=name, arguments={})
)
print(f" {name} -> Allowed")
except ValueError:
print(f" {name} -> BLOCKED")
contrast = _llm_call(
"In one sentence, compare allowlist vs denylist for tool access in an "
"LLM agent — which is safer and why?",
max_tokens=80,
)
print(f"AI contrast: {contrast}")
# Part 7: per-rule actions. REDACT replaces the match in-place,
# WARN logs but allows, BLOCK rejects the call.
print("\n=== Part 7: Action types ===\n")
for action in GuardrailAction:
print(f" {action.value}")
custom_config = GuardrailConfig(
default_action=GuardrailAction.BLOCK,
action_overrides={
"pii_email": GuardrailAction.REDACT,
"pii_phone_us": GuardrailAction.WARN,
"blocked_sql_injection": GuardrailAction.BLOCK,
},
)
print("\naction_overrides:")
for rule, act in custom_config.action_overrides.items():
print(f" {rule} -> {act.value}")
explainer = _llm_call(
"Briefly explain when an LLM service should REDACT vs BLOCK vs WARN "
"on policy violations. One sentence per action.",
max_tokens=140,
)
print(f"AI explainer:\n{explainer}")
# Part 8: a second hook type — ContentFilterHook scans plain text
# for blocked words and credential patterns.
print("\n=== Part 8: ContentFilterHook on a live agent ===\n")
content_filter = ContentFilterHook(
blocked_words=["password", "secret", "api_key"],
blocked_patterns=[r"sk-[a-zA-Z0-9]+", r"ghp_[a-zA-Z0-9]+"],
max_input_length=10000,
case_sensitive=False,
)
benign = _llm_call(
"Suggest one good practice for handling developer credentials in CI.",
hooks=[content_filter],
)
print(f"Filtered answer: {benign}")
try:
_llm_call("What's my password?", hooks=[content_filter])
except Exception as e: # noqa: BLE001
print(f" (filter blocked the input as expected: {type(e).__name__})")
# Part 9: stack multiple hooks. HookRegistry runs them in priority
# order; the first BLOCK wins.
print("\n=== Part 9: Stacking guardrail hooks ===\n")
registry = HookRegistry()
registry.add_provider(
GuardrailsHook(config=GuardrailConfig(block_dangerous_tools=frozenset({"exec", "eval"})))
)
registry.add_provider(ContentFilterHook(blocked_words=["forbidden"]))
print("Registered hook providers:")
for prov in registry.providers:
print(f" - {prov.name} (priority={prov.priority})")
stacked = _llm_call(
"Name two security risks of giving an LLM agent unrestricted shell "
"access. One bullet each.",
hooks=[
GuardrailsHook(
config=GuardrailConfig(block_dangerous_tools=frozenset({"exec", "eval"}))
),
ContentFilterHook(blocked_words=["forbidden"]),
],
)
print(f"Stacked-hooks answer: {stacked}")
# Part 10: prod vs dev policy presets. Dev is permissive (WARN);
# prod blocks irreversible operations and redacts PII.
print("\n=== Part 10: Custom security policies ===\n")
def production_config() -> GuardrailConfig:
return GuardrailConfig(
block_dangerous_tools=frozenset(
{"exec", "eval", "system", "shell", "delete", "drop", "truncate", "rm", "sudo"}
),
max_prompt_length=50000,
max_tool_result_length=25000,
default_action=GuardrailAction.BLOCK,
action_overrides={
"pii_email": GuardrailAction.REDACT,
"pii_ssn": GuardrailAction.BLOCK,
"pii_credit_card": GuardrailAction.BLOCK,
},
)
def development_config() -> GuardrailConfig:
return GuardrailConfig(
block_dangerous_tools=frozenset({"exec", "eval"}),
max_prompt_length=200000,
max_tool_result_length=100000,
default_action=GuardrailAction.WARN,
)
prod = production_config()
dev = development_config()
print(
f"prod blocks {len(prod.block_dangerous_tools)} tools, "
f"dev blocks {len(dev.block_dangerous_tools)}; "
f"prod default={prod.default_action.value}, dev default={dev.default_action.value}"
)
suggestion = _llm_call(
"List one extra guardrail rule a fintech company should add on top of "
"blocking shell tools. One short sentence.",
max_tokens=80,
)
print(f"AI suggestion: {suggestion}")
# Part 11: ask the model to write a guardrail cheat sheet.
print("\n=== Part 11: Best practices ===\n")
best = _llm_call(
"Write a six-line cheat sheet of best practices for guarding LLM "
"agents in production. Six bullets, terse.",
max_tokens=240,
)
print(best)
# Part 12: an end-to-end Agent run with a guardrail attached.
print("\n=== Part 12: Live Agent + Guardrails ===\n")
safe_guardrails = GuardrailsHook(
config=GuardrailConfig(
block_dangerous_tools=frozenset({"exec", "eval", "shell"}),
default_action=GuardrailAction.WARN,
),
)
safe_agent = Agent(
model=get_model(max_tokens=200),
system_prompt=(
"You are a friendly assistant. Refuse to share secrets or "
"anything the guardrails would block."
),
hooks=[safe_guardrails],
)
t0 = time.perf_counter()
safe_result = safe_agent.run_sync("How can I improve the security posture of a small SaaS app?")
dt = time.perf_counter() - t0
print(
f" [model call: {dt:.2f}s · "
f"{safe_result.metrics.prompt_tokens}→{safe_result.metrics.completion_tokens} tokens]"
)
print(f"Guarded answer: {safe_result.message[:300]}")
print(f"\nTotal violations logged in this notebook: {len(violations_log)}")
print("\n" + "=" * 60)
if __name__ == "__main__":
asyncio.run(main())