fix: move pre_llm_call plugin context to user message, preserve prompt cache (#5146)

Plugin context from pre_llm_call hooks was injected into the system
prompt, breaking the prompt cache prefix every turn when content
changed (typical for memory plugins). Now all plugin context goes
into the current turn's user message — the system prompt stays
identical across turns, preserving cached tokens.

The system prompt is reserved for Hermes internals. Plugins
contribute context alongside the user's input.

Also adds comprehensive documentation for all 6 plugin hooks:
pre_tool_call, post_tool_call, pre_llm_call, post_llm_call,
on_session_start, on_session_end — each with full callback
signatures, parameter tables, firing conditions, and examples.

Supersedes #5138 which identified the same cache-busting bug
and proposed an uncached system suffix approach. This fix goes
further by removing system prompt injection entirely.

Co-identified-by: OutThisLife (PR #5138)
This commit is contained in:
Teknium 2026-04-04 16:55:44 -07:00 committed by GitHub
parent 96e96a79ad
commit 5879b3ef82
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 653 additions and 57 deletions

View file

@ -441,8 +441,18 @@ class PluginManager:
plugin cannot break the core agent loop.
Returns a list of non-``None`` return values from callbacks.
This allows hooks like ``pre_llm_call`` to contribute context
that the agent core can collect and inject.
For ``pre_llm_call``, callbacks may return a dict describing
context to inject into the current turn's user message::
{"context": "recalled text..."}
"recalled text..." # plain string, equivalent
Context is ALWAYS injected into the user message, never the
system prompt. This preserves the prompt cache prefix the
system prompt stays identical across turns so cached tokens
are reused. All injected context is ephemeral never
persisted to session DB.
"""
callbacks = self._hooks.get(hook_name, [])
results: List[Any] = []

View file

@ -6648,10 +6648,17 @@ class AIAgent:
# Plugin hook: pre_llm_call
# Fired once per turn before the tool-calling loop. Plugins can
# return a dict with a ``context`` key whose value is a string
# that will be appended to the ephemeral system prompt for every
# API call in this turn (not persisted to session DB or cache).
_plugin_turn_context = ""
# return a dict with a ``context`` key (or a plain string) whose
# value is appended to the current turn's user message.
#
# Context is ALWAYS injected into the user message, never the
# system prompt. This preserves the prompt cache prefix — the
# system prompt stays identical across turns so cached tokens
# are reused. The system prompt is Hermes's territory; plugins
# contribute context alongside the user's input.
#
# All injected context is ephemeral (not persisted to session DB).
_plugin_user_context = ""
try:
from hermes_cli.plugins import invoke_hook as _invoke_hook
_pre_results = _invoke_hook(
@ -6663,14 +6670,14 @@ class AIAgent:
model=self.model,
platform=getattr(self, "platform", None) or "",
)
_ctx_parts = []
_ctx_parts: list[str] = []
for r in _pre_results:
if isinstance(r, dict) and r.get("context"):
_ctx_parts.append(str(r["context"]))
elif isinstance(r, str) and r.strip():
_ctx_parts.append(r)
if _ctx_parts:
_plugin_turn_context = "\n\n".join(_ctx_parts)
_plugin_user_context = "\n\n".join(_ctx_parts)
except Exception as exc:
logger.warning("pre_llm_call hook failed: %s", exc)
@ -6758,11 +6765,21 @@ class AIAgent:
for idx, msg in enumerate(messages):
api_msg = msg.copy()
# External memory provider prefetch: inject cached recalled context
if idx == current_turn_user_idx and msg.get("role") == "user" and _ext_prefetch_cache:
_base = api_msg.get("content", "")
if isinstance(_base, str):
api_msg["content"] = _base + "\n\n" + _ext_prefetch_cache
# Inject ephemeral context into the current turn's user message.
# Sources: memory manager prefetch + plugin pre_llm_call hooks
# with target="user_message" (the default). Both are
# API-call-time only — the original message in `messages` is
# never mutated, so nothing leaks into session persistence.
if idx == current_turn_user_idx and msg.get("role") == "user":
_injections = []
if _ext_prefetch_cache:
_injections.append(_ext_prefetch_cache)
if _plugin_user_context:
_injections.append(_plugin_user_context)
if _injections:
_base = api_msg.get("content", "")
if isinstance(_base, str):
api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections)
# For ALL assistant messages, pass reasoning back to the API
# This ensures multi-turn reasoning context is preserved
@ -6796,9 +6813,10 @@ class AIAgent:
effective_system = active_system_prompt or ""
if self.ephemeral_system_prompt:
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
# Plugin context from pre_llm_call hooks — ephemeral, not cached.
if _plugin_turn_context:
effective_system = (effective_system + "\n\n" + _plugin_turn_context).strip()
# NOTE: Plugin context from pre_llm_call hooks is injected into the
# user message (see injection block above), NOT the system prompt.
# This is intentional — system prompt modifications break the prompt
# cache prefix. The system prompt is reserved for Hermes internals.
if effective_system:
api_messages = [{"role": "system", "content": effective_system}] + api_messages

View file

@ -403,6 +403,131 @@ class TestPluginManagerList:
class TestPreLlmCallTargetRouting:
"""Tests for pre_llm_call hook return format with target-aware routing.
The routing logic lives in run_agent.py, but the return format is collected
by invoke_hook(). These tests verify the return format works correctly and
that downstream code can route based on the 'target' key.
"""
def _make_pre_llm_plugin(self, plugins_dir, name, return_expr):
"""Create a plugin that returns a specific value from pre_llm_call."""
_make_plugin_dir(
plugins_dir, name,
register_body=(
f'ctx.register_hook("pre_llm_call", lambda **kw: {return_expr})'
),
)
def test_context_dict_returned(self, tmp_path, monkeypatch):
"""Plugin returning a context dict is collected by invoke_hook."""
plugins_dir = tmp_path / "hermes_test" / "plugins"
self._make_pre_llm_plugin(
plugins_dir, "basic_plugin",
'{"context": "basic context"}',
)
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
mgr = PluginManager()
mgr.discover_and_load()
results = mgr.invoke_hook(
"pre_llm_call", session_id="s1", user_message="hi",
conversation_history=[], is_first_turn=True, model="test",
)
assert len(results) == 1
assert results[0]["context"] == "basic context"
assert "target" not in results[0]
def test_plain_string_return(self, tmp_path, monkeypatch):
"""Plain string returns are collected as-is (routing treats them as user_message)."""
plugins_dir = tmp_path / "hermes_test" / "plugins"
self._make_pre_llm_plugin(
plugins_dir, "str_plugin",
'"plain string context"',
)
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
mgr = PluginManager()
mgr.discover_and_load()
results = mgr.invoke_hook(
"pre_llm_call", session_id="s1", user_message="hi",
conversation_history=[], is_first_turn=True, model="test",
)
assert len(results) == 1
assert results[0] == "plain string context"
def test_multiple_plugins_context_collected(self, tmp_path, monkeypatch):
"""Multiple plugins returning context are all collected."""
plugins_dir = tmp_path / "hermes_test" / "plugins"
self._make_pre_llm_plugin(
plugins_dir, "aaa_memory",
'{"context": "memory context"}',
)
self._make_pre_llm_plugin(
plugins_dir, "bbb_guardrail",
'{"context": "guardrail text"}',
)
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
mgr = PluginManager()
mgr.discover_and_load()
results = mgr.invoke_hook(
"pre_llm_call", session_id="s1", user_message="hi",
conversation_history=[], is_first_turn=True, model="test",
)
assert len(results) == 2
contexts = [r["context"] for r in results]
assert "memory context" in contexts
assert "guardrail text" in contexts
def test_routing_logic_all_to_user_message(self, tmp_path, monkeypatch):
"""Simulate the routing logic from run_agent.py.
All plugin context dicts and plain strings ends up in a single
user message context string. There is no system_prompt target.
"""
plugins_dir = tmp_path / "hermes_test" / "plugins"
self._make_pre_llm_plugin(
plugins_dir, "aaa_mem",
'{"context": "memory A"}',
)
self._make_pre_llm_plugin(
plugins_dir, "bbb_guard",
'{"context": "rule B"}',
)
self._make_pre_llm_plugin(
plugins_dir, "ccc_plain",
'"plain text C"',
)
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
mgr = PluginManager()
mgr.discover_and_load()
results = mgr.invoke_hook(
"pre_llm_call", session_id="s1", user_message="hi",
conversation_history=[], is_first_turn=True, model="test",
)
# Replicate run_agent.py routing logic — everything goes to user msg
_ctx_parts = []
for r in results:
if isinstance(r, dict) and r.get("context"):
_ctx_parts.append(str(r["context"]))
elif isinstance(r, str) and r.strip():
_ctx_parts.append(r)
assert _ctx_parts == ["memory A", "rule B", "plain text C"]
_plugin_user_context = "\n\n".join(_ctx_parts)
assert "memory A" in _plugin_user_context
assert "rule B" in _plugin_user_context
assert "plain text C" in _plugin_user_context
# NOTE: TestPluginCommands removed register_command() was never implemented
# in PluginContext (hermes_cli/plugins.py). The tests referenced _plugin_commands,
# commands_registered, get_plugin_command_handler, and GATEWAY_KNOWN_COMMANDS

View file

@ -362,24 +362,124 @@ ctx.register_tool(
def register(ctx):
ctx.register_hook("pre_tool_call", before_any_tool)
ctx.register_hook("post_tool_call", after_any_tool)
ctx.register_hook("pre_llm_call", inject_memory)
ctx.register_hook("on_session_start", on_new_session)
ctx.register_hook("on_session_end", on_session_end)
```
Available hooks:
### Hook reference
| Hook | When | Arguments | Return |
|------|------|-----------|--------|
| `pre_tool_call` | Before any tool runs | `tool_name`, `args`, `task_id` | — |
| `post_tool_call` | After any tool returns | `tool_name`, `args`, `result`, `task_id` | — |
| `pre_llm_call` | Once per turn, before the LLM loop | `session_id`, `user_message`, `conversation_history`, `is_first_turn`, `model`, `platform` | `{"context": "..."}` |
| `post_llm_call` | Once per turn, after the LLM loop | `session_id`, `user_message`, `assistant_response`, `conversation_history`, `model`, `platform` | — |
| `on_session_start` | New session created (first turn only) | `session_id`, `model`, `platform` | — |
| `on_session_end` | End of every `run_conversation` call | `session_id`, `completed`, `interrupted`, `model`, `platform` | — |
Each hook is documented in full on the **[Event Hooks reference](/docs/user-guide/features/hooks#plugin-hooks)** — callback signatures, parameter tables, exactly when each fires, and examples. Here's the summary:
Most hooks are fire-and-forget observers. The exception is `pre_llm_call`: if a callback returns a dict with a `"context"` key (or a plain string), the value is appended to the ephemeral system prompt for the current turn. This allows memory plugins to inject recalled context without touching core code.
| Hook | Fires when | Callback signature | Returns |
|------|-----------|-------------------|---------|
| [`pre_tool_call`](/docs/user-guide/features/hooks#pre_tool_call) | Before any tool executes | `tool_name: str, args: dict, task_id: str` | ignored |
| [`post_tool_call`](/docs/user-guide/features/hooks#post_tool_call) | After any tool returns | `tool_name: str, args: dict, result: str, task_id: str` | ignored |
| [`pre_llm_call`](/docs/user-guide/features/hooks#pre_llm_call) | Once per turn, before the tool-calling loop | `session_id: str, user_message: str, conversation_history: list, is_first_turn: bool, model: str, platform: str` | [context injection](#pre_llm_call-context-injection) |
| [`post_llm_call`](/docs/user-guide/features/hooks#post_llm_call) | Once per turn, after the tool-calling loop (successful turns only) | `session_id: str, user_message: str, assistant_response: str, conversation_history: list, model: str, platform: str` | ignored |
| [`on_session_start`](/docs/user-guide/features/hooks#on_session_start) | New session created (first turn only) | `session_id: str, model: str, platform: str` | ignored |
| [`on_session_end`](/docs/user-guide/features/hooks#on_session_end) | End of every `run_conversation` call + CLI exit | `session_id: str, completed: bool, interrupted: bool, model: str, platform: str` | ignored |
If a hook crashes, it's logged and skipped; other hooks and the agent continue normally.
Most hooks are fire-and-forget observers — their return values are ignored. The exception is `pre_llm_call`, which can inject context into the conversation.
All callbacks should accept `**kwargs` for forward compatibility. If a hook callback crashes, it's logged and skipped. Other hooks and the agent continue normally.
### `pre_llm_call` context injection
This is the only hook whose return value matters. When a `pre_llm_call` callback returns a dict with a `"context"` key (or a plain string), Hermes injects that text into the **current turn's user message**. This is the mechanism for memory plugins, RAG integrations, guardrails, and any plugin that needs to provide the model with additional context.
#### Return format
```python
# Dict with context key
return {"context": "Recalled memories:\n- User prefers dark mode\n- Last project: hermes-agent"}
# Plain string (equivalent to the dict form above)
return "Recalled memories:\n- User prefers dark mode"
# Return None or don't return → no injection (observer-only)
return None
```
Any non-None, non-empty return with a `"context"` key (or a plain non-empty string) is collected and appended to the user message for the current turn.
#### How injection works
Injected context is appended to the **user message**, not the system prompt. This is a deliberate design choice:
- **Prompt cache preservation** — the system prompt stays identical across turns. Anthropic and OpenRouter cache the system prompt prefix, so keeping it stable saves 75%+ on input tokens in multi-turn conversations. If plugins modified the system prompt, every turn would be a cache miss.
- **Ephemeral** — the injection happens at API call time only. The original user message in the conversation history is never mutated, and nothing is persisted to the session database.
- **The system prompt is Hermes's territory** — it contains model-specific guidance, tool enforcement rules, personality instructions, and cached skill content. Plugins contribute context alongside the user's input, not by altering the agent's core instructions.
#### Example: Memory recall plugin
```python
"""Memory plugin — recalls relevant context from a vector store."""
import httpx
MEMORY_API = "https://your-memory-api.example.com"
def recall_context(session_id, user_message, is_first_turn, **kwargs):
"""Called before each LLM turn. Returns recalled memories."""
try:
resp = httpx.post(f"{MEMORY_API}/recall", json={
"session_id": session_id,
"query": user_message,
}, timeout=3)
memories = resp.json().get("results", [])
if not memories:
return None # nothing to inject
text = "Recalled context from previous sessions:\n"
text += "\n".join(f"- {m['text']}" for m in memories)
return {"context": text}
except Exception:
return None # fail silently, don't break the agent
def register(ctx):
ctx.register_hook("pre_llm_call", recall_context)
```
#### Example: Guardrails plugin
```python
"""Guardrails plugin — enforces content policies."""
POLICY = """You MUST follow these content policies for this session:
- Never generate code that accesses the filesystem outside the working directory
- Always warn before executing destructive operations
- Refuse requests involving personal data extraction"""
def inject_guardrails(**kwargs):
"""Injects policy text into every turn."""
return {"context": POLICY}
def register(ctx):
ctx.register_hook("pre_llm_call", inject_guardrails)
```
#### Example: Observer-only hook (no injection)
```python
"""Analytics plugin — tracks turn metadata without injecting context."""
import logging
logger = logging.getLogger(__name__)
def log_turn(session_id, user_message, model, is_first_turn, **kwargs):
"""Fires before each LLM call. Returns None — no context injected."""
logger.info("Turn: session=%s model=%s first=%s msg_len=%d",
session_id, model, is_first_turn, len(user_message or ""))
# No return → no injection
def register(ctx):
ctx.register_hook("pre_llm_call", log_turn)
```
#### Multiple plugins returning context
When multiple plugins return context from `pre_llm_call`, their outputs are joined with double newlines and appended to the user message together. The order follows plugin discovery order (alphabetical by plugin directory name).
### Distribute via pip

View file

@ -219,42 +219,385 @@ Gateway hooks only fire in the **gateway** (Telegram, Discord, Slack, WhatsApp).
```python
def register(ctx):
ctx.register_hook("pre_tool_call", my_callback)
ctx.register_hook("post_tool_call", my_callback)
ctx.register_hook("pre_tool_call", my_tool_observer)
ctx.register_hook("post_tool_call", my_tool_logger)
ctx.register_hook("pre_llm_call", my_memory_callback)
ctx.register_hook("post_llm_call", my_sync_callback)
ctx.register_hook("on_session_start", my_init_callback)
ctx.register_hook("on_session_end", my_cleanup_callback)
```
### Available Plugin Hooks
**General rules for all hooks:**
| Hook | Fires when | Callback receives |
|------|-----------|-------------------|
| `pre_tool_call` | Before any tool executes | `tool_name`, `args`, `task_id` |
| `post_tool_call` | After any tool returns | `tool_name`, `args`, `result`, `task_id` |
| `pre_llm_call` | Before LLM API request | `session_id`, `user_message`, `conversation_history`, `is_first_turn`, `model`, `platform` |
| `post_llm_call` | After LLM API response | `session_id`, `user_message`, `assistant_response`, `conversation_history`, `model`, `platform` |
| `on_session_start` | Session begins | `session_id`, `model`, `platform` |
| `on_session_end` | Session ends | `session_id`, `completed`, `interrupted`, `model`, `platform` |
- Callbacks receive **keyword arguments**. Always accept `**kwargs` for forward compatibility — new parameters may be added in future versions without breaking your plugin.
- If a callback **crashes**, it's logged and skipped. Other hooks and the agent continue normally. A misbehaving plugin can never break the agent.
- All hooks are **fire-and-forget observers** whose return values are ignored — except `pre_llm_call`, which can [inject context](#pre_llm_call).
Callbacks receive keyword arguments matching the columns above:
### Quick reference
| Hook | Fires when | Returns |
|------|-----------|---------|
| [`pre_tool_call`](#pre_tool_call) | Before any tool executes | ignored |
| [`post_tool_call`](#post_tool_call) | After any tool returns | ignored |
| [`pre_llm_call`](#pre_llm_call) | Once per turn, before the tool-calling loop | context injection |
| [`post_llm_call`](#post_llm_call) | Once per turn, after the tool-calling loop | ignored |
| [`on_session_start`](#on_session_start) | New session created (first turn only) | ignored |
| [`on_session_end`](#on_session_end) | Session ends | ignored |
---
### `pre_tool_call`
Fires **immediately before** every tool execution — built-in tools and plugin tools alike.
**Callback signature:**
```python
def my_callback(**kwargs):
tool = kwargs["tool_name"]
args = kwargs["args"]
# ...
def my_callback(tool_name: str, args: dict, task_id: str, **kwargs):
```
### Example: Block Dangerous Tools
| Parameter | Type | Description |
|-----------|------|-------------|
| `tool_name` | `str` | Name of the tool about to execute (e.g. `"terminal"`, `"web_search"`, `"read_file"`) |
| `args` | `dict` | The arguments the model passed to the tool |
| `task_id` | `str` | Session/task identifier. Empty string if not set. |
**Fires:** In `model_tools.py`, inside `handle_function_call()`, before the tool's handler runs. Fires once per tool call — if the model calls 3 tools in parallel, this fires 3 times.
**Return value:** Ignored.
**Use cases:** Logging, audit trails, tool call counters, blocking dangerous operations (print a warning), rate limiting.
**Example — tool call audit log:**
```python
# ~/.hermes/plugins/tool-guard/__init__.py
BLOCKED = {"terminal", "write_file"}
import json, logging
from datetime import datetime
def guard(**kwargs):
if kwargs["tool_name"] in BLOCKED:
print(f"⚠ Blocked tool call: {kwargs['tool_name']}")
logger = logging.getLogger(__name__)
def audit_tool_call(tool_name, args, task_id, **kwargs):
logger.info("TOOL_CALL session=%s tool=%s args=%s",
task_id, tool_name, json.dumps(args)[:200])
def register(ctx):
ctx.register_hook("pre_tool_call", guard)
ctx.register_hook("pre_tool_call", audit_tool_call)
```
See the **[Plugins guide](/docs/user-guide/features/plugins)** for full details on creating plugins.
**Example — warn on dangerous tools:**
```python
DANGEROUS = {"terminal", "write_file", "patch"}
def warn_dangerous(tool_name, **kwargs):
if tool_name in DANGEROUS:
print(f"⚠ Executing potentially dangerous tool: {tool_name}")
def register(ctx):
ctx.register_hook("pre_tool_call", warn_dangerous)
```
---
### `post_tool_call`
Fires **immediately after** every tool execution returns.
**Callback signature:**
```python
def my_callback(tool_name: str, args: dict, result: str, task_id: str, **kwargs):
```
| Parameter | Type | Description |
|-----------|------|-------------|
| `tool_name` | `str` | Name of the tool that just executed |
| `args` | `dict` | The arguments the model passed to the tool |
| `result` | `str` | The tool's return value (always a JSON string) |
| `task_id` | `str` | Session/task identifier. Empty string if not set. |
**Fires:** In `model_tools.py`, inside `handle_function_call()`, after the tool's handler returns. Fires once per tool call. Does **not** fire if the tool raised an unhandled exception (the error is caught and returned as an error JSON string instead, and `post_tool_call` fires with that error string as `result`).
**Return value:** Ignored.
**Use cases:** Logging tool results, metrics collection, tracking tool success/failure rates, sending notifications when specific tools complete.
**Example — track tool usage metrics:**
```python
from collections import Counter
import json
_tool_counts = Counter()
_error_counts = Counter()
def track_metrics(tool_name, result, **kwargs):
_tool_counts[tool_name] += 1
try:
parsed = json.loads(result)
if "error" in parsed:
_error_counts[tool_name] += 1
except (json.JSONDecodeError, TypeError):
pass
def register(ctx):
ctx.register_hook("post_tool_call", track_metrics)
```
---
### `pre_llm_call`
Fires **once per turn**, before the tool-calling loop begins. This is the **only hook whose return value is used** — it can inject context into the current turn's user message.
**Callback signature:**
```python
def my_callback(session_id: str, user_message: str, conversation_history: list,
is_first_turn: bool, model: str, platform: str, **kwargs):
```
| Parameter | Type | Description |
|-----------|------|-------------|
| `session_id` | `str` | Unique identifier for the current session |
| `user_message` | `str` | The user's original message for this turn (before any skill injection) |
| `conversation_history` | `list` | Copy of the full message list (OpenAI format: `[{"role": "user", "content": "..."}]`) |
| `is_first_turn` | `bool` | `True` if this is the first turn of a new session, `False` on subsequent turns |
| `model` | `str` | The model identifier (e.g. `"anthropic/claude-sonnet-4.6"`) |
| `platform` | `str` | Where the session is running: `"cli"`, `"telegram"`, `"discord"`, etc. |
**Fires:** In `run_agent.py`, inside `run_conversation()`, after context compression but before the main `while` loop. Fires once per `run_conversation()` call (i.e. once per user turn), not once per API call within the tool loop.
**Return value:** If the callback returns a dict with a `"context"` key, or a plain non-empty string, the text is appended to the current turn's user message. Return `None` for no injection.
```python
# Inject context
return {"context": "Recalled memories:\n- User likes Python\n- Working on hermes-agent"}
# Plain string (equivalent)
return "Recalled memories:\n- User likes Python"
# No injection
return None
```
**Where context is injected:** Always the **user message**, never the system prompt. This preserves the prompt cache — the system prompt stays identical across turns, so cached tokens are reused. The system prompt is Hermes's territory (model guidance, tool enforcement, personality, skills). Plugins contribute context alongside the user's input.
All injected context is **ephemeral** — added at API call time only. The original user message in the conversation history is never mutated, and nothing is persisted to the session database.
When **multiple plugins** return context, their outputs are joined with double newlines in plugin discovery order (alphabetical by directory name).
**Use cases:** Memory recall, RAG context injection, guardrails, per-turn analytics.
**Example — memory recall:**
```python
import httpx
MEMORY_API = "https://your-memory-api.example.com"
def recall(session_id, user_message, is_first_turn, **kwargs):
try:
resp = httpx.post(f"{MEMORY_API}/recall", json={
"session_id": session_id,
"query": user_message,
}, timeout=3)
memories = resp.json().get("results", [])
if not memories:
return None
text = "Recalled context:\n" + "\n".join(f"- {m['text']}" for m in memories)
return {"context": text}
except Exception:
return None
def register(ctx):
ctx.register_hook("pre_llm_call", recall)
```
**Example — guardrails:**
```python
POLICY = "Never execute commands that delete files without explicit user confirmation."
def guardrails(**kwargs):
return {"context": POLICY}
def register(ctx):
ctx.register_hook("pre_llm_call", guardrails)
```
---
### `post_llm_call`
Fires **once per turn**, after the tool-calling loop completes and the agent has produced a final response. Only fires on **successful** turns — does not fire if the turn was interrupted.
**Callback signature:**
```python
def my_callback(session_id: str, user_message: str, assistant_response: str,
conversation_history: list, model: str, platform: str, **kwargs):
```
| Parameter | Type | Description |
|-----------|------|-------------|
| `session_id` | `str` | Unique identifier for the current session |
| `user_message` | `str` | The user's original message for this turn |
| `assistant_response` | `str` | The agent's final text response for this turn |
| `conversation_history` | `list` | Copy of the full message list after the turn completed |
| `model` | `str` | The model identifier |
| `platform` | `str` | Where the session is running |
**Fires:** In `run_agent.py`, inside `run_conversation()`, after the tool loop exits with a final response. Guarded by `if final_response and not interrupted` — so it does **not** fire when the user interrupts mid-turn or the agent hits the iteration limit without producing a response.
**Return value:** Ignored.
**Use cases:** Syncing conversation data to an external memory system, computing response quality metrics, logging turn summaries, triggering follow-up actions.
**Example — sync to external memory:**
```python
import httpx
MEMORY_API = "https://your-memory-api.example.com"
def sync_memory(session_id, user_message, assistant_response, **kwargs):
try:
httpx.post(f"{MEMORY_API}/store", json={
"session_id": session_id,
"user": user_message,
"assistant": assistant_response,
}, timeout=5)
except Exception:
pass # best-effort
def register(ctx):
ctx.register_hook("post_llm_call", sync_memory)
```
**Example — track response lengths:**
```python
import logging
logger = logging.getLogger(__name__)
def log_response_length(session_id, assistant_response, model, **kwargs):
logger.info("RESPONSE session=%s model=%s chars=%d",
session_id, model, len(assistant_response or ""))
def register(ctx):
ctx.register_hook("post_llm_call", log_response_length)
```
---
### `on_session_start`
Fires **once** when a brand-new session is created. Does **not** fire on session continuation (when the user sends a second message in an existing session).
**Callback signature:**
```python
def my_callback(session_id: str, model: str, platform: str, **kwargs):
```
| Parameter | Type | Description |
|-----------|------|-------------|
| `session_id` | `str` | Unique identifier for the new session |
| `model` | `str` | The model identifier |
| `platform` | `str` | Where the session is running |
**Fires:** In `run_agent.py`, inside `run_conversation()`, during the first turn of a new session — specifically after the system prompt is built but before the tool loop starts. The check is `if not conversation_history` (no prior messages = new session).
**Return value:** Ignored.
**Use cases:** Initializing session-scoped state, warming caches, registering the session with an external service, logging session starts.
**Example — initialize a session cache:**
```python
_session_caches = {}
def init_session(session_id, model, platform, **kwargs):
_session_caches[session_id] = {
"model": model,
"platform": platform,
"tool_calls": 0,
"started": __import__("datetime").datetime.now().isoformat(),
}
def register(ctx):
ctx.register_hook("on_session_start", init_session)
```
---
### `on_session_end`
Fires at the **very end** of every `run_conversation()` call, regardless of outcome. Also fires from the CLI's exit handler if the agent was mid-turn when the user quit.
**Callback signature:**
```python
def my_callback(session_id: str, completed: bool, interrupted: bool,
model: str, platform: str, **kwargs):
```
| Parameter | Type | Description |
|-----------|------|-------------|
| `session_id` | `str` | Unique identifier for the session |
| `completed` | `bool` | `True` if the agent produced a final response, `False` otherwise |
| `interrupted` | `bool` | `True` if the turn was interrupted (user sent new message, `/stop`, or quit) |
| `model` | `str` | The model identifier |
| `platform` | `str` | Where the session is running |
**Fires:** In two places:
1. **`run_agent.py`** — at the end of every `run_conversation()` call, after all cleanup. Always fires, even if the turn errored.
2. **`cli.py`** — in the CLI's atexit handler, but **only** if the agent was mid-turn (`_agent_running=True`) when the exit occurred. This catches Ctrl+C and `/exit` during processing. In this case, `completed=False` and `interrupted=True`.
**Return value:** Ignored.
**Use cases:** Flushing buffers, closing connections, persisting session state, logging session duration, cleanup of resources initialized in `on_session_start`.
**Example — flush and cleanup:**
```python
_session_caches = {}
def cleanup_session(session_id, completed, interrupted, **kwargs):
cache = _session_caches.pop(session_id, None)
if cache:
# Flush accumulated data to disk or external service
status = "completed" if completed else ("interrupted" if interrupted else "failed")
print(f"Session {session_id} ended: {status}, {cache['tool_calls']} tool calls")
def register(ctx):
ctx.register_hook("on_session_end", cleanup_session)
```
**Example — session duration tracking:**
```python
import time, logging
logger = logging.getLogger(__name__)
_start_times = {}
def on_start(session_id, **kwargs):
_start_times[session_id] = time.time()
def on_end(session_id, completed, interrupted, **kwargs):
start = _start_times.pop(session_id, None)
if start:
duration = time.time() - start
logger.info("SESSION_DURATION session=%s seconds=%.1f completed=%s interrupted=%s",
session_id, duration, completed, interrupted)
def register(ctx):
ctx.register_hook("on_session_start", on_start)
ctx.register_hook("on_session_end", on_end)
```
---
See the **[Build a Plugin guide](/docs/guides/build-a-hermes-plugin)** for the full walkthrough including tool schemas, handlers, and advanced hook patterns.

View file

@ -103,12 +103,12 @@ Plugins can register callbacks for these lifecycle events. See the **[Event Hook
| Hook | Fires when |
|------|-----------|
| `pre_tool_call` | Before any tool executes |
| `post_tool_call` | After any tool returns |
| `pre_llm_call` | Once per turn, before the LLM loop — can return `{"context": "..."}` to inject into the system prompt |
| `post_llm_call` | Once per turn, after the LLM loop completes |
| `on_session_start` | New session created (first turn only) |
| `on_session_end` | End of every `run_conversation` call |
| [`pre_tool_call`](/docs/user-guide/features/hooks#pre_tool_call) | Before any tool executes |
| [`post_tool_call`](/docs/user-guide/features/hooks#post_tool_call) | After any tool returns |
| [`pre_llm_call`](/docs/user-guide/features/hooks#pre_llm_call) | Once per turn, before the LLM loop — can return `{"context": "..."}` to [inject context into the user message](/docs/user-guide/features/hooks#pre_llm_call) |
| [`post_llm_call`](/docs/user-guide/features/hooks#post_llm_call) | Once per turn, after the LLM loop (successful turns only) |
| [`on_session_start`](/docs/user-guide/features/hooks#on_session_start) | New session created (first turn only) |
| [`on_session_end`](/docs/user-guide/features/hooks#on_session_end) | End of every `run_conversation` call + CLI exit handler |
## Managing plugins