fix: move pre_llm_call plugin context to user message, preserve prompt cache (#5146)

Plugin context from pre_llm_call hooks was injected into the system prompt, breaking the prompt cache prefix every turn when content changed (typical for memory plugins). Now all plugin context goes into the current turn's user message — the system prompt stays identical across turns, preserving cached tokens. The system prompt is reserved for Hermes internals. Plugins contribute context alongside the user's input. Also adds comprehensive documentation for all 6 plugin hooks: pre_tool_call, post_tool_call, pre_llm_call, post_llm_call, on_session_start, on_session_end — each with full callback signatures, parameter tables, firing conditions, and examples. Supersedes #5138 which identified the same cache-busting bug and proposed an uncached system suffix approach. This fix goes further by removing system prompt injection entirely. Co-identified-by: OutThisLife (PR #5138)
2026-06-17 09:41:58 +00:00 · 2026-04-04 16:55:44 -07:00 · 2026-04-04 16:55:44 -07:00 · 5879b3ef82
commit 5879b3ef82
parent 96e96a79ad
6 changed files with 653 additions and 57 deletions
--- a/hermes_cli/plugins.py
+++ b/hermes_cli/plugins.py
@ -441,8 +441,18 @@ class PluginManager:
        plugin cannot break the core agent loop.

        Returns a list of non-``None`` return values from callbacks.
-        This allows hooks like ``pre_llm_call`` to contribute context
-        that the agent core can collect and inject.
+
+        For ``pre_llm_call``, callbacks may return a dict describing
+        context to inject into the current turn's user message::
+
+            {"context": "recalled text..."}
+            "recalled text..."          # plain string, equivalent
+
+        Context is ALWAYS injected into the user message, never the
+        system prompt.  This preserves the prompt cache prefix — the
+        system prompt stays identical across turns so cached tokens
+        are reused.  All injected context is ephemeral — never
+        persisted to session DB.
        """
        callbacks = self._hooks.get(hook_name, [])
        results: List[Any] = []
--- a/run_agent.py
+++ b/run_agent.py
@ -6648,10 +6648,17 @@ class AIAgent:

        # Plugin hook: pre_llm_call
        # Fired once per turn before the tool-calling loop.  Plugins can
-        # return a dict with a ``context`` key whose value is a string
-        # that will be appended to the ephemeral system prompt for every
-        # API call in this turn (not persisted to session DB or cache).
-        _plugin_turn_context = ""
+        # return a dict with a ``context`` key (or a plain string) whose
+        # value is appended to the current turn's user message.
+        #
+        # Context is ALWAYS injected into the user message, never the
+        # system prompt.  This preserves the prompt cache prefix — the
+        # system prompt stays identical across turns so cached tokens
+        # are reused.  The system prompt is Hermes's territory; plugins
+        # contribute context alongside the user's input.
+        #
+        # All injected context is ephemeral (not persisted to session DB).
+        _plugin_user_context = ""
        try:
            from hermes_cli.plugins import invoke_hook as _invoke_hook
            _pre_results = _invoke_hook(
@ -6663,14 +6670,14 @@ class AIAgent:
                model=self.model,
                platform=getattr(self, "platform", None) or "",
            )
-            _ctx_parts = []
+            _ctx_parts: list[str] = []
            for r in _pre_results:
                if isinstance(r, dict) and r.get("context"):
                    _ctx_parts.append(str(r["context"]))
                elif isinstance(r, str) and r.strip():
                    _ctx_parts.append(r)
            if _ctx_parts:
-                _plugin_turn_context = "\n\n".join(_ctx_parts)
+                _plugin_user_context = "\n\n".join(_ctx_parts)
        except Exception as exc:
            logger.warning("pre_llm_call hook failed: %s", exc)

@ -6758,11 +6765,21 @@ class AIAgent:
            for idx, msg in enumerate(messages):
                api_msg = msg.copy()

-                # External memory provider prefetch: inject cached recalled context
-                if idx == current_turn_user_idx and msg.get("role") == "user" and _ext_prefetch_cache:
-                    _base = api_msg.get("content", "")
-                    if isinstance(_base, str):
-                        api_msg["content"] = _base + "\n\n" + _ext_prefetch_cache
+                # Inject ephemeral context into the current turn's user message.
+                # Sources: memory manager prefetch + plugin pre_llm_call hooks
+                # with target="user_message" (the default).  Both are
+                # API-call-time only — the original message in `messages` is
+                # never mutated, so nothing leaks into session persistence.
+                if idx == current_turn_user_idx and msg.get("role") == "user":
+                    _injections = []
+                    if _ext_prefetch_cache:
+                        _injections.append(_ext_prefetch_cache)
+                    if _plugin_user_context:
+                        _injections.append(_plugin_user_context)
+                    if _injections:
+                        _base = api_msg.get("content", "")
+                        if isinstance(_base, str):
+                            api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections)

                # For ALL assistant messages, pass reasoning back to the API
                # This ensures multi-turn reasoning context is preserved
@ -6796,9 +6813,10 @@ class AIAgent:
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
-            # Plugin context from pre_llm_call hooks — ephemeral, not cached.
-            if _plugin_turn_context:
-                effective_system = (effective_system + "\n\n" + _plugin_turn_context).strip()
+            # NOTE: Plugin context from pre_llm_call hooks is injected into the
+            # user message (see injection block above), NOT the system prompt.
+            # This is intentional — system prompt modifications break the prompt
+            # cache prefix.  The system prompt is reserved for Hermes internals.
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages

--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@ -403,6 +403,131 @@ class TestPluginManagerList:



+class TestPreLlmCallTargetRouting:
+    """Tests for pre_llm_call hook return format with target-aware routing.
+
+    The routing logic lives in run_agent.py, but the return format is collected
+    by invoke_hook(). These tests verify the return format works correctly and
+    that downstream code can route based on the 'target' key.
+    """
+
+    def _make_pre_llm_plugin(self, plugins_dir, name, return_expr):
+        """Create a plugin that returns a specific value from pre_llm_call."""
+        _make_plugin_dir(
+            plugins_dir, name,
+            register_body=(
+                f'ctx.register_hook("pre_llm_call", lambda **kw: {return_expr})'
+            ),
+        )
+
+    def test_context_dict_returned(self, tmp_path, monkeypatch):
+        """Plugin returning a context dict is collected by invoke_hook."""
+        plugins_dir = tmp_path / "hermes_test" / "plugins"
+        self._make_pre_llm_plugin(
+            plugins_dir, "basic_plugin",
+            '{"context": "basic context"}',
+        )
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
+
+        mgr = PluginManager()
+        mgr.discover_and_load()
+
+        results = mgr.invoke_hook(
+            "pre_llm_call", session_id="s1", user_message="hi",
+            conversation_history=[], is_first_turn=True, model="test",
+        )
+        assert len(results) == 1
+        assert results[0]["context"] == "basic context"
+        assert "target" not in results[0]
+
+    def test_plain_string_return(self, tmp_path, monkeypatch):
+        """Plain string returns are collected as-is (routing treats them as user_message)."""
+        plugins_dir = tmp_path / "hermes_test" / "plugins"
+        self._make_pre_llm_plugin(
+            plugins_dir, "str_plugin",
+            '"plain string context"',
+        )
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
+
+        mgr = PluginManager()
+        mgr.discover_and_load()
+
+        results = mgr.invoke_hook(
+            "pre_llm_call", session_id="s1", user_message="hi",
+            conversation_history=[], is_first_turn=True, model="test",
+        )
+        assert len(results) == 1
+        assert results[0] == "plain string context"
+
+    def test_multiple_plugins_context_collected(self, tmp_path, monkeypatch):
+        """Multiple plugins returning context are all collected."""
+        plugins_dir = tmp_path / "hermes_test" / "plugins"
+        self._make_pre_llm_plugin(
+            plugins_dir, "aaa_memory",
+            '{"context": "memory context"}',
+        )
+        self._make_pre_llm_plugin(
+            plugins_dir, "bbb_guardrail",
+            '{"context": "guardrail text"}',
+        )
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
+
+        mgr = PluginManager()
+        mgr.discover_and_load()
+
+        results = mgr.invoke_hook(
+            "pre_llm_call", session_id="s1", user_message="hi",
+            conversation_history=[], is_first_turn=True, model="test",
+        )
+        assert len(results) == 2
+        contexts = [r["context"] for r in results]
+        assert "memory context" in contexts
+        assert "guardrail text" in contexts
+
+    def test_routing_logic_all_to_user_message(self, tmp_path, monkeypatch):
+        """Simulate the routing logic from run_agent.py.
+
+        All plugin context — dicts and plain strings — ends up in a single
+        user message context string. There is no system_prompt target.
+        """
+        plugins_dir = tmp_path / "hermes_test" / "plugins"
+        self._make_pre_llm_plugin(
+            plugins_dir, "aaa_mem",
+            '{"context": "memory A"}',
+        )
+        self._make_pre_llm_plugin(
+            plugins_dir, "bbb_guard",
+            '{"context": "rule B"}',
+        )
+        self._make_pre_llm_plugin(
+            plugins_dir, "ccc_plain",
+            '"plain text C"',
+        )
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes_test"))
+
+        mgr = PluginManager()
+        mgr.discover_and_load()
+
+        results = mgr.invoke_hook(
+            "pre_llm_call", session_id="s1", user_message="hi",
+            conversation_history=[], is_first_turn=True, model="test",
+        )
+
+        # Replicate run_agent.py routing logic — everything goes to user msg
+        _ctx_parts = []
+        for r in results:
+            if isinstance(r, dict) and r.get("context"):
+                _ctx_parts.append(str(r["context"]))
+            elif isinstance(r, str) and r.strip():
+                _ctx_parts.append(r)
+
+        assert _ctx_parts == ["memory A", "rule B", "plain text C"]
+        _plugin_user_context = "\n\n".join(_ctx_parts)
+        assert "memory A" in _plugin_user_context
+        assert "rule B" in _plugin_user_context
+        assert "plain text C" in _plugin_user_context
+
+
 # NOTE: TestPluginCommands removed – register_command() was never implemented
 # in PluginContext (hermes_cli/plugins.py).  The tests referenced _plugin_commands,
 # commands_registered, get_plugin_command_handler, and GATEWAY_KNOWN_COMMANDS
--- a/website/docs/guides/build-a-hermes-plugin.md
+++ b/website/docs/guides/build-a-hermes-plugin.md
@ -362,24 +362,124 @@ ctx.register_tool(
 def register(ctx):
    ctx.register_hook("pre_tool_call", before_any_tool)
    ctx.register_hook("post_tool_call", after_any_tool)
+    ctx.register_hook("pre_llm_call", inject_memory)
    ctx.register_hook("on_session_start", on_new_session)
    ctx.register_hook("on_session_end", on_session_end)
 ```

-Available hooks:
+### Hook reference

-| Hook | When | Arguments | Return |
-|------|------|-----------|--------|
-| `pre_tool_call` | Before any tool runs | `tool_name`, `args`, `task_id` | — |
-| `post_tool_call` | After any tool returns | `tool_name`, `args`, `result`, `task_id` | — |
-| `pre_llm_call` | Once per turn, before the LLM loop | `session_id`, `user_message`, `conversation_history`, `is_first_turn`, `model`, `platform` | `{"context": "..."}` |
-| `post_llm_call` | Once per turn, after the LLM loop | `session_id`, `user_message`, `assistant_response`, `conversation_history`, `model`, `platform` | — |
-| `on_session_start` | New session created (first turn only) | `session_id`, `model`, `platform` | — |
-| `on_session_end` | End of every `run_conversation` call | `session_id`, `completed`, `interrupted`, `model`, `platform` | — |
+Each hook is documented in full on the **[Event Hooks reference](/docs/user-guide/features/hooks#plugin-hooks)** — callback signatures, parameter tables, exactly when each fires, and examples. Here's the summary:

-Most hooks are fire-and-forget observers. The exception is `pre_llm_call`: if a callback returns a dict with a `"context"` key (or a plain string), the value is appended to the ephemeral system prompt for the current turn. This allows memory plugins to inject recalled context without touching core code.
+| Hook | Fires when | Callback signature | Returns |
+|------|-----------|-------------------|---------|
+| [`pre_tool_call`](/docs/user-guide/features/hooks#pre_tool_call) | Before any tool executes | `tool_name: str, args: dict, task_id: str` | ignored |
+| [`post_tool_call`](/docs/user-guide/features/hooks#post_tool_call) | After any tool returns | `tool_name: str, args: dict, result: str, task_id: str` | ignored |
+| [`pre_llm_call`](/docs/user-guide/features/hooks#pre_llm_call) | Once per turn, before the tool-calling loop | `session_id: str, user_message: str, conversation_history: list, is_first_turn: bool, model: str, platform: str` | [context injection](#pre_llm_call-context-injection) |
+| [`post_llm_call`](/docs/user-guide/features/hooks#post_llm_call) | Once per turn, after the tool-calling loop (successful turns only) | `session_id: str, user_message: str, assistant_response: str, conversation_history: list, model: str, platform: str` | ignored |
+| [`on_session_start`](/docs/user-guide/features/hooks#on_session_start) | New session created (first turn only) | `session_id: str, model: str, platform: str` | ignored |
+| [`on_session_end`](/docs/user-guide/features/hooks#on_session_end) | End of every `run_conversation` call + CLI exit | `session_id: str, completed: bool, interrupted: bool, model: str, platform: str` | ignored |

-If a hook crashes, it's logged and skipped; other hooks and the agent continue normally.
+Most hooks are fire-and-forget observers — their return values are ignored. The exception is `pre_llm_call`, which can inject context into the conversation.
+
+All callbacks should accept `**kwargs` for forward compatibility. If a hook callback crashes, it's logged and skipped. Other hooks and the agent continue normally.
+
+### `pre_llm_call` context injection
+
+This is the only hook whose return value matters. When a `pre_llm_call` callback returns a dict with a `"context"` key (or a plain string), Hermes injects that text into the **current turn's user message**. This is the mechanism for memory plugins, RAG integrations, guardrails, and any plugin that needs to provide the model with additional context.
+
+#### Return format
+
+```python
+# Dict with context key
+return {"context": "Recalled memories:\n- User prefers dark mode\n- Last project: hermes-agent"}
+
+# Plain string (equivalent to the dict form above)
+return "Recalled memories:\n- User prefers dark mode"
+
+# Return None or don't return → no injection (observer-only)
+return None
+```
+
+Any non-None, non-empty return with a `"context"` key (or a plain non-empty string) is collected and appended to the user message for the current turn.
+
+#### How injection works
+
+Injected context is appended to the **user message**, not the system prompt. This is a deliberate design choice:
+
+- **Prompt cache preservation** — the system prompt stays identical across turns. Anthropic and OpenRouter cache the system prompt prefix, so keeping it stable saves 75%+ on input tokens in multi-turn conversations. If plugins modified the system prompt, every turn would be a cache miss.
+- **Ephemeral** — the injection happens at API call time only. The original user message in the conversation history is never mutated, and nothing is persisted to the session database.
+- **The system prompt is Hermes's territory** — it contains model-specific guidance, tool enforcement rules, personality instructions, and cached skill content. Plugins contribute context alongside the user's input, not by altering the agent's core instructions.
+
+#### Example: Memory recall plugin
+
+```python
+"""Memory plugin — recalls relevant context from a vector store."""
+
+import httpx
+
+MEMORY_API = "https://your-memory-api.example.com"
+
+def recall_context(session_id, user_message, is_first_turn, **kwargs):
+    """Called before each LLM turn. Returns recalled memories."""
+    try:
+        resp = httpx.post(f"{MEMORY_API}/recall", json={
+            "session_id": session_id,
+            "query": user_message,
+        }, timeout=3)
+        memories = resp.json().get("results", [])
+        if not memories:
+            return None  # nothing to inject
+
+        text = "Recalled context from previous sessions:\n"
+        text += "\n".join(f"- {m['text']}" for m in memories)
+        return {"context": text}
+    except Exception:
+        return None  # fail silently, don't break the agent
+
+def register(ctx):
+    ctx.register_hook("pre_llm_call", recall_context)
+```
+
+#### Example: Guardrails plugin
+
+```python
+"""Guardrails plugin — enforces content policies."""
+
+POLICY = """You MUST follow these content policies for this session:
+- Never generate code that accesses the filesystem outside the working directory
+- Always warn before executing destructive operations
+- Refuse requests involving personal data extraction"""
+
+def inject_guardrails(**kwargs):
+    """Injects policy text into every turn."""
+    return {"context": POLICY}
+
+def register(ctx):
+    ctx.register_hook("pre_llm_call", inject_guardrails)
+```
+
+#### Example: Observer-only hook (no injection)
+
+```python
+"""Analytics plugin — tracks turn metadata without injecting context."""
+
+import logging
+logger = logging.getLogger(__name__)
+
+def log_turn(session_id, user_message, model, is_first_turn, **kwargs):
+    """Fires before each LLM call. Returns None — no context injected."""
+    logger.info("Turn: session=%s model=%s first=%s msg_len=%d",
+                session_id, model, is_first_turn, len(user_message or ""))
+    # No return → no injection
+
+def register(ctx):
+    ctx.register_hook("pre_llm_call", log_turn)
+```
+
+#### Multiple plugins returning context
+
+When multiple plugins return context from `pre_llm_call`, their outputs are joined with double newlines and appended to the user message together. The order follows plugin discovery order (alphabetical by plugin directory name).

 ### Distribute via pip

--- a/website/docs/user-guide/features/hooks.md
+++ b/website/docs/user-guide/features/hooks.md
@ -219,42 +219,385 @@ Gateway hooks only fire in the **gateway** (Telegram, Discord, Slack, WhatsApp).

 ```python
 def register(ctx):
-    ctx.register_hook("pre_tool_call", my_callback)
-    ctx.register_hook("post_tool_call", my_callback)
+    ctx.register_hook("pre_tool_call", my_tool_observer)
+    ctx.register_hook("post_tool_call", my_tool_logger)
+    ctx.register_hook("pre_llm_call", my_memory_callback)
+    ctx.register_hook("post_llm_call", my_sync_callback)
+    ctx.register_hook("on_session_start", my_init_callback)
+    ctx.register_hook("on_session_end", my_cleanup_callback)
 ```

-### Available Plugin Hooks
+**General rules for all hooks:**

-| Hook | Fires when | Callback receives |
-|------|-----------|-------------------|
-| `pre_tool_call` | Before any tool executes | `tool_name`, `args`, `task_id` |
-| `post_tool_call` | After any tool returns | `tool_name`, `args`, `result`, `task_id` |
-| `pre_llm_call` | Before LLM API request | `session_id`, `user_message`, `conversation_history`, `is_first_turn`, `model`, `platform` |
-| `post_llm_call` | After LLM API response | `session_id`, `user_message`, `assistant_response`, `conversation_history`, `model`, `platform` |
-| `on_session_start` | Session begins | `session_id`, `model`, `platform` |
-| `on_session_end` | Session ends | `session_id`, `completed`, `interrupted`, `model`, `platform` |
+- Callbacks receive **keyword arguments**. Always accept `**kwargs` for forward compatibility — new parameters may be added in future versions without breaking your plugin.
+- If a callback **crashes**, it's logged and skipped. Other hooks and the agent continue normally. A misbehaving plugin can never break the agent.
+- All hooks are **fire-and-forget observers** whose return values are ignored — except `pre_llm_call`, which can [inject context](#pre_llm_call).

-Callbacks receive keyword arguments matching the columns above:
+### Quick reference
+
+| Hook | Fires when | Returns |
+|------|-----------|---------|
+| [`pre_tool_call`](#pre_tool_call) | Before any tool executes | ignored |
+| [`post_tool_call`](#post_tool_call) | After any tool returns | ignored |
+| [`pre_llm_call`](#pre_llm_call) | Once per turn, before the tool-calling loop | context injection |
+| [`post_llm_call`](#post_llm_call) | Once per turn, after the tool-calling loop | ignored |
+| [`on_session_start`](#on_session_start) | New session created (first turn only) | ignored |
+| [`on_session_end`](#on_session_end) | Session ends | ignored |
+
+---
+
+### `pre_tool_call`
+
+Fires **immediately before** every tool execution — built-in tools and plugin tools alike.
+
+**Callback signature:**

 ```python
-def my_callback(**kwargs):
-    tool = kwargs["tool_name"]
-    args = kwargs["args"]
-    # ...
+def my_callback(tool_name: str, args: dict, task_id: str, **kwargs):
 ```

-### Example: Block Dangerous Tools
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `tool_name` | `str` | Name of the tool about to execute (e.g. `"terminal"`, `"web_search"`, `"read_file"`) |
+| `args` | `dict` | The arguments the model passed to the tool |
+| `task_id` | `str` | Session/task identifier. Empty string if not set. |
+
+**Fires:** In `model_tools.py`, inside `handle_function_call()`, before the tool's handler runs. Fires once per tool call — if the model calls 3 tools in parallel, this fires 3 times.
+
+**Return value:** Ignored.
+
+**Use cases:** Logging, audit trails, tool call counters, blocking dangerous operations (print a warning), rate limiting.
+
+**Example — tool call audit log:**

 ```python
-# ~/.hermes/plugins/tool-guard/__init__.py
-BLOCKED = {"terminal", "write_file"}
+import json, logging
+from datetime import datetime

-def guard(**kwargs):
-    if kwargs["tool_name"] in BLOCKED:
-        print(f"⚠ Blocked tool call: {kwargs['tool_name']}")
+logger = logging.getLogger(__name__)
+
+def audit_tool_call(tool_name, args, task_id, **kwargs):
+    logger.info("TOOL_CALL session=%s tool=%s args=%s",
+                task_id, tool_name, json.dumps(args)[:200])

 def register(ctx):
-    ctx.register_hook("pre_tool_call", guard)
+    ctx.register_hook("pre_tool_call", audit_tool_call)
 ```

-See the **[Plugins guide](/docs/user-guide/features/plugins)** for full details on creating plugins.
+**Example — warn on dangerous tools:**
+
+```python
+DANGEROUS = {"terminal", "write_file", "patch"}
+
+def warn_dangerous(tool_name, **kwargs):
+    if tool_name in DANGEROUS:
+        print(f"⚠ Executing potentially dangerous tool: {tool_name}")
+
+def register(ctx):
+    ctx.register_hook("pre_tool_call", warn_dangerous)
+```
+
+---
+
+### `post_tool_call`
+
+Fires **immediately after** every tool execution returns.
+
+**Callback signature:**
+
+```python
+def my_callback(tool_name: str, args: dict, result: str, task_id: str, **kwargs):
+```
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `tool_name` | `str` | Name of the tool that just executed |
+| `args` | `dict` | The arguments the model passed to the tool |
+| `result` | `str` | The tool's return value (always a JSON string) |
+| `task_id` | `str` | Session/task identifier. Empty string if not set. |
+
+**Fires:** In `model_tools.py`, inside `handle_function_call()`, after the tool's handler returns. Fires once per tool call. Does **not** fire if the tool raised an unhandled exception (the error is caught and returned as an error JSON string instead, and `post_tool_call` fires with that error string as `result`).
+
+**Return value:** Ignored.
+
+**Use cases:** Logging tool results, metrics collection, tracking tool success/failure rates, sending notifications when specific tools complete.
+
+**Example — track tool usage metrics:**
+
+```python
+from collections import Counter
+import json
+
+_tool_counts = Counter()
+_error_counts = Counter()
+
+def track_metrics(tool_name, result, **kwargs):
+    _tool_counts[tool_name] += 1
+    try:
+        parsed = json.loads(result)
+        if "error" in parsed:
+            _error_counts[tool_name] += 1
+    except (json.JSONDecodeError, TypeError):
+        pass
+
+def register(ctx):
+    ctx.register_hook("post_tool_call", track_metrics)
+```
+
+---
+
+### `pre_llm_call`
+
+Fires **once per turn**, before the tool-calling loop begins. This is the **only hook whose return value is used** — it can inject context into the current turn's user message.
+
+**Callback signature:**
+
+```python
+def my_callback(session_id: str, user_message: str, conversation_history: list,
+                is_first_turn: bool, model: str, platform: str, **kwargs):
+```
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `session_id` | `str` | Unique identifier for the current session |
+| `user_message` | `str` | The user's original message for this turn (before any skill injection) |
+| `conversation_history` | `list` | Copy of the full message list (OpenAI format: `[{"role": "user", "content": "..."}]`) |
+| `is_first_turn` | `bool` | `True` if this is the first turn of a new session, `False` on subsequent turns |
+| `model` | `str` | The model identifier (e.g. `"anthropic/claude-sonnet-4.6"`) |
+| `platform` | `str` | Where the session is running: `"cli"`, `"telegram"`, `"discord"`, etc. |
+
+**Fires:** In `run_agent.py`, inside `run_conversation()`, after context compression but before the main `while` loop. Fires once per `run_conversation()` call (i.e. once per user turn), not once per API call within the tool loop.
+
+**Return value:** If the callback returns a dict with a `"context"` key, or a plain non-empty string, the text is appended to the current turn's user message. Return `None` for no injection.
+
+```python
+# Inject context
+return {"context": "Recalled memories:\n- User likes Python\n- Working on hermes-agent"}
+
+# Plain string (equivalent)
+return "Recalled memories:\n- User likes Python"
+
+# No injection
+return None
+```
+
+**Where context is injected:** Always the **user message**, never the system prompt. This preserves the prompt cache — the system prompt stays identical across turns, so cached tokens are reused. The system prompt is Hermes's territory (model guidance, tool enforcement, personality, skills). Plugins contribute context alongside the user's input.
+
+All injected context is **ephemeral** — added at API call time only. The original user message in the conversation history is never mutated, and nothing is persisted to the session database.
+
+When **multiple plugins** return context, their outputs are joined with double newlines in plugin discovery order (alphabetical by directory name).
+
+**Use cases:** Memory recall, RAG context injection, guardrails, per-turn analytics.
+
+**Example — memory recall:**
+
+```python
+import httpx
+
+MEMORY_API = "https://your-memory-api.example.com"
+
+def recall(session_id, user_message, is_first_turn, **kwargs):
+    try:
+        resp = httpx.post(f"{MEMORY_API}/recall", json={
+            "session_id": session_id,
+            "query": user_message,
+        }, timeout=3)
+        memories = resp.json().get("results", [])
+        if not memories:
+            return None
+        text = "Recalled context:\n" + "\n".join(f"- {m['text']}" for m in memories)
+        return {"context": text}
+    except Exception:
+        return None
+
+def register(ctx):
+    ctx.register_hook("pre_llm_call", recall)
+```
+
+**Example — guardrails:**
+
+```python
+POLICY = "Never execute commands that delete files without explicit user confirmation."
+
+def guardrails(**kwargs):
+    return {"context": POLICY}
+
+def register(ctx):
+    ctx.register_hook("pre_llm_call", guardrails)
+```
+
+---
+
+### `post_llm_call`
+
+Fires **once per turn**, after the tool-calling loop completes and the agent has produced a final response. Only fires on **successful** turns — does not fire if the turn was interrupted.
+
+**Callback signature:**
+
+```python
+def my_callback(session_id: str, user_message: str, assistant_response: str,
+                conversation_history: list, model: str, platform: str, **kwargs):
+```
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `session_id` | `str` | Unique identifier for the current session |
+| `user_message` | `str` | The user's original message for this turn |
+| `assistant_response` | `str` | The agent's final text response for this turn |
+| `conversation_history` | `list` | Copy of the full message list after the turn completed |
+| `model` | `str` | The model identifier |
+| `platform` | `str` | Where the session is running |
+
+**Fires:** In `run_agent.py`, inside `run_conversation()`, after the tool loop exits with a final response. Guarded by `if final_response and not interrupted` — so it does **not** fire when the user interrupts mid-turn or the agent hits the iteration limit without producing a response.
+
+**Return value:** Ignored.
+
+**Use cases:** Syncing conversation data to an external memory system, computing response quality metrics, logging turn summaries, triggering follow-up actions.
+
+**Example — sync to external memory:**
+
+```python
+import httpx
+
+MEMORY_API = "https://your-memory-api.example.com"
+
+def sync_memory(session_id, user_message, assistant_response, **kwargs):
+    try:
+        httpx.post(f"{MEMORY_API}/store", json={
+            "session_id": session_id,
+            "user": user_message,
+            "assistant": assistant_response,
+        }, timeout=5)
+    except Exception:
+        pass  # best-effort
+
+def register(ctx):
+    ctx.register_hook("post_llm_call", sync_memory)
+```
+
+**Example — track response lengths:**
+
+```python
+import logging
+logger = logging.getLogger(__name__)
+
+def log_response_length(session_id, assistant_response, model, **kwargs):
+    logger.info("RESPONSE session=%s model=%s chars=%d",
+                session_id, model, len(assistant_response or ""))
+
+def register(ctx):
+    ctx.register_hook("post_llm_call", log_response_length)
+```
+
+---
+
+### `on_session_start`
+
+Fires **once** when a brand-new session is created. Does **not** fire on session continuation (when the user sends a second message in an existing session).
+
+**Callback signature:**
+
+```python
+def my_callback(session_id: str, model: str, platform: str, **kwargs):
+```
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `session_id` | `str` | Unique identifier for the new session |
+| `model` | `str` | The model identifier |
+| `platform` | `str` | Where the session is running |
+
+**Fires:** In `run_agent.py`, inside `run_conversation()`, during the first turn of a new session — specifically after the system prompt is built but before the tool loop starts. The check is `if not conversation_history` (no prior messages = new session).
+
+**Return value:** Ignored.
+
+**Use cases:** Initializing session-scoped state, warming caches, registering the session with an external service, logging session starts.
+
+**Example — initialize a session cache:**
+
+```python
+_session_caches = {}
+
+def init_session(session_id, model, platform, **kwargs):
+    _session_caches[session_id] = {
+        "model": model,
+        "platform": platform,
+        "tool_calls": 0,
+        "started": __import__("datetime").datetime.now().isoformat(),
+    }
+
+def register(ctx):
+    ctx.register_hook("on_session_start", init_session)
+```
+
+---
+
+### `on_session_end`
+
+Fires at the **very end** of every `run_conversation()` call, regardless of outcome. Also fires from the CLI's exit handler if the agent was mid-turn when the user quit.
+
+**Callback signature:**
+
+```python
+def my_callback(session_id: str, completed: bool, interrupted: bool,
+                model: str, platform: str, **kwargs):
+```
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `session_id` | `str` | Unique identifier for the session |
+| `completed` | `bool` | `True` if the agent produced a final response, `False` otherwise |
+| `interrupted` | `bool` | `True` if the turn was interrupted (user sent new message, `/stop`, or quit) |
+| `model` | `str` | The model identifier |
+| `platform` | `str` | Where the session is running |
+
+**Fires:** In two places:
+1. **`run_agent.py`** — at the end of every `run_conversation()` call, after all cleanup. Always fires, even if the turn errored.
+2. **`cli.py`** — in the CLI's atexit handler, but **only** if the agent was mid-turn (`_agent_running=True`) when the exit occurred. This catches Ctrl+C and `/exit` during processing. In this case, `completed=False` and `interrupted=True`.
+
+**Return value:** Ignored.
+
+**Use cases:** Flushing buffers, closing connections, persisting session state, logging session duration, cleanup of resources initialized in `on_session_start`.
+
+**Example — flush and cleanup:**
+
+```python
+_session_caches = {}
+
+def cleanup_session(session_id, completed, interrupted, **kwargs):
+    cache = _session_caches.pop(session_id, None)
+    if cache:
+        # Flush accumulated data to disk or external service
+        status = "completed" if completed else ("interrupted" if interrupted else "failed")
+        print(f"Session {session_id} ended: {status}, {cache['tool_calls']} tool calls")
+
+def register(ctx):
+    ctx.register_hook("on_session_end", cleanup_session)
+```
+
+**Example — session duration tracking:**
+
+```python
+import time, logging
+logger = logging.getLogger(__name__)
+
+_start_times = {}
+
+def on_start(session_id, **kwargs):
+    _start_times[session_id] = time.time()
+
+def on_end(session_id, completed, interrupted, **kwargs):
+    start = _start_times.pop(session_id, None)
+    if start:
+        duration = time.time() - start
+        logger.info("SESSION_DURATION session=%s seconds=%.1f completed=%s interrupted=%s",
+                     session_id, duration, completed, interrupted)
+
+def register(ctx):
+    ctx.register_hook("on_session_start", on_start)
+    ctx.register_hook("on_session_end", on_end)
+```
+
+---
+
+See the **[Build a Plugin guide](/docs/guides/build-a-hermes-plugin)** for the full walkthrough including tool schemas, handlers, and advanced hook patterns.
--- a/website/docs/user-guide/features/plugins.md
+++ b/website/docs/user-guide/features/plugins.md
@ -103,12 +103,12 @@ Plugins can register callbacks for these lifecycle events. See the **[Event Hook

 | Hook | Fires when |
 |------|-----------|
-| `pre_tool_call` | Before any tool executes |
-| `post_tool_call` | After any tool returns |
-| `pre_llm_call` | Once per turn, before the LLM loop — can return `{"context": "..."}` to inject into the system prompt |
-| `post_llm_call` | Once per turn, after the LLM loop completes |
-| `on_session_start` | New session created (first turn only) |
-| `on_session_end` | End of every `run_conversation` call |
+| [`pre_tool_call`](/docs/user-guide/features/hooks#pre_tool_call) | Before any tool executes |
+| [`post_tool_call`](/docs/user-guide/features/hooks#post_tool_call) | After any tool returns |
+| [`pre_llm_call`](/docs/user-guide/features/hooks#pre_llm_call) | Once per turn, before the LLM loop — can return `{"context": "..."}` to [inject context into the user message](/docs/user-guide/features/hooks#pre_llm_call) |
+| [`post_llm_call`](/docs/user-guide/features/hooks#post_llm_call) | Once per turn, after the LLM loop (successful turns only) |
+| [`on_session_start`](/docs/user-guide/features/hooks#on_session_start) | New session created (first turn only) |
+| [`on_session_end`](/docs/user-guide/features/hooks#on_session_end) | End of every `run_conversation` call + CLI exit handler |

 ## Managing plugins