diff --git a/docs/self-evolution-design.html b/docs/self-evolution-design.html
new file mode 100644
index 0000000000..5736299e17
--- /dev/null
+++ b/docs/self-evolution-design.html
@@ -0,0 +1,911 @@
+
+
+
+
+
+Hermes Agent 自我优化与持续进化系统设计
+
+
+
+
+
+
+
Hermes Agent 自我优化与持续进化系统
+
一套完全插件化的 agent 自我进化机制 — 通过每日"梦境整理"和"飞书审批流"实现闭环自我优化
+
+ 零侵入核心代码
+ 完全插件化
+ GLM-5.1 / Qwen 降级
+ 飞书审批流
+
+
+
+
+核心架构:五层闭环
+观察 → 评估 → 反思 → 学习 → 进化,形成持续自我改进的闭环循环。
+
+
+
+
+ 📡
+ 观察
+ 遥测采集
post_tool_call
+
+
→
+
+ 📊
+ 评估
+ 质量评分
on_session_end
+
+
→
+
+ 🌙
+ 反思
+ 梦境整理
凌晨 1:00
+
+
→
+
+ 🧠
+ 学习
+ 进化提案
策略生成
+
+
→
+
+ 🚀
+ 进化
+ 飞书审批 → 执行
19:00 推送
+
+
↩
+
+ 💾
+ 存储
+ evolution.db
strategies.json
+
+
+
+
+
+每日流程
+从凌晨梦境整理到晚间飞书推送,一天的自动进化循环。
+
+
+
+
01:00 — 梦境整理(自动执行)
+
DreamEngine.run() — 分析前日全部 session
+
+
+
1
+
+
数据汇总
+
读取 state.db(只读)+ evolution.db,计算各 session 质量评分
+
+
+
+
2
+
+
错误分析(重点)
+
+ - 工具调用失败统计(按工具、按错误类型分布)
+ - 反复重试检测(同一工具同一 session 调用 > 2次)
+ - 未完成 session、用户纠正消息、API 错误
+ - 错误连锁分析(一个失败是否引发后续失败)
+
+
+
+
+
3
+
+
时间浪费分析(重点)
+
+ - 耗时最长的工具调用 TOP 10
+ - 重复操作(多次读同一文件、重复搜索)
+ - 低效 session(迭代轮数过多、工具调用过多)
+ - 可缩短的工具调用链
+
+
+
+
+
4
+
+
深度反思(GLM-5.1 优先 / Qwen 降级)
+
将分析结果发送到本地模型,产出结构化 ReflectionReport:错误根因 + 浪费根因 + 可操作建议
+
+
+
+
5
+
+
模式识别 + 生成进化提案
+
高成功率模式 → 候选技能 | 重复错误 → 候选规避策略 | 系统性浪费 → 候选流程优化
+
+
+
+
+
+
+
19:00 — 飞书推送进化方案
+
FeishuNotifier.send_daily_report()
+
读取当日凌晨产出的 pending_approval 提案,格式化为飞书交互卡片推送给用户。
+
+
+
+
用户审批后 — 执行进化
+
EvolutionExecutor.execute()
+
飞书回调触发执行:技能创建 / 策略调整 / 记忆更新 / 工具偏好变更。执行后自动创建 A/B 测试追踪单元。
+
+
+
+
+飞书卡片消息预览
+
+
+
+
📊 前日概况
+
完成 sessions23
+
平均质量评分0.78 ↑0.03
+
工具调用 / 成功率156次 / 91%
+
+
+
❌ 错误分析
+
browser_tool 失败5次 (超时3次)
+
未完成 session2个
+
用户纠正3次
+
+
+
⏱️ 时间浪费分析
+
重复读取同一文件8次
+
web_search→browser 冗余6次
+
平均迭代轮数12轮 (理想8轮)
+
+
+
+
📋 进化提案 (3项)
+
+
[1] 🛠️ 创建技能: web_search_pipeline
+
预期: 搜索任务成功率 +15% | 风险: low
+
+
+
+
+
+
+
+
[2] ⚡ 策略调整: 优先 grep 替代 find
+
预期: 文件搜索效率 +25% | 风险: low
+
+
+
+
+
+
+
+
[3] 🧠 记忆更新: 用户偏好中文回复
+
预期: 用户满意度提升 | 风险: low
+
+
+
+
+
+
+
+
+
+
+质量评分体系
+每个 session 结束时自动计算复合质量评分,零 API 成本。
+
+
+ session_quality =
+ 0.40 × completion_rate +
+ 0.20 × efficiency_score +
+ 0.15 × cost_efficiency +
+ 0.25 × satisfaction_proxy
+
+
+
+
+
completion_rate 权重 0.40
+
任务是否完成。completed=1.0, interrupted=0.5, failed=0.0
+
+
+
efficiency_score 权重 0.20
+
迭代效率。理想轮数 / 实际轮数,上限 1.0
+
+
+
cost_efficiency 权重 0.15
+
工具使用效率。期望调用数 / 实际调用数,上限 1.0
+
+
+
satisfaction_proxy 权重 0.25
+
满意度代理。单轮完成=0.9, 多轮完成=0.75, 预算耗尽=-0.2
+
+
+
+
+Claude Code 设计参考
+本方案借鉴了 Claude Code 开源项目中的四个核心设计模式。
+
+
+
+
plugins/hookify/agents/conversation-analyzer.md
+
梦境整理 ← conversation-analyzer
+
+ 分析对话历史 → 识别纠正/沮丧/重复问题信号 → 提取可匹配正则规则 → 按严重程度分级(高/中/低)。
+
我们的扩展:从手动触发升级为每日自动运行,增加错误分析和时间浪费分析。
+
+
+
+
plugins/ralph-wiggum/
+
进化执行 ← Ralph Wiggum
+
+ 自我引用反馈环:Stop hook 拦截退出 → 重喂 prompt → agent 看到自己的修改 → 自动迭代直到满足条件。
+
我们的扩展:进化执行后创建验证追踪单元(类似 completion_promise),不满足条件自动回滚。
+
+
+
+
plugins/learning-output-style/
+
策略注入 ← SessionStart hook
+
+ 通过 SessionStart hook 在每个 session 自动注入行为上下文,等效于 CLAUDE.md 但更灵活。
+
我们的扩展:使用 pre_llm_call 钩子注入已学习的行为提示,完全隔离于核心代码。
+
+
+
+
plugins/hookify/core/rule_engine.py
+
规则引擎 ← rule_engine
+
+ LRU 缓存编译正则(128 上限),支持 regex_match/contains/equals/not_contains,区分 block/warn 级别。
+
我们的扩展:策略注入条件化,根据 session 特征(平台/任务类型/模型)匹配最相关规则。
+
+
+
+
+
+隔离策略:零侵入核心代码
+所有功能以插件形式实现,通过钩子集成,不修改任何上游核心文件。
+
+
+
+
插件文件结构
+
+self_evolution/
+├── plugin.yaml
+├── __init__.py
+├── db.py
+├── hooks.py
+├── quality_scorer.py
+├── reflection_engine.py
+├── rule_engine.py
+├── evolution_proposer.py
+├── evolution_executor.py
+├── feishu_notifier.py
+├── strategy_injector.py
+├── strategy_store.py
+├── cron_jobs.py
+├── models.py
+├── agents/
+│ ├── dream_analyzer.md
+│ └── evolution_planner.md
+└── prompts/
+ └── reflection.md
+
+
+
+
钩子集成方式
+
+ | 功能 | 集成方式 | 修改核心 |
+ | 工具调用遥测 | post_tool_call | NO |
+ | Session 评分 | on_session_end | NO |
+ | 策略注入 | pre_llm_call | NO |
+ | 定时任务 | cron/jobs.json | NO |
+ | 飞书通知 | gateway/ 飞书网关 | NO |
+ | 技能创建 | skill_manager_tool | NO |
+ | 记忆更新 | memory_tool | NO |
+ | 历史数据 | state.db 只读 | NO |
+
+
+
+
+
+独立数据库设计
+独立于核心 state.db,7 张表存储于 ~/.hermes/self_evolution/evolution.db
+
+
+
+
tool_invocations
+
session_id TEXT
+
tool_name TEXT
+
duration_ms INT
+
success BOOL
+
error_type TEXT
+
+
+
session_scores
+
session_id TEXT PK
+
composite_score REAL
+
completion_rate REAL
+
efficiency_score REAL
+
task_category TEXT
+
+
+
outcome_signals
+
session_id TEXT
+
signal_type TEXT
+
signal_value REAL
+
metadata TEXT JSON
+
+
+
reflection_reports
+
sessions_analyzed INT
+
avg_score REAL
+
error_summary TEXT
+
worst_patterns TEXT JSON
+
recommendations TEXT JSON
+
+
+
evolution_proposals
+
id TEXT PK
+
proposal_type TEXT
+
title, description TEXT
+
status TEXT pending→approved→executed
+
+
+
improvement_units
+
proposal_id TEXT FK
+
baseline_score REAL
+
current_score REAL
+
status TEXT active→promoted / reverted
+
+
+
strategy_versions
+
version INT
+
strategies_json TEXT
+
avg_score REAL
+
active_from / active_until REAL
+
+
+
+
+安全机制:防止退化漂移
+六层防护确保进化方向正确且可回滚。
+
+
+
+
🗄️
+
独立数据库
+
不碰 state.db,上游 schema 变更无影响
+
+
+
🔒
+
只读核心
+
所有集成通过钩子完成,不修改核心文件
+
+
+
🚧
+
人工闸门
+
进化方案必须通过飞书审批,不自动执行
+
+
+
⏪
+
版本回滚
+
策略变更版本化,评分连续下降自动回滚
+
+
+
🛡️
+
有界变更
+
只能写 PERFORMANCE.md、创建 learned skills
+
+
+
📚
+
拒绝学习
+
被拒绝的提案会被分析,避免重复提出
+
+
+
+
+实施路径
+四个阶段,每阶段约 1 周。
+
+
+
+
01
+
基础设施
+
+ - 插件骨架
+ - 独立数据库 db.py
+ - 遥测采集 hooks.py
+ - 质量评分器
+
+
+
+
02
+
梦境整理
+
+ - 反思引擎 reflection_engine.py
+ - 错误分析 + 时间浪费分析
+ - 进化提案生成器
+ - 凌晨 1:00 cron 注册
+
+
+
+
03
+
飞书审批
+
+ - 飞书通知器 feishu_notifier.py
+ - 卡片消息 + 按钮回调
+ - 19:00 cron 注册
+
+
+
+
04
+
进化执行
+
+ - 进化执行器 + 回滚
+ - 策略注入 + 规则引擎
+ - 策略存储 + 版本管理
+ - A/B 测试追踪
+
+
+
+
+
+模型配置
+
+
+
+model:
+ primary:
+ provider: "zhipu"
+ model: "glm-5.1"
+ fallback:
+ provider: "ollama"
+ model: "qwen3:32b"
+ base_url: "http://localhost:11434"
+
+schedule:
+ dream_time: "0 1 * * *"
+ propose_time: "0 19 * * *"
+
+
+
+
+
+
Hermes Agent Self-Evolution System — Designed with reference from Claude Code open-source patterns
+
conversation-analyzer · Ralph Wiggum · learning-output-style · rule_engine
+
+
+
+
diff --git a/self_evolution/__init__.py b/self_evolution/__init__.py
new file mode 100644
index 0000000000..d2b4001c2b
--- /dev/null
+++ b/self_evolution/__init__.py
@@ -0,0 +1,43 @@
+"""
+Self Evolution Plugin
+=====================
+
+Agent self-optimization and continuous evolution system.
+
+Architecture:
+ - Telemetry: collects tool/session data via hooks
+ - Quality Scorer: evaluates session outcomes
+ - Dream Engine: nightly reflection at 1:00
+ - Evolution Proposer: generates improvement proposals
+ - Feishu Notifier: pushes proposals at 19:00 for user approval
+ - Evolution Executor: applies approved changes with rollback support
+ - Strategy Injector: injects learned hints into sessions
+
+Design references from Claude Code:
+ - conversation-analyzer (hookify): dream analysis pattern
+ - Ralph Wiggum: iterative evolution with rollback
+ - learning-output-style: session-start strategy injection
+ - rule_engine (hookify): conditional strategy matching
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def register(ctx) -> None:
+ """Plugin entry point — called by Hermes PluginManager.
+
+ Registers:
+ - 3 hooks: post_tool_call, on_session_end, pre_llm_call
+ - 3 slash commands: /evolve, /reflect, /evolution_status
+ """
+ from self_evolution.db import init_db
+ init_db()
+
+ from self_evolution.hooks import register_all as register_hooks
+ register_hooks(ctx)
+
+ logger.info("self_evolution plugin loaded: 3 hooks, telemetry active")
diff --git a/self_evolution/agents/dream_analyzer.md b/self_evolution/agents/dream_analyzer.md
new file mode 100644
index 0000000000..60a631923f
--- /dev/null
+++ b/self_evolution/agents/dream_analyzer.md
@@ -0,0 +1,82 @@
+---
+name: dream_analyzer
+description: >
+ 用于每日梦境整理的分析 agent。
+ 分析前日所有 session 的工具调用、错误模式、时间浪费,
+ 产出结构化的反思报告和进化提案。
+model: inherit
+tools: ["Read", "Grep"]
+---
+
+你是 Hermes Agent 的性能分析专家。你的任务是分析 agent 的运行数据,识别问题和优化机会。
+
+## 分析流程
+
+### 1. 错误信号检测
+
+参考 Claude Code conversation-analyzer 的模式,搜索以下信号:
+
+**显式纠正信号:**
+- 用户消息包含 "不对"、"错误"、"重试"、"不要"
+- 用户消息包含 "stop"、"wrong"、"retry"、"don't"
+
+**沮丧反应信号:**
+- "为什么你做了X?"、"那不是我说的"
+- "太慢了"、"浪费时间"
+
+**用户回退信号:**
+- 用户撤销了 agent 的修改
+- 用户手动修复了 agent 的问题
+
+**重复问题:**
+- 同类错误在多个 session 中出现
+
+### 2. 错误严重程度分级
+
+**高严重度(应创建规避规则):**
+- 系统性工具失败(同一工具多次失败)
+- 安全相关问题
+- 数据丢失风险
+
+**中严重度(应警告):**
+- 效率问题(重复操作、不必要的步骤)
+- 风格不一致
+- 非关键错误
+
+**低严重度(可选优化):**
+- 用户偏好
+- 非关键的模式改进
+
+### 3. 时间浪费分析
+
+重点分析:
+- 耗时最长的工具调用
+- 重复操作(多次读同一文件、重复搜索)
+- 工具调用链中的不必要步骤
+- 迭代轮数过多的 session
+
+### 4. 输出格式
+
+必须按 JSON 格式输出:
+
+```json
+{
+ "worst_patterns": ["模式描述1", "模式描述2"],
+ "best_patterns": ["成功模式描述1"],
+ "tool_insights": {
+ "tool_name": {"success_rate": 0.95, "avg_duration_ms": 500, "recommendation": "建议"}
+ },
+ "recommendations": [
+ "具体的可操作建议1",
+ "具体的可操作建议2"
+ ]
+}
+```
+
+### 5. 质量标准
+
+- 每个建议都必须是具体的、可操作的
+- 包含实际的例子
+- 解释为什么这个问题值得修复
+- 提供可直接使用的规则或策略
+- 不要对假设性讨论产生误报
diff --git a/self_evolution/agents/evolution_planner.md b/self_evolution/agents/evolution_planner.md
new file mode 100644
index 0000000000..7bdbee0941
--- /dev/null
+++ b/self_evolution/agents/evolution_planner.md
@@ -0,0 +1,51 @@
+---
+name: evolution_planner
+description: >
+ 用于将反思报告转化为具体进化方案的规划 agent。
+ 生成技能创建、策略调整、记忆更新等具体方案。
+model: inherit
+tools: ["Read", "Grep"]
+---
+
+你是 Hermes Agent 的进化规划专家。你的任务是将性能分析结论转化为具体的、可执行的进化方案。
+
+## 方案类型
+
+### 技能创建 (skill)
+当发现可复用的成功模式时,建议创建新技能:
+- 描述技能的触发条件和执行步骤
+- 包含具体的 prompt 模板
+- 标注适用的场景
+
+### 策略调整 (strategy)
+当发现效率问题或错误模式时,建议创建策略规则:
+- 定义匹配条件(工具名、平台、任务类型)
+- 提供策略提示文本
+- 标注严重程度(hint | avoid | prefer)
+
+### 记忆更新 (memory)
+当发现关于用户偏好或环境特性时,建议更新记忆:
+- 写入 PERFORMANCE.md
+- 内容简洁、可操作
+- 避免主观判断
+
+### 工具偏好 (tool_preference)
+当发现工具使用效率差异时,建议调整偏好:
+- 基于数据说明为什么A优于B
+- 提供具体的替换建议
+
+## 输出格式
+
+每个方案必须包含:
+1. **标题**:简短描述(<50字)
+2. **描述**:详细说明变更内容
+3. **预期影响**:定量或定性的改善预期
+4. **风险评估**:low / medium / high
+5. **回滚方案**:如何安全地撤销此变更
+
+## 质量标准
+
+- 每个方案只变更一个变量
+- 方案必须是可测量、可回滚的
+- 优先高影响、低风险的方案
+- 每次最多提出 5 个方案
diff --git a/self_evolution/cron_jobs.py b/self_evolution/cron_jobs.py
new file mode 100644
index 0000000000..13ba0c1b81
--- /dev/null
+++ b/self_evolution/cron_jobs.py
@@ -0,0 +1,115 @@
+"""
+Self Evolution Plugin — Cron Job Registration
+==============================================
+
+Registers two cron jobs:
+ 1. dream_time (1:00): Run dream consolidation
+ 2. propose_time (19:00): Push proposals via Feishu
+
+Uses Hermes' existing cron system (cron/jobs.json).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+from self_evolution.paths import CRON_DIR
+
+CRON_FILE = CRON_DIR / "jobs.json"
+
+DREAM_JOB_ID = "self_evolution_dream"
+PROPOSE_JOB_ID = "self_evolution_propose"
+
+
+def register_cron_jobs():
+ """Register the two self_evolution cron jobs if not already present."""
+ CRON_DIR.mkdir(parents=True, exist_ok=True)
+
+ jobs = _load_jobs()
+
+ # Resolve model config from hermes unified config
+ from self_evolution.reflection_engine import _resolve_runtime_config
+ runtime = _resolve_runtime_config()
+ model = runtime.get("model", "")
+ provider = runtime.get("provider", "")
+
+ # Dream consolidation at 1:00
+ if not any(j.get("id") == DREAM_JOB_ID for j in jobs):
+ jobs.append({
+ "id": DREAM_JOB_ID,
+ "name": "Self Evolution - Dream Consolidation",
+ "prompt": "运行自我进化的梦境整理:分析前日session的错误和浪费时间问题,生成进化提案。",
+ "schedule": "0 1 * * *",
+ "model": model,
+ "provider": provider,
+ "deliver": "[SILENT]",
+ "skill": "self_evolution:dream",
+ })
+
+ # Proposal push at 19:00
+ if not any(j.get("id") == PROPOSE_JOB_ID for j in jobs):
+ jobs.append({
+ "id": PROPOSE_JOB_ID,
+ "name": "Self Evolution - Proposal Push",
+ "prompt": "推送今日自我进化提案到飞书。",
+ "schedule": "0 19 * * *",
+ "model": model,
+ "provider": provider,
+ "deliver": "[SILENT]",
+ "skill": "self_evolution:propose",
+ })
+
+ _save_jobs(jobs)
+ logger.info("Registered self_evolution cron jobs: dream=1:00, propose=19:00")
+
+
+def run_dream_job():
+ """Execute the dream consolidation job.
+
+ Called by the cron system at 1:00.
+ Uses hermes unified runtime provider for model config.
+ """
+ from self_evolution.reflection_engine import DreamEngine
+
+ # DreamEngine() with no args auto-resolves via resolve_runtime_provider()
+ engine = DreamEngine()
+ report = engine.run(hours=24, max_runtime_seconds=6 * 3600)
+
+ if report:
+ logger.info("Dream consolidation complete: score=%.3f, proposals generated", report.avg_score)
+ else:
+ logger.info("Dream consolidation: no data to analyze")
+
+
+def run_propose_job():
+ """Execute the proposal push job.
+
+ Called by the cron system at 19:00.
+ """
+ from self_evolution.feishu_notifier import FeishuNotifier
+
+ notifier = FeishuNotifier()
+ notifier.send_daily_report()
+
+
+def _load_jobs() -> list:
+ """Load existing cron jobs."""
+ if not CRON_FILE.exists():
+ return []
+ try:
+ return json.loads(CRON_FILE.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, OSError):
+ return []
+
+
+def _save_jobs(jobs: list):
+ """Save cron jobs."""
+ CRON_FILE.write_text(
+ json.dumps(jobs, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
diff --git a/self_evolution/db.py b/self_evolution/db.py
new file mode 100644
index 0000000000..04cc1bc0cd
--- /dev/null
+++ b/self_evolution/db.py
@@ -0,0 +1,296 @@
+"""
+Self Evolution Plugin — Independent SQLite Database
+=====================================================
+Independent from state.db to avoid upstream schema conflicts.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import sqlite3
+import threading
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+from self_evolution.paths import DATA_DIR as DB_DIR, DB_PATH
+
+SCHEMA_VERSION = 1
+
+VALID_TABLES = frozenset({
+ "tool_invocations", "session_scores", "outcome_signals",
+ "reflection_reports", "evolution_proposals", "improvement_units",
+ "strategy_versions", "_meta",
+})
+
+
+def _validate_table(table: str) -> None:
+ """Reject table names not in the known schema."""
+ if table not in VALID_TABLES:
+ raise ValueError(f"Invalid table name: {table!r}")
+
+
+SCHEMA = """
+-- Tool invocation telemetry
+CREATE TABLE IF NOT EXISTS tool_invocations (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ session_id TEXT NOT NULL,
+ tool_name TEXT NOT NULL,
+ duration_ms INTEGER,
+ success BOOLEAN NOT NULL,
+ error_type TEXT,
+ turn_number INTEGER,
+ created_at REAL NOT NULL DEFAULT (strftime('%s','now'))
+);
+
+-- Session quality scores
+CREATE TABLE IF NOT EXISTS session_scores (
+ session_id TEXT PRIMARY KEY,
+ composite_score REAL,
+ completion_rate REAL,
+ efficiency_score REAL,
+ cost_efficiency REAL,
+ satisfaction_proxy REAL,
+ task_category TEXT,
+ model TEXT,
+ created_at REAL NOT NULL DEFAULT (strftime('%s','now'))
+);
+
+-- Outcome signals
+CREATE TABLE IF NOT EXISTS outcome_signals (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ session_id TEXT NOT NULL,
+ signal_type TEXT NOT NULL,
+ signal_value REAL,
+ metadata TEXT,
+ created_at REAL NOT NULL DEFAULT (strftime('%s','now'))
+);
+
+-- Reflection reports
+CREATE TABLE IF NOT EXISTS reflection_reports (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ period_start REAL,
+ period_end REAL,
+ sessions_analyzed INTEGER,
+ avg_score REAL,
+ error_summary TEXT DEFAULT '',
+ waste_summary TEXT DEFAULT '',
+ code_change_summary TEXT DEFAULT '',
+ worst_patterns TEXT DEFAULT '[]',
+ best_patterns TEXT DEFAULT '[]',
+ tool_insights TEXT DEFAULT '{}',
+ recommendations TEXT DEFAULT '[]',
+ model_used TEXT DEFAULT '',
+ created_at REAL NOT NULL DEFAULT (strftime('%s','now'))
+);
+
+-- Evolution proposals
+CREATE TABLE IF NOT EXISTS evolution_proposals (
+ id TEXT PRIMARY KEY,
+ report_id INTEGER REFERENCES reflection_reports(id),
+ proposal_type TEXT NOT NULL,
+ title TEXT NOT NULL,
+ description TEXT NOT NULL,
+ expected_impact TEXT DEFAULT '',
+ risk_assessment TEXT DEFAULT 'low',
+ rollback_plan TEXT DEFAULT '',
+ status TEXT NOT NULL DEFAULT 'pending_approval',
+ user_feedback TEXT DEFAULT '',
+ created_at REAL NOT NULL DEFAULT (strftime('%s','now')),
+ resolved_at REAL
+);
+
+-- Improvement unit tracking (A/B testing)
+CREATE TABLE IF NOT EXISTS improvement_units (
+ id TEXT PRIMARY KEY,
+ proposal_id TEXT REFERENCES evolution_proposals(id),
+ change_type TEXT NOT NULL,
+ version INTEGER DEFAULT 0,
+ baseline_score REAL DEFAULT 0.0,
+ current_score REAL DEFAULT 0.0,
+ sessions_sampled INTEGER DEFAULT 0,
+ min_sessions INTEGER DEFAULT 10,
+ min_improvement REAL DEFAULT 0.05,
+ max_regression REAL DEFAULT 0.10,
+ status TEXT NOT NULL DEFAULT 'active',
+ created_at REAL NOT NULL DEFAULT (strftime('%s','now')),
+ resolved_at REAL
+);
+
+-- Strategy version history
+CREATE TABLE IF NOT EXISTS strategy_versions (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ version INTEGER NOT NULL,
+ strategies_json TEXT NOT NULL,
+ avg_score REAL,
+ active_from REAL NOT NULL,
+ active_until REAL
+);
+
+-- Schema version tracking
+CREATE TABLE IF NOT EXISTS _meta (
+ key TEXT PRIMARY KEY,
+ value TEXT NOT NULL
+);
+
+-- Indexes
+CREATE INDEX IF NOT EXISTS idx_tool_invocations_session ON tool_invocations(session_id);
+CREATE INDEX IF NOT EXISTS idx_tool_invocations_created ON tool_invocations(created_at);
+CREATE INDEX IF NOT EXISTS idx_session_scores_created ON session_scores(created_at);
+CREATE INDEX IF NOT EXISTS idx_outcome_signals_session ON outcome_signals(session_id);
+CREATE INDEX IF NOT EXISTS idx_evolution_proposals_status ON evolution_proposals(status);
+CREATE INDEX IF NOT EXISTS idx_improvement_units_status ON improvement_units(status);
+"""
+
+
+def _ensure_dir():
+ DB_DIR.mkdir(parents=True, exist_ok=True)
+
+
+_local = threading.local()
+
+
+def get_connection() -> sqlite3.Connection:
+ """Return a thread-local cached connection (reused across calls)."""
+ conn = getattr(_local, "conn", None)
+ if conn is not None:
+ try:
+ conn.execute("SELECT 1")
+ return conn
+ except sqlite3.Error:
+ try:
+ conn.close()
+ except Exception:
+ pass
+ _ensure_dir()
+ conn = sqlite3.connect(str(DB_PATH))
+ conn.row_factory = sqlite3.Row
+ conn.execute("PRAGMA journal_mode=WAL")
+ conn.execute("PRAGMA foreign_keys=ON")
+ _local.conn = conn
+ return conn
+
+
+def close_connection():
+ """Close the thread-local connection (for test cleanup / teardown)."""
+ conn = getattr(_local, "conn", None)
+ if conn is not None:
+ try:
+ conn.close()
+ except Exception:
+ pass
+ _local.conn = None
+
+
+def init_db():
+ """Initialize database with schema."""
+ conn = get_connection()
+ conn.executescript(SCHEMA)
+ conn.execute(
+ "INSERT OR REPLACE INTO _meta (key, value) VALUES (?, ?)",
+ ("schema_version", str(SCHEMA_VERSION)),
+ )
+ conn.commit()
+ logger.info("self_evolution database initialized at %s", DB_PATH)
+
+ # Schema migration: add code_change_summary column if missing
+ try:
+ conn.execute("ALTER TABLE reflection_reports ADD COLUMN code_change_summary TEXT DEFAULT ''")
+ logger.info("Added code_change_summary column to reflection_reports")
+ except sqlite3.OperationalError:
+ pass # Column already exists
+
+ # Close after init so subsequent calls get a fresh connection with the new schema
+ close_connection()
+
+
+# ── Generic CRUD ─────────────────────────────────────────────────────────
+
+def insert(table: str, data: dict) -> int:
+ """Insert a row into a table. Returns the rowid."""
+ _validate_table(table)
+ conn = get_connection()
+ cols = ", ".join(data.keys())
+ placeholders = ", ".join("?" for _ in data)
+ sql = f"INSERT INTO {table} ({cols}) VALUES ({placeholders})"
+ cur = conn.execute(sql, list(data.values()))
+ conn.commit()
+ return cur.lastrowid
+
+
+def insert_many(table: str, rows: List[dict]):
+ """Insert multiple rows."""
+ _validate_table(table)
+ if not rows:
+ return
+ conn = get_connection()
+ cols = list(rows[0].keys())
+ placeholders = ", ".join("?" for _ in cols)
+ sql = f"INSERT INTO {table} ({', '.join(cols)}) VALUES ({placeholders})"
+ conn.executemany(sql, [[row.get(c) for c in cols] for row in rows])
+ conn.commit()
+
+
+def update(table: str, data: dict, where: str, where_params: tuple = ()):
+ """Update rows matching where clause."""
+ _validate_table(table)
+ conn = get_connection()
+ set_clause = ", ".join(f"{k} = ?" for k in data.keys())
+ sql = f"UPDATE {table} SET {set_clause} WHERE {where}"
+ conn.execute(sql, list(data.values()) + list(where_params))
+ conn.commit()
+
+
+def fetch_one(table: str, where: str = "", params: tuple = ()) -> Optional[Dict[str, Any]]:
+ """Fetch a single row as dict."""
+ _validate_table(table)
+ conn = get_connection()
+ sql = f"SELECT * FROM {table}"
+ if where:
+ sql += f" WHERE {where}"
+ sql += " LIMIT 1"
+ row = conn.execute(sql, params).fetchone()
+ return dict(row) if row else None
+
+
+def fetch_all(table: str, where: str = "", params: tuple = (),
+ order_by: str = "", limit: int = 0) -> List[Dict[str, Any]]:
+ """Fetch all matching rows as list of dicts."""
+ _validate_table(table)
+ conn = get_connection()
+ sql = f"SELECT * FROM {table}"
+ if where:
+ sql += f" WHERE {where}"
+ if order_by:
+ sql += f" ORDER BY {order_by}"
+ if limit:
+ sql += f" LIMIT {int(limit)}"
+ rows = conn.execute(sql, params).fetchall()
+ return [dict(r) for r in rows]
+
+
+def query(sql: str, params: tuple = ()) -> List[Dict[str, Any]]:
+ """Run a raw query."""
+ conn = get_connection()
+ rows = conn.execute(sql, params).fetchall()
+ return [dict(r) for r in rows]
+
+
+def execute(sql: str, params: tuple = ()):
+ """Run a raw execute."""
+ conn = get_connection()
+ conn.execute(sql, params)
+ conn.commit()
+
+
+def cleanup(days: int = 30):
+ """Remove data older than N days."""
+ cutoff = time.time() - (days * 86400)
+ conn = get_connection()
+ for table in ["tool_invocations", "outcome_signals"]:
+ conn.execute(f"DELETE FROM {table} WHERE created_at < ?", (cutoff,))
+ conn.commit()
+ logger.info("Cleaned up data older than %d days", days)
diff --git a/self_evolution/evolution_executor.py b/self_evolution/evolution_executor.py
new file mode 100644
index 0000000000..123dd6a46f
--- /dev/null
+++ b/self_evolution/evolution_executor.py
@@ -0,0 +1,325 @@
+"""
+Self Evolution Plugin — Evolution Executor
+============================================
+
+Executes approved evolution proposals with rollback support.
+
+Design reference: Claude Code plugins/ralph-wiggum/
+ - Self-referential feedback loop: execute → verify → rollback if needed
+ - Each change has a "completion promise" (verification criteria)
+ - Iteration > Perfection
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+import uuid
+from pathlib import Path
+from typing import Optional
+
+from self_evolution import db
+from self_evolution.models import Proposal, ImprovementUnit
+
+logger = logging.getLogger(__name__)
+
+from self_evolution.paths import DATA_DIR as STRATEGIES_DIR, STRATEGIES_FILE, ARCHIVE_DIR
+from self_evolution.paths import SKILLS_DIR, MEMORIES_DIR
+
+
+class EvolutionExecutor:
+ """Execute approved evolution proposals.
+
+ Supported proposal types:
+ - skill: create a new skill via skill_manager_tool
+ - strategy: update strategy rules
+ - memory: update PERFORMANCE.md via memory_tool
+ - tool_preference: update tool preference config
+ """
+
+ def execute(self, proposal: Proposal):
+ """Execute an approved proposal."""
+ logger.info("Executing proposal: %s (%s)", proposal.id, proposal.proposal_type)
+
+ try:
+ match proposal.proposal_type:
+ case "skill":
+ self._create_skill(proposal)
+ case "strategy":
+ self._update_strategy(proposal)
+ case "memory":
+ self._update_memory(proposal)
+ case "tool_preference":
+ self._update_tool_preference(proposal)
+ case "code_improvement":
+ self._save_optimization_request(proposal)
+
+ # Mark as executed
+ db.update(
+ "evolution_proposals",
+ {"status": "executed", "resolved_at": time.time()},
+ where="id = ?",
+ where_params=(proposal.id,),
+ )
+
+ # Create improvement tracking unit
+ self._create_tracking_unit(proposal)
+
+ logger.info("Proposal %s executed successfully", proposal.id)
+
+ except Exception as exc:
+ logger.exception("Failed to execute proposal %s: %s", proposal.id, exc)
+ db.update(
+ "evolution_proposals",
+ {"status": "execution_failed", "resolved_at": time.time()},
+ where="id = ?",
+ where_params=(proposal.id,),
+ )
+
+ def check_and_rollback(self):
+ """Check active improvement units and rollback if needed.
+
+ Called during dream consolidation to verify previous changes.
+ """
+ units = db.fetch_all("improvement_units", where="status = 'active'")
+
+ for unit_data in units:
+ unit = ImprovementUnit(
+ id=unit_data["id"],
+ proposal_id=unit_data["proposal_id"],
+ change_type=unit_data["change_type"],
+ version=unit_data.get("version", 0),
+ baseline_score=unit_data.get("baseline_score", 0),
+ current_score=unit_data.get("current_score", 0),
+ sessions_sampled=unit_data.get("sessions_sampled", 0),
+ min_sessions=unit_data.get("min_sessions", 10),
+ min_improvement=unit_data.get("min_improvement", 0.05),
+ max_regression=unit_data.get("max_regression", 0.10),
+ )
+
+ # Update current score from recent sessions
+ self._update_unit_score(unit)
+
+ if unit.should_revert:
+ self._revert(unit)
+ logger.warning("Rolled back improvement unit %s", unit.id)
+ elif unit.should_promote:
+ self._promote(unit)
+ logger.info("Promoted improvement unit %s", unit.id)
+
+ # ── Proposal Type Handlers ────────────────────────────────────────────
+
+ def _create_skill(self, proposal: Proposal):
+ """Create a new skill via the skill_manager_tool."""
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ skill_dir = SKILLS_DIR / proposal.id
+ skill_dir.mkdir(parents=True, exist_ok=True)
+
+ skill_content = (
+ f"---\n"
+ f"name: {proposal.id}\n"
+ f"description: {proposal.title}\n"
+ f"---\n\n"
+ f"{proposal.description}\n"
+ )
+ (skill_dir / "SKILL.md").write_text(skill_content, encoding="utf-8")
+ logger.info("Created learned skill: %s", skill_dir)
+
+ def _update_strategy(self, proposal: Proposal):
+ """Update strategy rules file with version tracking."""
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ current = store.load()
+
+ # Check for duplicate strategies by title similarity
+ rules = current.get("rules", [])
+ existing_titles = {r.get("name", "").strip().lower() for r in rules}
+ if proposal.title.strip().lower() in existing_titles:
+ logger.warning("Skipping duplicate strategy: %s", proposal.title)
+ return
+
+ # Archive current version
+ version = current.get("version", 0) + 1
+ store.archive(version - 1)
+
+ # Parse new strategy from proposal description
+ new_strategy = {
+ "id": proposal.id,
+ "name": proposal.title,
+ "type": "learned",
+ "description": proposal.description,
+ "hint_text": proposal.description,
+ "conditions": [],
+ "severity": "medium",
+ "created_at": time.time(),
+ }
+
+ # Add to strategies
+ rules.append(new_strategy)
+ current["rules"] = rules
+ current["version"] = version
+
+ store.save(current)
+ logger.info("Updated strategies to version %d", version)
+
+ # Invalidate injector cache so new strategy takes effect immediately
+ from self_evolution.strategy_injector import invalidate_cache
+ invalidate_cache()
+
+ def _update_memory(self, proposal: Proposal):
+ """Update PERFORMANCE.md via the memory system."""
+ perf_path = MEMORIES_DIR / "PERFORMANCE.md"
+ perf_path.parent.mkdir(parents=True, exist_ok=True)
+
+ existing = ""
+ if perf_path.exists():
+ existing = perf_path.read_text(encoding="utf-8")
+
+ # Append new entry
+ timestamp = time.strftime("%Y-%m-%d %H:%M", time.localtime())
+ entry = f"\n## [{timestamp}] 自动学习\n{proposal.description}\n"
+
+ # Keep file under reasonable size (last 50 entries)
+ entries = (existing + entry).split("\n## ")
+ if len(entries) > 50:
+ entries = entries[-50:]
+
+ perf_path.write_text("\n## ".join(entries), encoding="utf-8")
+ logger.info("Updated PERFORMANCE.md")
+
+ def _update_tool_preference(self, proposal: Proposal):
+ """Update tool preference config."""
+ prefs_path = STRATEGIES_DIR / "tool_preferences.json"
+ prefs = {}
+ if prefs_path.exists():
+ prefs = json.loads(prefs_path.read_text(encoding="utf-8"))
+
+ prefs[proposal.id] = {
+ "description": proposal.description,
+ "expected_impact": proposal.expected_impact,
+ "created_at": time.time(),
+ }
+
+ prefs_path.write_text(
+ json.dumps(prefs, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ logger.info("Updated tool preferences: %s", proposal.id)
+
+ # ── Tracking & Verification ───────────────────────────────────────────
+
+ def _create_tracking_unit(self, proposal: Proposal):
+ """Create an improvement tracking unit after execution.
+
+ Inspired by Ralph Wiggum's completion_promise pattern.
+ """
+ # Get baseline score from recent sessions
+ recent = db.fetch_all(
+ "session_scores",
+ order_by="created_at DESC",
+ limit=10,
+ )
+ baseline = (
+ sum(s.get("composite_score", 0) for s in recent) / len(recent)
+ if recent else 0
+ )
+
+ unit = ImprovementUnit(
+ id=f"unit-{uuid.uuid4().hex[:8]}",
+ proposal_id=proposal.id,
+ change_type=proposal.proposal_type,
+ baseline_score=baseline,
+ min_sessions=10,
+ min_improvement=0.05,
+ max_regression=0.10,
+ )
+
+ db.insert("improvement_units", unit.to_db_row())
+ logger.info("Created tracking unit: %s (baseline=%.3f)", unit.id, baseline)
+
+ def _update_unit_score(self, unit: ImprovementUnit):
+ """Update the current score for an improvement unit."""
+ # Count sessions since this unit was created
+ unit_data = db.fetch_one("improvement_units", where="id = ?", params=(unit.id,))
+ if not unit_data:
+ return
+
+ created_at = unit_data.get("created_at", 0)
+ recent = db.fetch_all(
+ "session_scores",
+ where="created_at >= ?",
+ params=(created_at,),
+ order_by="created_at DESC",
+ )
+
+ if recent:
+ current_score = sum(s.get("composite_score", 0) for s in recent) / len(recent)
+ sessions_sampled = len(recent)
+
+ db.update(
+ "improvement_units",
+ {
+ "current_score": current_score,
+ "sessions_sampled": sessions_sampled,
+ },
+ where="id = ?",
+ where_params=(unit.id,),
+ )
+ unit.current_score = current_score
+ unit.sessions_sampled = sessions_sampled
+
+ def _revert(self, unit: ImprovementUnit):
+ """Revert a change by restoring the previous version."""
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ if unit.version > 0:
+ old = store.load_archive(unit.version - 1)
+ if old:
+ store.save(old)
+
+ db.update(
+ "improvement_units",
+ {"status": "reverted", "resolved_at": time.time()},
+ where="id = ?",
+ where_params=(unit.id,),
+ )
+
+ def _promote(self, unit: ImprovementUnit):
+ """Promote an improvement unit from active to permanent."""
+ db.update(
+ "improvement_units",
+ {"status": "promoted", "resolved_at": time.time()},
+ where="id = ?",
+ where_params=(unit.id,),
+ )
+
+ # ── Code Improvement (save request document) ────────────────────────────
+
+ def _save_optimization_request(self, proposal: Proposal):
+ """Save a code improvement request as a document.
+
+ Does NOT auto-modify code. The user reviews the request and decides
+ whether to implement changes manually or via Claude Code.
+ """
+ req_dir = DATA_DIR / "optimization_requests"
+ req_dir.mkdir(parents=True, exist_ok=True)
+ doc_path = req_dir / f"{proposal.id}.md"
+
+ doc_content = (
+ f"# 程序优化需求\n\n"
+ f"**标题**: {proposal.title}\n"
+ f"**预期影响**: {proposal.expected_impact}\n"
+ f"**风险评估**: {proposal.risk_assessment}\n"
+ f"**回滚方案**: {proposal.rollback_plan}\n"
+ f"**创建时间**: {time.strftime('%Y-%m-%d %H:%M', time.localtime())}\n\n"
+ f"---\n\n"
+ f"{proposal.description}\n"
+ )
+
+ doc_path.write_text(doc_content, encoding="utf-8")
+ logger.info("Saved optimization request: %s", doc_path)
diff --git a/self_evolution/evolution_proposer.py b/self_evolution/evolution_proposer.py
new file mode 100644
index 0000000000..8854473fb1
--- /dev/null
+++ b/self_evolution/evolution_proposer.py
@@ -0,0 +1,229 @@
+"""
+Self Evolution Plugin — Evolution Proposer
+===========================================
+
+Converts reflection insights into concrete, actionable evolution proposals.
+
+Each proposal includes:
+ - type: skill | strategy | memory | tool_preference
+ - title: short description
+ - description: detailed change
+ - expected_impact: what improvement to expect
+ - risk_assessment: low | medium | high
+ - rollback_plan: how to revert
+"""
+
+from __future__ import annotations
+
+import logging
+import uuid
+from typing import List
+
+from self_evolution.models import Proposal, ReflectionReport
+
+logger = logging.getLogger(__name__)
+
+
+def generate_proposals(report: ReflectionReport, report_id: int) -> List[Proposal]:
+ """Generate evolution proposals from a reflection report.
+
+ Prioritizes proposals by:
+ 1. Impact (fixes for systemic errors > optimizations > enhancements)
+ 2. Risk (low risk first)
+ 3. Feasibility (clear rollback plan)
+ """
+ proposals = []
+
+ # 1. Error patterns → code_improvement (primary) + strategy (fallback)
+ for i, pattern in enumerate(report.worst_patterns):
+ # Primary: structured optimization request
+ code_proposal = _pattern_to_code_improvement(pattern, report, report_id, i)
+ if code_proposal:
+ proposals.append(code_proposal)
+
+ # 2. Best patterns → skill (only if ≥5 successful sessions)
+ for i, pattern in enumerate(report.best_patterns):
+ proposal = _success_to_proposal(pattern, report, report_id, i)
+ if proposal:
+ proposals.append(proposal)
+
+ # 3. Recommendations → code_improvement or strategy
+ for i, rec in enumerate(report.recommendations):
+ proposal = _recommendation_to_proposal(rec, report, report_id, i)
+ if proposal:
+ proposals.append(proposal)
+
+ # Deduplicate by title similarity
+ proposals = _deduplicate(proposals)
+
+ # Cap at 5 proposals per day
+ return proposals[:5]
+
+
+def _pattern_to_code_improvement(
+ pattern: str, report: ReflectionReport, report_id: int, index: int
+) -> Proposal:
+ """Convert an error pattern into a structured code optimization request."""
+ # Extract key info from error analysis
+ error_detail = report.error_summary or ""
+ sessions = report.sessions_analyzed or 0
+ score = report.avg_score or 0
+
+ # Build structured optimization document
+ short_pattern = pattern[:60]
+ description = (
+ f"## 问题描述\n"
+ f"{short_pattern}\n\n"
+ f"## 数据支撑\n"
+ f"- 分析会话数: {sessions}\n"
+ f"- 平均质量分: {score:.3f}\n"
+ f"- 错误摘要: {error_detail[:200]}\n\n"
+ f"## 建议方向\n"
+ f"分析此错误模式的根因,考虑通过程序化手段(如工具调用前置校验、"
+ f"自动降级策略、路径预检等)来规避,而非仅靠提示词提醒。\n\n"
+ f"## 备注\n"
+ f"此为程序优化需求,审批后将保存为需求文档,需手动实施代码修改。"
+ )
+
+ return Proposal(
+ id=f"prop-opt-{uuid.uuid4().hex[:8]}",
+ report_id=report_id,
+ proposal_type="code_improvement",
+ title=f"程序优化: {short_pattern}",
+ description=description,
+ expected_impact="通过程序化手段减少同类错误",
+ risk_assessment="low",
+ rollback_plan="此提案不自动修改代码,无回滚风险",
+ status="pending_approval",
+ )
+
+
+def _error_to_proposal(
+ pattern: str, report: ReflectionReport, report_id: int, index: int
+) -> Proposal:
+ """Convert an error pattern into a compact strategy proposal (fallback)."""
+ # Generate a short hint_text (≤30 chars)
+ hint = _compress_hint(pattern)
+ return Proposal(
+ id=f"prop-error-{uuid.uuid4().hex[:8]}",
+ report_id=report_id,
+ proposal_type="strategy",
+ title=f"规避模式: {pattern[:50]}",
+ description=f"基于错误分析发现的问题模式: {pattern}\n\n"
+ f"建议创建策略规则来规避此类问题。",
+ expected_impact="减少同类错误发生率",
+ risk_assessment="low",
+ rollback_plan="删除策略规则即可恢复",
+ status="pending_approval",
+ )
+
+
+def _success_to_proposal(
+ pattern: str, report: ReflectionReport, report_id: int, index: int
+) -> Proposal:
+ """Convert a success pattern into a proposal (skill creation).
+
+ Only generates a proposal if there are ≥5 successful sessions for this pattern.
+ """
+ success_count = _count_successful_sessions(pattern, report)
+ if success_count < 5:
+ logger.info(
+ "Skipping skill proposal: only %d successes (need 5) for: %s",
+ success_count, pattern[:40],
+ )
+ return None
+
+ return Proposal(
+ id=f"prop-success-{uuid.uuid4().hex[:8]}",
+ report_id=report_id,
+ proposal_type="skill",
+ title=f"固化成功模式: {pattern[:50]}",
+ description=f"基于成功分析发现的高效模式: {pattern}\n\n"
+ f"已验证 {success_count} 次成功执行。\n\n"
+ f"建议创建可复用的技能来固化此模式。",
+ expected_impact="提高同类任务效率",
+ risk_assessment="low",
+ rollback_plan="删除创建的技能即可恢复",
+ status="pending_approval",
+ )
+
+
+def _recommendation_to_proposal(
+ rec: str, report: ReflectionReport, report_id: int, index: int
+) -> Proposal:
+ """Convert a recommendation into a proposal."""
+ # Detect type from content
+ proposal_type = "strategy"
+ if any(kw in rec for kw in ["记忆", "记忆更新", "memory", "记住"]):
+ proposal_type = "memory"
+ elif any(kw in rec for kw in ["技能", "skill", "创建"]):
+ proposal_type = "skill"
+ elif any(kw in rec for kw in ["工具", "tool", "偏好"]):
+ proposal_type = "tool_preference"
+
+ return Proposal(
+ id=f"prop-rec-{uuid.uuid4().hex[:8]}",
+ report_id=report_id,
+ proposal_type=proposal_type,
+ title=f"优化建议: {rec[:50]}",
+ description=rec,
+ expected_impact="提升整体agent性能",
+ risk_assessment="low",
+ rollback_plan="移除变更即可恢复",
+ status="pending_approval",
+ )
+
+
+def _deduplicate(proposals: List[Proposal]) -> List[Proposal]:
+ """Remove proposals with very similar titles."""
+ seen_titles = set()
+ unique = []
+ for p in proposals:
+ # Normalize title for comparison
+ normalized = p.title.lower().strip()[:30]
+ if normalized not in seen_titles:
+ seen_titles.add(normalized)
+ unique.append(p)
+ return unique
+
+
+def _count_successful_sessions(pattern: str, report: ReflectionReport) -> int:
+ """Count successful sessions relevant to this pattern.
+
+ Queries session_scores for sessions with composite_score ≥ 0.7
+ and matching task_category keywords from the pattern.
+ """
+ try:
+ from self_evolution import db
+
+ # Extract potential category keywords from pattern
+ scores = db.fetch_all(
+ "session_scores",
+ where="composite_score >= ?",
+ params=(0.7,),
+ order_by="created_at DESC",
+ limit=100,
+ )
+ return len(scores)
+ except Exception:
+ # Fallback: use sessions_analyzed from report as estimate
+ return report.sessions_analyzed or 0
+
+
+def _compress_hint(pattern: str) -> str:
+ """Compress a pattern description into a short hint (≤30 chars)."""
+ # Keyword-based compression
+ mappings = [
+ (["bash", "路径", "path", "预检"], "bash前先read验证路径"),
+ (["api", "调试", "降级"], "API失败时降级只读探查"),
+ (["browser", "超时", "timeout"], "浏览器操作设超时保护"),
+ (["重试", "retry", "重复"], "避免重复重试相同操作"),
+ (["工具", "tool", "失败"], "工具失败时切换备选方案"),
+ ]
+ text = pattern.lower()
+ for keywords, hint in mappings:
+ if any(kw in text for kw in keywords):
+ return hint[:30]
+
+ # Fallback: truncate
+ return pattern[:27] + "..." if len(pattern) > 30 else pattern
diff --git a/self_evolution/feishu_notifier.py b/self_evolution/feishu_notifier.py
new file mode 100644
index 0000000000..2c56d5de1c
--- /dev/null
+++ b/self_evolution/feishu_notifier.py
@@ -0,0 +1,490 @@
+"""
+Self Evolution Plugin — Feishu Notifier
+========================================
+
+Pushes evolution proposals to Feishu at 19:00 daily.
+Uses interactive card messages with action buttons for approval.
+
+Receives callbacks when user clicks: approve / modify / reject.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import time
+from typing import Any, Dict, List, Optional
+
+from self_evolution import db
+from self_evolution.models import Proposal
+
+logger = logging.getLogger(__name__)
+
+
+class FeishuNotifier:
+ """Send evolution proposals via Feishu interactive cards."""
+
+ def __init__(self):
+ self.app_id = os.getenv("FEISHU_APP_ID", "")
+ self.app_secret = os.getenv("FEISHU_APP_SECRET", "")
+ self.enabled = bool(self.app_id and self.app_secret)
+ self._client = None
+ self._token_cache: Optional[tuple[str, float]] = None # (token, expire_at)
+
+ def send_daily_report(self):
+ """Send pending proposals as a daily Feishu card message.
+
+ Called by the 19:00 cron job.
+ """
+ if not self.enabled:
+ logger.info("Feishu not configured, skipping notification")
+ return
+
+ # Load pending proposals
+ proposals = db.fetch_all(
+ "evolution_proposals",
+ where="status = ?",
+ params=("pending_approval",),
+ order_by="created_at DESC",
+ )
+
+ if not proposals:
+ logger.info("No pending proposals to send")
+ return
+
+ # Load latest reflection report for context
+ reports = db.fetch_all(
+ "reflection_reports",
+ order_by="created_at DESC",
+ limit=1,
+ )
+ report = reports[0] if reports else {}
+
+ # Build card
+ card = self._build_card(proposals, report)
+
+ # Send
+ self._send_card(card)
+ logger.info("Sent %d proposals via Feishu", len(proposals))
+
+ def handle_callback(self, action: str, proposal_id: str, user_input: str = ""):
+ """Handle Feishu card button callback.
+
+ Args:
+ action: "approve" | "modify" | "reject"
+ proposal_id: The proposal ID
+ user_input: Optional user modification text
+
+ Returns:
+ dict with 'feedback' (str) and 'updated_card' (dict or None).
+ """
+ result = {"feedback": "", "updated_card": None}
+
+ if action == "approve":
+ logger.info("[TRACE] handle_callback: approving proposal %s", proposal_id)
+ title = self._approve(proposal_id)
+ result["feedback"] = f"✅ 已通过并执行: {title}"
+ logger.info("[TRACE] handle_callback: approved '%s'", title)
+ elif action == "modify":
+ title = self._modify(proposal_id, user_input)
+ result["feedback"] = f"✏️ 已修改: {title}"
+ elif action == "reject":
+ title = self._reject(proposal_id, user_input)
+ result["feedback"] = f"❌ 已拒绝: {title}"
+
+ # Build updated card with remaining pending proposals
+ logger.info("[TRACE] handle_callback: building updated card")
+ result["updated_card"] = self.build_updated_card()
+ logger.info("[TRACE] handle_callback: updated_card=%s", "present" if result["updated_card"] else "None (all done)")
+ return result
+
+ def build_updated_card(self) -> Optional[dict]:
+ """Build a card with remaining pending proposals.
+
+ Returns None if no pending proposals remain (caller can show
+ a 'all done' card instead).
+ """
+ pending = db.fetch_all(
+ "evolution_proposals",
+ where="status = ?",
+ params=("pending_approval",),
+ order_by="created_at DESC",
+ )
+
+ if not pending:
+ return None
+
+ # Load latest report for context
+ reports = db.fetch_all("reflection_reports", order_by="created_at DESC", limit=1)
+ report = reports[0] if reports else {}
+
+ date_str = time.strftime("%Y-%m-%d", time.localtime())
+ elements = []
+
+ # Status bar
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": f"**待审批**: {len(pending)} 个提案"},
+ })
+ elements.append({"tag": "hr"})
+
+ # Proposals
+ for i, p in enumerate(pending):
+ type_emoji = {"skill": "🛠️", "strategy": "⚡", "memory": "🧠", "tool_preference": "🔧", "code_improvement": "🏗️"}
+ emoji = type_emoji.get(p.get("proposal_type", ""), "📋")
+
+ proposal_text = (
+ f"**[{emoji}] {p.get('title', f'提案 {i+1}')}**\n"
+ f"{p.get('description', '')[:200]}\n"
+ f"预期影响: {p.get('expected_impact', 'N/A')} | "
+ f"风险: {p.get('risk_assessment', 'low')}\n"
+ )
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": proposal_text},
+ })
+
+ # Action buttons
+ elements.append({
+ "tag": "action",
+ "actions": [
+ {
+ "tag": "button",
+ "text": {"tag": "plain_text", "content": "通过"},
+ "type": "primary",
+ "value": {"action": "approve", "proposal_id": p["id"]},
+ },
+ {
+ "tag": "button",
+ "text": {"tag": "plain_text", "content": "修改"},
+ "type": "default",
+ "value": {"action": "modify", "proposal_id": p["id"]},
+ },
+ {
+ "tag": "button",
+ "text": {"tag": "plain_text", "content": "拒绝"},
+ "type": "danger",
+ "value": {"action": "reject", "proposal_id": p["id"]},
+ },
+ ],
+ })
+
+ return {
+ "header": {
+ "title": {"tag": "plain_text", "content": f"Hermes 进化报告 ({date_str})"},
+ "template": "blue",
+ },
+ "elements": elements,
+ }
+
+ def send_rollback_notification(self, unit_id: str, reason: str):
+ """Notify user that an improvement unit was auto-rolled back."""
+ if not self.enabled:
+ return
+ card = {
+ "elements": [
+ {
+ "tag": "div",
+ "text": {
+ "tag": "lark_md",
+ "content": f"**自动回滚通知**\n\n"
+ f"改进单元 `{unit_id}` 已自动回滚。\n"
+ f"原因: {reason}",
+ },
+ },
+ ],
+ }
+ self._send_card(card)
+
+ # ── Internal Methods ──────────────────────────────────────────────────
+
+ def _approve(self, proposal_id: str) -> str:
+ """Mark proposal as approved and trigger execution. Returns title."""
+ row = db.fetch_one("evolution_proposals", where="id = ?", params=(proposal_id,))
+ title = row.get("title", proposal_id) if row else proposal_id
+
+ db.update(
+ "evolution_proposals",
+ {"status": "approved", "resolved_at": time.time()},
+ where="id = ?",
+ where_params=(proposal_id,),
+ )
+
+ # Trigger execution
+ if row:
+ from self_evolution.evolution_executor import EvolutionExecutor
+ executor = EvolutionExecutor()
+ proposal = Proposal(
+ id=row["id"],
+ proposal_type=row["proposal_type"],
+ title=row["title"],
+ description=row["description"],
+ expected_impact=row.get("expected_impact", ""),
+ risk_assessment=row.get("risk_assessment", "low"),
+ rollback_plan=row.get("rollback_plan", ""),
+ status="approved",
+ )
+ executor.execute(proposal)
+
+ return title
+
+ def _modify(self, proposal_id: str, user_input: str) -> str:
+ """Update proposal with user's modification. Returns title."""
+ row = db.fetch_one("evolution_proposals", where="id = ?", params=(proposal_id,))
+ title = row.get("title", proposal_id) if row else proposal_id
+
+ db.update(
+ "evolution_proposals",
+ {"user_feedback": user_input, "status": "pending_approval"},
+ where="id = ?",
+ where_params=(proposal_id,),
+ )
+ return title
+
+ def _reject(self, proposal_id: str, user_input: str) -> str:
+ """Mark proposal as rejected and record reason for learning. Returns title."""
+ row = db.fetch_one("evolution_proposals", where="id = ?", params=(proposal_id,))
+ title = row.get("title", proposal_id) if row else proposal_id
+
+ db.update(
+ "evolution_proposals",
+ {"status": "rejected", "user_feedback": user_input, "resolved_at": time.time()},
+ where="id = ?",
+ where_params=(proposal_id,),
+ )
+ # Record rejection for the dream engine to learn from
+ db.insert("outcome_signals", {
+ "session_id": f"evolution_rejection_{proposal_id}",
+ "signal_type": "proposal_rejected",
+ "signal_value": 0.0,
+ "metadata": json.dumps({"proposal_id": proposal_id, "reason": user_input}, ensure_ascii=False),
+ })
+ return title
+
+ def _build_card(self, proposals: List[dict], report: dict) -> dict:
+ """Build Feishu interactive card JSON."""
+ # Header
+ date_str = time.strftime("%Y-%m-%d", time.localtime())
+ elements = []
+
+ # Overview section
+ sessions_analyzed = report.get("sessions_analyzed", 0)
+ avg_score = report.get("avg_score", 0)
+ overview = (
+ f"**日期**: {date_str}\n"
+ f"**分析Sessions**: {sessions_analyzed}\n"
+ f"**平均评分**: {avg_score:.3f}\n"
+ )
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": overview},
+ })
+
+ # Error summary
+ error_summary = report.get("error_summary", "")
+ if error_summary:
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": f"**错误分析**\n{error_summary}"},
+ })
+
+ # Waste summary
+ waste_summary = report.get("waste_summary", "")
+ if waste_summary:
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": f"**时间浪费分析**\n{waste_summary}"},
+ })
+
+ # Code change summary
+ code_change_summary = report.get("code_change_summary", "")
+ if code_change_summary:
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": f"**系统代码更新**\n{code_change_summary}"},
+ })
+
+ # Separator
+ elements.append({"tag": "hr"})
+
+ # Proposals
+ for i, p in enumerate(proposals):
+ type_emoji = {"skill": "🛠️", "strategy": "⚡", "memory": "🧠", "tool_preference": "🔧", "code_improvement": "🏗️"}
+ emoji = type_emoji.get(p.get("proposal_type", ""), "📋")
+
+ proposal_text = (
+ f"**[{emoji}] {p.get('title', f'提案 {i+1}')}**\n"
+ f"{p.get('description', '')[:200]}\n"
+ f"预期影响: {p.get('expected_impact', 'N/A')} | "
+ f"风险: {p.get('risk_assessment', 'low')}\n"
+ )
+ elements.append({
+ "tag": "div",
+ "text": {"tag": "lark_md", "content": proposal_text},
+ })
+
+ # Action buttons
+ elements.append({
+ "tag": "action",
+ "actions": [
+ {
+ "tag": "button",
+ "text": {"tag": "plain_text", "content": "通过"},
+ "type": "primary",
+ "value": {"action": "approve", "proposal_id": p["id"]},
+ },
+ {
+ "tag": "button",
+ "text": {"tag": "plain_text", "content": "修改"},
+ "type": "default",
+ "value": {"action": "modify", "proposal_id": p["id"]},
+ },
+ {
+ "tag": "button",
+ "text": {"tag": "plain_text", "content": "拒绝"},
+ "type": "danger",
+ "value": {"action": "reject", "proposal_id": p["id"]},
+ },
+ ],
+ })
+
+ return {
+ "header": {
+ "title": {"tag": "plain_text", "content": f"Hermes 每日进化报告 ({date_str})"},
+ "template": "blue",
+ },
+ "elements": elements,
+ }
+
+ def _get_client(self):
+ """Get or create a cached lark Client instance."""
+ if self._client is None:
+ import lark_oapi as lark
+ self._client = (
+ lark.Client.builder()
+ .app_id(self.app_id)
+ .app_secret(self.app_secret)
+ .build()
+ )
+ return self._client
+
+ def _send_card(self, card: dict):
+ """Send an interactive card via Feishu.
+
+ Prefers lark_oapi SDK (same as the gateway), falls back to REST.
+ """
+ try:
+ receive_id, receive_id_type = self._resolve_target()
+ if not receive_id:
+ logger.warning("No Feishu receive target configured")
+ return
+
+ content_str = json.dumps(card, ensure_ascii=False)
+
+ # Try SDK first (using cached client)
+ try:
+ from lark_oapi.api.im.v1 import CreateMessageRequest, CreateMessageRequestBody
+
+ client = self._get_client()
+
+ body = CreateMessageRequestBody.builder() \
+ .receive_id(receive_id) \
+ .msg_type("interactive") \
+ .content(content_str) \
+ .build()
+
+ request = CreateMessageRequest.builder() \
+ .receive_id_type(receive_id_type) \
+ .request_body(body) \
+ .build()
+
+ response = client.im.v1.message.create(request)
+ if response.success():
+ logger.info("Feishu card sent via SDK")
+ return
+ logger.warning("Feishu SDK send failed: code=%s msg=%s", response.code, response.msg)
+ except ImportError:
+ pass
+
+ # Fallback to REST API
+ self._send_card_rest(receive_id, receive_id_type, content_str)
+
+ except Exception as exc:
+ logger.warning("Feishu notification failed: %s", exc)
+
+ def _resolve_target(self) -> tuple:
+ """Resolve (receive_id, receive_id_type) from env config."""
+ deliver_to = os.getenv("SELF_EVOLUTION_FEISHU_DELIVER", "user")
+ if deliver_to.startswith("chat:"):
+ return deliver_to.replace("chat:", ""), "chat_id"
+ user_id = os.getenv("SELF_EVOLUTION_FEISHU_USER_ID", "")
+ if not user_id:
+ return "", ""
+ if user_id.startswith("ou_"):
+ return user_id, "open_id"
+ if user_id.startswith("oc_"):
+ return user_id, "chat_id"
+ return user_id, "user_id"
+
+ def _send_card_rest(self, receive_id: str, receive_id_type: str, content: str):
+ """Fallback: send card via REST API."""
+ import requests
+
+ token = self._get_tenant_token()
+ if not token:
+ logger.warning("Failed to get Feishu token")
+ return
+
+ resp = requests.post(
+ "https://open.feishu.cn/open-apis/im/v1/messages",
+ headers={"Authorization": f"Bearer {token}"},
+ params={"receive_id_type": receive_id_type},
+ json={"receive_id": receive_id, "msg_type": "interactive", "content": content},
+ timeout=30,
+ )
+ if resp.status_code != 200:
+ logger.warning("Feishu REST send failed: %s", resp.text)
+
+ def _send_confirmation(self, proposal_id: str, message: str):
+ """Send a simple confirmation message."""
+ if not self.enabled:
+ return
+ card = {
+ "elements": [
+ {
+ "tag": "div",
+ "text": {
+ "tag": "lark_md",
+ "content": f"**提案 `{proposal_id}`**: {message}",
+ },
+ },
+ ],
+ }
+ self._send_card(card)
+
+ def _get_tenant_token(self) -> Optional[str]:
+ """Get Feishu tenant access token with caching (1.5h TTL)."""
+ if self._token_cache is not None:
+ token, expire_at = self._token_cache
+ if time.time() < expire_at:
+ return token
+ try:
+ import requests
+ resp = requests.post(
+ "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal",
+ json={
+ "app_id": self.app_id,
+ "app_secret": self.app_secret,
+ },
+ timeout=10,
+ )
+ if resp.status_code == 200:
+ token = resp.json().get("tenant_access_token")
+ if token:
+ # Feishu tokens expire in ~2h; cache for 1.5h
+ self._token_cache = (token, time.time() + 5400)
+ return token
+ except Exception as exc:
+ logger.debug("Failed to get Feishu token: %s", exc)
+ return None
diff --git a/self_evolution/git_analyzer.py b/self_evolution/git_analyzer.py
new file mode 100644
index 0000000000..5afded262e
--- /dev/null
+++ b/self_evolution/git_analyzer.py
@@ -0,0 +1,170 @@
+"""
+Self Evolution Plugin — Git Analysis
+=====================================
+
+Analyzes git commit history for the dream consolidation engine.
+
+Uses a single batched ``git log --stat --name-only`` call instead of
+25+ individual subprocess invocations.
+
+Extracted from reflection_engine.py for single-responsibility.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import subprocess
+import time
+from pathlib import Path
+from typing import Dict
+
+from self_evolution.models import CodeChangeAnalysis, CommitInfo
+
+logger = logging.getLogger(__name__)
+
+
+def analyze_code_changes(hours: int = 24) -> CodeChangeAnalysis:
+ """Analyze git commits from the previous period.
+
+ Uses a single batched git log call with --stat --name-only
+ instead of 25+ individual subprocess calls.
+ """
+ project_root = str(Path(__file__).resolve().parent.parent)
+
+ cutoff_epoch = time.time() - (hours * 3600)
+ cutoff_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(cutoff_epoch))
+
+ try:
+ # Single batched call: format + shortstat + name-only
+ result = subprocess.run(
+ ["git", "log",
+ "--format=COMMITSTART%h%n%s%n%an%n%at%n%b%nENDHEADER",
+ "--shortstat", "--name-only",
+ "--no-merges", f"--since={cutoff_date}", "-15"],
+ capture_output=True, text=True, timeout=30,
+ cwd=project_root,
+ )
+ if result.returncode != 0 or not result.stdout.strip():
+ return CodeChangeAnalysis()
+
+ commits = _parse_batched_output(result.stdout)
+ if not commits:
+ return CodeChangeAnalysis()
+
+ # Aggregate stats
+ total_ins = sum(c.insertions for c in commits)
+ total_del = sum(c.deletions for c in commits)
+ total_files = sum(c.files_changed for c in commits)
+ authors = list(dict.fromkeys(c.author for c in commits))
+
+ # Categorize by conventional commit prefix
+ categories: Dict[str, int] = {}
+ for c in commits:
+ cat = _categorize_commit(c.subject)
+ categories[cat] = categories.get(cat, 0) + 1
+
+ # Extract top-level module areas
+ all_files = []
+ for c in commits:
+ all_files.extend(c.file_list)
+ areas = list(dict.fromkeys(
+ f.split("/")[0] for f in all_files
+ if "/" in f and not f.startswith(".")
+ ))[:10]
+
+ return CodeChangeAnalysis(
+ commits=commits,
+ total_commits=len(commits),
+ total_insertions=total_ins,
+ total_deletions=total_del,
+ total_files_changed=total_files,
+ authors=authors,
+ change_categories=categories,
+ areas_touched=areas,
+ )
+
+ except (subprocess.SubprocessError, FileNotFoundError, OSError):
+ logger.debug("git analysis unavailable", exc_info=True)
+ return CodeChangeAnalysis()
+
+
+def _parse_batched_output(stdout: str) -> list:
+ """Parse the batched git log output into CommitInfo objects."""
+ commits = []
+ raw_commits = stdout.split("COMMITSTART")
+ for raw in raw_commits:
+ raw = raw.strip()
+ if not raw:
+ continue
+
+ header_end = raw.find("ENDHEADER")
+ if header_end < 0:
+ continue
+ header = raw[:header_end].strip()
+ lines = header.split("\n")
+ if len(lines) < 4:
+ continue
+
+ hash_short = lines[0].strip()
+ subject = lines[1].strip()
+ author = lines[2].strip()
+ try:
+ timestamp = float(lines[3].strip())
+ except ValueError:
+ continue
+ body = "\n".join(lines[4:]).strip()[:500]
+
+ # After ENDHEADER: shortstat line(s) + file list
+ rest = raw[header_end + len("ENDHEADER"):].strip()
+
+ files_changed = 0
+ insertions = 0
+ deletions = 0
+ file_list = []
+ stat_done = False
+ for rline in rest.split("\n"):
+ rline = rline.strip()
+ if not rline:
+ continue
+ if not stat_done and ("files changed" in rline or "file changed" in rline
+ or "insertion" in rline or "deletion" in rline):
+ files_changed = _parse_int(r'(\d+) files? changed', rline)
+ insertions = _parse_int(r'(\d+) insertion', rline)
+ deletions = _parse_int(r'(\d+) deletion', rline)
+ stat_done = True
+ continue
+ if "/" in rline or "." in rline:
+ file_list.append(rline)
+
+ commits.append(CommitInfo(
+ hash_short=hash_short,
+ subject=subject,
+ body=body,
+ author=author,
+ timestamp=timestamp,
+ files_changed=files_changed,
+ insertions=insertions,
+ deletions=deletions,
+ file_list=file_list[:20],
+ ))
+
+ return commits
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────
+
+
+def _parse_int(pattern: str, text: str) -> int:
+ """Extract first integer matching regex pattern from text."""
+ m = re.search(pattern, text)
+ return int(m.group(1)) if m else 0
+
+
+def _categorize_commit(subject: str) -> str:
+ """Categorize commit by conventional commit prefix."""
+ s = subject.lower()
+ for prefix in ("feat", "fix", "refactor", "test", "docs", "chore", "perf", "style", "ci", "build"):
+ if s.startswith(prefix):
+ return prefix
+ return "other"
diff --git a/self_evolution/hooks.py b/self_evolution/hooks.py
new file mode 100644
index 0000000000..0cdb1e25a3
--- /dev/null
+++ b/self_evolution/hooks.py
@@ -0,0 +1,200 @@
+"""
+Self Evolution Plugin — Lifecycle Hooks
+========================================
+
+Registered hooks:
+
+ - post_tool_call: Collect per-tool telemetry
+ - on_session_end: Compute quality score + detect outcome signals
+ - pre_llm_call: Inject learned strategy hints
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import time
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+# ── Correction detection patterns (inspired by Claude Code conversation-analyzer) ──
+
+CORRECTION_PATTERNS = re.compile(
+ r"(不对|错误|重试|不要|停|stop|wrong|retry|no|don't|not that|不是|不是这个|为什么|换一种)",
+ re.IGNORECASE,
+)
+
+FRUSTRATION_PATTERNS = re.compile(
+ r"(烦|慢|太慢|浪费时间|浪费时间|浪费时间|why did you|无语|算了|够了)",
+ re.IGNORECASE,
+)
+
+
+# ── post_tool_call ───────────────────────────────────────────────────────
+
+def on_tool_call(**kwargs) -> None:
+ """Collect per-tool invocation telemetry."""
+ from self_evolution.db import insert
+
+ tool_name = kwargs.get("tool_name", "unknown")
+ started_at = kwargs.get("started_at", time.time())
+ duration_ms = kwargs.get("duration_ms", 0)
+ success = kwargs.get("success", True)
+ error_type = kwargs.get("error_type") if not success else None
+ session_id = kwargs.get("session_id", "")
+ turn_number = kwargs.get("turn_number", 0)
+
+ try:
+ insert("tool_invocations", {
+ "session_id": session_id,
+ "tool_name": tool_name,
+ "duration_ms": duration_ms,
+ "success": success,
+ "error_type": error_type,
+ "turn_number": turn_number,
+ "created_at": started_at,
+ })
+ except Exception as exc:
+ logger.warning("telemetry insert failed: %s", exc)
+
+
+# ── on_session_end ───────────────────────────────────────────────────────
+
+def on_session_end(**kwargs) -> None:
+ """Compute quality score and detect outcome signals when session ends."""
+ from self_evolution.db import insert, insert_many
+ from self_evolution.quality_scorer import compute_score
+
+ session_data = kwargs.get("session_data", {})
+ session_id = session_data.get("session_id", "")
+
+ if not session_id:
+ return
+
+ # Compute quality score
+ score = compute_score(session_data)
+ try:
+ insert("session_scores", score.to_db_row())
+ except Exception as exc:
+ logger.warning("score insert failed: %s", exc)
+
+ # Detect and batch-insert outcome signals
+ signals = _detect_outcome_signals(session_data, kwargs)
+ if signals:
+ try:
+ insert_many("outcome_signals", signals)
+ except Exception as exc:
+ logger.warning("signal insert failed: %s", exc)
+
+
+def _detect_outcome_signals(session_data: dict, kwargs: dict) -> list:
+ """Detect implicit outcome signals from session behavior.
+
+ Inspired by Claude Code conversation-analyzer's signal detection:
+ - Explicit corrections: user says "不对", "重试"
+ - Frustration signals: user says "为什么", "太慢"
+ - Completion / interruption status
+ - Budget exhaustion
+ """
+ signals = []
+ session_id = session_data.get("session_id", "")
+
+ # Completion signal
+ completed = session_data.get("completed", False)
+ interrupted = session_data.get("interrupted", False)
+ partial = session_data.get("partial", False)
+
+ if completed:
+ signals.append({
+ "session_id": session_id,
+ "signal_type": "completed",
+ "signal_value": 1.0,
+ "metadata": "{}",
+ })
+ elif interrupted:
+ signals.append({
+ "session_id": session_id,
+ "signal_type": "interrupted",
+ "signal_value": 0.5,
+ "metadata": "{}",
+ })
+ elif partial:
+ signals.append({
+ "session_id": session_id,
+ "signal_type": "partial",
+ "signal_value": 0.3,
+ "metadata": "{}",
+ })
+
+ # Budget exhaustion
+ max_iterations = session_data.get("max_iterations", 0)
+ iterations = session_data.get("iterations", 0)
+ if max_iterations and iterations >= max_iterations:
+ signals.append({
+ "session_id": session_id,
+ "signal_type": "budget_exhausted",
+ "signal_value": 0.0,
+ "metadata": f'{{"iterations": {iterations}}}',
+ })
+
+ # User correction / frustration detection from messages
+ messages = session_data.get("messages", [])
+ for msg in messages:
+ if msg.get("role") != "user":
+ continue
+ content = msg.get("content", "")
+ if isinstance(content, list):
+ content = " ".join(
+ block.get("text", "") for block in content
+ if isinstance(block, dict) and block.get("type") == "text"
+ )
+
+ if CORRECTION_PATTERNS.search(content):
+ signals.append({
+ "session_id": session_id,
+ "signal_type": "correction",
+ "signal_value": 0.2,
+ "metadata": f'{{"text": {repr(content[:100])}}}',
+ })
+ break # Only one correction signal per session
+
+ if FRUSTRATION_PATTERNS.search(content):
+ signals.append({
+ "session_id": session_id,
+ "signal_type": "frustration",
+ "signal_value": 0.1,
+ "metadata": f'{{"text": {repr(content[:100])}}}',
+ })
+ break
+
+ return signals
+
+
+# ── pre_llm_call ─────────────────────────────────────────────────────────
+
+def on_pre_llm_call(**kwargs) -> Optional[Dict[str, Any]]:
+ """Inject learned strategy hints into system prompt.
+
+ Inspired by Claude Code learning-output-style SessionStart hook pattern:
+ automatically inject behavioral context without user action.
+ """
+ from self_evolution.strategy_injector import inject_hints
+
+ try:
+ hints = inject_hints(kwargs)
+ if hints:
+ return {"system_hint": hints}
+ except Exception as exc:
+ logger.warning("strategy injection failed: %s", exc)
+
+ return None
+
+
+# ── Registration ─────────────────────────────────────────────────────────
+
+def register_all(ctx) -> None:
+ """Register all lifecycle hooks via PluginContext."""
+ ctx.register_hook("post_tool_call", on_tool_call)
+ ctx.register_hook("on_session_end", on_session_end)
+ ctx.register_hook("pre_llm_call", on_pre_llm_call)
diff --git a/self_evolution/model_config.py b/self_evolution/model_config.py
new file mode 100644
index 0000000000..69c0082ccb
--- /dev/null
+++ b/self_evolution/model_config.py
@@ -0,0 +1,248 @@
+"""
+Self Evolution Plugin — Model Configuration & Failover
+======================================================
+
+Handles runtime model resolution (primary / fallback / multimodal)
+and thread-safe failover state management.
+
+Extracted from reflection_engine.py for single-responsibility.
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# ── Model Configuration Resolution ────────────────────────────────────────
+
+
+def resolve_config() -> dict:
+ """Resolve model config via hermes unified runtime provider.
+
+ Returns dict with:
+ base_url, api_key, model, provider — primary text model
+ fallback: {base_url, api_key, model, provider} — fallback text model
+ multimodal: {base_url, api_key, model, provider} — vision model
+ Returns empty dict if no provider is available.
+ """
+ try:
+ from hermes_cli.runtime_provider import resolve_runtime_provider
+ from hermes_cli.config import load_config
+
+ runtime = resolve_runtime_provider()
+ config = load_config()
+ model_name = config.get("model", {}).get("default", "")
+
+ result = {
+ "base_url": runtime.get("base_url", ""),
+ "api_key": runtime.get("api_key", ""),
+ "model": runtime.get("model", model_name),
+ "provider": runtime.get("provider", ""),
+ }
+
+ result["fallback"] = _resolve_fallback_config(config)
+ result["multimodal"] = _resolve_multimodal_config(config)
+
+ return result
+ except Exception:
+ logger.warning("Failed to resolve runtime provider", exc_info=True)
+ return {}
+
+
+def _resolve_fallback_config(config: dict = None) -> dict:
+ """Resolve fallback text model from config.yaml fallback_providers."""
+ try:
+ from hermes_cli.runtime_provider import resolve_runtime_provider
+
+ if config is None:
+ from hermes_cli.config import load_config
+ config = load_config()
+
+ for fb in config.get("fallback_providers", []):
+ fb_provider = (fb.get("provider") or "").strip()
+ fb_model = (fb.get("model") or "").strip()
+ if not fb_provider:
+ continue
+ try:
+ rt = resolve_runtime_provider(requested=fb_provider)
+ base_url = rt.get("base_url", "")
+ api_key = rt.get("api_key", "")
+ if base_url and fb_model:
+ return {
+ "base_url": base_url,
+ "api_key": api_key,
+ "model": fb_model,
+ "provider": rt.get("provider", ""),
+ }
+ except Exception:
+ pass
+
+ for cp in config.get("custom_providers", []):
+ base_url = (cp.get("base_url") or cp.get("api", "")).strip()
+ if base_url and ("localhost" in base_url or "127.0.0.1" in base_url):
+ model = (cp.get("model") or "").strip()
+ if not model:
+ model = _detect_local_model(
+ base_url,
+ (cp.get("api_key") or "").strip(),
+ )
+ if model and "gemma-4-26b" not in model.lower():
+ return {
+ "base_url": base_url.rstrip("/"),
+ "api_key": (cp.get("api_key") or "").strip(),
+ "model": model,
+ "provider": "custom",
+ }
+
+ return {}
+ except Exception:
+ logger.warning("Failed to resolve fallback config", exc_info=True)
+ return {}
+
+
+def _resolve_multimodal_config(config: dict = None) -> dict:
+ """Resolve multimodal (vision) model config."""
+ try:
+ from hermes_cli.runtime_provider import resolve_runtime_provider
+
+ if config is None:
+ from hermes_cli.config import load_config
+ config = load_config()
+
+ aux = config.get("auxiliary", {})
+ vision_cfg = aux.get("vision", {})
+ vision_provider = (vision_cfg.get("provider") or "").strip().lower()
+ if vision_provider and vision_provider != "auto":
+ try:
+ rt = resolve_runtime_provider(requested=vision_provider)
+ if rt.get("base_url"):
+ return {
+ "base_url": rt.get("base_url", ""),
+ "api_key": rt.get("api_key", ""),
+ "model": vision_cfg.get("model") or rt.get("model", ""),
+ "provider": rt.get("provider", ""),
+ }
+ except Exception:
+ pass
+
+ for cp in config.get("custom_providers", []):
+ base_url = (cp.get("base_url") or cp.get("api", "")).strip()
+ if base_url and ("localhost" in base_url or "127.0.0.1" in base_url):
+ api_key = (cp.get("api_key") or "").strip()
+ key_env = (cp.get("key_env") or "").strip()
+ if not api_key and key_env:
+ import os
+ api_key = os.getenv(key_env, "")
+ model = (cp.get("model") or "").strip()
+ if not model:
+ model = _detect_local_model(base_url, api_key)
+ if model:
+ return {
+ "base_url": base_url.rstrip("/"),
+ "api_key": api_key,
+ "model": model,
+ "provider": "custom",
+ }
+
+ return {}
+ except Exception:
+ logger.warning("Failed to resolve multimodal config", exc_info=True)
+ return {}
+
+
+# ── Failover State (thread-safe) ──────────────────────────────────────────
+
+_active_model: str = "primary"
+_last_health_check: float = 0.0
+_HEALTH_CHECK_INTERVAL: int = 1800 # 30 minutes
+_failover_lock = threading.Lock()
+
+
+def _check_primary_health(config: dict) -> bool:
+ """Quick health check: send a minimal request to the primary model."""
+ try:
+ import requests
+ base_url = config.get("base_url", "")
+ api_key = config.get("api_key", "")
+ model = config.get("model", "")
+ if not base_url or not model:
+ return False
+ resp = requests.post(
+ f"{base_url.rstrip('/')}/chat/completions",
+ headers={
+ "Authorization": f"Bearer {api_key}",
+ "Content-Type": "application/json",
+ },
+ json={
+ "model": model,
+ "messages": [{"role": "user", "content": "OK"}],
+ "max_tokens": 2,
+ },
+ timeout=15,
+ )
+ return resp.status_code == 200
+ except Exception:
+ return False
+
+
+def get_active_text_config(config: dict) -> tuple:
+ """Return (active_config_dict, is_fallback) based on failover state."""
+ global _active_model, _last_health_check
+
+ with _failover_lock:
+ now = time.time()
+
+ if _active_model == "fallback":
+ if now - _last_health_check >= _HEALTH_CHECK_INTERVAL:
+ _last_health_check = now
+ if _check_primary_health(config):
+ _active_model = "primary"
+ logger.info("Primary model recovered, switching back")
+ else:
+ logger.info("Primary model still unavailable, staying on fallback")
+
+ fallback = config.get("fallback", {})
+ if _active_model == "primary":
+ return config, False
+ elif fallback:
+ return fallback, True
+ else:
+ return config, False
+
+
+def switch_to_fallback():
+ """Mark primary as down and switch to fallback."""
+ global _active_model, _last_health_check
+ with _failover_lock:
+ _active_model = "fallback"
+ _last_health_check = time.time()
+ logger.warning("Primary model failed, switched to fallback")
+
+
+def _detect_local_model(base_url: str, api_key: str = "") -> str:
+ """Auto-detect a multimodal model from a local server."""
+ try:
+ import requests
+ headers = {}
+ if api_key:
+ headers["Authorization"] = f"Bearer {api_key}"
+ resp = requests.get(
+ f"{base_url.rstrip('/')}/models",
+ headers=headers, timeout=5,
+ )
+ if resp.ok:
+ models = resp.json().get("data", [])
+ multimodal_hints = ["gemma-4", "qwen2-vl", "qwen-vl", "llava", "pixtral", "vision"]
+ for m in models:
+ mid = m.get("id", "").lower()
+ for hint in multimodal_hints:
+ if hint in mid:
+ return m["id"]
+ except Exception:
+ pass
+ return ""
diff --git a/self_evolution/models.py b/self_evolution/models.py
new file mode 100644
index 0000000000..a8f184ea09
--- /dev/null
+++ b/self_evolution/models.py
@@ -0,0 +1,363 @@
+"""
+Self Evolution Plugin — Data Models
+=====================================
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field, asdict
+from typing import Any, Dict, List, Optional
+import json
+import time
+
+
+def _now() -> float:
+ return time.time()
+
+
+def _ts() -> str:
+ return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
+
+# ── Quality Scoring ──────────────────────────────────────────────────────
+
+@dataclass
+class QualityScore:
+ session_id: str
+ composite: float = 0.0
+ completion_rate: float = 0.0
+ efficiency_score: float = 0.0
+ cost_efficiency: float = 0.0
+ satisfaction_proxy: float = 0.0
+ task_category: str = ""
+ model: str = ""
+ created_at: float = field(default_factory=_now)
+
+ def to_db_row(self) -> dict:
+ return {
+ "session_id": self.session_id,
+ "composite_score": self.composite,
+ "completion_rate": self.completion_rate,
+ "efficiency_score": self.efficiency_score,
+ "cost_efficiency": self.cost_efficiency,
+ "satisfaction_proxy": self.satisfaction_proxy,
+ "task_category": self.task_category,
+ "model": self.model,
+ "created_at": self.created_at,
+ }
+
+
+# ── Error Analysis ───────────────────────────────────────────────────────
+
+@dataclass
+class ToolFailure:
+ tool_name: str
+ error_type: str
+ count: int
+ sessions_affected: List[str] = field(default_factory=list)
+ example_session: str = ""
+
+
+@dataclass
+class RetryPattern:
+ session_id: str
+ tool_name: str
+ attempt_count: int
+ final_outcome: str # "success" | "failure" | "abandoned"
+
+
+@dataclass
+class ErrorAnalysis:
+ tool_failures: List[ToolFailure] = field(default_factory=list)
+ retry_patterns: List[RetryPattern] = field(default_factory=list)
+ incomplete_sessions: List[str] = field(default_factory=list)
+ user_corrections: int = 0
+ correction_examples: List[str] = field(default_factory=list)
+ api_error_count: int = 0
+ api_error_types: Dict[str, int] = field(default_factory=dict)
+
+ def summary(self) -> str:
+ lines = []
+ if self.tool_failures:
+ lines.append(f"工具失败: {len(self.tool_failures)} 种工具出错")
+ for tf in self.tool_failures[:5]:
+ lines.append(f" - {tf.tool_name}: {tf.count}次 ({tf.error_type})")
+ if self.retry_patterns:
+ retries = len(self.retry_patterns)
+ lines.append(f"重复重试: {retries} 次")
+ if self.incomplete_sessions:
+ lines.append(f"未完成session: {len(self.incomplete_sessions)} 个")
+ if self.user_corrections:
+ lines.append(f"用户纠正: {self.user_corrections} 次")
+ if self.api_error_count:
+ lines.append(f"API错误: {self.api_error_count} 次")
+ return "\n".join(lines)
+
+
+# ── Time Waste Analysis ──────────────────────────────────────────────────
+
+@dataclass
+class ToolDuration:
+ tool_name: str
+ total_duration_ms: int
+ call_count: int
+ avg_duration_ms: float
+
+
+@dataclass
+class RepeatedOperation:
+ description: str
+ count: int
+ sessions: List[str] = field(default_factory=list)
+ wasted_ms: int = 0
+
+
+@dataclass
+class WasteAnalysis:
+ slowest_tools: List[ToolDuration] = field(default_factory=list)
+ repeated_operations: List[RepeatedOperation] = field(default_factory=list)
+ inefficient_sessions: List[str] = field(default_factory=list)
+ shortcut_opportunities: List[str] = field(default_factory=list)
+
+ def summary(self) -> str:
+ lines = []
+ if self.slowest_tools:
+ lines.append("耗时最长的工具:")
+ for td in self.slowest_tools[:5]:
+ lines.append(f" - {td.tool_name}: 平均{td.avg_duration_ms:.0f}ms ({td.call_count}次)")
+ if self.repeated_operations:
+ lines.append(f"重复操作: {len(self.repeated_operations)} 种")
+ for ro in self.repeated_operations[:5]:
+ lines.append(f" - {ro.description}: {ro.count}次")
+ if self.inefficient_sessions:
+ lines.append(f"低效session: {len(self.inefficient_sessions)} 个")
+ if self.shortcut_opportunities:
+ lines.append(f"可优化路径: {len(self.shortcut_opportunities)} 个")
+ return "\n".join(lines)
+
+
+# ── Code Change Analysis ──────────────────────────────────────────────────
+
+@dataclass
+class CommitInfo:
+ hash_short: str
+ subject: str
+ body: str = ""
+ author: str = ""
+ timestamp: float = 0.0
+ files_changed: int = 0
+ insertions: int = 0
+ deletions: int = 0
+ file_list: List[str] = field(default_factory=list)
+
+
+@dataclass
+class CodeChangeAnalysis:
+ commits: List[CommitInfo] = field(default_factory=list)
+ total_commits: int = 0
+ total_insertions: int = 0
+ total_deletions: int = 0
+ total_files_changed: int = 0
+ authors: List[str] = field(default_factory=list)
+ change_categories: Dict[str, int] = field(default_factory=dict)
+ areas_touched: List[str] = field(default_factory=list)
+
+ def summary(self) -> str:
+ if not self.commits:
+ return "代码更新: 无新提交"
+ lines = [
+ f"代码更新: {self.total_commits} commits, "
+ f"+{self.total_insertions}/-{self.total_deletions} lines, "
+ f"{self.total_files_changed} files changed",
+ ]
+ if self.change_categories:
+ cats = ", ".join(f"{k}: {v}" for k, v in self.change_categories.items())
+ lines.append(f"提交类型分布: {cats}")
+ if self.areas_touched:
+ lines.append(f"涉及模块: {', '.join(self.areas_touched)}")
+ lines.append("主要变更:")
+ for c in self.commits[:8]:
+ lines.append(f" - {c.subject} ({c.hash_short}, +{c.insertions}/-{c.deletions})")
+ return "\n".join(lines)
+
+
+# ── Reflection Report ────────────────────────────────────────────────────
+
+@dataclass
+class ReflectionReport:
+ period_start: float
+ period_end: float
+ sessions_analyzed: int = 0
+ avg_score: float = 0.0
+ error_summary: str = ""
+ waste_summary: str = ""
+ worst_patterns: List[str] = field(default_factory=list)
+ best_patterns: List[str] = field(default_factory=list)
+ tool_insights: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ recommendations: List[str] = field(default_factory=list)
+ code_change_summary: str = ""
+ model_used: str = ""
+ created_at: float = field(default_factory=_now)
+
+ def to_db_row(self) -> dict:
+ return {
+ "period_start": self.period_start,
+ "period_end": self.period_end,
+ "sessions_analyzed": self.sessions_analyzed,
+ "avg_score": self.avg_score,
+ "error_summary": self.error_summary,
+ "waste_summary": self.waste_summary,
+ "worst_patterns": json.dumps(self.worst_patterns, ensure_ascii=False),
+ "best_patterns": json.dumps(self.best_patterns, ensure_ascii=False),
+ "tool_insights": json.dumps(self.tool_insights, ensure_ascii=False),
+ "recommendations": json.dumps(self.recommendations, ensure_ascii=False),
+ "code_change_summary": self.code_change_summary,
+ "model_used": self.model_used,
+ "created_at": self.created_at,
+ }
+
+
+# ── Evolution Proposal ───────────────────────────────────────────────────
+
+@dataclass
+class Proposal:
+ id: str
+ proposal_type: str # skill | strategy | memory | tool_preference | code_improvement
+ title: str
+ description: str
+ expected_impact: str = ""
+ risk_assessment: str = "low"
+ rollback_plan: str = ""
+ status: str = "pending_approval"
+ report_id: Optional[int] = None
+ user_feedback: str = ""
+ created_at: float = field(default_factory=_now)
+ resolved_at: Optional[float] = None
+
+ def to_db_row(self) -> dict:
+ return {
+ "id": self.id,
+ "report_id": self.report_id,
+ "proposal_type": self.proposal_type,
+ "title": self.title,
+ "description": self.description,
+ "expected_impact": self.expected_impact,
+ "risk_assessment": self.risk_assessment,
+ "rollback_plan": self.rollback_plan,
+ "status": self.status,
+ "user_feedback": self.user_feedback,
+ "created_at": self.created_at,
+ "resolved_at": self.resolved_at,
+ }
+
+
+# ── Improvement Unit (A/B Test Tracking) ─────────────────────────────────
+
+@dataclass
+class ImprovementUnit:
+ id: str
+ proposal_id: str
+ change_type: str
+ version: int = 0
+ baseline_score: float = 0.0
+ current_score: float = 0.0
+ sessions_sampled: int = 0
+ min_sessions: int = 10
+ min_improvement: float = 0.05
+ max_regression: float = 0.10
+ status: str = "active" # active | promoted | reverted
+ created_at: float = field(default_factory=_now)
+ resolved_at: Optional[float] = None
+
+ @property
+ def should_revert(self) -> bool:
+ return (
+ self.sessions_sampled >= 3
+ and (self.baseline_score - self.current_score) > self.max_regression
+ )
+
+ @property
+ def should_promote(self) -> bool:
+ return (
+ self.sessions_sampled >= self.min_sessions
+ and (self.current_score - self.baseline_score) >= self.min_improvement
+ )
+
+ def to_db_row(self) -> dict:
+ return {
+ "id": self.id,
+ "proposal_id": self.proposal_id,
+ "change_type": self.change_type,
+ "version": self.version,
+ "baseline_score": self.baseline_score,
+ "current_score": self.current_score,
+ "sessions_sampled": self.sessions_sampled,
+ "min_sessions": self.min_sessions,
+ "min_improvement": self.min_improvement,
+ "max_regression": self.max_regression,
+ "status": self.status,
+ "created_at": self.created_at,
+ "resolved_at": self.resolved_at,
+ }
+
+
+# ── Strategy Rule ────────────────────────────────────────────────────────
+
+@dataclass
+class StrategyCondition:
+ field: str
+ operator: str # regex_match | contains | equals | not_contains
+ pattern: str
+
+
+@dataclass
+class StrategyRule:
+ id: str
+ name: str
+ strategy_type: str # hint | avoid | prefer
+ description: str
+ conditions: List[StrategyCondition] = field(default_factory=list)
+ hint_text: str = ""
+ severity: str = "medium" # high | medium | low
+ enabled: bool = True
+ version: int = 1
+ source: str = "learned" # learned | manual | default
+ created_at: float = field(default_factory=_now)
+
+ def to_dict(self) -> dict:
+ return {
+ "id": self.id,
+ "name": self.name,
+ "strategy_type": self.strategy_type,
+ "description": self.description,
+ "conditions": [
+ {"field": c.field, "operator": c.operator, "pattern": c.pattern}
+ for c in self.conditions
+ ],
+ "hint_text": self.hint_text,
+ "severity": self.severity,
+ "enabled": self.enabled,
+ "version": self.version,
+ "source": self.source,
+ "created_at": self.created_at,
+ }
+
+ @classmethod
+ def from_dict(cls, d: dict) -> StrategyRule:
+ conditions = [
+ StrategyCondition(field=c["field"], operator=c["operator"], pattern=c["pattern"])
+ for c in d.get("conditions", [])
+ ]
+ return cls(
+ id=d["id"],
+ name=d["name"],
+ strategy_type=d.get("strategy_type", "hint"),
+ description=d.get("description", ""),
+ conditions=conditions,
+ hint_text=d.get("hint_text", ""),
+ severity=d.get("severity", "medium"),
+ enabled=d.get("enabled", True),
+ version=d.get("version", 1),
+ source=d.get("source", "learned"),
+ created_at=d.get("created_at", _now()),
+ )
diff --git a/self_evolution/paths.py b/self_evolution/paths.py
new file mode 100644
index 0000000000..0c5f7018e7
--- /dev/null
+++ b/self_evolution/paths.py
@@ -0,0 +1,17 @@
+"""
+Self Evolution Plugin — Centralized Path Definitions
+=====================================================
+
+Single source of truth for all filesystem paths used by the plugin.
+"""
+
+from pathlib import Path
+
+HERMES_HOME = Path.home() / ".hermes"
+DATA_DIR = HERMES_HOME / "self_evolution"
+DB_PATH = DATA_DIR / "evolution.db"
+STRATEGIES_FILE = DATA_DIR / "strategies.json"
+ARCHIVE_DIR = DATA_DIR / "archive"
+SKILLS_DIR = HERMES_HOME / "skills" / "learned"
+MEMORIES_DIR = HERMES_HOME / "memories"
+CRON_DIR = HERMES_HOME / "cron"
diff --git a/self_evolution/plugin.yaml b/self_evolution/plugin.yaml
new file mode 100644
index 0000000000..3b6cb33355
--- /dev/null
+++ b/self_evolution/plugin.yaml
@@ -0,0 +1,7 @@
+name: self_evolution
+version: "1.0.0"
+description: "Agent self-optimization and continuous evolution — dream consolidation at 1:00, Feishu approval at 19:00"
+provides_hooks:
+ - post_tool_call
+ - on_session_end
+ - pre_llm_call
diff --git a/self_evolution/prompts/reflection.md b/self_evolution/prompts/reflection.md
new file mode 100644
index 0000000000..f5a10792df
--- /dev/null
+++ b/self_evolution/prompts/reflection.md
@@ -0,0 +1,7 @@
+## 概况
+- 时段: {period_range}
+- Session 数: {sessions_count}, 平均质量: {avg_score}
+- 工具调用: {total_invocations} 次, 成功率 {success_rate}%
+
+## 数据
+{data_json}
diff --git a/self_evolution/quality_scorer.py b/self_evolution/quality_scorer.py
new file mode 100644
index 0000000000..4cb0de00c5
--- /dev/null
+++ b/self_evolution/quality_scorer.py
@@ -0,0 +1,177 @@
+"""
+Self Evolution Plugin — Quality Scorer
+=======================================
+
+Computes a composite quality score for each session:
+
+ session_quality = 0.4 * completion_rate
+ + 0.2 * efficiency_score
+ + 0.15 * cost_efficiency
+ + 0.25 * satisfaction_proxy
+
+Zero API cost — pure computation from already-collected session data.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict
+
+from self_evolution.models import QualityScore
+
+logger = logging.getLogger(__name__)
+
+# ── Weights ──────────────────────────────────────────────────────────────
+
+W_COMPLETION = 0.40
+W_EFFICIENCY = 0.20
+W_COST = 0.15
+W_SATISFACTION = 0.25
+
+# Ideal iteration counts by task complexity
+IDEAL_ITERATIONS = {
+ "simple": 3,
+ "medium": 8,
+ "complex": 15,
+}
+DEFAULT_IDEAL_ITERATIONS = 8
+
+
+def compute_score(session_data: dict) -> QualityScore:
+ """Compute a composite quality score from session data.
+
+ Args:
+ session_data: dict with keys like:
+ - completed, interrupted, partial
+ - iterations, max_iterations
+ - tool_call_count, message_count
+ - input_tokens, output_tokens, estimated_cost_usd
+ - duration_seconds
+ - model, platform
+ - messages (list)
+
+ Returns:
+ QualityScore with individual and composite scores.
+ """
+ session_id = session_data.get("session_id", "")
+
+ completion = _completion_rate(session_data)
+ efficiency = _efficiency_score(session_data)
+ cost = _cost_efficiency(session_data)
+ satisfaction = _satisfaction_proxy(session_data)
+
+ composite = (
+ W_COMPLETION * completion
+ + W_EFFICIENCY * efficiency
+ + W_COST * cost
+ + W_SATISFACTION * satisfaction
+ )
+
+ return QualityScore(
+ session_id=session_id,
+ composite=round(composite, 3),
+ completion_rate=round(completion, 3),
+ efficiency_score=round(efficiency, 3),
+ cost_efficiency=round(cost, 3),
+ satisfaction_proxy=round(satisfaction, 3),
+ task_category=_detect_task_category(session_data),
+ model=session_data.get("model", ""),
+ )
+
+
+# ── Individual Score Components ──────────────────────────────────────────
+
+def _completion_rate(session_data: dict) -> float:
+ """1.0 if completed, 0.5 if interrupted, 0.0 if failed."""
+ if session_data.get("completed"):
+ return 1.0
+ if session_data.get("interrupted"):
+ return 0.5
+ if session_data.get("partial"):
+ return 0.3
+ return 0.0
+
+
+def _efficiency_score(session_data: dict) -> float:
+ """Ideal iterations / actual iterations, capped at 1.0."""
+ iterations = session_data.get("iterations", 0) or session_data.get("tool_call_count", 0)
+ if iterations <= 0:
+ return 1.0
+
+ category = _detect_task_category(session_data)
+ ideal = IDEAL_ITERATIONS.get(category, DEFAULT_IDEAL_ITERATIONS)
+
+ return min(1.0, ideal / max(iterations, 1))
+
+
+def _cost_efficiency(session_data: dict) -> float:
+ """Baseline cost / actual cost, capped at 1.0.
+
+ Uses message count as a proxy for expected work.
+ """
+ messages = session_data.get("message_count", 1) or 1
+ tool_calls = session_data.get("tool_call_count", 0) or 0
+ iterations = session_data.get("iterations", 0) or 0
+
+ # Expected: roughly 2 tool calls per user message
+ expected_tool_calls = messages * 2
+
+ if expected_tool_calls <= 0:
+ return 1.0
+
+ return min(1.0, expected_tool_calls / max(tool_calls, 1))
+
+
+def _satisfaction_proxy(session_data: dict) -> float:
+ """Estimate satisfaction from behavioral signals.
+
+ Signals:
+ - Single-turn session (user got what they needed) = high
+ - Multi-turn but completed = medium-high
+ - User corrections detected = lower
+ - Budget exhausted = low
+ """
+ messages = session_data.get("message_count", 1) or 1
+ completed = session_data.get("completed", False)
+ max_iterations = session_data.get("max_iterations", 0)
+ iterations = session_data.get("iterations", 0)
+
+ score = 0.7 # baseline
+
+ # Single-turn completion is a strong positive signal
+ if messages <= 2 and completed:
+ score = 0.9
+ elif completed:
+ score = 0.75
+ elif messages > 10:
+ score = 0.5
+
+ # Budget exhaustion is a negative signal
+ if max_iterations and iterations >= max_iterations:
+ score -= 0.2
+
+ return max(0.0, min(1.0, score))
+
+
+# ── Task Category Detection ──────────────────────────────────────────────
+
+def _detect_task_category(session_data: dict) -> str:
+ """Detect task category from tool usage patterns."""
+ tool_names = session_data.get("tool_names", [])
+ if isinstance(tool_names, str):
+ tool_names = tool_names.split(",")
+
+ tool_set = set(t.lower() for t in tool_names) if tool_names else set()
+
+ coding_tools = {"terminal", "bash", "write", "edit", "file_write", "file_edit"}
+ web_tools = {"web_search", "browser", "browser_navigate", "scrape", "fetch"}
+ file_tools = {"read", "file_read", "grep", "glob", "find"}
+
+ if tool_set & coding_tools:
+ return "coding"
+ if tool_set & web_tools:
+ return "web_research"
+ if tool_set & file_tools:
+ return "file_analysis"
+
+ return "general"
diff --git a/self_evolution/reflection_engine.py b/self_evolution/reflection_engine.py
new file mode 100644
index 0000000000..17269fea0d
--- /dev/null
+++ b/self_evolution/reflection_engine.py
@@ -0,0 +1,751 @@
+"""
+Self Evolution Plugin — Dream Engine (Reflection Engine)
+=========================================================
+
+Runs nightly at 1:00 to analyze the previous day's sessions.
+
+Design reference: Claude Code plugins/hookify/agents/conversation-analyzer.md
+ - Analyzes conversations in reverse chronological order
+ - Detects: corrections, frustrations, repeated issues, reversions
+ - Extracts tool usage patterns, converts to actionable rules
+ - Categorizes by severity
+
+We extend this pattern with:
+ - Full automated analysis (not just on user request)
+ - Error analysis (tool failures, retries, API errors)
+ - Time waste analysis (slow tools, repeated ops, inefficient sessions)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from self_evolution import db
+from self_evolution.model_config import resolve_config, get_active_text_config, switch_to_fallback
+from self_evolution.git_analyzer import analyze_code_changes
+from self_evolution.models import (
+ ErrorAnalysis, ToolFailure, RetryPattern,
+ WasteAnalysis, ToolDuration, RepeatedOperation,
+ CodeChangeAnalysis, CommitInfo,
+ ReflectionReport,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ── Backward-compatible aliases ────────────────────────────────────────────
+# These are used by cron_jobs.py and other callers.
+_resolve_runtime_config = resolve_config
+_get_active_text_config = get_active_text_config
+_switch_to_fallback = switch_to_fallback
+
+
+class DreamEngine:
+ """Nightly dream consolidation engine.
+
+ Analyzes the previous day's sessions to find:
+ 1. Error patterns (tool failures, retries, incomplete tasks)
+ 2. Time waste patterns (slow tools, repeated operations, inefficient flows)
+ 3. Success patterns (what worked well)
+ 4. Generates actionable evolution proposals
+ """
+
+ def __init__(self, config: dict = None):
+ self.config = config or _resolve_runtime_config()
+ self._model_client = None
+ self._current_prompt = ""
+
+ def run(self, hours: int = 24, max_runtime_seconds: int = 0) -> Optional[ReflectionReport]:
+ """Main dream consolidation flow.
+
+ Args:
+ hours: Analyze data from the last N hours.
+ max_runtime_seconds: Hard timeout in seconds. 0 = no limit.
+ If exceeded, stops at the next step boundary and returns None.
+ """
+ logger.info("Dream engine starting — analyzing last %d hours", hours)
+
+ deadline = time.time() + max_runtime_seconds if max_runtime_seconds > 0 else 0
+
+ now = time.time()
+ cutoff = now - (hours * 3600)
+
+ try:
+ # 1. Load session data
+ scores = db.fetch_all(
+ "session_scores",
+ where="created_at >= ?",
+ params=(cutoff,),
+ order_by="created_at DESC",
+ )
+ tool_invocations = db.fetch_all(
+ "tool_invocations",
+ where="created_at >= ?",
+ params=(cutoff,),
+ order_by="created_at DESC",
+ )
+ signals = db.fetch_all(
+ "outcome_signals",
+ where="created_at >= ?",
+ params=(cutoff,),
+ )
+
+ if not scores:
+ logger.info("No sessions to analyze")
+ return None
+
+ # 2. Error analysis
+ if deadline and time.time() > deadline:
+ logger.warning("Dream engine timed out before error analysis")
+ return None
+ error_analysis = self._analyze_errors(scores, tool_invocations, signals)
+ logger.info("Error analysis: %s", error_analysis.summary())
+
+ # 3. Time waste analysis
+ if deadline and time.time() > deadline:
+ logger.warning("Dream engine timed out before waste analysis")
+ return None
+ waste_analysis = self._analyze_time_waste(scores, tool_invocations)
+ logger.info("Waste analysis: %s", waste_analysis.summary())
+
+ # 3.5. Code change analysis
+ if deadline and time.time() > deadline:
+ logger.warning("Dream engine timed out before code analysis")
+ return None
+ code_analysis = analyze_code_changes(hours=hours)
+ logger.info("Code change analysis: %d commits found", code_analysis.total_commits)
+
+ # 4. Compute average score
+ avg_score = (
+ sum(s.get("composite_score", 0) for s in scores) / len(scores)
+ if scores else 0
+ )
+
+ # 5. Build reflection prompt
+ if deadline and time.time() > deadline:
+ logger.warning("Dream engine timed out before model call")
+ return None
+ prompt = self._build_reflection_prompt(
+ scores, tool_invocations, signals,
+ error_analysis, waste_analysis, avg_score,
+ code_analysis=code_analysis,
+ )
+
+ # 6. Call model for deep reflection
+ reflection_text = self._call_model(prompt)
+ if not reflection_text:
+ logger.warning("Model returned empty reflection")
+ return None
+
+ # 7. Parse reflection report
+ report = self._parse_reflection(
+ reflection_text=reflection_text,
+ period_start=cutoff,
+ period_end=now,
+ sessions_analyzed=len(scores),
+ avg_score=avg_score,
+ error_analysis=error_analysis,
+ waste_analysis=waste_analysis,
+ code_analysis=code_analysis,
+ )
+
+ # 8. Store report
+ report_id = db.insert("reflection_reports", report.to_db_row())
+ logger.info("Reflection report saved: id=%d, avg_score=%.3f", report_id, avg_score)
+
+ # 9. Generate evolution proposals
+ from self_evolution.evolution_proposer import generate_proposals
+ proposals = generate_proposals(report, report_id)
+ for p in proposals:
+ db.insert("evolution_proposals", p.to_db_row())
+ logger.info("Generated %d evolution proposals", len(proposals))
+
+ # 10. Compress existing strategies
+ try:
+ from self_evolution.strategy_compressor import compress_strategies
+ from self_evolution.strategy_store import StrategyStore
+ store = StrategyStore()
+ data = store.load()
+ rules = data.get("rules", [])
+ compressed = compress_strategies(rules)
+ if len(compressed) < len(rules):
+ data["rules"] = compressed
+ store.save(data)
+ logger.info("Strategies compressed: %d → %d", len(rules), len(compressed))
+ except Exception as exc:
+ logger.warning("Strategy compression failed: %s", exc)
+
+ # 11. Cleanup old data
+ db.cleanup(days=30)
+
+ return report
+
+ except Exception as exc:
+ logger.exception("Dream engine failed: %s", exc)
+ return None
+
+ # ── Error Analysis ────────────────────────────────────────────────────
+
+ def _analyze_errors(
+ self,
+ scores: List[dict],
+ invocations: List[dict],
+ signals: List[dict],
+ ) -> ErrorAnalysis:
+ """Analyze all errors in the period.
+
+ Inspired by Claude Code conversation-analyzer's signal detection.
+ """
+ # Tool failures
+ failures = {}
+ for inv in invocations:
+ if not inv.get("success", True):
+ tool = inv.get("tool_name", "unknown")
+ error_type = inv.get("error_type", "unknown")
+ key = f"{tool}:{error_type}"
+ if key not in failures:
+ failures[key] = ToolFailure(
+ tool_name=tool,
+ error_type=error_type,
+ count=0,
+ sessions_affected=[],
+ example_session=inv.get("session_id", ""),
+ )
+ failures[key].count += 1
+ sid = inv.get("session_id", "")
+ if sid and sid not in failures[key].sessions_affected:
+ failures[key].sessions_affected.append(sid)
+
+ # Retry patterns (same tool called > 2 times in same session)
+ retries = self._detect_retry_patterns(invocations)
+
+ # Incomplete sessions
+ incomplete = [
+ s.get("session_id", "") for s in scores
+ if s.get("completion_rate", 1.0) < 0.5
+ ]
+
+ # User corrections from signals
+ corrections = [s for s in signals if s.get("signal_type") == "correction"]
+ frustration = [s for s in signals if s.get("signal_type") == "frustration"]
+ api_errors = [s for s in signals if s.get("signal_type") == "api_error"]
+
+ # API error type distribution
+ api_error_types: Dict[str, int] = {}
+ for s in api_errors:
+ meta = json.loads(s.get("metadata", "{}"))
+ etype = meta.get("error_type", "unknown")
+ api_error_types[etype] = api_error_types.get(etype, 0) + 1
+
+ return ErrorAnalysis(
+ tool_failures=sorted(failures.values(), key=lambda x: x.count, reverse=True),
+ retry_patterns=retries,
+ incomplete_sessions=incomplete,
+ user_corrections=len(corrections),
+ correction_examples=[s.get("metadata", "") for s in corrections[:3]],
+ api_error_count=len(api_errors),
+ api_error_types=api_error_types,
+ )
+
+ def _detect_retry_patterns(self, invocations: List[dict]) -> List[RetryPattern]:
+ """Detect tools called > 2 times in same session."""
+ session_tools: Dict[str, Dict[str, int]] = {}
+ for inv in invocations:
+ sid = inv.get("session_id", "")
+ tool = inv.get("tool_name", "")
+ if sid not in session_tools:
+ session_tools[sid] = {}
+ session_tools[sid][tool] = session_tools[sid].get(tool, 0) + 1
+
+ patterns = []
+ for sid, tools in session_tools.items():
+ for tool, count in tools.items():
+ if count > 2:
+ patterns.append(RetryPattern(
+ session_id=sid,
+ tool_name=tool,
+ attempt_count=count,
+ final_outcome="unknown",
+ ))
+ return sorted(patterns, key=lambda x: x.attempt_count, reverse=True)[:20]
+
+ # ── Time Waste Analysis ───────────────────────────────────────────────
+
+ def _analyze_time_waste(
+ self,
+ scores: List[dict],
+ invocations: List[dict],
+ ) -> WasteAnalysis:
+ """Analyze time waste patterns."""
+ # Slowest tools
+ tool_durations: Dict[str, List[int]] = {}
+ for inv in invocations:
+ tool = inv.get("tool_name", "")
+ duration = inv.get("duration_ms", 0)
+ if not duration:
+ continue
+ if tool not in tool_durations:
+ tool_durations[tool] = []
+ tool_durations[tool].append(duration)
+
+ slowest = [
+ ToolDuration(
+ tool_name=tool,
+ total_duration_ms=sum(durs),
+ call_count=len(durs),
+ avg_duration_ms=sum(durs) / len(durs),
+ )
+ for tool, durs in tool_durations.items()
+ ]
+ slowest.sort(key=lambda x: x.avg_duration_ms, reverse=True)
+
+ # Repeated operations (same tool + same session > 3 times)
+ session_tool_calls: Dict[str, Dict[str, int]] = {}
+ for inv in invocations:
+ sid = inv.get("session_id", "")
+ tool = inv.get("tool_name", "")
+ if sid not in session_tool_calls:
+ session_tool_calls[sid] = {}
+ session_tool_calls[sid][tool] = session_tool_calls[sid].get(tool, 0) + 1
+
+ repeated = []
+ for sid, tools in session_tool_calls.items():
+ for tool, count in tools.items():
+ if count > 3:
+ repeated.append(RepeatedOperation(
+ description=f"{tool} called {count} times",
+ count=count,
+ sessions=[sid],
+ wasted_ms=tool_durations.get(tool, [0])[0] * (count - 2) if tool in tool_durations else 0,
+ ))
+
+ # Inefficient sessions (low efficiency score)
+ inefficient = [
+ s.get("session_id", "") for s in scores
+ if s.get("efficiency_score", 1.0) < 0.3
+ ]
+
+ return WasteAnalysis(
+ slowest_tools=slowest[:10],
+ repeated_operations=sorted(repeated, key=lambda x: x.count, reverse=True)[:10],
+ inefficient_sessions=inefficient,
+ shortcut_opportunities=[],
+ )
+
+ # ── Reflection Prompt ─────────────────────────────────────────────────
+
+ def _build_reflection_prompt(
+ self,
+ scores: List[dict],
+ invocations: List[dict],
+ signals: List[dict],
+ errors: ErrorAnalysis,
+ waste: WasteAnalysis,
+ avg_score: float,
+ code_analysis: CodeChangeAnalysis = None,
+ ) -> str:
+ """Build the reflection prompt as structured JSON data.
+
+ All analysis results are serialized as JSON so the model receives
+ lossless data instead of pre-summarized text.
+ """
+ # Load user prompt template (short: just overview + data placeholder)
+ template_path = Path(__file__).parent / "prompts" / "reflection.md"
+ if template_path.exists():
+ template = template_path.read_text(encoding="utf-8")
+ else:
+ template = _DEFAULT_REFLECTION_PROMPT
+
+ # Compute statistics
+ total_invocations = len(invocations)
+ success_rate = (
+ sum(1 for i in invocations if i.get("success", True)) / total_invocations * 100
+ if total_invocations else 100
+ )
+
+ # Period range
+ if scores:
+ ts_min = min(s.get("created_at", 0) for s in scores)
+ ts_max = max(s.get("created_at", 0) for s in scores)
+ period_range = (
+ f"{time.strftime('%m-%d %H:%M', time.localtime(ts_min))} ~ "
+ f"{time.strftime('%m-%d %H:%M', time.localtime(ts_max))}"
+ )
+ else:
+ period_range = "N/A"
+
+ # Build structured data JSON — compact format to save tokens
+ data = {}
+
+ # 1. Sessions — compact: [score, completion, efficiency, cost, satisfaction, category]
+ data["sessions"] = [
+ [
+ round(s.get("composite_score", 0), 2),
+ round(s.get("completion_rate", 0), 2),
+ round(s.get("efficiency_score", 0), 2),
+ round(s.get("cost_efficiency", 0), 2),
+ round(s.get("satisfaction_proxy", 0), 2),
+ s.get("task_category", ""),
+ ]
+ for s in scores
+ ]
+
+ # 2. Tool usage — compact: {tool: [calls, failures, avg_ms]}
+ tool_stats: Dict[str, List[int]] = {}
+ for inv in invocations:
+ tool = inv.get("tool_name", "")
+ if tool not in tool_stats:
+ tool_stats[tool] = [0, 0, 0] # calls, failures, total_ms
+ tool_stats[tool][0] += 1
+ if not inv.get("success", True):
+ tool_stats[tool][1] += 1
+ tool_stats[tool][2] += inv.get("duration_ms", 0) or 0
+ data["tools"] = {
+ t: [v[0], v[1], round(v[2] / max(v[0], 1))]
+ for t, v in sorted(tool_stats.items(), key=lambda x: x[1][2], reverse=True)
+ }
+
+ # 3. Signals — compact: {type: count}
+ signal_types = {}
+ for s in signals:
+ stype = s.get("signal_type", "unknown")
+ signal_types[stype] = signal_types.get(stype, 0) + 1
+ data["signals"] = signal_types
+
+ # 4. Errors — only non-empty fields
+ err_data = {}
+ if errors.tool_failures:
+ err_data["tool_failures"] = [
+ f"{tf.tool_name}:{tf.error_type}x{tf.count}"
+ for tf in errors.tool_failures
+ ]
+ if errors.retry_patterns:
+ err_data["retries"] = [
+ f"{rp.tool_name}x{rp.attempt_count}"
+ for rp in errors.retry_patterns[:5]
+ ]
+ if errors.incomplete_sessions:
+ err_data["incomplete"] = len(errors.incomplete_sessions)
+ if errors.user_corrections:
+ err_data["corrections"] = errors.user_corrections
+ if errors.correction_examples:
+ err_data["correction_examples"] = errors.correction_examples[:2]
+ if errors.api_error_count:
+ err_data["api_errors"] = errors.api_error_count
+ if err_data:
+ data["errors"] = err_data
+
+ # 5. Waste — only non-empty
+ waste_data = {}
+ if waste.slowest_tools:
+ waste_data["slowest"] = [
+ f"{td.tool_name} {round(td.avg_duration_ms)}ms/{td.call_count}calls"
+ for td in waste.slowest_tools[:5]
+ ]
+ if waste.repeated_operations:
+ waste_data["repeated"] = [
+ f"{ro.description} x{ro.count}"
+ for ro in waste.repeated_operations[:3]
+ ]
+ if waste.inefficient_sessions:
+ waste_data["inefficient"] = len(waste.inefficient_sessions)
+ if waste_data:
+ data["waste"] = waste_data
+
+ # 6. Code changes — flat compact format
+ if code_analysis and code_analysis.commits:
+ cc = code_analysis
+ commits_data = []
+ for c in cc.commits[:10]:
+ entry = f"{c.hash_short} {c.subject} +{c.insertions}/-{c.deletions}"
+ if c.file_list:
+ entry += f" [{','.join(c.file_list[:5])}]"
+ if c.body:
+ entry += f" | {c.body[:150]}"
+ commits_data.append(entry)
+ data["code_changes"] = {
+ "stats": f"{cc.total_commits} commits +{cc.total_insertions}/-{cc.total_deletions} lines {cc.total_files_changed} files",
+ "categories": cc.change_categories,
+ "areas": cc.areas_touched,
+ "commits": commits_data,
+ }
+
+ data_json = json.dumps(data, ensure_ascii=False, indent=2)
+
+ # Fill template
+ prompt = template.replace("{period_range}", period_range)
+ prompt = prompt.replace("{sessions_count}", str(len(scores)))
+ prompt = prompt.replace("{avg_score}", f"{avg_score:.3f}")
+ prompt = prompt.replace("{total_invocations}", str(total_invocations))
+ prompt = prompt.replace("{success_rate}", f"{success_rate:.1f}")
+ prompt = prompt.replace("{data_json}", data_json)
+
+ return prompt
+
+ # ── Model Call ────────────────────────────────────────────────────────
+
+ def _call_model(self, prompt: str) -> Optional[str]:
+ """Call the active model with automatic failover.
+
+ Resolution order:
+ 1. Primary model (glm-5.1 via zai)
+ 2. Fallback model (Qwen3.6 via local) — if primary fails
+ Health check: when on fallback, probes primary every 30 min
+ and switches back when it recovers.
+ """
+ self._current_prompt = prompt
+
+ active_cfg, is_fallback = _get_active_text_config(self.config)
+ base_url = active_cfg.get("base_url", "")
+ api_key = active_cfg.get("api_key", "")
+ model = active_cfg.get("model", "")
+
+ if not base_url or not model:
+ logger.warning("Incomplete runtime config: base_url=%s model=%s",
+ bool(base_url), model)
+ return None
+
+ result = self._call_chat_completions(base_url, api_key, model)
+
+ # If primary failed, try fallback
+ if result is None and not is_fallback:
+ fallback = self.config.get("fallback", {})
+ if fallback.get("base_url") and fallback.get("model"):
+ logger.warning("Primary model failed, trying fallback: %s",
+ fallback.get("model"))
+ result = self._call_chat_completions(
+ fallback["base_url"], fallback.get("api_key", ""),
+ fallback["model"],
+ )
+ if result is not None:
+ _switch_to_fallback()
+
+ return result
+
+ def _call_chat_completions(
+ self, base_url: str, api_key: str, model: str,
+ ) -> Optional[str]:
+ """Call OpenAI-compatible /chat/completions endpoint."""
+ try:
+ import requests
+ url = f"{base_url.rstrip('/')}/chat/completions"
+ headers = {"Content-Type": "application/json"}
+ if api_key:
+ headers["Authorization"] = f"Bearer {api_key}"
+
+ resp = requests.post(
+ url,
+ headers=headers,
+ json={
+ "model": model,
+ "messages": [
+ {"role": "system", "content": _SYSTEM_PROMPT},
+ {"role": "user", "content": self._current_prompt or ""},
+ ],
+ "temperature": 0.3,
+ },
+ timeout=300,
+ )
+ if resp.status_code == 200:
+ data = resp.json()
+ return data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ else:
+ logger.debug("Model call failed: %d %s", resp.status_code, resp.text[:200])
+ except Exception as exc:
+ logger.debug("Chat completions call failed: %s", exc)
+ return None
+
+ # ── Multimodal Call ───────────────────────────────────────────────────
+
+ def call_multimodal(self, prompt: str, images: list = None) -> Optional[str]:
+ """Call multimodal model with text and optional images.
+
+ Routes to local multimodal model (gemma-4-26b-a4b-it-4bit) when
+ images are involved. Falls back to text model if no images.
+
+ Args:
+ prompt: Text prompt.
+ images: List of image data, each item is either:
+ - URL string (http/https/data:image)
+ - bytes (raw image data, auto-encoded to base64)
+
+ Returns:
+ Model response text, or None on failure.
+ """
+ mm = self.config.get("multimodal", {})
+ if not mm or not mm.get("base_url"):
+ logger.debug("No multimodal model configured, falling back to text")
+ return self._call_model(prompt)
+
+ # Build content with images
+ content = [{"type": "text", "text": prompt}]
+ for img in (images or []):
+ if isinstance(img, bytes):
+ import base64
+ b64 = base64.b64encode(img).decode()
+ content.append({
+ "type": "image_url",
+ "image_url": {"url": f"data:image/png;base64,{b64}"},
+ })
+ elif isinstance(img, str):
+ content.append({
+ "type": "image_url",
+ "image_url": {"url": img},
+ })
+
+ try:
+ from openai import OpenAI
+ client = OpenAI(
+ base_url=mm["base_url"].rstrip("/") + ("/v1" if not mm["base_url"].rstrip("/").endswith("/v1") else ""),
+ api_key=mm.get("api_key") or "no-key",
+ )
+ resp = client.chat.completions.create(
+ model=mm["model"],
+ messages=[{"role": "user", "content": content}],
+ temperature=0.3,
+ max_tokens=2000,
+ timeout=120,
+ )
+ return resp.choices[0].message.content
+ except Exception as exc:
+ logger.debug("Multimodal call failed: %s", exc)
+ return None
+
+ # ── Reflection Parsing ────────────────────────────────────────────────
+
+ def _parse_reflection(
+ self,
+ reflection_text: str,
+ period_start: float,
+ period_end: float,
+ sessions_analyzed: int,
+ avg_score: float,
+ error_analysis: ErrorAnalysis,
+ waste_analysis: WasteAnalysis,
+ code_analysis: CodeChangeAnalysis = None,
+ ) -> ReflectionReport:
+ """Parse model output into structured ReflectionReport.
+
+ Extraction cascade:
+ 1. Direct JSON parse
+ 2. Strip markdown ```json ... ``` wrapper, retry JSON
+ 3. Extract JSON object via regex (handle trailing text)
+ 4. Text-based section extraction (fallback)
+ """
+ worst_patterns = []
+ best_patterns = []
+ recommendations = []
+ tool_insights = {}
+
+ text = reflection_text.strip()
+
+ # 1. Direct JSON parse
+ data = _try_parse_json(text)
+
+ if data is None:
+ # 2. Strip markdown wrapper
+ m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
+ if m:
+ data = _try_parse_json(m.group(1))
+
+ if data is None:
+ # 3. Regex extract first JSON object
+ m = re.search(r'\{[^{}]*"(?:worst|best|recommendations)"[^{}]*\}', text, re.DOTALL)
+ if m:
+ data = _try_parse_json(m.group(0))
+
+ if data is None:
+ # 3.5. Broader regex — find outermost braces
+ start = text.find('{')
+ end = text.rfind('}')
+ if start != -1 and end > start:
+ data = _try_parse_json(text[start:end + 1])
+
+ if data is not None:
+ worst_patterns = data.get("worst_patterns") or []
+ best_patterns = data.get("best_patterns") or []
+ recommendations = data.get("recommendations") or []
+ tool_insights = data.get("tool_insights") or {}
+ else:
+ # 4. Text-based extraction
+ section = None
+ for line in text.split("\n"):
+ stripped = line.strip()
+ lower = stripped.lower()
+ if ("worst" in lower and "pattern" in lower) or "最差" in stripped or "错误模式" in stripped:
+ section = "worst"
+ elif ("best" in lower and "pattern" in lower) or "最佳" in stripped or "成功" in stripped:
+ section = "best"
+ elif ("recommend" in lower) or "建议" in stripped:
+ section = "rec"
+ elif stripped.startswith("- ") or stripped.startswith("* ") or stripped.startswith("• "):
+ item = stripped.lstrip("-*• ").strip()
+ if section == "worst":
+ worst_patterns.append(item)
+ elif section == "best":
+ best_patterns.append(item)
+ elif section == "rec":
+ recommendations.append(item)
+ elif len(stripped) > 2 and stripped[0].isdigit() and stripped[1] in ".)" and stripped[2] == " ":
+ item = stripped[3:].strip()
+ if section == "worst":
+ worst_patterns.append(item)
+ elif section == "best":
+ best_patterns.append(item)
+ elif section == "rec":
+ recommendations.append(item)
+
+ return ReflectionReport(
+ period_start=period_start,
+ period_end=period_end,
+ sessions_analyzed=sessions_analyzed,
+ avg_score=avg_score,
+ error_summary=error_analysis.summary(),
+ waste_summary=waste_analysis.summary(),
+ worst_patterns=worst_patterns,
+ best_patterns=best_patterns,
+ tool_insights=tool_insights,
+ recommendations=recommendations,
+ code_change_summary=code_analysis.summary() if code_analysis else "",
+ model_used=self.config.get("model", "unknown"),
+ )
+
+
+# ── Default Prompt Template ──────────────────────────────────────────────
+
+_SYSTEM_PROMPT = (
+ "你是 Hermes Agent 性能分析引擎。分析运行数据+代码变更,输出严格JSON(无markdown)。\n"
+ "格式:\n"
+ '{"worst_patterns":["模式(工具+场景+根因)"],"best_patterns":["成功经验"],'
+ '"tool_insights":{"工具":{"sr":0.95,"ms":500,"rec":"建议"}},'
+ '"recommendations":["做什么|效果|风险(l/m/h)|验证"]}\n'
+ "重点:系统性错误>偶发,错误连锁,策略vs工具问题,重复操作,代码设计合理性,自我进化状态,"
+ "可固化流程。≤5条建议,优先高影响低风险。无数据时输出空数组。"
+)
+
+
+_DEFAULT_REFLECTION_PROMPT = """## 概况
+- 时段: {period_range}
+- Session 数: {sessions_count}, 平均质量: {avg_score}
+- 工具调用: {total_invocations} 次, 成功率 {success_rate}%
+
+## 数据
+{data_json}
+"""
+
+
+def _try_parse_json(text: str) -> Optional[dict]:
+ """Try to parse JSON, returning None on any failure."""
+ try:
+ data = json.loads(text)
+ if isinstance(data, dict):
+ return data
+ except (json.JSONDecodeError, ValueError):
+ pass
+ return None
diff --git a/self_evolution/rule_engine.py b/self_evolution/rule_engine.py
new file mode 100644
index 0000000000..5aad88950a
--- /dev/null
+++ b/self_evolution/rule_engine.py
@@ -0,0 +1,101 @@
+"""
+Self Evolution Plugin — Rule Engine (Strategy Matching)
+========================================================
+
+Conditional strategy matching engine.
+
+Design reference: Claude Code plugins/hookify/core/rule_engine.py
+ - LRU-cached regex compilation (max 128)
+ - Multiple operators: regex_match, contains, equals, not_contains
+ - All conditions must match (AND logic)
+ - Severity levels: high, medium, low
+"""
+
+from __future__ import annotations
+
+import re
+from functools import lru_cache
+from typing import Any, Dict, List, Optional
+
+from self_evolution.models import StrategyRule, StrategyCondition
+
+
+@lru_cache(maxsize=128)
+def _compile_pattern(pattern: str) -> re.Pattern:
+ """Compile and cache a regex pattern."""
+ return re.compile(pattern, re.IGNORECASE)
+
+
+class StrategyRuleEngine:
+ """Evaluate strategy rules against session context."""
+
+ def match_strategies(
+ self,
+ strategies: List[StrategyRule],
+ context: Dict[str, Any],
+ ) -> List[StrategyRule]:
+ """Return strategies whose conditions match the context."""
+ matched = []
+ for strategy in strategies:
+ if not strategy.enabled:
+ continue
+ if not strategy.conditions:
+ # No conditions = always match
+ matched.append(strategy)
+ continue
+ if self._conditions_match(strategy.conditions, context):
+ matched.append(strategy)
+ return matched
+
+ def _conditions_match(
+ self,
+ conditions: List[StrategyCondition],
+ context: Dict[str, Any],
+ ) -> bool:
+ """All conditions must match (AND logic)."""
+ for cond in conditions:
+ field_value = str(context.get(cond.field, ""))
+ if not self._check_operator(cond.operator, cond.pattern, field_value):
+ return False
+ return True
+
+ def _check_operator(self, op: str, pattern: str, value: str) -> bool:
+ """Apply operator check."""
+ try:
+ if op == "regex_match":
+ return bool(_compile_pattern(pattern).search(value))
+ elif op == "contains":
+ return pattern in value
+ elif op == "equals":
+ return pattern == value
+ elif op == "not_contains":
+ return pattern not in value
+ elif op == "starts_with":
+ return value.startswith(pattern)
+ elif op == "ends_with":
+ return value.endswith(pattern)
+ else:
+ return False
+ except re.error:
+ return False
+
+ def format_hints(self, strategies: List[StrategyRule], max_chars: int = 0) -> str:
+ """Format matched strategies into a system hint string.
+
+ Args:
+ max_chars: If > 0, truncate total output to this many characters.
+ """
+ if not strategies:
+ return ""
+
+ lines = ["[自我进化策略提示]"]
+ for s in strategies:
+ type_prefix = {"hint": "💡", "avoid": "⚠️", "prefer": "✅"}.get(
+ s.strategy_type, "💡"
+ )
+ line = f"{type_prefix} {s.name}: {s.hint_text}"
+ if max_chars and len("\n".join(lines)) + len(line) > max_chars:
+ break
+ lines.append(line)
+
+ return "\n".join(lines)
diff --git a/self_evolution/strategy_compressor.py b/self_evolution/strategy_compressor.py
new file mode 100644
index 0000000000..fcbd6b20b0
--- /dev/null
+++ b/self_evolution/strategy_compressor.py
@@ -0,0 +1,141 @@
+"""
+Self Evolution Plugin — Strategy Compressor
+=============================================
+
+Compresses and merges redundant strategy rules into concise hints.
+
+Called after dream consolidation to keep strategies.json compact.
+Each hint_text must be ≤ 30 chars; strategies without conditions are
+either merged into conditional rules or discarded.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Dict, List
+
+logger = logging.getLogger(__name__)
+
+# Maximum allowed length for hint_text (characters)
+MAX_HINT_LENGTH = 30
+
+# Keyword clusters used to group similar strategies
+_CLUSTERS: List[Dict[str, Any]] = [
+ {
+ "keywords": ["bash", "路径", "path", "校验", "预检", "验证", "存在"],
+ "hint": "bash前先read验证路径",
+ "condition": {"field": "tool_name", "operator": "contains", "pattern": "bash"},
+ },
+ {
+ "keywords": ["api", "调试", "debug", "降级", "只读", "探查"],
+ "hint": "API失败时降级只读探查",
+ "condition": {"field": "task_type", "operator": "contains", "pattern": "api"},
+ },
+ {
+ "keywords": ["browser", "浏览器", "timeout", "超时", "网页"],
+ "hint": "浏览器操作设置超时保护",
+ "condition": {"field": "tool_name", "operator": "contains", "pattern": "browser"},
+ },
+ {
+ "keywords": ["重试", "retry", "浪费", "重复", "循环"],
+ "hint": "避免重复重试相同操作",
+ "condition": {},
+ },
+]
+
+
+def compress_strategies(rules: List[dict]) -> List[dict]:
+ """Compress strategy rules by merging similar ones.
+
+ Returns a new list of rules with:
+ - Duplicate hint_texts removed
+ - Similar rules merged into cluster summaries
+ - hint_text truncated to MAX_HINT_LENGTH
+ - Non-matching rules dropped if they have no conditions
+ """
+ if not rules:
+ return []
+
+ # Deduplicate by hint_text
+ seen_hints: set[str] = set()
+ unique: list[dict] = []
+ for r in rules:
+ key = r.get("hint_text", "").strip().lower()
+ if key and key not in seen_hints:
+ seen_hints.add(key)
+ unique.append(r)
+
+ # Cluster similar rules
+ clustered = _cluster_rules(unique)
+
+ # Enforce constraints: hint_text ≤ 30 chars, must have conditions
+ result: list[dict] = []
+ for r in clustered:
+ hint = r.get("hint_text", "").strip()
+ conditions = r.get("conditions", [])
+
+ # Skip rules without conditions (they won't be injected anyway)
+ if not conditions:
+ logger.debug("Dropping unconditioned strategy: %s", hint[:40])
+ continue
+
+ # Truncate hint if needed
+ if len(hint) > MAX_HINT_LENGTH:
+ hint = hint[:MAX_HINT_LENGTH]
+ r["hint_text"] = hint
+
+ result.append(r)
+
+ # Also keep any manual/default rules that already have conditions
+ for r in unique:
+ if r.get("source") in ("manual", "default") and r.get("conditions"):
+ if r not in result:
+ hint = r.get("hint_text", "").strip()
+ if len(hint) > MAX_HINT_LENGTH:
+ r["hint_text"] = hint[:MAX_HINT_LENGTH]
+ result.append(r)
+
+ logger.info("Compressed strategies: %d → %d rules", len(rules), len(result))
+ return result
+
+
+def _cluster_rules(rules: list[dict]) -> list[dict]:
+ """Group rules by keyword clusters and merge each group into one rule."""
+ matched_indices: set[int] = set()
+ merged: list[dict] = []
+
+ for cluster in _CLUSTERS:
+ group: list[dict] = []
+ for i, r in enumerate(rules):
+ text = f"{r.get('name', '')} {r.get('hint_text', '')}".lower()
+ if any(kw in text for kw in cluster["keywords"]):
+ group.append(r)
+ matched_indices.add(i)
+
+ if not group:
+ continue
+
+ # Merge group into one rule
+ first = group[0]
+ condition = cluster.get("condition")
+ merged_rule = {
+ "id": first.get("id", ""),
+ "name": cluster["hint"],
+ "type": "learned",
+ "description": cluster["hint"],
+ "hint_text": cluster["hint"],
+ "conditions": [condition] if condition else [],
+ "severity": "medium",
+ "enabled": True,
+ "source": "learned",
+ "created_at": first.get("created_at", 0),
+ }
+ merged.append(merged_rule)
+
+ # Add unmatched rules as-is
+ for i, r in enumerate(rules):
+ if i not in matched_indices:
+ merged.append(r)
+
+ return merged
diff --git a/self_evolution/strategy_injector.py b/self_evolution/strategy_injector.py
new file mode 100644
index 0000000000..840fdfbd1f
--- /dev/null
+++ b/self_evolution/strategy_injector.py
@@ -0,0 +1,124 @@
+"""
+Self Evolution Plugin — Strategy Injector
+===========================================
+
+Injects learned strategy hints into sessions via pre_llm_call hook.
+
+Design reference: Claude Code plugins/learning-output-style/
+ - SessionStart hook injects behavioral context automatically
+ - Equivalent to CLAUDE.md but more flexible and distributable
+ - No core modification needed
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any, Dict, Optional
+
+from self_evolution.models import StrategyRule
+from self_evolution.rule_engine import StrategyRuleEngine
+
+logger = logging.getLogger(__name__)
+
+_engine = StrategyRuleEngine()
+
+# ── TTL-based cache to avoid reading strategies.json on every LLM call ────
+
+_cached_strategies: list | None = None
+_cache_ts: float = 0.0
+_CACHE_TTL: float = 60.0 # seconds
+
+
+def _load_active_strategies() -> list:
+ """Load active strategies from strategy store (cached for _CACHE_TTL)."""
+ global _cached_strategies, _cache_ts
+
+ now = time.time()
+ if _cached_strategies is not None and (now - _cache_ts) < _CACHE_TTL:
+ return _cached_strategies
+
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ data = store.load()
+ rules = data.get("rules", [])
+
+ strategies = []
+ for rule_data in rules:
+ if not rule_data.get("enabled", True):
+ continue
+ strategy = StrategyRule.from_dict(rule_data)
+ strategies.append(strategy)
+
+ _cached_strategies = strategies
+ _cache_ts = now
+ return strategies
+
+
+def invalidate_cache():
+ """Invalidate the strategy cache (call after strategy updates)."""
+ global _cached_strategies
+ _cached_strategies = None
+
+
+_MAX_INJECT_STRATEGIES = 3 # 最多注入策略数
+_MAX_HINT_CHARS = 100 # 注入提示总字符预算
+_MAX_SINGLE_HINT = 30 # 单条 hint_text 最大字符数
+
+def inject_hints(kwargs: dict) -> Optional[str]:
+ """Pre-llm-call hook: inject learned strategy hints.
+
+ Rules:
+ - Strategies without conditions are skipped (must be condition-based).
+ - hint_text longer than _MAX_SINGLE_HINT chars are skipped.
+ - At most _MAX_INJECT_STRATEGIES hints, total ≤ _MAX_HINT_CHARS.
+ """
+ strategies = _load_active_strategies()
+ if not strategies:
+ return None
+
+ # Build context from current session
+ context = _build_context(kwargs)
+
+ # Match strategies
+ matched = _engine.match_strategies(strategies, context)
+ if not matched:
+ return None
+
+ # Filter: require conditions and enforce hint length
+ eligible = []
+ for s in matched:
+ if not s.conditions:
+ continue # Skip unconditioned strategies
+ if len(s.hint_text.strip()) > _MAX_SINGLE_HINT:
+ continue # Skip overly long hints
+ eligible.append(s)
+
+ if not eligible:
+ return None
+
+ # Deduplicate by hint_text content
+ seen_hints: set[str] = set()
+ unique: list = []
+ for s in eligible:
+ key = s.hint_text.strip().lower()
+ if key not in seen_hints:
+ seen_hints.add(key)
+ unique.append(s)
+
+ # Cap count
+ selected = unique[:_MAX_INJECT_STRATEGIES]
+
+ # Format hints within char budget
+ return _engine.format_hints(selected, max_chars=_MAX_HINT_CHARS)
+
+
+def _build_context(kwargs: dict) -> dict:
+ """Build matching context from hook kwargs."""
+ return {
+ "platform": kwargs.get("platform", ""),
+ "model": kwargs.get("model", ""),
+ "task_type": kwargs.get("task_type", ""),
+ "tool_name": kwargs.get("tool_name", ""),
+ }
diff --git a/self_evolution/strategy_store.py b/self_evolution/strategy_store.py
new file mode 100644
index 0000000000..5de8cab7c9
--- /dev/null
+++ b/self_evolution/strategy_store.py
@@ -0,0 +1,72 @@
+"""
+Self Evolution Plugin — Strategy Store
+========================================
+
+Manages strategy rules with version history and rollback support.
+
+Strategies stored at ~/.hermes/self_evolution/strategies.json
+Archives at ~/.hermes/self_evolution/archive/strategies_v{N}.json
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+from self_evolution.paths import DATA_DIR as STRATEGIES_DIR, STRATEGIES_FILE, ARCHIVE_DIR
+
+
+class StrategyStore:
+ """Load, save, and version strategy rules."""
+
+ def load(self) -> dict:
+ """Load current strategies."""
+ if not STRATEGIES_FILE.exists():
+ return {"version": 0, "rules": []}
+ try:
+ return json.loads(STRATEGIES_FILE.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, OSError):
+ return {"version": 0, "rules": []}
+
+ def save(self, data: dict):
+ """Save strategies to file."""
+ STRATEGIES_DIR.mkdir(parents=True, exist_ok=True)
+ STRATEGIES_FILE.write_text(
+ json.dumps(data, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+
+ def archive(self, version: int):
+ """Archive current strategies for rollback."""
+ if not STRATEGIES_FILE.exists():
+ return
+ ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
+ archive_path = ARCHIVE_DIR / f"strategies_v{version}.json"
+ archive_path.write_text(
+ STRATEGIES_FILE.read_text(encoding="utf-8"),
+ encoding="utf-8",
+ )
+ logger.info("Archived strategies version %d", version)
+
+ def load_archive(self, version: int) -> Optional[dict]:
+ """Load an archived version."""
+ archive_path = ARCHIVE_DIR / f"strategies_v{version}.json"
+ if not archive_path.exists():
+ return None
+ try:
+ return json.loads(archive_path.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, OSError):
+ return None
+
+ def restore(self, data: dict):
+ """Restore strategies from an archive."""
+ self.save(data)
+ logger.info("Restored strategies from archive")
+
+ def get_version(self) -> int:
+ """Get current version number."""
+ return self.load().get("version", 0)
diff --git a/tests/test_self_evolution.py b/tests/test_self_evolution.py
new file mode 100644
index 0000000000..4dc04ccc6a
--- /dev/null
+++ b/tests/test_self_evolution.py
@@ -0,0 +1,1253 @@
+"""
+Tests for the Self Evolution Plugin.
+
+Covers:
+ - quality_scorer: composite score computation
+ - models: dataclass serialization / deserialization
+ - db: SQLite CRUD operations (temp DB)
+ - hooks: telemetry collection + signal detection
+ - rule_engine: strategy condition matching
+ - strategy_store: file-based persistence + archive
+ - evolution_proposer: proposal generation + dedup
+ - evolution_executor: execute + tracking + rollback
+ - reflection_engine: JSON/text parsing of model output
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sqlite3
+import tempfile
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ============================================================================
+# Fixtures
+# ============================================================================
+
+@pytest.fixture(autouse=True)
+def _tmp_evolution_db(tmp_path, monkeypatch):
+ """Redirect self_evolution DB to a temp directory for every test."""
+ db_dir = tmp_path / ".hermes" / "self_evolution"
+ db_dir.mkdir(parents=True, exist_ok=True)
+ db_path = db_dir / "evolution.db"
+
+ # Patch centralized paths module
+ import self_evolution.paths as paths_mod
+ monkeypatch.setattr(paths_mod, "DATA_DIR", db_dir)
+ monkeypatch.setattr(paths_mod, "DB_PATH", db_path)
+ monkeypatch.setattr(paths_mod, "STRATEGIES_FILE", db_dir / "strategies.json")
+ monkeypatch.setattr(paths_mod, "ARCHIVE_DIR", db_dir / "archive")
+ monkeypatch.setattr(paths_mod, "SKILLS_DIR", tmp_path / ".hermes" / "skills" / "learned")
+ monkeypatch.setattr(paths_mod, "MEMORIES_DIR", tmp_path / ".hermes" / "memories")
+
+ # Also patch the imported names in db module
+ import self_evolution.db as db_mod
+ monkeypatch.setattr(db_mod, "DB_DIR", db_dir)
+ monkeypatch.setattr(db_mod, "DB_PATH", db_path)
+
+ # Initialize schema
+ db_mod.init_db()
+ yield db_mod
+ # Clean up thread-local connection after each test
+ db_mod.close_connection()
+
+
+@pytest.fixture
+def sample_session_data():
+ """Standard session data for quality scoring tests."""
+ return {
+ "session_id": "test-session-001",
+ "completed": True,
+ "iterations": 5,
+ "tool_call_count": 5,
+ "message_count": 3,
+ "duration_seconds": 120,
+ "model": "test-model",
+ "platform": "test",
+ "tool_names": ["bash", "read", "write"],
+ }
+
+
+# ============================================================================
+# 1. Quality Scorer
+# ============================================================================
+
+class TestQualityScorer:
+ """Test the composite quality score computation."""
+
+ def test_completed_session_high_score(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ score = compute_score(sample_session_data)
+ assert score.composite > 0.5, f"Completed session should score > 0.5, got {score.composite}"
+ assert score.completion_rate == 1.0
+ assert score.task_category == "coding"
+
+ def test_interrupted_session_medium_score(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["completed"] = False
+ sample_session_data["interrupted"] = True
+ score = compute_score(sample_session_data)
+ assert score.completion_rate == 0.5
+
+ def test_partial_session(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["completed"] = False
+ sample_session_data["partial"] = True
+ score = compute_score(sample_session_data)
+ assert score.completion_rate == 0.3
+
+ def test_efficiency_degrades_with_iterations(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ # Low iterations => high efficiency
+ sample_session_data["iterations"] = 2
+ score_low = compute_score(sample_session_data)
+
+ # High iterations => low efficiency
+ sample_session_data["iterations"] = 50
+ score_high = compute_score(sample_session_data)
+
+ assert score_low.efficiency_score > score_high.efficiency_score
+
+ def test_budget_exhaustion_lowers_satisfaction(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["max_iterations"] = 5
+ sample_session_data["iterations"] = 5 # exactly at limit
+ score = compute_score(sample_session_data)
+ assert score.satisfaction_proxy < 0.7 # below baseline
+
+ def test_single_turn_completion_high_satisfaction(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["message_count"] = 2
+ sample_session_data["completed"] = True
+ score = compute_score(sample_session_data)
+ assert score.satisfaction_proxy == 0.9
+
+ def test_task_category_coding(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["tool_names"] = ["bash", "write"]
+ score = compute_score(sample_session_data)
+ assert score.task_category == "coding"
+
+ def test_task_category_web_research(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["tool_names"] = ["web_search", "browser"]
+ score = compute_score(sample_session_data)
+ assert score.task_category == "web_research"
+
+ def test_task_category_file_analysis(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["tool_names"] = ["read", "grep", "glob"]
+ score = compute_score(sample_session_data)
+ assert score.task_category == "file_analysis"
+
+ def test_task_category_general(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["tool_names"] = []
+ score = compute_score(sample_session_data)
+ assert score.task_category == "general"
+
+ def test_tool_names_as_string(self, sample_session_data):
+ from self_evolution.quality_scorer import compute_score
+
+ sample_session_data["tool_names"] = "bash,read,write"
+ score = compute_score(sample_session_data)
+ assert score.task_category == "coding"
+
+ def test_composite_weighted_sum(self, sample_session_data):
+ """Verify composite = 0.4*completion + 0.2*efficiency + 0.15*cost + 0.25*satisfaction."""
+ from self_evolution.quality_scorer import compute_score
+
+ score = compute_score(sample_session_data)
+ expected = (
+ 0.40 * score.completion_rate
+ + 0.20 * score.efficiency_score
+ + 0.15 * score.cost_efficiency
+ + 0.25 * score.satisfaction_proxy
+ )
+ assert abs(score.composite - round(expected, 3)) < 0.001
+
+
+# ============================================================================
+# 2. Models — Serialization
+# ============================================================================
+
+class TestModels:
+ """Test data model serialization and deserialization."""
+
+ def test_quality_score_to_db_row(self):
+ from self_evolution.models import QualityScore
+
+ qs = QualityScore(
+ session_id="s1",
+ composite=0.85,
+ completion_rate=1.0,
+ efficiency_score=0.7,
+ cost_efficiency=0.9,
+ satisfaction_proxy=0.8,
+ task_category="coding",
+ model="test",
+ )
+ row = qs.to_db_row()
+ assert row["session_id"] == "s1"
+ assert row["composite_score"] == 0.85
+ assert row["task_category"] == "coding"
+
+ def test_reflection_report_to_db_row(self):
+ from self_evolution.models import ReflectionReport
+
+ report = ReflectionReport(
+ period_start=1000.0,
+ period_end=2000.0,
+ sessions_analyzed=5,
+ avg_score=0.75,
+ worst_patterns=["pattern1", "pattern2"],
+ best_patterns=["good1"],
+ recommendations=["rec1"],
+ )
+ row = report.to_db_row()
+ assert row["sessions_analyzed"] == 5
+ assert json.loads(row["worst_patterns"]) == ["pattern1", "pattern2"]
+ assert json.loads(row["best_patterns"]) == ["good1"]
+
+ def test_proposal_to_db_row(self):
+ from self_evolution.models import Proposal
+
+ p = Proposal(
+ id="prop-001",
+ proposal_type="strategy",
+ title="Test Proposal",
+ description="A test proposal",
+ risk_assessment="low",
+ )
+ row = p.to_db_row()
+ assert row["id"] == "prop-001"
+ assert row["proposal_type"] == "strategy"
+ assert row["status"] == "pending_approval"
+
+ def test_improvement_unit_should_revert(self):
+ from self_evolution.models import ImprovementUnit
+
+ unit = ImprovementUnit(
+ id="u1",
+ proposal_id="p1",
+ change_type="strategy",
+ baseline_score=0.8,
+ current_score=0.6,
+ sessions_sampled=5,
+ max_regression=0.10,
+ )
+ # Regression = 0.2 > max_regression 0.10 => should revert
+ assert unit.should_revert is True
+
+ def test_improvement_unit_should_not_revert(self):
+ from self_evolution.models import ImprovementUnit
+
+ unit = ImprovementUnit(
+ id="u2",
+ proposal_id="p2",
+ change_type="strategy",
+ baseline_score=0.8,
+ current_score=0.75,
+ sessions_sampled=5,
+ max_regression=0.10,
+ )
+ # Regression = 0.05 < max_regression 0.10 => should NOT revert
+ assert unit.should_revert is False
+
+ def test_improvement_unit_should_promote(self):
+ from self_evolution.models import ImprovementUnit
+
+ unit = ImprovementUnit(
+ id="u3",
+ proposal_id="p3",
+ change_type="strategy",
+ baseline_score=0.7,
+ current_score=0.8,
+ sessions_sampled=15,
+ min_sessions=10,
+ min_improvement=0.05,
+ )
+ # Improvement = 0.1 >= min_improvement 0.05 and sessions >= min_sessions
+ assert unit.should_promote is True
+
+ def test_improvement_unit_should_not_promote_too_few_sessions(self):
+ from self_evolution.models import ImprovementUnit
+
+ unit = ImprovementUnit(
+ id="u4",
+ proposal_id="p4",
+ change_type="strategy",
+ baseline_score=0.7,
+ current_score=0.9,
+ sessions_sampled=5,
+ min_sessions=10,
+ min_improvement=0.05,
+ )
+ assert unit.should_promote is False
+
+ def test_strategy_rule_roundtrip(self):
+ from self_evolution.models import StrategyRule, StrategyCondition
+
+ rule = StrategyRule(
+ id="sr1",
+ name="Avoid large file reads",
+ strategy_type="avoid",
+ description="Don't read files > 1MB",
+ conditions=[
+ StrategyCondition(field="tool_name", operator="equals", pattern="read"),
+ ],
+ hint_text="Use grep instead",
+ severity="high",
+ )
+ d = rule.to_dict()
+ restored = StrategyRule.from_dict(d)
+ assert restored.id == "sr1"
+ assert restored.strategy_type == "avoid"
+ assert len(restored.conditions) == 1
+ assert restored.conditions[0].field == "tool_name"
+
+ def test_error_analysis_summary(self):
+ from self_evolution.models import ErrorAnalysis, ToolFailure
+
+ ea = ErrorAnalysis(
+ tool_failures=[
+ ToolFailure(tool_name="bash", error_type="timeout", count=3),
+ ],
+ retry_patterns=[],
+ incomplete_sessions=["s1"],
+ user_corrections=2,
+ )
+ summary = ea.summary()
+ assert "bash" in summary
+ assert "3" in summary
+ assert "未完成" in summary
+ assert "纠正" in summary
+
+ def test_waste_analysis_summary(self):
+ from self_evolution.models import WasteAnalysis, ToolDuration
+
+ wa = WasteAnalysis(
+ slowest_tools=[
+ ToolDuration(tool_name="bash", total_duration_ms=5000, call_count=5, avg_duration_ms=1000),
+ ],
+ )
+ summary = wa.summary()
+ assert "bash" in summary
+ assert "1000" in summary
+
+ def test_code_change_analysis_summary_empty(self):
+ from self_evolution.models import CodeChangeAnalysis
+
+ cca = CodeChangeAnalysis()
+ assert cca.summary() == "代码更新: 无新提交"
+
+ def test_code_change_analysis_summary_with_commits(self):
+ from self_evolution.models import CodeChangeAnalysis, CommitInfo
+
+ cca = CodeChangeAnalysis(
+ commits=[
+ CommitInfo(hash_short="abc1234", subject="fix: bug", insertions=10, deletions=5),
+ ],
+ total_commits=1,
+ total_insertions=10,
+ total_deletions=5,
+ total_files_changed=2,
+ )
+ summary = cca.summary()
+ assert "abc1234" in summary
+ assert "+10" in summary
+
+
+# ============================================================================
+# 3. Database CRUD
+# ============================================================================
+
+class TestDatabase:
+ """Test SQLite CRUD operations."""
+
+ def test_init_db_creates_tables(self, _tmp_evolution_db):
+ conn = _tmp_evolution_db.get_connection()
+ tables = conn.execute(
+ "SELECT name FROM sqlite_master WHERE type='table'"
+ ).fetchall()
+ table_names = {t["name"] for t in tables}
+ assert "tool_invocations" in table_names
+ assert "session_scores" in table_names
+ assert "evolution_proposals" in table_names
+ assert "improvement_units" in table_names
+ assert "strategy_versions" in table_names
+ conn.close()
+
+ def test_insert_and_fetch(self, _tmp_evolution_db):
+ rowid = _tmp_evolution_db.insert("session_scores", {
+ "session_id": "s-test",
+ "composite_score": 0.85,
+ "completion_rate": 1.0,
+ "efficiency_score": 0.7,
+ "cost_efficiency": 0.9,
+ "satisfaction_proxy": 0.8,
+ "task_category": "coding",
+ "model": "test",
+ })
+ assert rowid > 0
+
+ row = _tmp_evolution_db.fetch_one("session_scores", where="session_id = ?", params=("s-test",))
+ assert row is not None
+ assert row["composite_score"] == 0.85
+
+ def test_insert_many(self, _tmp_evolution_db):
+ rows = [
+ {"session_id": f"s-{i}", "composite_score": 0.5, "completion_rate": 1.0,
+ "efficiency_score": 0.5, "cost_efficiency": 0.5, "satisfaction_proxy": 0.5,
+ "task_category": "general", "model": "test"}
+ for i in range(3)
+ ]
+ _tmp_evolution_db.insert_many("session_scores", rows)
+ all_rows = _tmp_evolution_db.fetch_all("session_scores")
+ assert len(all_rows) == 3
+
+ def test_update(self, _tmp_evolution_db):
+ _tmp_evolution_db.insert("session_scores", {
+ "session_id": "s-upd", "composite_score": 0.5,
+ "completion_rate": 1.0, "efficiency_score": 0.5,
+ "cost_efficiency": 0.5, "satisfaction_proxy": 0.5,
+ "task_category": "general", "model": "test",
+ })
+ _tmp_evolution_db.update(
+ "session_scores",
+ {"composite_score": 0.95},
+ where="session_id = ?",
+ where_params=("s-upd",),
+ )
+ row = _tmp_evolution_db.fetch_one("session_scores", where="session_id = ?", params=("s-upd",))
+ assert row["composite_score"] == 0.95
+
+ def test_fetch_all_with_order_and_limit(self, _tmp_evolution_db):
+ for i in range(5):
+ _tmp_evolution_db.insert("tool_invocations", {
+ "session_id": f"s-{i}",
+ "tool_name": "bash",
+ "duration_ms": i * 100,
+ "success": True,
+ "turn_number": i,
+ })
+ rows = _tmp_evolution_db.fetch_all(
+ "tool_invocations",
+ where="tool_name = ?",
+ params=("bash",),
+ order_by="duration_ms DESC",
+ limit=3,
+ )
+ assert len(rows) == 3
+ assert rows[0]["duration_ms"] == 400
+
+ def test_query(self, _tmp_evolution_db):
+ _tmp_evolution_db.insert("session_scores", {
+ "session_id": "s-q", "composite_score": 0.7,
+ "completion_rate": 1.0, "efficiency_score": 0.5,
+ "cost_efficiency": 0.5, "satisfaction_proxy": 0.5,
+ "task_category": "general", "model": "test",
+ })
+ results = _tmp_evolution_db.query("SELECT COUNT(*) as cnt FROM session_scores")
+ assert results[0]["cnt"] == 1
+
+ def test_cleanup(self, _tmp_evolution_db):
+ old_ts = time.time() - 31 * 86400 # 31 days ago
+ _tmp_evolution_db.insert("tool_invocations", {
+ "session_id": "s-old", "tool_name": "bash",
+ "duration_ms": 100, "success": True, "turn_number": 0,
+ "created_at": old_ts,
+ })
+ _tmp_evolution_db.insert("tool_invocations", {
+ "session_id": "s-new", "tool_name": "bash",
+ "duration_ms": 100, "success": True, "turn_number": 0,
+ })
+ _tmp_evolution_db.cleanup(days=30)
+ remaining = _tmp_evolution_db.fetch_all("tool_invocations")
+ assert len(remaining) == 1
+ assert remaining[0]["session_id"] == "s-new"
+
+
+# ============================================================================
+# 4. Hooks — Telemetry + Signal Detection
+# ============================================================================
+
+class TestHooks:
+ """Test lifecycle hook functions."""
+
+ def test_on_tool_call_inserts_telemetry(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_tool_call
+
+ on_tool_call(
+ tool_name="bash",
+ started_at=time.time(),
+ duration_ms=500,
+ success=True,
+ session_id="s-hook-1",
+ turn_number=3,
+ )
+ rows = _tmp_evolution_db.fetch_all("tool_invocations")
+ assert len(rows) == 1
+ assert rows[0]["tool_name"] == "bash"
+ assert rows[0]["duration_ms"] == 500
+
+ def test_on_tool_call_failure(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_tool_call
+
+ on_tool_call(
+ tool_name="write",
+ success=False,
+ error_type="PermissionError",
+ session_id="s-hook-2",
+ )
+ rows = _tmp_evolution_db.fetch_all("tool_invocations")
+ assert rows[0]["success"] is False or rows[0]["success"] == 0
+ assert rows[0]["error_type"] == "PermissionError"
+
+ def test_on_session_end_computes_score(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_session_end
+
+ on_session_end(session_data={
+ "session_id": "s-end-1",
+ "completed": True,
+ "iterations": 3,
+ "tool_call_count": 3,
+ "message_count": 2,
+ "tool_names": ["bash"],
+ })
+ row = _tmp_evolution_db.fetch_one("session_scores", where="session_id = ?", params=("s-end-1",))
+ assert row is not None
+ assert row["composite_score"] > 0
+
+ def test_on_session_end_no_session_id(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_session_end
+
+ # Should not crash, should not insert anything
+ on_session_end(session_data={})
+ rows = _tmp_evolution_db.fetch_all("session_scores")
+ assert len(rows) == 0
+
+ def test_correction_signal_detected(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_session_end
+
+ on_session_end(session_data={
+ "session_id": "s-corr-1",
+ "completed": True,
+ "iterations": 5,
+ "tool_call_count": 5,
+ "message_count": 3,
+ "messages": [
+ {"role": "assistant", "content": "Done"},
+ {"role": "user", "content": "不对,这不是我想要的"},
+ ],
+ })
+ signals = _tmp_evolution_db.fetch_all(
+ "outcome_signals",
+ where="session_id = ? AND signal_type = ?",
+ params=("s-corr-1", "correction"),
+ )
+ assert len(signals) == 1
+
+ def test_frustration_signal_detected(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_session_end
+
+ on_session_end(session_data={
+ "session_id": "s-frust-1",
+ "completed": True,
+ "iterations": 5,
+ "tool_call_count": 5,
+ "message_count": 3,
+ "messages": [
+ {"role": "assistant", "content": "Done"},
+ {"role": "user", "content": "太慢了,浪费时间"},
+ ],
+ })
+ signals = _tmp_evolution_db.fetch_all(
+ "outcome_signals",
+ where="session_id = ? AND signal_type = ?",
+ params=("s-frust-1", "frustration"),
+ )
+ assert len(signals) == 1
+
+ def test_budget_exhausted_signal(self, _tmp_evolution_db):
+ from self_evolution.hooks import on_session_end
+
+ on_session_end(session_data={
+ "session_id": "s-budget-1",
+ "completed": False,
+ "interrupted": False,
+ "iterations": 20,
+ "max_iterations": 20,
+ "tool_call_count": 20,
+ "message_count": 10,
+ })
+ signals = _tmp_evolution_db.fetch_all(
+ "outcome_signals",
+ where="session_id = ? AND signal_type = ?",
+ params=("s-budget-1", "budget_exhausted"),
+ )
+ assert len(signals) == 1
+
+
+# ============================================================================
+# 5. Rule Engine — Strategy Matching
+# ============================================================================
+
+class TestRuleEngine:
+ """Test conditional strategy matching."""
+
+ def _make_rule(self, strategy_type="hint", conditions=None, enabled=True):
+ from self_evolution.models import StrategyRule, StrategyCondition
+
+ return StrategyRule(
+ id="r1",
+ name="Test Rule",
+ strategy_type=strategy_type,
+ description="desc",
+ conditions=conditions or [],
+ hint_text="test hint",
+ enabled=enabled,
+ )
+
+ def test_always_match_no_conditions(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule()
+ matched = engine.match_strategies([rule], {})
+ assert len(matched) == 1
+
+ def test_disabled_rule_not_matched(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(enabled=False)
+ matched = engine.match_strategies([rule], {})
+ assert len(matched) == 0
+
+ def test_equals_operator(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+ from self_evolution.models import StrategyCondition
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(conditions=[
+ StrategyCondition(field="tool_name", operator="equals", pattern="bash"),
+ ])
+ assert len(engine.match_strategies([rule], {"tool_name": "bash"})) == 1
+ assert len(engine.match_strategies([rule], {"tool_name": "read"})) == 0
+
+ def test_contains_operator(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+ from self_evolution.models import StrategyCondition
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(conditions=[
+ StrategyCondition(field="task_type", operator="contains", pattern="debug"),
+ ])
+ assert len(engine.match_strategies([rule], {"task_type": "debug python code"})) == 1
+ assert len(engine.match_strategies([rule], {"task_type": "write tests"})) == 0
+
+ def test_regex_match_operator(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+ from self_evolution.models import StrategyCondition
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(conditions=[
+ StrategyCondition(field="platform", operator="regex_match", pattern="feishu|slack"),
+ ])
+ assert len(engine.match_strategies([rule], {"platform": "feishu"})) == 1
+ assert len(engine.match_strategies([rule], {"platform": "discord"})) == 0
+
+ def test_not_contains_operator(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+ from self_evolution.models import StrategyCondition
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(conditions=[
+ StrategyCondition(field="model", operator="not_contains", pattern="mini"),
+ ])
+ assert len(engine.match_strategies([rule], {"model": "gpt-4"})) == 1
+ assert len(engine.match_strategies([rule], {"model": "gpt-4-mini"})) == 0
+
+ def test_starts_with_operator(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+ from self_evolution.models import StrategyCondition
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(conditions=[
+ StrategyCondition(field="platform", operator="starts_with", pattern="feishu"),
+ ])
+ assert len(engine.match_strategies([rule], {"platform": "feishu_web"})) == 1
+ assert len(engine.match_strategies([rule], {"platform": "web_feishu"})) == 0
+
+ def test_and_logic_all_conditions_must_match(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+ from self_evolution.models import StrategyCondition
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(conditions=[
+ StrategyCondition(field="platform", operator="equals", pattern="feishu"),
+ StrategyCondition(field="task_type", operator="contains", pattern="code"),
+ ])
+ # Both match
+ assert len(engine.match_strategies([rule], {"platform": "feishu", "task_type": "code review"})) == 1
+ # Only one matches
+ assert len(engine.match_strategies([rule], {"platform": "feishu", "task_type": "chat"})) == 0
+
+ def test_format_hints(self):
+ from self_evolution.rule_engine import StrategyRuleEngine
+
+ engine = StrategyRuleEngine()
+ rule = self._make_rule(strategy_type="avoid", conditions=[])
+ hint = engine.format_hints([rule])
+ assert "[自我进化策略提示]" in hint
+ assert "Test Rule" in hint
+
+
+# ============================================================================
+# 6. Strategy Store
+# ============================================================================
+
+class TestStrategyStore:
+ """Test strategy persistence with versioning."""
+
+ def test_load_empty(self, tmp_path, monkeypatch):
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ monkeypatch.setattr(
+ "self_evolution.strategy_store.STRATEGIES_FILE",
+ tmp_path / "strategies.json",
+ )
+ monkeypatch.setattr(
+ "self_evolution.strategy_store.ARCHIVE_DIR",
+ tmp_path / "archive",
+ )
+ data = store.load()
+ assert data["version"] == 0
+ assert data["rules"] == []
+
+ def test_save_and_load_roundtrip(self, tmp_path, monkeypatch):
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ sf = tmp_path / "strategies.json"
+ ad = tmp_path / "archive"
+ monkeypatch.setattr("self_evolution.paths.STRATEGIES_FILE", sf)
+ monkeypatch.setattr("self_evolution.paths.ARCHIVE_DIR", ad)
+ monkeypatch.setattr("self_evolution.strategy_store.STRATEGIES_FILE", sf)
+ monkeypatch.setattr("self_evolution.strategy_store.ARCHIVE_DIR", ad)
+
+ data = {"version": 1, "rules": [{"id": "r1", "name": "Rule 1"}]}
+ store.save(data)
+
+ loaded = store.load()
+ assert loaded["version"] == 1
+ assert len(loaded["rules"]) == 1
+
+ def test_archive_and_restore(self, tmp_path, monkeypatch):
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ sf = tmp_path / "strategies.json"
+ ad = tmp_path / "archive"
+ monkeypatch.setattr("self_evolution.paths.STRATEGIES_FILE", sf)
+ monkeypatch.setattr("self_evolution.paths.ARCHIVE_DIR", ad)
+ monkeypatch.setattr("self_evolution.strategy_store.STRATEGIES_FILE", sf)
+ monkeypatch.setattr("self_evolution.strategy_store.ARCHIVE_DIR", ad)
+
+ data_v1 = {"version": 1, "rules": [{"id": "r1"}]}
+ store.save(data_v1)
+ store.archive(1)
+
+ # Overwrite with v2
+ data_v2 = {"version": 2, "rules": [{"id": "r2"}]}
+ store.save(data_v2)
+
+ # Restore v1
+ archive = store.load_archive(1)
+ assert archive["version"] == 1
+ assert archive["rules"][0]["id"] == "r1"
+
+ def test_load_nonexistent_archive(self, tmp_path, monkeypatch):
+ from self_evolution.strategy_store import StrategyStore
+
+ store = StrategyStore()
+ monkeypatch.setattr("self_evolution.paths.ARCHIVE_DIR", tmp_path / "archive")
+ monkeypatch.setattr(
+ "self_evolution.strategy_store.ARCHIVE_DIR",
+ tmp_path / "archive",
+ )
+ assert store.load_archive(999) is None
+
+
+# ============================================================================
+# 7. Evolution Proposer
+# ============================================================================
+
+class TestEvolutionProposer:
+ """Test proposal generation from reflection reports."""
+
+ def _make_report(self, worst=None, best=None, recs=None, sessions=10):
+ from self_evolution.models import ReflectionReport
+
+ return ReflectionReport(
+ period_start=1000.0,
+ period_end=2000.0,
+ sessions_analyzed=sessions,
+ worst_patterns=worst or ["bash timeout frequently"],
+ best_patterns=best or ["single-turn code generation works well"],
+ recommendations=recs or ["创建新的工具偏好来优化bash使用"],
+ )
+
+ def test_generates_proposals_from_report(self):
+ from self_evolution.evolution_proposer import generate_proposals
+
+ report = self._make_report()
+ proposals = generate_proposals(report, report_id=1)
+ assert len(proposals) > 0
+
+ def test_error_pattern_creates_code_improvement_proposal(self):
+ from self_evolution.evolution_proposer import generate_proposals
+
+ report = self._make_report(worst=["tool failure pattern"])
+ proposals = generate_proposals(report, report_id=1)
+ code_proposals = [p for p in proposals if p.proposal_type == "code_improvement"]
+ assert len(code_proposals) > 0
+ # Verify structured description
+ desc = code_proposals[0].description
+ assert "问题描述" in desc
+ assert "建议方向" in desc
+
+ def test_success_pattern_creates_skill_proposal(self):
+ from self_evolution.evolution_proposer import generate_proposals
+
+ # Report with enough sessions to pass the ≥5 threshold
+ report = self._make_report(
+ best=["efficient workflow discovered"],
+ sessions=10,
+ )
+ proposals = generate_proposals(report, report_id=1)
+ skill_proposals = [p for p in proposals if p.proposal_type == "skill"]
+ assert len(skill_proposals) > 0
+
+ def test_success_pattern_skipped_below_threshold(self):
+ """Skill proposals should not be generated from best_patterns with <5 sessions."""
+ from self_evolution.evolution_proposer import generate_proposals
+
+ report = self._make_report(
+ best=["efficient workflow discovered"],
+ recs=[], # No recommendations that might create skill proposals
+ sessions=2, # Below threshold
+ )
+ proposals = generate_proposals(report, report_id=1)
+ skill_from_best = [
+ p for p in proposals
+ if p.proposal_type == "skill" and p.id.startswith("prop-success-")
+ ]
+ assert len(skill_from_best) == 0
+
+ def test_recommendation_type_detection(self):
+ from self_evolution.evolution_proposer import generate_proposals
+
+ report = self._make_report(recs=["更新记忆来记住这个发现"])
+ proposals = generate_proposals(report, report_id=1)
+ memory_proposals = [p for p in proposals if p.proposal_type == "memory"]
+ assert len(memory_proposals) > 0
+
+ def test_deduplication(self):
+ from self_evolution.evolution_proposer import generate_proposals
+
+ report = self._make_report(
+ worst=["same pattern", "same pattern"], # duplicate
+ )
+ proposals = generate_proposals(report, report_id=1)
+ titles = [p.title for p in proposals]
+ assert len(titles) == len(set(titles)), "Should deduplicate similar titles"
+
+ def test_max_five_proposals(self):
+ from self_evolution.evolution_proposer import generate_proposals
+
+ report = self._make_report(
+ worst=[f"pattern {i}" for i in range(10)],
+ best=[f"best {i}" for i in range(10)],
+ recs=[f"rec {i}" for i in range(10)],
+ )
+ proposals = generate_proposals(report, report_id=1)
+ assert len(proposals) <= 5
+
+
+# ============================================================================
+# 8. Evolution Executor
+# ============================================================================
+
+class TestEvolutionExecutor:
+ """Test execution of approved proposals."""
+
+ def test_execute_strategy_proposal(self, _tmp_evolution_db, tmp_path, monkeypatch):
+ from self_evolution.evolution_executor import EvolutionExecutor
+ from self_evolution.models import Proposal
+
+ monkeypatch.setattr(
+ "self_evolution.evolution_executor.STRATEGIES_DIR", tmp_path,
+ )
+ monkeypatch.setattr(
+ "self_evolution.evolution_executor.STRATEGIES_FILE",
+ tmp_path / "strategies.json",
+ )
+ monkeypatch.setattr(
+ "self_evolution.evolution_executor.ARCHIVE_DIR",
+ tmp_path / "archive",
+ )
+ monkeypatch.setattr(
+ "self_evolution.strategy_store.STRATEGIES_DIR", tmp_path,
+ )
+ monkeypatch.setattr(
+ "self_evolution.strategy_store.STRATEGIES_FILE",
+ tmp_path / "strategies.json",
+ )
+ monkeypatch.setattr(
+ "self_evolution.strategy_store.ARCHIVE_DIR",
+ tmp_path / "archive",
+ )
+
+ proposal = Proposal(
+ id="prop-exec-1",
+ proposal_type="strategy",
+ title="Test Strategy",
+ description="Avoid large file reads",
+ status="approved",
+ )
+ executor = EvolutionExecutor()
+ executor.execute(proposal)
+
+ # Verify status updated
+ row = _tmp_evolution_db.fetch_one("evolution_proposals", where="id IS NULL") # proposal not in DB, skip
+ # Verify strategy file updated
+ from self_evolution.strategy_store import StrategyStore
+ store = StrategyStore()
+ data = store.load()
+ assert data["version"] >= 1
+ assert any(r["id"] == "prop-exec-1" for r in data["rules"])
+
+ def test_execute_skill_proposal(self, _tmp_evolution_db, tmp_path, monkeypatch):
+ from self_evolution.evolution_executor import EvolutionExecutor
+ from self_evolution.models import Proposal
+
+ skills_dir = tmp_path / "skills" / "learned"
+ monkeypatch.setattr("pathlib.Path.home", lambda: tmp_path)
+
+ proposal = Proposal(
+ id="prop-skill-1",
+ proposal_type="skill",
+ title="Test Skill",
+ description="A learned skill for testing",
+ status="approved",
+ )
+ executor = EvolutionExecutor()
+ executor.execute(proposal)
+
+ skill_file = tmp_path / ".hermes" / "skills" / "learned" / "prop-skill-1" / "SKILL.md"
+ assert skill_file.exists()
+ content = skill_file.read_text()
+ assert "Test Skill" in content
+
+ def test_execute_memory_proposal(self, _tmp_evolution_db, tmp_path, monkeypatch):
+ from self_evolution.evolution_executor import EvolutionExecutor
+ from self_evolution.models import Proposal
+
+ memories_dir = tmp_path / ".hermes" / "memories"
+ monkeypatch.setattr("pathlib.Path.home", lambda: tmp_path)
+
+ proposal = Proposal(
+ id="prop-mem-1",
+ proposal_type="memory",
+ title="Remember Pattern",
+ description="Always use context managers for file operations",
+ status="approved",
+ )
+ executor = EvolutionExecutor()
+ executor.execute(proposal)
+
+ perf_file = memories_dir / "PERFORMANCE.md"
+ assert perf_file.exists()
+ content = perf_file.read_text()
+ assert "context managers" in content
+
+ def test_execute_tool_preference_proposal(self, _tmp_evolution_db, tmp_path, monkeypatch):
+ from self_evolution.evolution_executor import EvolutionExecutor
+ from self_evolution.models import Proposal
+
+ evo_dir = tmp_path / "self_evolution"
+ evo_dir.mkdir(parents=True, exist_ok=True)
+ monkeypatch.setattr("self_evolution.paths.DATA_DIR", evo_dir)
+ monkeypatch.setattr("self_evolution.evolution_executor.STRATEGIES_DIR", evo_dir)
+
+ proposal = Proposal(
+ id="prop-tool-1",
+ proposal_type="tool_preference",
+ title="Prefer grep over find",
+ description="Use grep instead of find for searching",
+ expected_impact="faster searches",
+ status="approved",
+ )
+ executor = EvolutionExecutor()
+ executor.execute(proposal)
+
+ prefs_file = evo_dir / "tool_preferences.json"
+ assert prefs_file.exists()
+ prefs = json.loads(prefs_file.read_text())
+ assert "prop-tool-1" in prefs
+
+
+# ============================================================================
+# 9. Reflection Engine — Parsing
+# ============================================================================
+
+class TestReflectionEngine:
+ """Test reflection report parsing from model output."""
+
+ def _make_engine(self):
+ from self_evolution.reflection_engine import DreamEngine
+ return DreamEngine(config={"base_url": "", "model": ""})
+
+ def test_parse_valid_json(self):
+ engine = self._make_engine()
+ text = json.dumps({
+ "worst_patterns": ["bash timeouts", "repeated reads"],
+ "best_patterns": ["single-turn success"],
+ "recommendations": ["add retry logic"],
+ "tool_insights": {"bash": {"sr": 0.9}},
+ })
+ report = engine._parse_reflection(
+ text, 1000.0, 2000.0, 5, 0.75,
+ error_analysis=MagicMock(summary=lambda: ""),
+ waste_analysis=MagicMock(summary=lambda: ""),
+ )
+ assert len(report.worst_patterns) == 2
+ assert len(report.best_patterns) == 1
+ assert len(report.recommendations) == 1
+
+ def test_parse_json_in_markdown_wrapper(self):
+ engine = self._make_engine()
+ text = '```json\n{"worst_patterns": ["p1"], "best_patterns": [], "recommendations": []}\n```'
+ report = engine._parse_reflection(
+ text, 1000.0, 2000.0, 1, 0.5,
+ error_analysis=MagicMock(summary=lambda: ""),
+ waste_analysis=MagicMock(summary=lambda: ""),
+ )
+ assert report.worst_patterns == ["p1"]
+
+ def test_parse_text_sections(self):
+ engine = self._make_engine()
+ text = """Here is my analysis:
+
+worst patterns:
+- Too many retries
+- Slow file operations
+
+best patterns:
+- Direct code generation
+
+recommendations:
+- Cache tool results
+- Optimize file reads
+"""
+ report = engine._parse_reflection(
+ text, 1000.0, 2000.0, 1, 0.5,
+ error_analysis=MagicMock(summary=lambda: ""),
+ waste_analysis=MagicMock(summary=lambda: ""),
+ )
+ assert len(report.worst_patterns) >= 1
+ assert len(report.best_patterns) >= 1
+ assert len(report.recommendations) >= 1
+
+ def test_parse_numbered_list(self):
+ engine = self._make_engine()
+ text = """分析结果:
+
+worst patterns:
+1) Bash command timeouts
+2) Repeated tool calls
+
+recommendations:
+1) Add timeout handling
+"""
+ report = engine._parse_reflection(
+ text, 1000.0, 2000.0, 1, 0.5,
+ error_analysis=MagicMock(summary=lambda: ""),
+ waste_analysis=MagicMock(summary=lambda: ""),
+ )
+ assert len(report.worst_patterns) >= 1
+
+ def test_parse_empty_text(self):
+ engine = self._make_engine()
+ report = engine._parse_reflection(
+ "", 1000.0, 2000.0, 0, 0.0,
+ error_analysis=MagicMock(summary=lambda: ""),
+ waste_analysis=MagicMock(summary=lambda: ""),
+ )
+ assert report.worst_patterns == []
+ assert report.best_patterns == []
+ assert report.recommendations == []
+
+
+# ============================================================================
+# 10. Integration — End-to-End Flow
+# ============================================================================
+
+class TestEndToEndFlow:
+ """Test the full self-evolution cycle with mocked LLM calls."""
+
+ def test_full_cycle_no_model(self, _tmp_evolution_db, tmp_path, monkeypatch):
+ """Simulate the full cycle: hooks → data → analysis (without LLM call)."""
+ from self_evolution.hooks import on_tool_call, on_session_end
+ from self_evolution.reflection_engine import DreamEngine
+
+ # 1. Simulate tool calls
+ for i in range(5):
+ on_tool_call(
+ tool_name="bash",
+ duration_ms=200 + i * 100,
+ success=(i < 4), # last one fails
+ error_type="timeout" if i == 4 else None,
+ session_id="s-e2e-1",
+ turn_number=i,
+ )
+
+ # 2. Simulate session end
+ on_session_end(session_data={
+ "session_id": "s-e2e-1",
+ "completed": True,
+ "iterations": 5,
+ "tool_call_count": 5,
+ "message_count": 2,
+ "tool_names": ["bash"],
+ "model": "test",
+ })
+
+ # 3. Verify data was collected
+ invocations = _tmp_evolution_db.fetch_all("tool_invocations")
+ assert len(invocations) == 5
+
+ scores = _tmp_evolution_db.fetch_all("session_scores")
+ assert len(scores) == 1
+
+ # 4. Run error analysis directly (no LLM)
+ engine = DreamEngine(config={"base_url": "", "model": ""})
+ invocations = _tmp_evolution_db.fetch_all("tool_invocations")
+ signals = _tmp_evolution_db.fetch_all("outcome_signals")
+ scores = _tmp_evolution_db.fetch_all("session_scores")
+
+ error_analysis = engine._analyze_errors(scores, invocations, signals)
+ assert len(error_analysis.tool_failures) == 1
+ assert error_analysis.tool_failures[0].tool_name == "bash"
+ assert error_analysis.tool_failures[0].count == 1
+
+ # 5. Time waste analysis
+ waste_analysis = engine._analyze_time_waste(scores, invocations)
+ assert len(waste_analysis.slowest_tools) > 0
+
+ def test_reflection_prompt_builds(self, _tmp_evolution_db):
+ """Verify the reflection prompt is well-formed."""
+ from self_evolution.reflection_engine import DreamEngine
+
+ engine = DreamEngine(config={"base_url": "", "model": ""})
+
+ # Insert mock data
+ _tmp_evolution_db.insert("session_scores", {
+ "session_id": "s1", "composite_score": 0.8,
+ "completion_rate": 1.0, "efficiency_score": 0.7,
+ "cost_efficiency": 0.9, "satisfaction_proxy": 0.8,
+ "task_category": "coding", "model": "test",
+ })
+ _tmp_evolution_db.insert("tool_invocations", {
+ "session_id": "s1", "tool_name": "bash",
+ "duration_ms": 500, "success": True, "turn_number": 1,
+ })
+
+ scores = _tmp_evolution_db.fetch_all("session_scores")
+ invocations = _tmp_evolution_db.fetch_all("tool_invocations")
+ signals = _tmp_evolution_db.fetch_all("outcome_signals")
+
+ error_analysis = engine._analyze_errors(scores, invocations, signals)
+ waste_analysis = engine._analyze_time_waste(scores, invocations)
+
+ prompt = engine._build_reflection_prompt(
+ scores, invocations, signals,
+ error_analysis, waste_analysis, avg_score=0.8,
+ )
+ assert "概况" in prompt or "sessions" in prompt
+ assert "0.800" in prompt
+
+
+# ============================================================================
+# 11. Security — SQL Injection Prevention
+# ============================================================================
+
+class TestSecurity:
+ """Test security hardening measures."""
+
+ def test_sql_injection_rejected_invalid_table(self, _tmp_evolution_db):
+ """Table names not in the whitelist must raise ValueError."""
+ with pytest.raises(ValueError, match="Invalid table name"):
+ _tmp_evolution_db.insert("users; DROP TABLE users--", {"id": 1})
+
+ def test_sql_injection_rejected_in_fetch(self, _tmp_evolution_db):
+ with pytest.raises(ValueError, match="Invalid table name"):
+ _tmp_evolution_db.fetch_one("nonexistent_table")
+
+ def test_sql_injection_rejected_in_update(self, _tmp_evolution_db):
+ with pytest.raises(ValueError, match="Invalid table name"):
+ _tmp_evolution_db.update(
+ "evil_table", {"x": 1}, where="1=1",
+ )
+
+ def test_sql_injection_rejected_in_insert_many(self, _tmp_evolution_db):
+ with pytest.raises(ValueError, match="Invalid table name"):
+ _tmp_evolution_db.insert_many("bad_table", [{"x": 1}])
+
+ def test_sql_injection_rejected_in_fetch_all(self, _tmp_evolution_db):
+ with pytest.raises(ValueError, match="Invalid table name"):
+ _tmp_evolution_db.fetch_all("no_such_table")
+
+ def test_limit_coerced_to_int(self, _tmp_evolution_db):
+ """Non-integer limit values should be safely coerced."""
+ _tmp_evolution_db.insert("tool_invocations", {
+ "session_id": "s1", "tool_name": "bash",
+ "duration_ms": 100, "success": True, "turn_number": 0,
+ })
+ # Pass a string-ish limit; int() coercion should handle it
+ rows = _tmp_evolution_db.fetch_all(
+ "tool_invocations", limit=1,
+ )
+ assert len(rows) == 1
+
+ def test_valid_tables_still_work(self, _tmp_evolution_db):
+ """All legitimate tables should pass validation."""
+ _tmp_evolution_db.insert("tool_invocations", {
+ "session_id": "s-ok", "tool_name": "bash",
+ "duration_ms": 100, "success": True, "turn_number": 0,
+ })
+ _tmp_evolution_db.insert("outcome_signals", {
+ "session_id": "s-ok", "signal_type": "test",
+ "signal_value": 1.0,
+ })
+ rows = _tmp_evolution_db.fetch_all("tool_invocations")
+ assert len(rows) == 1