feat: add self-evolution plugin — agent self-optimization system

Add a comprehensive self-evolution system that enables Hermes Agent
to continuously improve through automated analysis and optimization:

Core components:
- reflection_engine: Nightly session analysis (1:00 AM)
- evolution_proposer: Generate improvement proposals from insights
- quality_scorer: Multi-dimensional session quality evaluation
- strategy_injector: Inject learned strategies into new sessions
- strategy_compressor: Strategy optimization and deduplication
- git_analyzer: Code change pattern analysis
- rule_engine: Pattern-based rule generation
- feishu_notifier: Feishu card notifications for evolution events

Storage:
- db.py: SQLite telemetry storage
- strategy_store: Persistent strategy storage
- models.py: Data models

Plugin integration:
- plugin.yaml, hooks.py, __init__.py for plugin system
- cron_jobs.py for scheduled tasks

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
玉冰 2026-04-25 00:40:13 +08:00
parent e5d41f05d4
commit 3cd384dc43
23 changed files with 6173 additions and 0 deletions

View file

@ -0,0 +1,751 @@
"""
Self Evolution Plugin Dream Engine (Reflection Engine)
=========================================================
Runs nightly at 1:00 to analyze the previous day's sessions.
Design reference: Claude Code plugins/hookify/agents/conversation-analyzer.md
- Analyzes conversations in reverse chronological order
- Detects: corrections, frustrations, repeated issues, reversions
- Extracts tool usage patterns, converts to actionable rules
- Categorizes by severity
We extend this pattern with:
- Full automated analysis (not just on user request)
- Error analysis (tool failures, retries, API errors)
- Time waste analysis (slow tools, repeated ops, inefficient sessions)
"""
from __future__ import annotations
import json
import logging
import os
import re
import time
from pathlib import Path
from typing import Any, Dict, List, Optional
from self_evolution import db
from self_evolution.model_config import resolve_config, get_active_text_config, switch_to_fallback
from self_evolution.git_analyzer import analyze_code_changes
from self_evolution.models import (
ErrorAnalysis, ToolFailure, RetryPattern,
WasteAnalysis, ToolDuration, RepeatedOperation,
CodeChangeAnalysis, CommitInfo,
ReflectionReport,
)
logger = logging.getLogger(__name__)
# ── Backward-compatible aliases ────────────────────────────────────────────
# These are used by cron_jobs.py and other callers.
_resolve_runtime_config = resolve_config
_get_active_text_config = get_active_text_config
_switch_to_fallback = switch_to_fallback
class DreamEngine:
"""Nightly dream consolidation engine.
Analyzes the previous day's sessions to find:
1. Error patterns (tool failures, retries, incomplete tasks)
2. Time waste patterns (slow tools, repeated operations, inefficient flows)
3. Success patterns (what worked well)
4. Generates actionable evolution proposals
"""
def __init__(self, config: dict = None):
self.config = config or _resolve_runtime_config()
self._model_client = None
self._current_prompt = ""
def run(self, hours: int = 24, max_runtime_seconds: int = 0) -> Optional[ReflectionReport]:
"""Main dream consolidation flow.
Args:
hours: Analyze data from the last N hours.
max_runtime_seconds: Hard timeout in seconds. 0 = no limit.
If exceeded, stops at the next step boundary and returns None.
"""
logger.info("Dream engine starting — analyzing last %d hours", hours)
deadline = time.time() + max_runtime_seconds if max_runtime_seconds > 0 else 0
now = time.time()
cutoff = now - (hours * 3600)
try:
# 1. Load session data
scores = db.fetch_all(
"session_scores",
where="created_at >= ?",
params=(cutoff,),
order_by="created_at DESC",
)
tool_invocations = db.fetch_all(
"tool_invocations",
where="created_at >= ?",
params=(cutoff,),
order_by="created_at DESC",
)
signals = db.fetch_all(
"outcome_signals",
where="created_at >= ?",
params=(cutoff,),
)
if not scores:
logger.info("No sessions to analyze")
return None
# 2. Error analysis
if deadline and time.time() > deadline:
logger.warning("Dream engine timed out before error analysis")
return None
error_analysis = self._analyze_errors(scores, tool_invocations, signals)
logger.info("Error analysis: %s", error_analysis.summary())
# 3. Time waste analysis
if deadline and time.time() > deadline:
logger.warning("Dream engine timed out before waste analysis")
return None
waste_analysis = self._analyze_time_waste(scores, tool_invocations)
logger.info("Waste analysis: %s", waste_analysis.summary())
# 3.5. Code change analysis
if deadline and time.time() > deadline:
logger.warning("Dream engine timed out before code analysis")
return None
code_analysis = analyze_code_changes(hours=hours)
logger.info("Code change analysis: %d commits found", code_analysis.total_commits)
# 4. Compute average score
avg_score = (
sum(s.get("composite_score", 0) for s in scores) / len(scores)
if scores else 0
)
# 5. Build reflection prompt
if deadline and time.time() > deadline:
logger.warning("Dream engine timed out before model call")
return None
prompt = self._build_reflection_prompt(
scores, tool_invocations, signals,
error_analysis, waste_analysis, avg_score,
code_analysis=code_analysis,
)
# 6. Call model for deep reflection
reflection_text = self._call_model(prompt)
if not reflection_text:
logger.warning("Model returned empty reflection")
return None
# 7. Parse reflection report
report = self._parse_reflection(
reflection_text=reflection_text,
period_start=cutoff,
period_end=now,
sessions_analyzed=len(scores),
avg_score=avg_score,
error_analysis=error_analysis,
waste_analysis=waste_analysis,
code_analysis=code_analysis,
)
# 8. Store report
report_id = db.insert("reflection_reports", report.to_db_row())
logger.info("Reflection report saved: id=%d, avg_score=%.3f", report_id, avg_score)
# 9. Generate evolution proposals
from self_evolution.evolution_proposer import generate_proposals
proposals = generate_proposals(report, report_id)
for p in proposals:
db.insert("evolution_proposals", p.to_db_row())
logger.info("Generated %d evolution proposals", len(proposals))
# 10. Compress existing strategies
try:
from self_evolution.strategy_compressor import compress_strategies
from self_evolution.strategy_store import StrategyStore
store = StrategyStore()
data = store.load()
rules = data.get("rules", [])
compressed = compress_strategies(rules)
if len(compressed) < len(rules):
data["rules"] = compressed
store.save(data)
logger.info("Strategies compressed: %d%d", len(rules), len(compressed))
except Exception as exc:
logger.warning("Strategy compression failed: %s", exc)
# 11. Cleanup old data
db.cleanup(days=30)
return report
except Exception as exc:
logger.exception("Dream engine failed: %s", exc)
return None
# ── Error Analysis ────────────────────────────────────────────────────
def _analyze_errors(
self,
scores: List[dict],
invocations: List[dict],
signals: List[dict],
) -> ErrorAnalysis:
"""Analyze all errors in the period.
Inspired by Claude Code conversation-analyzer's signal detection.
"""
# Tool failures
failures = {}
for inv in invocations:
if not inv.get("success", True):
tool = inv.get("tool_name", "unknown")
error_type = inv.get("error_type", "unknown")
key = f"{tool}:{error_type}"
if key not in failures:
failures[key] = ToolFailure(
tool_name=tool,
error_type=error_type,
count=0,
sessions_affected=[],
example_session=inv.get("session_id", ""),
)
failures[key].count += 1
sid = inv.get("session_id", "")
if sid and sid not in failures[key].sessions_affected:
failures[key].sessions_affected.append(sid)
# Retry patterns (same tool called > 2 times in same session)
retries = self._detect_retry_patterns(invocations)
# Incomplete sessions
incomplete = [
s.get("session_id", "") for s in scores
if s.get("completion_rate", 1.0) < 0.5
]
# User corrections from signals
corrections = [s for s in signals if s.get("signal_type") == "correction"]
frustration = [s for s in signals if s.get("signal_type") == "frustration"]
api_errors = [s for s in signals if s.get("signal_type") == "api_error"]
# API error type distribution
api_error_types: Dict[str, int] = {}
for s in api_errors:
meta = json.loads(s.get("metadata", "{}"))
etype = meta.get("error_type", "unknown")
api_error_types[etype] = api_error_types.get(etype, 0) + 1
return ErrorAnalysis(
tool_failures=sorted(failures.values(), key=lambda x: x.count, reverse=True),
retry_patterns=retries,
incomplete_sessions=incomplete,
user_corrections=len(corrections),
correction_examples=[s.get("metadata", "") for s in corrections[:3]],
api_error_count=len(api_errors),
api_error_types=api_error_types,
)
def _detect_retry_patterns(self, invocations: List[dict]) -> List[RetryPattern]:
"""Detect tools called > 2 times in same session."""
session_tools: Dict[str, Dict[str, int]] = {}
for inv in invocations:
sid = inv.get("session_id", "")
tool = inv.get("tool_name", "")
if sid not in session_tools:
session_tools[sid] = {}
session_tools[sid][tool] = session_tools[sid].get(tool, 0) + 1
patterns = []
for sid, tools in session_tools.items():
for tool, count in tools.items():
if count > 2:
patterns.append(RetryPattern(
session_id=sid,
tool_name=tool,
attempt_count=count,
final_outcome="unknown",
))
return sorted(patterns, key=lambda x: x.attempt_count, reverse=True)[:20]
# ── Time Waste Analysis ───────────────────────────────────────────────
def _analyze_time_waste(
self,
scores: List[dict],
invocations: List[dict],
) -> WasteAnalysis:
"""Analyze time waste patterns."""
# Slowest tools
tool_durations: Dict[str, List[int]] = {}
for inv in invocations:
tool = inv.get("tool_name", "")
duration = inv.get("duration_ms", 0)
if not duration:
continue
if tool not in tool_durations:
tool_durations[tool] = []
tool_durations[tool].append(duration)
slowest = [
ToolDuration(
tool_name=tool,
total_duration_ms=sum(durs),
call_count=len(durs),
avg_duration_ms=sum(durs) / len(durs),
)
for tool, durs in tool_durations.items()
]
slowest.sort(key=lambda x: x.avg_duration_ms, reverse=True)
# Repeated operations (same tool + same session > 3 times)
session_tool_calls: Dict[str, Dict[str, int]] = {}
for inv in invocations:
sid = inv.get("session_id", "")
tool = inv.get("tool_name", "")
if sid not in session_tool_calls:
session_tool_calls[sid] = {}
session_tool_calls[sid][tool] = session_tool_calls[sid].get(tool, 0) + 1
repeated = []
for sid, tools in session_tool_calls.items():
for tool, count in tools.items():
if count > 3:
repeated.append(RepeatedOperation(
description=f"{tool} called {count} times",
count=count,
sessions=[sid],
wasted_ms=tool_durations.get(tool, [0])[0] * (count - 2) if tool in tool_durations else 0,
))
# Inefficient sessions (low efficiency score)
inefficient = [
s.get("session_id", "") for s in scores
if s.get("efficiency_score", 1.0) < 0.3
]
return WasteAnalysis(
slowest_tools=slowest[:10],
repeated_operations=sorted(repeated, key=lambda x: x.count, reverse=True)[:10],
inefficient_sessions=inefficient,
shortcut_opportunities=[],
)
# ── Reflection Prompt ─────────────────────────────────────────────────
def _build_reflection_prompt(
self,
scores: List[dict],
invocations: List[dict],
signals: List[dict],
errors: ErrorAnalysis,
waste: WasteAnalysis,
avg_score: float,
code_analysis: CodeChangeAnalysis = None,
) -> str:
"""Build the reflection prompt as structured JSON data.
All analysis results are serialized as JSON so the model receives
lossless data instead of pre-summarized text.
"""
# Load user prompt template (short: just overview + data placeholder)
template_path = Path(__file__).parent / "prompts" / "reflection.md"
if template_path.exists():
template = template_path.read_text(encoding="utf-8")
else:
template = _DEFAULT_REFLECTION_PROMPT
# Compute statistics
total_invocations = len(invocations)
success_rate = (
sum(1 for i in invocations if i.get("success", True)) / total_invocations * 100
if total_invocations else 100
)
# Period range
if scores:
ts_min = min(s.get("created_at", 0) for s in scores)
ts_max = max(s.get("created_at", 0) for s in scores)
period_range = (
f"{time.strftime('%m-%d %H:%M', time.localtime(ts_min))} ~ "
f"{time.strftime('%m-%d %H:%M', time.localtime(ts_max))}"
)
else:
period_range = "N/A"
# Build structured data JSON — compact format to save tokens
data = {}
# 1. Sessions — compact: [score, completion, efficiency, cost, satisfaction, category]
data["sessions"] = [
[
round(s.get("composite_score", 0), 2),
round(s.get("completion_rate", 0), 2),
round(s.get("efficiency_score", 0), 2),
round(s.get("cost_efficiency", 0), 2),
round(s.get("satisfaction_proxy", 0), 2),
s.get("task_category", ""),
]
for s in scores
]
# 2. Tool usage — compact: {tool: [calls, failures, avg_ms]}
tool_stats: Dict[str, List[int]] = {}
for inv in invocations:
tool = inv.get("tool_name", "")
if tool not in tool_stats:
tool_stats[tool] = [0, 0, 0] # calls, failures, total_ms
tool_stats[tool][0] += 1
if not inv.get("success", True):
tool_stats[tool][1] += 1
tool_stats[tool][2] += inv.get("duration_ms", 0) or 0
data["tools"] = {
t: [v[0], v[1], round(v[2] / max(v[0], 1))]
for t, v in sorted(tool_stats.items(), key=lambda x: x[1][2], reverse=True)
}
# 3. Signals — compact: {type: count}
signal_types = {}
for s in signals:
stype = s.get("signal_type", "unknown")
signal_types[stype] = signal_types.get(stype, 0) + 1
data["signals"] = signal_types
# 4. Errors — only non-empty fields
err_data = {}
if errors.tool_failures:
err_data["tool_failures"] = [
f"{tf.tool_name}:{tf.error_type}x{tf.count}"
for tf in errors.tool_failures
]
if errors.retry_patterns:
err_data["retries"] = [
f"{rp.tool_name}x{rp.attempt_count}"
for rp in errors.retry_patterns[:5]
]
if errors.incomplete_sessions:
err_data["incomplete"] = len(errors.incomplete_sessions)
if errors.user_corrections:
err_data["corrections"] = errors.user_corrections
if errors.correction_examples:
err_data["correction_examples"] = errors.correction_examples[:2]
if errors.api_error_count:
err_data["api_errors"] = errors.api_error_count
if err_data:
data["errors"] = err_data
# 5. Waste — only non-empty
waste_data = {}
if waste.slowest_tools:
waste_data["slowest"] = [
f"{td.tool_name} {round(td.avg_duration_ms)}ms/{td.call_count}calls"
for td in waste.slowest_tools[:5]
]
if waste.repeated_operations:
waste_data["repeated"] = [
f"{ro.description} x{ro.count}"
for ro in waste.repeated_operations[:3]
]
if waste.inefficient_sessions:
waste_data["inefficient"] = len(waste.inefficient_sessions)
if waste_data:
data["waste"] = waste_data
# 6. Code changes — flat compact format
if code_analysis and code_analysis.commits:
cc = code_analysis
commits_data = []
for c in cc.commits[:10]:
entry = f"{c.hash_short} {c.subject} +{c.insertions}/-{c.deletions}"
if c.file_list:
entry += f" [{','.join(c.file_list[:5])}]"
if c.body:
entry += f" | {c.body[:150]}"
commits_data.append(entry)
data["code_changes"] = {
"stats": f"{cc.total_commits} commits +{cc.total_insertions}/-{cc.total_deletions} lines {cc.total_files_changed} files",
"categories": cc.change_categories,
"areas": cc.areas_touched,
"commits": commits_data,
}
data_json = json.dumps(data, ensure_ascii=False, indent=2)
# Fill template
prompt = template.replace("{period_range}", period_range)
prompt = prompt.replace("{sessions_count}", str(len(scores)))
prompt = prompt.replace("{avg_score}", f"{avg_score:.3f}")
prompt = prompt.replace("{total_invocations}", str(total_invocations))
prompt = prompt.replace("{success_rate}", f"{success_rate:.1f}")
prompt = prompt.replace("{data_json}", data_json)
return prompt
# ── Model Call ────────────────────────────────────────────────────────
def _call_model(self, prompt: str) -> Optional[str]:
"""Call the active model with automatic failover.
Resolution order:
1. Primary model (glm-5.1 via zai)
2. Fallback model (Qwen3.6 via local) if primary fails
Health check: when on fallback, probes primary every 30 min
and switches back when it recovers.
"""
self._current_prompt = prompt
active_cfg, is_fallback = _get_active_text_config(self.config)
base_url = active_cfg.get("base_url", "")
api_key = active_cfg.get("api_key", "")
model = active_cfg.get("model", "")
if not base_url or not model:
logger.warning("Incomplete runtime config: base_url=%s model=%s",
bool(base_url), model)
return None
result = self._call_chat_completions(base_url, api_key, model)
# If primary failed, try fallback
if result is None and not is_fallback:
fallback = self.config.get("fallback", {})
if fallback.get("base_url") and fallback.get("model"):
logger.warning("Primary model failed, trying fallback: %s",
fallback.get("model"))
result = self._call_chat_completions(
fallback["base_url"], fallback.get("api_key", ""),
fallback["model"],
)
if result is not None:
_switch_to_fallback()
return result
def _call_chat_completions(
self, base_url: str, api_key: str, model: str,
) -> Optional[str]:
"""Call OpenAI-compatible /chat/completions endpoint."""
try:
import requests
url = f"{base_url.rstrip('/')}/chat/completions"
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
resp = requests.post(
url,
headers=headers,
json={
"model": model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": self._current_prompt or ""},
],
"temperature": 0.3,
},
timeout=300,
)
if resp.status_code == 200:
data = resp.json()
return data.get("choices", [{}])[0].get("message", {}).get("content", "")
else:
logger.debug("Model call failed: %d %s", resp.status_code, resp.text[:200])
except Exception as exc:
logger.debug("Chat completions call failed: %s", exc)
return None
# ── Multimodal Call ───────────────────────────────────────────────────
def call_multimodal(self, prompt: str, images: list = None) -> Optional[str]:
"""Call multimodal model with text and optional images.
Routes to local multimodal model (gemma-4-26b-a4b-it-4bit) when
images are involved. Falls back to text model if no images.
Args:
prompt: Text prompt.
images: List of image data, each item is either:
- URL string (http/https/data:image)
- bytes (raw image data, auto-encoded to base64)
Returns:
Model response text, or None on failure.
"""
mm = self.config.get("multimodal", {})
if not mm or not mm.get("base_url"):
logger.debug("No multimodal model configured, falling back to text")
return self._call_model(prompt)
# Build content with images
content = [{"type": "text", "text": prompt}]
for img in (images or []):
if isinstance(img, bytes):
import base64
b64 = base64.b64encode(img).decode()
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64}"},
})
elif isinstance(img, str):
content.append({
"type": "image_url",
"image_url": {"url": img},
})
try:
from openai import OpenAI
client = OpenAI(
base_url=mm["base_url"].rstrip("/") + ("/v1" if not mm["base_url"].rstrip("/").endswith("/v1") else ""),
api_key=mm.get("api_key") or "no-key",
)
resp = client.chat.completions.create(
model=mm["model"],
messages=[{"role": "user", "content": content}],
temperature=0.3,
max_tokens=2000,
timeout=120,
)
return resp.choices[0].message.content
except Exception as exc:
logger.debug("Multimodal call failed: %s", exc)
return None
# ── Reflection Parsing ────────────────────────────────────────────────
def _parse_reflection(
self,
reflection_text: str,
period_start: float,
period_end: float,
sessions_analyzed: int,
avg_score: float,
error_analysis: ErrorAnalysis,
waste_analysis: WasteAnalysis,
code_analysis: CodeChangeAnalysis = None,
) -> ReflectionReport:
"""Parse model output into structured ReflectionReport.
Extraction cascade:
1. Direct JSON parse
2. Strip markdown ```json ... ``` wrapper, retry JSON
3. Extract JSON object via regex (handle trailing text)
4. Text-based section extraction (fallback)
"""
worst_patterns = []
best_patterns = []
recommendations = []
tool_insights = {}
text = reflection_text.strip()
# 1. Direct JSON parse
data = _try_parse_json(text)
if data is None:
# 2. Strip markdown wrapper
m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if m:
data = _try_parse_json(m.group(1))
if data is None:
# 3. Regex extract first JSON object
m = re.search(r'\{[^{}]*"(?:worst|best|recommendations)"[^{}]*\}', text, re.DOTALL)
if m:
data = _try_parse_json(m.group(0))
if data is None:
# 3.5. Broader regex — find outermost braces
start = text.find('{')
end = text.rfind('}')
if start != -1 and end > start:
data = _try_parse_json(text[start:end + 1])
if data is not None:
worst_patterns = data.get("worst_patterns") or []
best_patterns = data.get("best_patterns") or []
recommendations = data.get("recommendations") or []
tool_insights = data.get("tool_insights") or {}
else:
# 4. Text-based extraction
section = None
for line in text.split("\n"):
stripped = line.strip()
lower = stripped.lower()
if ("worst" in lower and "pattern" in lower) or "最差" in stripped or "错误模式" in stripped:
section = "worst"
elif ("best" in lower and "pattern" in lower) or "最佳" in stripped or "成功" in stripped:
section = "best"
elif ("recommend" in lower) or "建议" in stripped:
section = "rec"
elif stripped.startswith("- ") or stripped.startswith("* ") or stripped.startswith(""):
item = stripped.lstrip("-*• ").strip()
if section == "worst":
worst_patterns.append(item)
elif section == "best":
best_patterns.append(item)
elif section == "rec":
recommendations.append(item)
elif len(stripped) > 2 and stripped[0].isdigit() and stripped[1] in ".)" and stripped[2] == " ":
item = stripped[3:].strip()
if section == "worst":
worst_patterns.append(item)
elif section == "best":
best_patterns.append(item)
elif section == "rec":
recommendations.append(item)
return ReflectionReport(
period_start=period_start,
period_end=period_end,
sessions_analyzed=sessions_analyzed,
avg_score=avg_score,
error_summary=error_analysis.summary(),
waste_summary=waste_analysis.summary(),
worst_patterns=worst_patterns,
best_patterns=best_patterns,
tool_insights=tool_insights,
recommendations=recommendations,
code_change_summary=code_analysis.summary() if code_analysis else "",
model_used=self.config.get("model", "unknown"),
)
# ── Default Prompt Template ──────────────────────────────────────────────
_SYSTEM_PROMPT = (
"你是 Hermes Agent 性能分析引擎。分析运行数据+代码变更输出严格JSON无markdown\n"
"格式:\n"
'{"worst_patterns":["模式(工具+场景+根因)"],"best_patterns":["成功经验"],'
'"tool_insights":{"工具":{"sr":0.95,"ms":500,"rec":"建议"}},'
'"recommendations":["做什么|效果|风险(l/m/h)|验证"]}\n'
"重点:系统性错误>偶发,错误连锁,策略vs工具问题,重复操作,代码设计合理性,自我进化状态,"
"可固化流程。≤5条建议,优先高影响低风险。无数据时输出空数组。"
)
_DEFAULT_REFLECTION_PROMPT = """## 概况
- 时段: {period_range}
- Session : {sessions_count}, 平均质量: {avg_score}
- 工具调用: {total_invocations} , 成功率 {success_rate}%
## 数据
{data_json}
"""
def _try_parse_json(text: str) -> Optional[dict]:
"""Try to parse JSON, returning None on any failure."""
try:
data = json.loads(text)
if isinstance(data, dict):
return data
except (json.JSONDecodeError, ValueError):
pass
return None