mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor
This commit is contained in:
commit
7e4dd6ea02
220 changed files with 23482 additions and 1959 deletions
270
gateway/run.py
270
gateway/run.py
|
|
@ -186,6 +186,8 @@ if _config_path.exists():
|
|||
os.environ["HERMES_AGENT_TIMEOUT"] = str(_agent_cfg["gateway_timeout"])
|
||||
if "gateway_timeout_warning" in _agent_cfg and "HERMES_AGENT_TIMEOUT_WARNING" not in os.environ:
|
||||
os.environ["HERMES_AGENT_TIMEOUT_WARNING"] = str(_agent_cfg["gateway_timeout_warning"])
|
||||
if "gateway_notify_interval" in _agent_cfg and "HERMES_AGENT_NOTIFY_INTERVAL" not in os.environ:
|
||||
os.environ["HERMES_AGENT_NOTIFY_INTERVAL"] = str(_agent_cfg["gateway_notify_interval"])
|
||||
if "restart_drain_timeout" in _agent_cfg and "HERMES_RESTART_DRAIN_TIMEOUT" not in os.environ:
|
||||
os.environ["HERMES_RESTART_DRAIN_TIMEOUT"] = str(_agent_cfg["restart_drain_timeout"])
|
||||
_display_cfg = _cfg.get("display", {})
|
||||
|
|
@ -1715,6 +1717,9 @@ class GatewayRunner:
|
|||
):
|
||||
self._schedule_update_notification_watch()
|
||||
|
||||
# Notify the chat that initiated /restart that the gateway is back.
|
||||
await self._send_restart_notification()
|
||||
|
||||
# Drain any recovered process watchers (from crash recovery checkpoint)
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
|
|
@ -2541,11 +2546,8 @@ class GatewayRunner:
|
|||
self._pending_messages.pop(_quick_key, None)
|
||||
if _quick_key in self._running_agents:
|
||||
del self._running_agents[_quick_key]
|
||||
# Mark session suspended so the next message starts fresh
|
||||
# instead of resuming the stuck context (#7536).
|
||||
self.session_store.suspend_session(_quick_key)
|
||||
logger.info("HARD STOP for session %s — suspended, session lock released", _quick_key[:20])
|
||||
return "⚡ Force-stopped. The session is suspended — your next message will start fresh."
|
||||
logger.info("STOP for session %s — agent interrupted, session lock released", _quick_key[:20])
|
||||
return "⚡ Stopped. You can continue this session."
|
||||
|
||||
# /reset and /new must bypass the running-agent guard so they
|
||||
# actually dispatch as commands instead of being queued as user
|
||||
|
|
@ -2762,6 +2764,9 @@ class GatewayRunner:
|
|||
if canonical == "update":
|
||||
return await self._handle_update_command(event)
|
||||
|
||||
if canonical == "debug":
|
||||
return await self._handle_debug_command(event)
|
||||
|
||||
if canonical == "title":
|
||||
return await self._handle_title_command(event)
|
||||
|
||||
|
|
@ -3329,21 +3334,26 @@ class GatewayRunner:
|
|||
# Must run after runtime resolution so _hyg_base_url is set.
|
||||
if _hyg_config_context_length is None and _hyg_base_url:
|
||||
try:
|
||||
_hyg_custom_providers = _hyg_data.get("custom_providers")
|
||||
if isinstance(_hyg_custom_providers, list):
|
||||
for _cp in _hyg_custom_providers:
|
||||
if not isinstance(_cp, dict):
|
||||
continue
|
||||
_cp_url = (_cp.get("base_url") or "").rstrip("/")
|
||||
if _cp_url and _cp_url == _hyg_base_url.rstrip("/"):
|
||||
_cp_models = _cp.get("models", {})
|
||||
if isinstance(_cp_models, dict):
|
||||
_cp_model_cfg = _cp_models.get(_hyg_model, {})
|
||||
if isinstance(_cp_model_cfg, dict):
|
||||
_cp_ctx = _cp_model_cfg.get("context_length")
|
||||
if _cp_ctx is not None:
|
||||
_hyg_config_context_length = int(_cp_ctx)
|
||||
break
|
||||
try:
|
||||
from hermes_cli.config import get_compatible_custom_providers as _gw_gcp
|
||||
_hyg_custom_providers = _gw_gcp(_hyg_data)
|
||||
except Exception:
|
||||
_hyg_custom_providers = _hyg_data.get("custom_providers")
|
||||
if not isinstance(_hyg_custom_providers, list):
|
||||
_hyg_custom_providers = []
|
||||
for _cp in _hyg_custom_providers:
|
||||
if not isinstance(_cp, dict):
|
||||
continue
|
||||
_cp_url = (_cp.get("base_url") or "").rstrip("/")
|
||||
if _cp_url and _cp_url == _hyg_base_url.rstrip("/"):
|
||||
_cp_models = _cp.get("models", {})
|
||||
if isinstance(_cp_models, dict):
|
||||
_cp_model_cfg = _cp_models.get(_hyg_model, {})
|
||||
if isinstance(_cp_model_cfg, dict):
|
||||
_cp_ctx = _cp_model_cfg.get("context_length")
|
||||
if _cp_ctx is not None:
|
||||
_hyg_config_context_length = int(_cp_ctx)
|
||||
break
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
except Exception:
|
||||
|
|
@ -4204,9 +4214,7 @@ class GatewayRunner:
|
|||
only through normal command dispatch (no running agent) or as a
|
||||
fallback. Force-clean the session lock in all cases for safety.
|
||||
|
||||
When there IS a running/pending agent, the session is also marked
|
||||
as *suspended* so the next message starts a fresh session instead
|
||||
of resuming the stuck context (#7536).
|
||||
The session is preserved so the user can continue the conversation.
|
||||
"""
|
||||
source = event.source
|
||||
session_entry = self.session_store.get_or_create_session(source)
|
||||
|
|
@ -4217,17 +4225,15 @@ class GatewayRunner:
|
|||
# Force-clean the sentinel so the session is unlocked.
|
||||
if session_key in self._running_agents:
|
||||
del self._running_agents[session_key]
|
||||
self.session_store.suspend_session(session_key)
|
||||
logger.info("HARD STOP (pending) for session %s — suspended, sentinel cleared", session_key[:20])
|
||||
return "⚡ Force-stopped. The agent was still starting — your next message will start fresh."
|
||||
logger.info("STOP (pending) for session %s — sentinel cleared", session_key[:20])
|
||||
return "⚡ Stopped. The agent hadn't started yet — you can continue this session."
|
||||
if agent:
|
||||
agent.interrupt("Stop requested")
|
||||
# Force-clean the session lock so a truly hung agent doesn't
|
||||
# keep it locked forever.
|
||||
if session_key in self._running_agents:
|
||||
del self._running_agents[session_key]
|
||||
self.session_store.suspend_session(session_key)
|
||||
return "⚡ Force-stopped. Your next message will start a fresh session."
|
||||
return "⚡ Stopped. You can continue this session."
|
||||
else:
|
||||
return "No active task to stop."
|
||||
|
||||
|
|
@ -4239,11 +4245,36 @@ class GatewayRunner:
|
|||
return f"⏳ Draining {count} active agent(s) before restart..."
|
||||
return "⏳ Gateway restart already in progress..."
|
||||
|
||||
# Save the requester's routing info so the new gateway process can
|
||||
# notify them once it comes back online.
|
||||
try:
|
||||
import json as _json
|
||||
notify_data = {
|
||||
"platform": event.source.platform.value if event.source.platform else None,
|
||||
"chat_id": event.source.chat_id,
|
||||
}
|
||||
if event.source.thread_id:
|
||||
notify_data["thread_id"] = event.source.thread_id
|
||||
(_hermes_home / ".restart_notify.json").write_text(
|
||||
_json.dumps(notify_data)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to write restart notify file: %s", e)
|
||||
|
||||
active_agents = self._running_agent_count()
|
||||
self.request_restart(detached=True, via_service=False)
|
||||
# When running under a service manager (systemd/launchd), use the
|
||||
# service restart path: exit with code 75 so the service manager
|
||||
# restarts us. The detached subprocess approach (setsid + bash)
|
||||
# doesn't work under systemd because KillMode=mixed kills all
|
||||
# processes in the cgroup, including the detached helper.
|
||||
_under_service = bool(os.environ.get("INVOCATION_ID")) # systemd sets this
|
||||
if _under_service:
|
||||
self.request_restart(detached=False, via_service=True)
|
||||
else:
|
||||
self.request_restart(detached=True, via_service=False)
|
||||
if active_agents:
|
||||
return f"⏳ Draining {active_agents} active agent(s) before restart..."
|
||||
return "♻ Restarting gateway..."
|
||||
return "♻ Restarting gateway. If you aren't notified within 60 seconds, restart from the console with `hermes gateway restart`."
|
||||
|
||||
async def _handle_help_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /help command - list available commands."""
|
||||
|
|
@ -4360,7 +4391,11 @@ class GatewayRunner:
|
|||
current_provider = model_cfg.get("provider", current_provider)
|
||||
current_base_url = model_cfg.get("base_url", "")
|
||||
user_provs = cfg.get("providers")
|
||||
custom_provs = cfg.get("custom_providers")
|
||||
try:
|
||||
from hermes_cli.config import get_compatible_custom_providers
|
||||
custom_provs = get_compatible_custom_providers(cfg)
|
||||
except Exception:
|
||||
custom_provs = cfg.get("custom_providers")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -4991,6 +5026,8 @@ class GatewayRunner:
|
|||
|
||||
if success:
|
||||
adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
|
||||
if hasattr(adapter, "_voice_sources"):
|
||||
adapter._voice_sources[guild_id] = event.source.to_dict()
|
||||
self._voice_mode[event.source.chat_id] = "all"
|
||||
self._save_voice_modes()
|
||||
self._set_adapter_auto_tts_disabled(adapter, event.source.chat_id, disabled=False)
|
||||
|
|
@ -5051,14 +5088,23 @@ class GatewayRunner:
|
|||
if not text_ch_id:
|
||||
return
|
||||
|
||||
# Build source — reuse the linked text channel's metadata when available
|
||||
# so voice input shares the same session as the bound text conversation.
|
||||
source_data = getattr(adapter, "_voice_sources", {}).get(guild_id)
|
||||
if source_data:
|
||||
source = SessionSource.from_dict(source_data)
|
||||
source.user_id = str(user_id)
|
||||
source.user_name = str(user_id)
|
||||
else:
|
||||
source = SessionSource(
|
||||
platform=Platform.DISCORD,
|
||||
chat_id=str(text_ch_id),
|
||||
user_id=str(user_id),
|
||||
user_name=str(user_id),
|
||||
chat_type="channel",
|
||||
)
|
||||
|
||||
# Check authorization before processing voice input
|
||||
source = SessionSource(
|
||||
platform=Platform.DISCORD,
|
||||
chat_id=str(text_ch_id),
|
||||
user_id=str(user_id),
|
||||
user_name=str(user_id),
|
||||
chat_type="channel",
|
||||
)
|
||||
if not self._is_user_authorized(source):
|
||||
logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
|
||||
return
|
||||
|
|
@ -6523,6 +6569,61 @@ class GatewayRunner:
|
|||
Platform.FEISHU, Platform.WECOM, Platform.WECOM_CALLBACK, Platform.WEIXIN, Platform.BLUEBUBBLES, Platform.LOCAL,
|
||||
})
|
||||
|
||||
async def _handle_debug_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /debug — upload debug report + logs and return paste URLs."""
|
||||
import asyncio
|
||||
from hermes_cli.debug import (
|
||||
_capture_dump, collect_debug_report, _read_full_log,
|
||||
upload_to_pastebin,
|
||||
)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
# Run blocking I/O (dump capture, log reads, uploads) in a thread.
|
||||
def _collect_and_upload():
|
||||
dump_text = _capture_dump()
|
||||
report = collect_debug_report(log_lines=200, dump_text=dump_text)
|
||||
agent_log = _read_full_log("agent")
|
||||
gateway_log = _read_full_log("gateway")
|
||||
|
||||
if agent_log:
|
||||
agent_log = dump_text + "\n\n--- full agent.log ---\n" + agent_log
|
||||
if gateway_log:
|
||||
gateway_log = dump_text + "\n\n--- full gateway.log ---\n" + gateway_log
|
||||
|
||||
urls = {}
|
||||
failures = []
|
||||
|
||||
try:
|
||||
urls["Report"] = upload_to_pastebin(report)
|
||||
except Exception as exc:
|
||||
return f"✗ Failed to upload debug report: {exc}"
|
||||
|
||||
if agent_log:
|
||||
try:
|
||||
urls["agent.log"] = upload_to_pastebin(agent_log)
|
||||
except Exception:
|
||||
failures.append("agent.log")
|
||||
|
||||
if gateway_log:
|
||||
try:
|
||||
urls["gateway.log"] = upload_to_pastebin(gateway_log)
|
||||
except Exception:
|
||||
failures.append("gateway.log")
|
||||
|
||||
lines = ["**Debug report uploaded:**", ""]
|
||||
label_width = max(len(k) for k in urls)
|
||||
for label, url in urls.items():
|
||||
lines.append(f"`{label:<{label_width}}` {url}")
|
||||
|
||||
if failures:
|
||||
lines.append(f"\n_(failed to upload: {', '.join(failures)})_")
|
||||
|
||||
lines.append("\nShare these links with the Hermes team for support.")
|
||||
return "\n".join(lines)
|
||||
|
||||
return await loop.run_in_executor(None, _collect_and_upload)
|
||||
|
||||
async def _handle_update_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /update command — update Hermes Agent to the latest version.
|
||||
|
||||
|
|
@ -6917,6 +7018,48 @@ class GatewayRunner:
|
|||
|
||||
return True
|
||||
|
||||
async def _send_restart_notification(self) -> None:
|
||||
"""Notify the chat that initiated /restart that the gateway is back."""
|
||||
import json as _json
|
||||
|
||||
notify_path = _hermes_home / ".restart_notify.json"
|
||||
if not notify_path.exists():
|
||||
return
|
||||
|
||||
try:
|
||||
data = _json.loads(notify_path.read_text())
|
||||
platform_str = data.get("platform")
|
||||
chat_id = data.get("chat_id")
|
||||
thread_id = data.get("thread_id")
|
||||
|
||||
if not platform_str or not chat_id:
|
||||
return
|
||||
|
||||
platform = Platform(platform_str)
|
||||
adapter = self.adapters.get(platform)
|
||||
if not adapter:
|
||||
logger.debug(
|
||||
"Restart notification skipped: %s adapter not connected",
|
||||
platform_str,
|
||||
)
|
||||
return
|
||||
|
||||
metadata = {"thread_id": thread_id} if thread_id else None
|
||||
await adapter.send(
|
||||
chat_id,
|
||||
"♻ Gateway restarted successfully. Your session continues.",
|
||||
metadata=metadata,
|
||||
)
|
||||
logger.info(
|
||||
"Sent restart notification to %s:%s",
|
||||
platform_str,
|
||||
chat_id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Restart notification failed: %s", e)
|
||||
finally:
|
||||
notify_path.unlink(missing_ok=True)
|
||||
|
||||
def _set_session_env(self, context: SessionContext) -> list:
|
||||
"""Set session context variables for the current async task.
|
||||
|
||||
|
|
@ -7448,9 +7591,11 @@ class GatewayRunner:
|
|||
_pl = get_tool_preview_max_len()
|
||||
import json as _json
|
||||
args_str = _json.dumps(args, ensure_ascii=False, default=str)
|
||||
_cap = _pl if _pl > 0 else 200
|
||||
if len(args_str) > _cap:
|
||||
args_str = args_str[:_cap - 3] + "..."
|
||||
# When tool_preview_length is 0 (default), don't truncate
|
||||
# in verbose mode — the user explicitly asked for full
|
||||
# detail. Platform message-length limits handle the rest.
|
||||
if _pl > 0 and len(args_str) > _pl:
|
||||
args_str = args_str[:_pl - 3] + "..."
|
||||
msg = f"{emoji} {tool_name}({list(args.keys())})\n{args_str}"
|
||||
elif preview:
|
||||
msg = f"{emoji} {tool_name}: \"{preview}\""
|
||||
|
|
@ -7760,10 +7905,18 @@ class GatewayRunner:
|
|||
from gateway.stream_consumer import GatewayStreamConsumer, StreamConsumerConfig
|
||||
_adapter = self.adapters.get(source.platform)
|
||||
if _adapter:
|
||||
# Platforms that don't support editing sent messages
|
||||
# (e.g. WeChat) must not show a cursor in intermediate
|
||||
# sends — the cursor would be permanently visible because
|
||||
# it can never be edited away. Use an empty cursor for
|
||||
# such platforms so streaming still delivers the final
|
||||
# response, just without the typing indicator.
|
||||
_adapter_supports_edit = getattr(_adapter, "SUPPORTS_MESSAGE_EDITING", True)
|
||||
_effective_cursor = _scfg.cursor if _adapter_supports_edit else ""
|
||||
_consumer_cfg = StreamConsumerConfig(
|
||||
edit_interval=_scfg.edit_interval,
|
||||
buffer_threshold=_scfg.buffer_threshold,
|
||||
cursor=_scfg.cursor,
|
||||
cursor=_effective_cursor,
|
||||
)
|
||||
_stream_consumer = GatewayStreamConsumer(
|
||||
adapter=_adapter,
|
||||
|
|
@ -8243,11 +8396,17 @@ class GatewayRunner:
|
|||
interrupt_monitor = asyncio.create_task(monitor_for_interrupt())
|
||||
|
||||
# Periodic "still working" notifications for long-running tasks.
|
||||
# Fires every 10 minutes so the user knows the agent hasn't died.
|
||||
_NOTIFY_INTERVAL = 600 # 10 minutes
|
||||
# Fires every N seconds so the user knows the agent hasn't died.
|
||||
# Config: agent.gateway_notify_interval in config.yaml, or
|
||||
# HERMES_AGENT_NOTIFY_INTERVAL env var. Default 600s (10 min).
|
||||
# 0 = disable notifications.
|
||||
_NOTIFY_INTERVAL_RAW = float(os.getenv("HERMES_AGENT_NOTIFY_INTERVAL", 600))
|
||||
_NOTIFY_INTERVAL = _NOTIFY_INTERVAL_RAW if _NOTIFY_INTERVAL_RAW > 0 else None
|
||||
_notify_start = time.time()
|
||||
|
||||
async def _notify_long_running():
|
||||
if _NOTIFY_INTERVAL is None:
|
||||
return # Notifications disabled (gateway_notify_interval: 0)
|
||||
_notify_adapter = self.adapters.get(source.platform)
|
||||
if not _notify_adapter:
|
||||
return
|
||||
|
|
@ -8842,16 +9001,19 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
runner.request_restart(detached=False, via_service=True)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
try:
|
||||
loop.add_signal_handler(sig, shutdown_signal_handler)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
if hasattr(signal, "SIGUSR1"):
|
||||
try:
|
||||
loop.add_signal_handler(signal.SIGUSR1, restart_signal_handler)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
if threading.current_thread() is threading.main_thread():
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
try:
|
||||
loop.add_signal_handler(sig, shutdown_signal_handler)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
if hasattr(signal, "SIGUSR1"):
|
||||
try:
|
||||
loop.add_signal_handler(signal.SIGUSR1, restart_signal_handler)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
else:
|
||||
logger.info("Skipping signal handlers (not running in main thread).")
|
||||
|
||||
# Start the gateway
|
||||
success = await runner.start()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue