mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(cron): track delivery failures in job status (#6042)
_deliver_result() now returns Optional[str] — None on success, error message on failure. All failure paths (unknown platform, platform disabled, config load error, send failure, unresolvable target) return descriptive error strings. mark_job_run() gains delivery_error param, tracked as last_delivery_error on the job — separate from agent execution errors. A job where the agent succeeded but delivery failed shows last_status='ok' + last_delivery_error='...'. The cronjob list tool now surfaces last_delivery_error so agents and users can see when cron outputs aren't arriving. Inspired by PR #5863 (oxngon) — reimplemented with proper wiring. Tests: 3 new mark_job_run tests + 6 new _deliver_result return tests.
This commit is contained in:
parent
598c25d43e
commit
fff237e111
6 changed files with 167 additions and 22 deletions
|
|
@ -574,12 +574,16 @@ def remove_job(job_id: str) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,
|
||||||
|
delivery_error: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Mark a job as having been run.
|
Mark a job as having been run.
|
||||||
|
|
||||||
Updates last_run_at, last_status, increments completed count,
|
Updates last_run_at, last_status, increments completed count,
|
||||||
computes next_run_at, and auto-deletes if repeat limit reached.
|
computes next_run_at, and auto-deletes if repeat limit reached.
|
||||||
|
|
||||||
|
``delivery_error`` is tracked separately from the agent error — a job
|
||||||
|
can succeed (agent produced output) but fail delivery (platform down).
|
||||||
"""
|
"""
|
||||||
jobs = load_jobs()
|
jobs = load_jobs()
|
||||||
for i, job in enumerate(jobs):
|
for i, job in enumerate(jobs):
|
||||||
|
|
@ -588,6 +592,8 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
||||||
job["last_run_at"] = now
|
job["last_run_at"] = now
|
||||||
job["last_status"] = "ok" if success else "error"
|
job["last_status"] = "ok" if success else "error"
|
||||||
job["last_error"] = error if not success else None
|
job["last_error"] = error if not success else None
|
||||||
|
# Track delivery failures separately — cleared on successful delivery
|
||||||
|
job["last_delivery_error"] = delivery_error
|
||||||
|
|
||||||
# Increment completed count
|
# Increment completed count
|
||||||
if job.get("repeat"):
|
if job.get("repeat"):
|
||||||
|
|
|
||||||
|
|
@ -196,7 +196,7 @@ def _send_media_via_adapter(adapter, chat_id: str, media_files: list, metadata:
|
||||||
logger.warning("Job '%s': failed to send media %s: %s", job.get("id", "?"), media_path, e)
|
logger.warning("Job '%s': failed to send media %s: %s", job.get("id", "?"), media_path, e)
|
||||||
|
|
||||||
|
|
||||||
def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
|
def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Deliver job output to the configured target (origin chat, specific platform, etc.).
|
Deliver job output to the configured target (origin chat, specific platform, etc.).
|
||||||
|
|
||||||
|
|
@ -204,16 +204,16 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
|
||||||
use the live adapter first — this supports E2EE rooms (e.g. Matrix) where
|
use the live adapter first — this supports E2EE rooms (e.g. Matrix) where
|
||||||
the standalone HTTP path cannot encrypt. Falls back to standalone send if
|
the standalone HTTP path cannot encrypt. Falls back to standalone send if
|
||||||
the adapter path fails or is unavailable.
|
the adapter path fails or is unavailable.
|
||||||
|
|
||||||
|
Returns None on success, or an error string on failure.
|
||||||
"""
|
"""
|
||||||
target = _resolve_delivery_target(job)
|
target = _resolve_delivery_target(job)
|
||||||
if not target:
|
if not target:
|
||||||
if job.get("deliver", "local") != "local":
|
if job.get("deliver", "local") != "local":
|
||||||
logger.warning(
|
msg = f"no delivery target resolved for deliver={job.get('deliver', 'local')}"
|
||||||
"Job '%s' deliver=%s but no concrete delivery target could be resolved",
|
logger.warning("Job '%s': %s", job["id"], msg)
|
||||||
job["id"],
|
return msg
|
||||||
job.get("deliver", "local"),
|
return None # local-only jobs don't deliver — not a failure
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
platform_name = target["platform"]
|
platform_name = target["platform"]
|
||||||
chat_id = target["chat_id"]
|
chat_id = target["chat_id"]
|
||||||
|
|
@ -239,19 +239,22 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
|
||||||
}
|
}
|
||||||
platform = platform_map.get(platform_name.lower())
|
platform = platform_map.get(platform_name.lower())
|
||||||
if not platform:
|
if not platform:
|
||||||
logger.warning("Job '%s': unknown platform '%s' for delivery", job["id"], platform_name)
|
msg = f"unknown platform '{platform_name}'"
|
||||||
return
|
logger.warning("Job '%s': %s", job["id"], msg)
|
||||||
|
return msg
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config = load_gateway_config()
|
config = load_gateway_config()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Job '%s': failed to load gateway config for delivery: %s", job["id"], e)
|
msg = f"failed to load gateway config: {e}"
|
||||||
return
|
logger.error("Job '%s': %s", job["id"], msg)
|
||||||
|
return msg
|
||||||
|
|
||||||
pconfig = config.platforms.get(platform)
|
pconfig = config.platforms.get(platform)
|
||||||
if not pconfig or not pconfig.enabled:
|
if not pconfig or not pconfig.enabled:
|
||||||
logger.warning("Job '%s': platform '%s' not configured/enabled", job["id"], platform_name)
|
msg = f"platform '{platform_name}' not configured/enabled"
|
||||||
return
|
logger.warning("Job '%s': %s", job["id"], msg)
|
||||||
|
return msg
|
||||||
|
|
||||||
# Optionally wrap the content with a header/footer so the user knows this
|
# Optionally wrap the content with a header/footer so the user knows this
|
||||||
# is a cron delivery. Wrapping is on by default; set cron.wrap_response: false
|
# is a cron delivery. Wrapping is on by default; set cron.wrap_response: false
|
||||||
|
|
@ -307,7 +310,7 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
|
||||||
|
|
||||||
if adapter_ok:
|
if adapter_ok:
|
||||||
logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
|
logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
|
||||||
return
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
|
"Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
|
||||||
|
|
@ -329,13 +332,17 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
|
||||||
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
|
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
|
||||||
result = future.result(timeout=30)
|
result = future.result(timeout=30)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Job '%s': delivery to %s:%s failed: %s", job["id"], platform_name, chat_id, e)
|
msg = f"delivery to {platform_name}:{chat_id} failed: {e}"
|
||||||
return
|
logger.error("Job '%s': %s", job["id"], msg)
|
||||||
|
return msg
|
||||||
|
|
||||||
if result and result.get("error"):
|
if result and result.get("error"):
|
||||||
logger.error("Job '%s': delivery error: %s", job["id"], result["error"])
|
msg = f"delivery error: {result['error']}"
|
||||||
else:
|
logger.error("Job '%s': %s", job["id"], msg)
|
||||||
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
|
return msg
|
||||||
|
|
||||||
|
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
_SCRIPT_TIMEOUT = 120 # seconds
|
_SCRIPT_TIMEOUT = 120 # seconds
|
||||||
|
|
@ -868,13 +875,15 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
|
||||||
logger.info("Job '%s': agent returned %s — skipping delivery", job["id"], SILENT_MARKER)
|
logger.info("Job '%s': agent returned %s — skipping delivery", job["id"], SILENT_MARKER)
|
||||||
should_deliver = False
|
should_deliver = False
|
||||||
|
|
||||||
|
delivery_error = None
|
||||||
if should_deliver:
|
if should_deliver:
|
||||||
try:
|
try:
|
||||||
_deliver_result(job, deliver_content, adapters=adapters, loop=loop)
|
delivery_error = _deliver_result(job, deliver_content, adapters=adapters, loop=loop)
|
||||||
except Exception as de:
|
except Exception as de:
|
||||||
|
delivery_error = str(de)
|
||||||
logger.error("Delivery failed for job %s: %s", job["id"], de)
|
logger.error("Delivery failed for job %s: %s", job["id"], de)
|
||||||
|
|
||||||
mark_job_run(job["id"], success, error)
|
mark_job_run(job["id"], success, error, delivery_error=delivery_error)
|
||||||
executed += 1
|
executed += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -93,6 +93,21 @@ def cron_list(show_all: bool = False):
|
||||||
script = job.get("script")
|
script = job.get("script")
|
||||||
if script:
|
if script:
|
||||||
print(f" Script: {script}")
|
print(f" Script: {script}")
|
||||||
|
|
||||||
|
# Execution history
|
||||||
|
last_status = job.get("last_status")
|
||||||
|
if last_status:
|
||||||
|
last_run = job.get("last_run_at", "?")
|
||||||
|
if last_status == "ok":
|
||||||
|
status_display = color("ok", Colors.GREEN)
|
||||||
|
else:
|
||||||
|
status_display = color(f"{last_status}: {job.get('last_error', '?')}", Colors.RED)
|
||||||
|
print(f" Last run: {last_run} {status_display}")
|
||||||
|
|
||||||
|
delivery_err = job.get("last_delivery_error")
|
||||||
|
if delivery_err:
|
||||||
|
print(f" {color('⚠ Delivery failed:', Colors.YELLOW)} {delivery_err}")
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
from hermes_cli.gateway import find_gateway_pids
|
from hermes_cli.gateway import find_gateway_pids
|
||||||
|
|
|
||||||
|
|
@ -339,6 +339,36 @@ class TestMarkJobRun:
|
||||||
assert updated["last_status"] == "error"
|
assert updated["last_status"] == "error"
|
||||||
assert updated["last_error"] == "timeout"
|
assert updated["last_error"] == "timeout"
|
||||||
|
|
||||||
|
def test_delivery_error_tracked_separately(self, tmp_cron_dir):
|
||||||
|
"""Agent succeeds but delivery fails — both tracked independently."""
|
||||||
|
job = create_job(prompt="Report", schedule="every 1h")
|
||||||
|
mark_job_run(job["id"], success=True, delivery_error="platform 'telegram' not configured")
|
||||||
|
updated = get_job(job["id"])
|
||||||
|
assert updated["last_status"] == "ok"
|
||||||
|
assert updated["last_error"] is None
|
||||||
|
assert updated["last_delivery_error"] == "platform 'telegram' not configured"
|
||||||
|
|
||||||
|
def test_delivery_error_cleared_on_success(self, tmp_cron_dir):
|
||||||
|
"""Successful delivery clears the previous delivery error."""
|
||||||
|
job = create_job(prompt="Report", schedule="every 1h")
|
||||||
|
mark_job_run(job["id"], success=True, delivery_error="network timeout")
|
||||||
|
updated = get_job(job["id"])
|
||||||
|
assert updated["last_delivery_error"] == "network timeout"
|
||||||
|
# Next run delivers successfully
|
||||||
|
mark_job_run(job["id"], success=True, delivery_error=None)
|
||||||
|
updated = get_job(job["id"])
|
||||||
|
assert updated["last_delivery_error"] is None
|
||||||
|
|
||||||
|
def test_both_agent_and_delivery_error(self, tmp_cron_dir):
|
||||||
|
"""Agent fails AND delivery fails — both errors recorded."""
|
||||||
|
job = create_job(prompt="Report", schedule="every 1h")
|
||||||
|
mark_job_run(job["id"], success=False, error="model timeout",
|
||||||
|
delivery_error="platform 'discord' not enabled")
|
||||||
|
updated = get_job(job["id"])
|
||||||
|
assert updated["last_status"] == "error"
|
||||||
|
assert updated["last_error"] == "model timeout"
|
||||||
|
assert updated["last_delivery_error"] == "platform 'discord' not enabled"
|
||||||
|
|
||||||
|
|
||||||
class TestAdvanceNextRun:
|
class TestAdvanceNextRun:
|
||||||
"""Tests for advance_next_run() — crash-safety for recurring jobs."""
|
"""Tests for advance_next_run() — crash-safety for recurring jobs."""
|
||||||
|
|
|
||||||
|
|
@ -508,6 +508,90 @@ class TestDeliverResultWrapping:
|
||||||
assert send_mock.call_args.kwargs["thread_id"] == "17585"
|
assert send_mock.call_args.kwargs["thread_id"] == "17585"
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeliverResultErrorReturns:
|
||||||
|
"""Verify _deliver_result returns error strings on failure, None on success."""
|
||||||
|
|
||||||
|
def test_returns_none_on_successful_delivery(self):
|
||||||
|
from gateway.config import Platform
|
||||||
|
|
||||||
|
pconfig = MagicMock()
|
||||||
|
pconfig.enabled = True
|
||||||
|
mock_cfg = MagicMock()
|
||||||
|
mock_cfg.platforms = {Platform.TELEGRAM: pconfig}
|
||||||
|
|
||||||
|
with patch("gateway.config.load_gateway_config", return_value=mock_cfg), \
|
||||||
|
patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})):
|
||||||
|
job = {
|
||||||
|
"id": "ok-job",
|
||||||
|
"deliver": "origin",
|
||||||
|
"origin": {"platform": "telegram", "chat_id": "123"},
|
||||||
|
}
|
||||||
|
result = _deliver_result(job, "Output.")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_returns_none_for_local_delivery(self):
|
||||||
|
"""local-only jobs don't deliver — not a failure."""
|
||||||
|
job = {"id": "local-job", "deliver": "local"}
|
||||||
|
result = _deliver_result(job, "Output.")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_returns_error_for_unknown_platform(self):
|
||||||
|
job = {
|
||||||
|
"id": "bad-platform",
|
||||||
|
"deliver": "origin",
|
||||||
|
"origin": {"platform": "fax", "chat_id": "123"},
|
||||||
|
}
|
||||||
|
with patch("gateway.config.load_gateway_config"):
|
||||||
|
result = _deliver_result(job, "Output.")
|
||||||
|
assert result is not None
|
||||||
|
assert "unknown platform" in result
|
||||||
|
|
||||||
|
def test_returns_error_when_platform_disabled(self):
|
||||||
|
from gateway.config import Platform
|
||||||
|
|
||||||
|
pconfig = MagicMock()
|
||||||
|
pconfig.enabled = False
|
||||||
|
mock_cfg = MagicMock()
|
||||||
|
mock_cfg.platforms = {Platform.TELEGRAM: pconfig}
|
||||||
|
|
||||||
|
with patch("gateway.config.load_gateway_config", return_value=mock_cfg):
|
||||||
|
job = {
|
||||||
|
"id": "disabled",
|
||||||
|
"deliver": "origin",
|
||||||
|
"origin": {"platform": "telegram", "chat_id": "123"},
|
||||||
|
}
|
||||||
|
result = _deliver_result(job, "Output.")
|
||||||
|
assert result is not None
|
||||||
|
assert "not configured" in result
|
||||||
|
|
||||||
|
def test_returns_error_on_send_failure(self):
|
||||||
|
from gateway.config import Platform
|
||||||
|
|
||||||
|
pconfig = MagicMock()
|
||||||
|
pconfig.enabled = True
|
||||||
|
mock_cfg = MagicMock()
|
||||||
|
mock_cfg.platforms = {Platform.TELEGRAM: pconfig}
|
||||||
|
|
||||||
|
with patch("gateway.config.load_gateway_config", return_value=mock_cfg), \
|
||||||
|
patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"error": "rate limited"})):
|
||||||
|
job = {
|
||||||
|
"id": "rate-limited",
|
||||||
|
"deliver": "origin",
|
||||||
|
"origin": {"platform": "telegram", "chat_id": "123"},
|
||||||
|
}
|
||||||
|
result = _deliver_result(job, "Output.")
|
||||||
|
assert result is not None
|
||||||
|
assert "rate limited" in result
|
||||||
|
|
||||||
|
def test_returns_error_for_unresolved_target(self, monkeypatch):
|
||||||
|
"""Non-local delivery with no resolvable target should return an error."""
|
||||||
|
monkeypatch.delenv("TELEGRAM_HOME_CHANNEL", raising=False)
|
||||||
|
job = {"id": "no-target", "deliver": "telegram"}
|
||||||
|
result = _deliver_result(job, "Output.")
|
||||||
|
assert result is not None
|
||||||
|
assert "no delivery target" in result
|
||||||
|
|
||||||
|
|
||||||
class TestRunJobSessionPersistence:
|
class TestRunJobSessionPersistence:
|
||||||
def test_run_job_passes_session_db_and_cron_platform(self, tmp_path):
|
def test_run_job_passes_session_db_and_cron_platform(self, tmp_path):
|
||||||
job = {
|
job = {
|
||||||
|
|
|
||||||
|
|
@ -195,6 +195,7 @@ def _format_job(job: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"next_run_at": job.get("next_run_at"),
|
"next_run_at": job.get("next_run_at"),
|
||||||
"last_run_at": job.get("last_run_at"),
|
"last_run_at": job.get("last_run_at"),
|
||||||
"last_status": job.get("last_status"),
|
"last_status": job.get("last_status"),
|
||||||
|
"last_delivery_error": job.get("last_delivery_error"),
|
||||||
"enabled": job.get("enabled", True),
|
"enabled": job.get("enabled", True),
|
||||||
"state": job.get("state", "scheduled" if job.get("enabled", True) else "paused"),
|
"state": job.get("state", "scheduled" if job.get("enabled", True) else "paused"),
|
||||||
"paused_at": job.get("paused_at"),
|
"paused_at": job.get("paused_at"),
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue