feat(cron): track delivery failures in job status (#6042)

_deliver_result() now returns Optional[str] — None on success, error
message on failure. All failure paths (unknown platform, platform
disabled, config load error, send failure, unresolvable target)
return descriptive error strings.

mark_job_run() gains delivery_error param, tracked as
last_delivery_error on the job — separate from agent execution errors.
A job where the agent succeeded but delivery failed shows
last_status='ok' + last_delivery_error='...'.

The cronjob list tool now surfaces last_delivery_error so agents and
users can see when cron outputs aren't arriving.

Inspired by PR #5863 (oxngon) — reimplemented with proper wiring.

Tests: 3 new mark_job_run tests + 6 new _deliver_result return tests.
This commit is contained in:
Teknium 2026-04-07 22:49:01 -07:00 committed by GitHub
parent 598c25d43e
commit fff237e111
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 167 additions and 22 deletions

View file

@ -574,12 +574,16 @@ def remove_job(job_id: str) -> bool:
return False return False
def mark_job_run(job_id: str, success: bool, error: Optional[str] = None): def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,
delivery_error: Optional[str] = None):
""" """
Mark a job as having been run. Mark a job as having been run.
Updates last_run_at, last_status, increments completed count, Updates last_run_at, last_status, increments completed count,
computes next_run_at, and auto-deletes if repeat limit reached. computes next_run_at, and auto-deletes if repeat limit reached.
``delivery_error`` is tracked separately from the agent error a job
can succeed (agent produced output) but fail delivery (platform down).
""" """
jobs = load_jobs() jobs = load_jobs()
for i, job in enumerate(jobs): for i, job in enumerate(jobs):
@ -588,6 +592,8 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
job["last_run_at"] = now job["last_run_at"] = now
job["last_status"] = "ok" if success else "error" job["last_status"] = "ok" if success else "error"
job["last_error"] = error if not success else None job["last_error"] = error if not success else None
# Track delivery failures separately — cleared on successful delivery
job["last_delivery_error"] = delivery_error
# Increment completed count # Increment completed count
if job.get("repeat"): if job.get("repeat"):

View file

@ -196,7 +196,7 @@ def _send_media_via_adapter(adapter, chat_id: str, media_files: list, metadata:
logger.warning("Job '%s': failed to send media %s: %s", job.get("id", "?"), media_path, e) logger.warning("Job '%s': failed to send media %s: %s", job.get("id", "?"), media_path, e)
def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None: def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Optional[str]:
""" """
Deliver job output to the configured target (origin chat, specific platform, etc.). Deliver job output to the configured target (origin chat, specific platform, etc.).
@ -204,16 +204,16 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
use the live adapter first this supports E2EE rooms (e.g. Matrix) where use the live adapter first this supports E2EE rooms (e.g. Matrix) where
the standalone HTTP path cannot encrypt. Falls back to standalone send if the standalone HTTP path cannot encrypt. Falls back to standalone send if
the adapter path fails or is unavailable. the adapter path fails or is unavailable.
Returns None on success, or an error string on failure.
""" """
target = _resolve_delivery_target(job) target = _resolve_delivery_target(job)
if not target: if not target:
if job.get("deliver", "local") != "local": if job.get("deliver", "local") != "local":
logger.warning( msg = f"no delivery target resolved for deliver={job.get('deliver', 'local')}"
"Job '%s' deliver=%s but no concrete delivery target could be resolved", logger.warning("Job '%s': %s", job["id"], msg)
job["id"], return msg
job.get("deliver", "local"), return None # local-only jobs don't deliver — not a failure
)
return
platform_name = target["platform"] platform_name = target["platform"]
chat_id = target["chat_id"] chat_id = target["chat_id"]
@ -239,19 +239,22 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
} }
platform = platform_map.get(platform_name.lower()) platform = platform_map.get(platform_name.lower())
if not platform: if not platform:
logger.warning("Job '%s': unknown platform '%s' for delivery", job["id"], platform_name) msg = f"unknown platform '{platform_name}'"
return logger.warning("Job '%s': %s", job["id"], msg)
return msg
try: try:
config = load_gateway_config() config = load_gateway_config()
except Exception as e: except Exception as e:
logger.error("Job '%s': failed to load gateway config for delivery: %s", job["id"], e) msg = f"failed to load gateway config: {e}"
return logger.error("Job '%s': %s", job["id"], msg)
return msg
pconfig = config.platforms.get(platform) pconfig = config.platforms.get(platform)
if not pconfig or not pconfig.enabled: if not pconfig or not pconfig.enabled:
logger.warning("Job '%s': platform '%s' not configured/enabled", job["id"], platform_name) msg = f"platform '{platform_name}' not configured/enabled"
return logger.warning("Job '%s': %s", job["id"], msg)
return msg
# Optionally wrap the content with a header/footer so the user knows this # Optionally wrap the content with a header/footer so the user knows this
# is a cron delivery. Wrapping is on by default; set cron.wrap_response: false # is a cron delivery. Wrapping is on by default; set cron.wrap_response: false
@ -307,7 +310,7 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
if adapter_ok: if adapter_ok:
logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id) logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
return return None
except Exception as e: except Exception as e:
logger.warning( logger.warning(
"Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone", "Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
@ -329,13 +332,17 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files)) future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
result = future.result(timeout=30) result = future.result(timeout=30)
except Exception as e: except Exception as e:
logger.error("Job '%s': delivery to %s:%s failed: %s", job["id"], platform_name, chat_id, e) msg = f"delivery to {platform_name}:{chat_id} failed: {e}"
return logger.error("Job '%s': %s", job["id"], msg)
return msg
if result and result.get("error"): if result and result.get("error"):
logger.error("Job '%s': delivery error: %s", job["id"], result["error"]) msg = f"delivery error: {result['error']}"
else: logger.error("Job '%s': %s", job["id"], msg)
return msg
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id) logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
return None
_SCRIPT_TIMEOUT = 120 # seconds _SCRIPT_TIMEOUT = 120 # seconds
@ -868,13 +875,15 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
logger.info("Job '%s': agent returned %s — skipping delivery", job["id"], SILENT_MARKER) logger.info("Job '%s': agent returned %s — skipping delivery", job["id"], SILENT_MARKER)
should_deliver = False should_deliver = False
delivery_error = None
if should_deliver: if should_deliver:
try: try:
_deliver_result(job, deliver_content, adapters=adapters, loop=loop) delivery_error = _deliver_result(job, deliver_content, adapters=adapters, loop=loop)
except Exception as de: except Exception as de:
delivery_error = str(de)
logger.error("Delivery failed for job %s: %s", job["id"], de) logger.error("Delivery failed for job %s: %s", job["id"], de)
mark_job_run(job["id"], success, error) mark_job_run(job["id"], success, error, delivery_error=delivery_error)
executed += 1 executed += 1
except Exception as e: except Exception as e:

View file

@ -93,6 +93,21 @@ def cron_list(show_all: bool = False):
script = job.get("script") script = job.get("script")
if script: if script:
print(f" Script: {script}") print(f" Script: {script}")
# Execution history
last_status = job.get("last_status")
if last_status:
last_run = job.get("last_run_at", "?")
if last_status == "ok":
status_display = color("ok", Colors.GREEN)
else:
status_display = color(f"{last_status}: {job.get('last_error', '?')}", Colors.RED)
print(f" Last run: {last_run} {status_display}")
delivery_err = job.get("last_delivery_error")
if delivery_err:
print(f" {color('⚠ Delivery failed:', Colors.YELLOW)} {delivery_err}")
print() print()
from hermes_cli.gateway import find_gateway_pids from hermes_cli.gateway import find_gateway_pids

View file

@ -339,6 +339,36 @@ class TestMarkJobRun:
assert updated["last_status"] == "error" assert updated["last_status"] == "error"
assert updated["last_error"] == "timeout" assert updated["last_error"] == "timeout"
def test_delivery_error_tracked_separately(self, tmp_cron_dir):
"""Agent succeeds but delivery fails — both tracked independently."""
job = create_job(prompt="Report", schedule="every 1h")
mark_job_run(job["id"], success=True, delivery_error="platform 'telegram' not configured")
updated = get_job(job["id"])
assert updated["last_status"] == "ok"
assert updated["last_error"] is None
assert updated["last_delivery_error"] == "platform 'telegram' not configured"
def test_delivery_error_cleared_on_success(self, tmp_cron_dir):
"""Successful delivery clears the previous delivery error."""
job = create_job(prompt="Report", schedule="every 1h")
mark_job_run(job["id"], success=True, delivery_error="network timeout")
updated = get_job(job["id"])
assert updated["last_delivery_error"] == "network timeout"
# Next run delivers successfully
mark_job_run(job["id"], success=True, delivery_error=None)
updated = get_job(job["id"])
assert updated["last_delivery_error"] is None
def test_both_agent_and_delivery_error(self, tmp_cron_dir):
"""Agent fails AND delivery fails — both errors recorded."""
job = create_job(prompt="Report", schedule="every 1h")
mark_job_run(job["id"], success=False, error="model timeout",
delivery_error="platform 'discord' not enabled")
updated = get_job(job["id"])
assert updated["last_status"] == "error"
assert updated["last_error"] == "model timeout"
assert updated["last_delivery_error"] == "platform 'discord' not enabled"
class TestAdvanceNextRun: class TestAdvanceNextRun:
"""Tests for advance_next_run() — crash-safety for recurring jobs.""" """Tests for advance_next_run() — crash-safety for recurring jobs."""

View file

@ -508,6 +508,90 @@ class TestDeliverResultWrapping:
assert send_mock.call_args.kwargs["thread_id"] == "17585" assert send_mock.call_args.kwargs["thread_id"] == "17585"
class TestDeliverResultErrorReturns:
"""Verify _deliver_result returns error strings on failure, None on success."""
def test_returns_none_on_successful_delivery(self):
from gateway.config import Platform
pconfig = MagicMock()
pconfig.enabled = True
mock_cfg = MagicMock()
mock_cfg.platforms = {Platform.TELEGRAM: pconfig}
with patch("gateway.config.load_gateway_config", return_value=mock_cfg), \
patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})):
job = {
"id": "ok-job",
"deliver": "origin",
"origin": {"platform": "telegram", "chat_id": "123"},
}
result = _deliver_result(job, "Output.")
assert result is None
def test_returns_none_for_local_delivery(self):
"""local-only jobs don't deliver — not a failure."""
job = {"id": "local-job", "deliver": "local"}
result = _deliver_result(job, "Output.")
assert result is None
def test_returns_error_for_unknown_platform(self):
job = {
"id": "bad-platform",
"deliver": "origin",
"origin": {"platform": "fax", "chat_id": "123"},
}
with patch("gateway.config.load_gateway_config"):
result = _deliver_result(job, "Output.")
assert result is not None
assert "unknown platform" in result
def test_returns_error_when_platform_disabled(self):
from gateway.config import Platform
pconfig = MagicMock()
pconfig.enabled = False
mock_cfg = MagicMock()
mock_cfg.platforms = {Platform.TELEGRAM: pconfig}
with patch("gateway.config.load_gateway_config", return_value=mock_cfg):
job = {
"id": "disabled",
"deliver": "origin",
"origin": {"platform": "telegram", "chat_id": "123"},
}
result = _deliver_result(job, "Output.")
assert result is not None
assert "not configured" in result
def test_returns_error_on_send_failure(self):
from gateway.config import Platform
pconfig = MagicMock()
pconfig.enabled = True
mock_cfg = MagicMock()
mock_cfg.platforms = {Platform.TELEGRAM: pconfig}
with patch("gateway.config.load_gateway_config", return_value=mock_cfg), \
patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"error": "rate limited"})):
job = {
"id": "rate-limited",
"deliver": "origin",
"origin": {"platform": "telegram", "chat_id": "123"},
}
result = _deliver_result(job, "Output.")
assert result is not None
assert "rate limited" in result
def test_returns_error_for_unresolved_target(self, monkeypatch):
"""Non-local delivery with no resolvable target should return an error."""
monkeypatch.delenv("TELEGRAM_HOME_CHANNEL", raising=False)
job = {"id": "no-target", "deliver": "telegram"}
result = _deliver_result(job, "Output.")
assert result is not None
assert "no delivery target" in result
class TestRunJobSessionPersistence: class TestRunJobSessionPersistence:
def test_run_job_passes_session_db_and_cron_platform(self, tmp_path): def test_run_job_passes_session_db_and_cron_platform(self, tmp_path):
job = { job = {

View file

@ -195,6 +195,7 @@ def _format_job(job: Dict[str, Any]) -> Dict[str, Any]:
"next_run_at": job.get("next_run_at"), "next_run_at": job.get("next_run_at"),
"last_run_at": job.get("last_run_at"), "last_run_at": job.get("last_run_at"),
"last_status": job.get("last_status"), "last_status": job.get("last_status"),
"last_delivery_error": job.get("last_delivery_error"),
"enabled": job.get("enabled", True), "enabled": job.get("enabled", True),
"state": job.get("state", "scheduled" if job.get("enabled", True) else "paused"), "state": job.get("state", "scheduled" if job.get("enabled", True) else "paused"),
"paused_at": job.get("paused_at"), "paused_at": job.get("paused_at"),