mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(cron): run due jobs in parallel to prevent serial tick starvation (#13021)
Replaces the serial for-loop in tick() with ThreadPoolExecutor so all jobs due in a single tick run concurrently. A slow job no longer blocks others from executing, fixing silent job skipping (issue #9086). Thread safety: - Session/delivery env vars migrated from os.environ to ContextVars (gateway/session_context.py) so parallel jobs can't clobber each other's delivery targets. Each thread gets its own copied context. - jobs.json read-modify-write cycles (advance_next_run, mark_job_run) protected by threading.Lock to prevent concurrent save clobber. - send_message_tool reads delivery vars via get_session_env() for ContextVar-aware resolution with os.environ fallback. Configuration: - cron.max_parallel_jobs in config.yaml (null = unbounded, 1 = serial) - HERMES_CRON_MAX_PARALLEL env var override Based on PR #9169 by @VenomMoth1. Fixes #9086
This commit is contained in:
parent
d587d62eba
commit
c86915024e
6 changed files with 259 additions and 81 deletions
|
|
@ -747,14 +747,17 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
|||
# scheduler process — every job this process runs is a cron job.
|
||||
os.environ["HERMES_CRON_SESSION"] = "1"
|
||||
|
||||
# Use ContextVars for per-job session/delivery state so parallel jobs
|
||||
# don't clobber each other's targets (os.environ is process-global).
|
||||
from gateway.session_context import set_session_vars, clear_session_vars, _VAR_MAP
|
||||
|
||||
_ctx_tokens = set_session_vars(
|
||||
platform=origin["platform"] if origin else "",
|
||||
chat_id=str(origin["chat_id"]) if origin else "",
|
||||
chat_name=origin.get("chat_name", "") if origin else "",
|
||||
)
|
||||
|
||||
try:
|
||||
# Inject origin context so the agent's send_message tool knows the chat.
|
||||
# Must be INSIDE the try block so the finally cleanup always runs.
|
||||
if origin:
|
||||
os.environ["HERMES_SESSION_PLATFORM"] = origin["platform"]
|
||||
os.environ["HERMES_SESSION_CHAT_ID"] = str(origin["chat_id"])
|
||||
if origin.get("chat_name"):
|
||||
os.environ["HERMES_SESSION_CHAT_NAME"] = origin["chat_name"]
|
||||
# Re-read .env and config.yaml fresh every run so provider/key
|
||||
# changes take effect without a gateway restart.
|
||||
from dotenv import load_dotenv
|
||||
|
|
@ -765,10 +768,10 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
|||
|
||||
delivery_target = _resolve_delivery_target(job)
|
||||
if delivery_target:
|
||||
os.environ["HERMES_CRON_AUTO_DELIVER_PLATFORM"] = delivery_target["platform"]
|
||||
os.environ["HERMES_CRON_AUTO_DELIVER_CHAT_ID"] = str(delivery_target["chat_id"])
|
||||
_VAR_MAP["HERMES_CRON_AUTO_DELIVER_PLATFORM"].set(delivery_target["platform"])
|
||||
_VAR_MAP["HERMES_CRON_AUTO_DELIVER_CHAT_ID"].set(str(delivery_target["chat_id"]))
|
||||
if delivery_target.get("thread_id") is not None:
|
||||
os.environ["HERMES_CRON_AUTO_DELIVER_THREAD_ID"] = str(delivery_target["thread_id"])
|
||||
_VAR_MAP["HERMES_CRON_AUTO_DELIVER_THREAD_ID"].set(str(delivery_target["thread_id"]))
|
||||
|
||||
model = job.get("model") or os.getenv("HERMES_MODEL") or ""
|
||||
|
||||
|
|
@ -1012,16 +1015,8 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
|||
return False, output, "", error_msg
|
||||
|
||||
finally:
|
||||
# Clean up injected env vars so they don't leak to other jobs
|
||||
for key in (
|
||||
"HERMES_SESSION_PLATFORM",
|
||||
"HERMES_SESSION_CHAT_ID",
|
||||
"HERMES_SESSION_CHAT_NAME",
|
||||
"HERMES_CRON_AUTO_DELIVER_PLATFORM",
|
||||
"HERMES_CRON_AUTO_DELIVER_CHAT_ID",
|
||||
"HERMES_CRON_AUTO_DELIVER_THREAD_ID",
|
||||
):
|
||||
os.environ.pop(key, None)
|
||||
# Clean up ContextVar session/delivery state for this job.
|
||||
clear_session_vars(_ctx_tokens)
|
||||
if _session_db:
|
||||
try:
|
||||
_session_db.end_session(_cron_session_id, "cron_complete")
|
||||
|
|
@ -1074,15 +1069,42 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
|
|||
if verbose:
|
||||
logger.info("%s - %s job(s) due", _hermes_now().strftime('%H:%M:%S'), len(due_jobs))
|
||||
|
||||
executed = 0
|
||||
# Advance next_run_at for all recurring jobs FIRST, under the file lock,
|
||||
# before any execution begins. This preserves at-most-once semantics.
|
||||
for job in due_jobs:
|
||||
try:
|
||||
# For recurring jobs (cron/interval), advance next_run_at to the
|
||||
# next future occurrence BEFORE execution. This way, if the
|
||||
# process crashes mid-run, the job won't re-fire on restart.
|
||||
# One-shot jobs are left alone so they can retry on restart.
|
||||
advance_next_run(job["id"])
|
||||
advance_next_run(job["id"])
|
||||
|
||||
# Resolve max parallel workers: env var > config.yaml > unbounded.
|
||||
# Set HERMES_CRON_MAX_PARALLEL=1 to restore old serial behaviour.
|
||||
_max_workers: Optional[int] = None
|
||||
try:
|
||||
_env_par = os.getenv("HERMES_CRON_MAX_PARALLEL", "").strip()
|
||||
if _env_par:
|
||||
_max_workers = int(_env_par) or None
|
||||
except (ValueError, TypeError):
|
||||
logger.warning("Invalid HERMES_CRON_MAX_PARALLEL value; defaulting to unbounded")
|
||||
if _max_workers is None:
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
_ucfg = load_config() or {}
|
||||
_cfg_par = (
|
||||
_ucfg.get("cron", {}) if isinstance(_ucfg, dict) else {}
|
||||
).get("max_parallel_jobs")
|
||||
if _cfg_par is not None:
|
||||
_max_workers = int(_cfg_par) or None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if verbose:
|
||||
logger.info(
|
||||
"Running %d job(s) in parallel (max_workers=%s)",
|
||||
len(due_jobs),
|
||||
_max_workers if _max_workers else "unbounded",
|
||||
)
|
||||
|
||||
def _process_job(job: dict) -> bool:
|
||||
"""Run one due job end-to-end: execute, save, deliver, mark."""
|
||||
try:
|
||||
success, output, final_response, error = run_job(job)
|
||||
|
||||
output_file = save_job_output(job["id"], output)
|
||||
|
|
@ -1114,13 +1136,23 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
|
|||
error = "Agent completed but produced empty response (model error, timeout, or misconfiguration)"
|
||||
|
||||
mark_job_run(job["id"], success, error, delivery_error=delivery_error)
|
||||
executed += 1
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error processing job %s: %s", job['id'], e)
|
||||
mark_job_run(job["id"], False, str(e))
|
||||
return False
|
||||
|
||||
return executed
|
||||
# Run all due jobs concurrently, each in its own ContextVar copy
|
||||
# so session/delivery state stays isolated per-thread.
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=_max_workers) as _tick_pool:
|
||||
_futures = []
|
||||
for job in due_jobs:
|
||||
_ctx = contextvars.copy_context()
|
||||
_futures.append(_tick_pool.submit(_ctx.run, _process_job, job))
|
||||
_results = [f.result() for f in _futures]
|
||||
|
||||
return sum(_results)
|
||||
finally:
|
||||
if fcntl:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue