feat(agent): add jittered retry backoff

Adds agent/retry_utils.py with jittered_backoff() — exponential backoff
with additive jitter to prevent thundering-herd retry spikes when
multiple gateway sessions hit the same rate-limited provider.

Replaces fixed exponential backoff at 4 call sites:
- run_agent.py: None-choices retry path (5s base, 120s cap)
- run_agent.py: API error retry path (2s base, 60s cap)
- trajectory_compressor.py: sync + async summarization retries

Thread-safe jitter counter with overflow guards ensures unique seeds
across concurrent retries.

Trimmed from original PR to keep only wired-in functionality.

Co-authored-by: martinp09 <martinp09@users.noreply.github.com>
This commit is contained in:
zocomputer 2026-04-07 22:49:31 -07:00 committed by Teknium
parent fff237e111
commit e1befe5077
4 changed files with 181 additions and 4 deletions

View file

@ -44,6 +44,7 @@ import fire
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn, TimeRemainingColumn
from rich.console import Console
from hermes_constants import OPENROUTER_BASE_URL
from agent.retry_utils import jittered_backoff
# Load environment variables
from dotenv import load_dotenv
@ -585,7 +586,7 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
self.logger.warning(f"Summarization attempt {attempt + 1} failed: {e}")
if attempt < self.config.max_retries - 1:
time.sleep(self.config.retry_delay * (attempt + 1))
time.sleep(jittered_backoff(attempt + 1, base_delay=self.config.retry_delay, max_delay=30.0))
else:
# Fallback: create a basic summary
return "[CONTEXT SUMMARY]: [Summary generation failed - previous turns contained tool calls and responses that have been compressed to save context space.]"
@ -647,7 +648,7 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
self.logger.warning(f"Summarization attempt {attempt + 1} failed: {e}")
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay * (attempt + 1))
await asyncio.sleep(jittered_backoff(attempt + 1, base_delay=self.config.retry_delay, max_delay=30.0))
else:
# Fallback: create a basic summary
return "[CONTEXT SUMMARY]: [Summary generation failed - previous turns contained tool calls and responses that have been compressed to save context space.]"