feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the full LLM request lifecycle: 1. Resolve provider + model from task config or explicit args 2. Get or create a cached client for that provider 3. Format request args (max_tokens handling, provider extra_body) 4. Make the API call with max_tokens/max_completion_tokens retry 5. Return the response Config: expanded auxiliary section with provider:model slots for all tasks (compression, vision, web_extract, session_search, skills_hub, mcp, flush_memories). Config version bumped to 7. Migrated all auxiliary consumers: - context_compressor.py: uses call_llm(task='compression') - vision_tools.py: uses async_call_llm(task='vision') - web_tools.py: uses async_call_llm(task='web_extract') - session_search_tool.py: uses async_call_llm(task='session_search') - browser_tool.py: uses call_llm(task='vision'/'web_extract') - mcp_tool.py: uses call_llm(task='mcp') - skills_guard.py: uses call_llm(provider='openrouter') - run_agent.py flush_memories: uses call_llm(task='flush_memories') Tests updated for context_compressor and MCP tool. Some test mocks still need updating (15 remaining failures from mock pattern changes, 2 pre-existing).
2026-04-25 00:51:20 +00:00 · 2026-03-11 20:52:19 -07:00 · 2026-03-11 20:52:19 -07:00 · 0aa31cd3cb
commit 0aa31cd3cb
parent 013cc4d2fc
13 changed files with 552 additions and 375 deletions
--- a/trajectory_compressor.py
+++ b/trajectory_compressor.py
@ -344,28 +344,32 @@ class TrajectoryCompressor:
            raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
    
    def _init_summarizer(self):
-        """Initialize LLM client for summarization (sync and async).
+        """Initialize LLM routing for summarization (sync and async).

-        Routes through the centralized provider router for known providers
-        (OpenRouter, Nous, Codex, etc.) so auth and headers are handled
-        consistently.  Falls back to raw construction for custom endpoints.
+        Uses call_llm/async_call_llm from the centralized provider router
+        which handles auth, headers, and provider detection internally.
+        For custom endpoints, falls back to raw client construction.
        """
-        from agent.auxiliary_client import resolve_provider_client
+        from agent.auxiliary_client import call_llm, async_call_llm

        provider = self._detect_provider()
        if provider:
-            # Use centralized router — handles auth, headers, Codex adapter
-            self.client, _ = resolve_provider_client(
+            # Store provider for use in _generate_summary calls
+            self._llm_provider = provider
+            self._use_call_llm = True
+            # Verify the provider is available
+            from agent.auxiliary_client import resolve_provider_client
+            client, _ = resolve_provider_client(
                provider, model=self.config.summarization_model)
-            self.async_client, _ = resolve_provider_client(
-                provider, model=self.config.summarization_model,
-                async_mode=True)
-            if self.client is None:
+            if client is None:
                raise RuntimeError(
                    f"Provider '{provider}' is not configured. "
                    f"Check your API key or run: hermes setup")
+            self.client = None  # Not used directly
+            self.async_client = None  # Not used directly
        else:
            # Custom endpoint — use config's raw base_url + api_key_env
+            self._use_call_llm = False
            api_key = os.getenv(self.config.api_key_env)
            if not api_key:
                raise RuntimeError(
@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
            try:
                metrics.summarization_api_calls += 1
                
-                response = self.client.chat.completions.create(
-                    model=self.config.summarization_model,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=self.config.temperature,
-                    max_tokens=self.config.summary_target_tokens * 2,
-                )
+                if getattr(self, '_use_call_llm', False):
+                    from agent.auxiliary_client import call_llm
+                    response = call_llm(
+                        provider=self._llm_provider,
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
+                else:
+                    response = self.client.chat.completions.create(
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
                
                summary = response.choices[0].message.content.strip()
                
@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
            try:
                metrics.summarization_api_calls += 1
                
-                response = await self.async_client.chat.completions.create(
-                    model=self.config.summarization_model,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=self.config.temperature,
-                    max_tokens=self.config.summary_target_tokens * 2,
-                )
+                if getattr(self, '_use_call_llm', False):
+                    from agent.auxiliary_client import async_call_llm
+                    response = await async_call_llm(
+                        provider=self._llm_provider,
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
+                else:
+                    response = await self.async_client.chat.completions.create(
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
                
                summary = response.choices[0].message.content.strip()