Track tool-call validity vs attempts; shape reward accordingly

- AgentResult now includes tool-call metrics: attempted, schema_valid, executed_ok, exec_error - HermesAgentLoop normalizes args robustly without crashing, but distinguishes schema-valid args (dict) from coerced formats (stringified JSON, plain strings) - SweSmithOracleEnv reward shaping now prefers schema-valid tool calls while still giving small credit for attempted tool use
2026-05-03 02:11:48 +00:00 · 2026-02-14 09:17:05 +10:00 · 2026-02-14 09:17:05 +10:00 · 499490d06a
commit 499490d06a
parent 35b2250b36
2 changed files with 133 additions and 56 deletions
--- a/environments/swe_smith_oracle_env.py
+++ b/environments/swe_smith_oracle_env.py
@ -488,19 +488,27 @@ class SweSmithOracleEnv(HermesAgentBaseEnv):
        """
        repo_dir = self._repo_name(item)

-        # Count valid tool calls (assistant messages that have tool_calls)
-        tool_call_count = sum(
+        # Count tool calls. Prefer the agent-loop metrics if present:
+        # - attempted: model called a known tool name
+        # - schema_valid: args were a dict (no coercion/double-decoding)
+        fallback_count = sum(
            len(msg.get("tool_calls", []))
            for msg in result.messages
            if msg.get("role") == "assistant"
        )

-        if tool_call_count == 0:
+        attempted = getattr(result, "tool_calls_attempted", fallback_count)
+        schema_valid = getattr(result, "tool_calls_schema_valid", fallback_count)
+
+        if attempted == 0:
            print(f"[SweSmithOracleEnv] No tool calls made; score=0.0", flush=True)
            return 0.0

-        # Partial reward: 0.05 per tool call, capped at 0.3
-        tool_call_reward = min(tool_call_count * 0.05, 0.3)
+        # Shaping: reward attempting tool use a little, but reward schema-valid calls more.
+        # Full credit per call is still 0.05 when schema_valid.
+        attempt_reward = min(attempted * 0.02, 0.10)
+        schema_reward = min(schema_valid * 0.03, 0.20)
+        tool_call_reward = min(attempt_reward + schema_reward, 0.30)

        nodeids = self._tests_for_item(item)
        if not nodeids: