mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 02:11:48 +00:00
Track tool-call validity vs attempts; shape reward accordingly
- AgentResult now includes tool-call metrics: attempted, schema_valid, executed_ok, exec_error - HermesAgentLoop normalizes args robustly without crashing, but distinguishes schema-valid args (dict) from coerced formats (stringified JSON, plain strings) - SweSmithOracleEnv reward shaping now prefers schema-valid tool calls while still giving small credit for attempted tool use
This commit is contained in:
parent
35b2250b36
commit
499490d06a
2 changed files with 133 additions and 56 deletions
|
|
@ -488,19 +488,27 @@ class SweSmithOracleEnv(HermesAgentBaseEnv):
|
|||
"""
|
||||
repo_dir = self._repo_name(item)
|
||||
|
||||
# Count valid tool calls (assistant messages that have tool_calls)
|
||||
tool_call_count = sum(
|
||||
# Count tool calls. Prefer the agent-loop metrics if present:
|
||||
# - attempted: model called a known tool name
|
||||
# - schema_valid: args were a dict (no coercion/double-decoding)
|
||||
fallback_count = sum(
|
||||
len(msg.get("tool_calls", []))
|
||||
for msg in result.messages
|
||||
if msg.get("role") == "assistant"
|
||||
)
|
||||
|
||||
if tool_call_count == 0:
|
||||
attempted = getattr(result, "tool_calls_attempted", fallback_count)
|
||||
schema_valid = getattr(result, "tool_calls_schema_valid", fallback_count)
|
||||
|
||||
if attempted == 0:
|
||||
print(f"[SweSmithOracleEnv] No tool calls made; score=0.0", flush=True)
|
||||
return 0.0
|
||||
|
||||
# Partial reward: 0.05 per tool call, capped at 0.3
|
||||
tool_call_reward = min(tool_call_count * 0.05, 0.3)
|
||||
# Shaping: reward attempting tool use a little, but reward schema-valid calls more.
|
||||
# Full credit per call is still 0.05 when schema_valid.
|
||||
attempt_reward = min(attempted * 0.02, 0.10)
|
||||
schema_reward = min(schema_valid * 0.03, 0.20)
|
||||
tool_call_reward = min(attempt_reward + schema_reward, 0.30)
|
||||
|
||||
nodeids = self._tests_for_item(item)
|
||||
if not nodeids:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue