chore: remove Atropos RL environments and tinker-atropos integration (#26106)

* chore: remove Atropos RL environments, tools, tests, skill, and tinker-atropos submodule

Delete:
- environments/ (43 files — base env, agent loop, tool call parsers, benchmarks)
- rl_cli.py (standalone RL training CLI)
- tools/rl_training_tool.py (all 10 rl_* tools)
- tests: test_rl_training_tool, test_tool_call_parsers, test_managed_server_tool_support,
  test_agent_loop, test_agent_loop_vllm, test_agent_loop_tool_calling,
  test_terminalbench2_env_security
- optional-skills/mlops/hermes-atropos-environments/
- tinker-atropos git submodule + .gitmodules

* chore: remove RL/Atropos references from Python source

- toolsets.py: remove rl toolset block + update comment
- model_tools.py: remove rl_tools group + update async bridging comment
- hermes_cli/tools_config.py: remove RL display entry, _DEFAULT_OFF_TOOLSETS,
  setup block, and rl_training post-setup handler
- tools/budget_config.py: remove RL environment reference in docstring
- tests/test_model_tools.py: remove rl_tools from expected groups
- tests/run_agent/test_streaming_tool_call_repair.py: fix stale cross-reference

* chore: remove rl/yc-bench extras and tinker-atropos refs from pyproject.toml

- Remove rl extra (atroposlib, tinker, fastapi, uvicorn, wandb)
- Remove yc-bench extra
- Remove rl_cli from py-modules
- Remove [tool.ty.src] exclude for tinker-atropos
- Remove [tool.ruff] exclude for tinker-atropos
- Regenerate uv.lock

* chore: remove tinker-atropos from install/setup scripts

- setup-hermes.sh: remove entire tinker-atropos submodule install block
- scripts/install.sh: remove both tinker-atropos blocks (Termux + standard)
- scripts/install.ps1: remove tinker-atropos block
- nix/hermes-agent.nix: remove tinker-atropos pip install line

* chore: remove RL references from cli-config.yaml.example

* docs: remove Atropos/RL references from README, CONTRIBUTING, AGENTS.md

* docs: remove RL/Atropos references from website

- Delete: environments.md, rl-training.md, mlops-hermes-atropos-environments.md
- sidebars.ts: remove rl-training and environments sidebar entries
- optional-skills-catalog.md: remove hermes-atropos-environments row
- tools-reference.md: remove entire rl toolset section
- toolsets-reference.md: remove rl row + update example
- integrations/index.md: remove RL Training bullet
- architecture.md: remove environments/ from tree + RL section
- contributing.md: remove tinker-atropos setup
- updating.md: remove tinker-atropos install + stale submodule update

* chore: remove remaining RL/Atropos stragglers

- hermes_cli/config.py: remove TINKER_API_KEY + WANDB_API_KEY env var defs
- hermes_cli/doctor.py: remove Submodules check section (tinker-atropos)
- hermes_cli/setup.py: remove RL Training status check
- hermes_cli/status.py: remove Tinker + WandB from API key status display
- agent/display.py: remove both rl_* tool preview/activity blocks
- website/docs: remove RL references from providers.md + env-variables.md
- tests: remove TINKER_API_KEY from conftest, set_config_value, setup_script

* chore: remove RL training section from .env.example
This commit is contained in:
Siddharth Balyan 2026-05-15 10:36:38 +05:30 committed by GitHub
parent d364132114
commit 5af672c753
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
97 changed files with 18 additions and 15690 deletions

View file

@ -1,178 +0,0 @@
"""
Tests for ManagedServer / tool-parser integration.
Validates that:
1. The installed atroposlib API still matches Hermes's expectations
2. Hermes's parser registry remains compatible with ManagedServer parsing
3. HermesAgentBaseEnv wires the selected parser into ServerManager correctly
These tests verify the contract between hermes-agent's environments/ code
and atroposlib's ManagedServer. They detect API incompatibilities early.
"""
import inspect
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
try:
import atroposlib # noqa: F401
except ImportError:
pytest.skip("atroposlib not installed", allow_module_level=True)
class TestManagedServerAPI:
"""Test that ManagedServer's API matches what hermes-agent expects."""
def test_managed_server_init_signature(self):
"""ManagedServer should accept tool_call_parser parameter."""
from atroposlib.envs.server_handling.managed_server import ManagedServer
sig = inspect.signature(ManagedServer.__init__)
params = list(sig.parameters.keys())
# Core params that must exist
assert "self" in params
assert "server" in params
assert "tokenizer" in params
assert "track_tree" in params
# tool_call_parser — required for tool_call_support branch
# If this fails, atroposlib hasn't been updated to tool_call_support
has_tool_parser = "tool_call_parser" in params
if not has_tool_parser:
pytest.skip(
"ManagedServer does not have tool_call_parser param — "
"baseline atroposlib (pre tool_call_support branch)"
)
def test_server_manager_managed_server_signature(self):
"""ServerManager.managed_server() should accept tool_call_parser."""
from atroposlib.envs.server_handling.server_manager import ServerManager
sig = inspect.signature(ServerManager.managed_server)
params = list(sig.parameters.keys())
assert "self" in params
assert "tokenizer" in params
has_tool_parser = "tool_call_parser" in params
if not has_tool_parser:
pytest.skip(
"ServerManager.managed_server() does not have tool_call_parser param — "
"baseline atroposlib (pre tool_call_support branch)"
)
def test_managed_server_chat_template_kwargs(self):
"""ManagedServer should have CHAT_TEMPLATE_KWARGS for forwarding tools/thinking."""
from atroposlib.envs.server_handling.managed_server import ManagedServer
if not hasattr(ManagedServer, "CHAT_TEMPLATE_KWARGS"):
pytest.skip(
"ManagedServer does not have CHAT_TEMPLATE_KWARGS — "
"baseline atroposlib (pre tool_call_support branch)"
)
kwargs = ManagedServer.CHAT_TEMPLATE_KWARGS
assert "tools" in kwargs, "tools must be in CHAT_TEMPLATE_KWARGS"
def test_no_get_logprobs_method(self):
"""get_logprobs should be removed in tool_call_support branch."""
from atroposlib.envs.server_handling.managed_server import ManagedServer
# In baseline, get_logprobs exists. In tool_call_support, it's removed.
# We just note the state — not a hard fail either way.
has_get_logprobs = hasattr(ManagedServer, "get_logprobs")
if has_get_logprobs:
pytest.skip(
"ManagedServer still has get_logprobs — baseline atroposlib"
)
class TestParserCompatibility:
"""Test that hermes-agent's parsers match ManagedServer's expectations."""
def test_parser_parse_returns_correct_format(self):
"""
ManagedServer expects parser.parse(text) -> (content, tool_calls)
where tool_calls is a list of objects with .id, .function.name, .function.arguments
"""
from environments.tool_call_parsers import get_parser
parser = get_parser("hermes")
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
tc = tool_calls[0]
# ManagedServer accesses these attrs directly
assert hasattr(tc, "id")
assert hasattr(tc, "function")
assert hasattr(tc.function, "name")
assert hasattr(tc.function, "arguments")
def test_parser_no_tools_returns_none(self):
"""ManagedServer checks `if parsed_tool_calls:` — None should be falsy."""
from environments.tool_call_parsers import get_parser
parser = get_parser("hermes")
content, tool_calls = parser.parse("Just text, no tools")
assert tool_calls is None
def test_parser_content_is_string_or_none(self):
"""ManagedServer uses `parsed_content or ""` — must be str or None."""
from environments.tool_call_parsers import get_parser
parser = get_parser("hermes")
# With tool calls
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>'
content, _ = parser.parse(text)
assert content is None or isinstance(content, str)
# Without tool calls
content2, _ = parser.parse("Just text")
assert isinstance(content2, str)
class TestBaseEnvCompatibility:
"""Test that hermes_base_env.py's tool-parser wiring matches the current API."""
def test_hermes_base_env_sets_server_manager_tool_parser(self):
"""Hermes wires parser selection through ServerManager.tool_parser."""
import ast
base_env_path = Path(__file__).parent.parent.parent / "environments" / "hermes_base_env.py"
source = base_env_path.read_text()
tree = ast.parse(source)
found_assignment = False
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Attribute) and target.attr == "tool_parser":
parent = target.value
if (
isinstance(parent, ast.Attribute)
and parent.attr == "server"
and isinstance(parent.value, ast.Name)
and parent.value.id == "self"
):
found_assignment = True
assert found_assignment, (
"hermes_base_env.py should set self.server.tool_parser from config.tool_call_parser"
)
def test_hermes_base_env_uses_config_tool_call_parser(self):
"""Verify hermes_base_env uses the config field rather than a local parser instance."""
base_env_path = Path(__file__).parent.parent.parent / "environments" / "hermes_base_env.py"
source = base_env_path.read_text()
assert 'tool_call_parser: str = Field(' in source
assert 'self.server.tool_parser = config.tool_call_parser' in source

View file

@ -1,142 +0,0 @@
"""Tests for rl_training_tool.py — file handle lifecycle and cleanup.
Verifies that _stop_training_run properly closes log file handles,
terminates processes, and handles edge cases on failure paths.
Inspired by PR #715 (0xbyt4).
"""
from unittest.mock import MagicMock
import pytest
from tools.rl_training_tool import RunState, _stop_training_run
def _make_run_state(**overrides) -> RunState:
"""Create a minimal RunState for testing."""
defaults = {
"run_id": "test-run-001",
"environment": "test_env",
"config": {},
}
defaults.update(overrides)
return RunState(**defaults)
class TestStopTrainingRunFileHandles:
"""Verify that _stop_training_run closes log file handles stored as attributes."""
def test_closes_all_log_file_handles(self):
state = _make_run_state()
files = {}
for attr in ("api_log_file", "trainer_log_file", "env_log_file"):
fh = MagicMock()
setattr(state, attr, fh)
files[attr] = fh
_stop_training_run(state)
for attr, fh in files.items():
fh.close.assert_called_once()
assert getattr(state, attr) is None
def test_clears_file_attrs_to_none(self):
state = _make_run_state()
state.api_log_file = MagicMock()
_stop_training_run(state)
assert state.api_log_file is None
def test_close_exception_does_not_propagate(self):
"""If a file handle .close() raises, it must not crash."""
state = _make_run_state()
bad_fh = MagicMock()
bad_fh.close.side_effect = OSError("already closed")
good_fh = MagicMock()
state.api_log_file = bad_fh
state.trainer_log_file = good_fh
_stop_training_run(state) # should not raise
bad_fh.close.assert_called_once()
good_fh.close.assert_called_once()
def test_handles_missing_file_attrs(self):
"""RunState without log file attrs should not crash."""
state = _make_run_state()
# No log file attrs set at all — getattr(..., None) should handle it
_stop_training_run(state) # should not raise
class TestStopTrainingRunProcesses:
"""Verify that _stop_training_run terminates processes correctly."""
def test_terminates_running_processes(self):
state = _make_run_state()
for attr in ("api_process", "trainer_process", "env_process"):
proc = MagicMock()
proc.poll.return_value = None # still running
setattr(state, attr, proc)
_stop_training_run(state)
for attr in ("api_process", "trainer_process", "env_process"):
getattr(state, attr).terminate.assert_called_once()
def test_does_not_terminate_exited_processes(self):
state = _make_run_state()
proc = MagicMock()
proc.poll.return_value = 0 # already exited
state.api_process = proc
_stop_training_run(state)
proc.terminate.assert_not_called()
def test_handles_none_processes(self):
state = _make_run_state()
# All process attrs are None by default
_stop_training_run(state) # should not raise
def test_handles_mixed_running_and_exited_processes(self):
state = _make_run_state()
# api still running
api = MagicMock()
api.poll.return_value = None
state.api_process = api
# trainer already exited
trainer = MagicMock()
trainer.poll.return_value = 0
state.trainer_process = trainer
# env is None
state.env_process = None
_stop_training_run(state)
api.terminate.assert_called_once()
trainer.terminate.assert_not_called()
class TestStopTrainingRunStatus:
"""Verify status transitions in _stop_training_run."""
def test_sets_status_to_stopped_when_running(self):
state = _make_run_state(status="running")
_stop_training_run(state)
assert state.status == "stopped"
def test_does_not_change_status_when_failed(self):
state = _make_run_state(status="failed")
_stop_training_run(state)
assert state.status == "failed"
def test_does_not_change_status_when_pending(self):
state = _make_run_state(status="pending")
_stop_training_run(state)
assert state.status == "pending"
def test_no_crash_with_no_processes_and_no_files(self):
state = _make_run_state()
_stop_training_run(state) # should not raise
assert state.status == "pending"

View file

@ -1,274 +0,0 @@
"""
Tests for environments/tool_call_parsers/ client-side tool call parsers.
These parsers extract structured tool_calls from raw model output text.
Used in Phase 2 (VLLM/generate) where the server returns raw tokens.
"""
import json
import sys
from pathlib import Path
import pytest
# Ensure repo root is importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
try:
from environments.tool_call_parsers import (
ParseResult,
ToolCallParser,
get_parser,
list_parsers,
)
except ImportError:
pytest.skip("atroposlib not installed", allow_module_level=True)
# ─── Registry tests ─────────────────────────────────────────────────────
class TestParserRegistry:
def test_list_parsers_returns_nonempty(self):
parsers = list_parsers()
assert len(parsers) > 0
def test_hermes_parser_registered(self):
parsers = list_parsers()
assert "hermes" in parsers
def test_get_parser_returns_instance(self):
parser = get_parser("hermes")
assert isinstance(parser, ToolCallParser)
def test_get_parser_unknown_raises(self):
with pytest.raises(KeyError):
get_parser("nonexistent_parser_xyz")
def test_all_registered_parsers_instantiate(self):
"""Every registered parser should be importable and instantiable."""
for name in list_parsers():
parser = get_parser(name)
assert isinstance(parser, ToolCallParser)
assert hasattr(parser, "parse")
# ─── Hermes parser tests ────────────────────────────────────────────────
class TestHermesParser:
@pytest.fixture
def parser(self):
return get_parser("hermes")
def test_no_tool_call(self, parser):
text = "Hello, I can help you with that."
content, tool_calls = parser.parse(text)
assert content == text
assert tool_calls is None
def test_single_tool_call(self, parser):
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls -la"}}</tool_call>'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "terminal"
args = json.loads(tool_calls[0].function.arguments)
assert args["command"] == "ls -la"
def test_tool_call_with_surrounding_text(self, parser):
text = 'Let me check that for you.\n<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "terminal"
# Content should have the surrounding text
if content is not None:
assert "check that" in content or content.strip() != ""
def test_multiple_tool_calls(self, parser):
text = (
'<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>\n'
'<tool_call>{"name": "read_file", "arguments": {"path": "test.py"}}</tool_call>'
)
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 2
names = {tc.function.name for tc in tool_calls}
assert "terminal" in names
assert "read_file" in names
def test_tool_call_ids_are_unique(self, parser):
text = (
'<tool_call>{"name": "terminal", "arguments": {"command": "ls"}}</tool_call>\n'
'<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>'
)
_, tool_calls = parser.parse(text)
assert tool_calls is not None
ids = [tc.id for tc in tool_calls]
assert len(ids) == len(set(ids)), "Tool call IDs must be unique"
def test_empty_string(self, parser):
content, tool_calls = parser.parse("")
assert tool_calls is None
def test_malformed_json_in_tool_call(self, parser):
text = '<tool_call>not valid json</tool_call>'
content, tool_calls = parser.parse(text)
# Should either return None tool_calls or handle gracefully
# (implementation may vary — some parsers return error tool calls)
def test_truncated_tool_call(self, parser):
"""Test handling of unclosed tool_call tag (model truncated mid-generation)."""
text = '<tool_call>{"name": "terminal", "arguments": {"command": "ls -la"}'
content, tool_calls = parser.parse(text)
# Parser should handle truncated output gracefully
# Either parse it successfully or return None
# ─── Parse result contract tests (applies to ALL parsers) ───────────────
class TestParseResultContract:
"""Ensure all parsers conform to the ParseResult contract."""
@pytest.fixture(params=["hermes"]) # Add more as needed
def parser(self, request):
return get_parser(request.param)
def test_returns_tuple_of_two(self, parser):
result = parser.parse("hello world")
assert isinstance(result, tuple)
assert len(result) == 2
def test_no_tools_returns_none_tool_calls(self, parser):
content, tool_calls = parser.parse("Just plain text, no tools.")
assert tool_calls is None
assert content is not None
def test_tool_calls_are_proper_objects(self, parser):
"""When tool calls are found, they should be ChatCompletionMessageToolCall objects."""
# Use hermes format since that's universal
text = '<tool_call>{"name": "terminal", "arguments": {"command": "echo hi"}}</tool_call>'
content, tool_calls = parser.parse(text)
if tool_calls is not None:
for tc in tool_calls:
assert hasattr(tc, "id")
assert hasattr(tc, "function")
assert hasattr(tc.function, "name")
assert hasattr(tc.function, "arguments")
assert tc.id is not None
assert isinstance(tc.function.name, str)
assert isinstance(tc.function.arguments, str)
# ─── DeepSeek V3 parser tests ───────────────────────────────────────────
class TestDeepSeekV3Parser:
@pytest.fixture
def parser(self):
return get_parser("deepseek_v3")
def test_no_tool_call(self, parser):
text = "Hello, how can I help you?"
content, tool_calls = parser.parse(text)
assert content == text
assert tool_calls is None
def test_single_tool_call(self, parser):
text = (
'<tool▁calls▁begin><tool▁call▁begin>function<tool▁sep>get_weather\n'
'```json\n{"city": "London"}\n```<tool▁call▁end><tool▁calls▁end>'
)
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "get_weather"
args = json.loads(tool_calls[0].function.arguments)
assert args["city"] == "London"
def test_multiple_tool_calls(self, parser):
text = (
'<tool▁calls▁begin>'
'<tool▁call▁begin>function<tool▁sep>get_weather\n'
'```json\n{"city": "London"}\n```<tool▁call▁end>'
'<tool▁call▁begin>function<tool▁sep>get_time\n'
'```json\n{"timezone": "UTC"}\n```<tool▁call▁end>'
'<tool▁calls▁end>'
)
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 2, f"Expected 2 tool calls, got {len(tool_calls)}"
names = [tc.function.name for tc in tool_calls]
assert "get_weather" in names
assert "get_time" in names
def test_tool_call_with_preceding_text(self, parser):
text = (
'Let me check that for you.\n'
'<tool▁calls▁begin><tool▁call▁begin>function<tool▁sep>terminal\n'
'```json\n{"command": "ls"}\n```<tool▁call▁end><tool▁calls▁end>'
)
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
# ─── Mistral parser tests ───────────────────────────────────────────────
class TestMistralParser:
@pytest.fixture
def parser(self):
return get_parser("mistral")
def test_no_tool_call(self, parser):
text = "Hello, how can I help you?"
content, tool_calls = parser.parse(text)
assert content == text
assert tool_calls is None
def test_pre_v11_single_tool_call(self, parser):
text = '[TOOL_CALLS] [{"name": "func", "arguments": {"key": "val"}}]'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "func"
args = json.loads(tool_calls[0].function.arguments)
assert args["key"] == "val"
def test_pre_v11_nested_json(self, parser):
text = '[TOOL_CALLS] [{"name": "func", "arguments": {"nested": {"deep": true}}}]'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "func"
args = json.loads(tool_calls[0].function.arguments)
assert args["nested"]["deep"] is True
def test_v11_single_tool_call(self, parser):
text = '[TOOL_CALLS]get_weather{"city": "London"}'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "get_weather"
args = json.loads(tool_calls[0].function.arguments)
assert args["city"] == "London"
def test_v11_multiple_tool_calls(self, parser):
text = '[TOOL_CALLS]func1{"a": 1}[TOOL_CALLS]func2{"b": 2}'
content, tool_calls = parser.parse(text)
assert tool_calls is not None
assert len(tool_calls) == 2
names = [tc.function.name for tc in tool_calls]
assert "func1" in names
assert "func2" in names
def test_preceding_text_preserved(self, parser):
text = 'Hello[TOOL_CALLS]func{"a": 1}'
content, tool_calls = parser.parse(text)
assert content == "Hello"
assert tool_calls is not None
assert len(tool_calls) == 1
assert tool_calls[0].function.name == "func"
def test_malformed_json_fallback(self, parser):
text = "[TOOL_CALLS] not valid json"
content, tool_calls = parser.parse(text)
assert tool_calls is None