From 3b9c53e6dbef3f2518e23b8fd755dceec8b09d29 Mon Sep 17 00:00:00 2001
From: Shannon Sands <shannon@nousresearch.com>
Date: Mon, 9 Feb 2026 01:36:20 +0000
Subject: [PATCH] Add Tinker RL training integration and documentation

- pyproject.toml: Added tinker SDK, torch, wandb, math-verify to [atropos] extras
- README.md: Added comprehensive RL Training with Tinker section including:
  - Architecture diagram (3-process pipeline)
  - Quick start guide for GSM8k agent training
  - Configuration documentation
  - RL CLI usage
  - Sandbox backend options (Nomad, Singularity, Modal)

New files in tinker-atropos submodule (committed there):
- tinker_atropos/environments/gsm8k_agent.py: Agent GSM8k env with Python REPL tool
- configs/gsm8k_agent.yaml: Config for Qwen3-4B training
---
 README.md      | 131 +++++++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml |   6 ++-
 2 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 18b016c4b8..16e34a0da4 100644
--- a/README.md
+++ b/README.md
@@ -995,6 +995,137 @@ All variables go in `~/.hermes/.env`. Run `hermes config set VAR value` to set t
 
 ---
 
+## RL Training with Tinker
+
+Hermes-Agent includes an RL training integration with [Tinker](https://thinkingmachines.ai/tinker/) (Thinking Machines) and [Atropos](https://github.com/NousResearch/atropos) for training language models with reinforcement learning from agent trajectories.
+
+### Prerequisites
+
+1. **Install with Atropos extras** (includes Tinker SDK, atroposlib, torch, wandb):
+```bash
+pip install -e ".[atropos]"
+```
+
+2. **Initialize the tinker-atropos submodule**:
+```bash
+git submodule update --init
+pip install -e ./tinker-atropos
+```
+
+3. **Get API keys**:
+   - `TINKER_API_KEY` from [Tinker Console](https://tinker-console.thinkingmachines.ai/keys) (requires billing setup)
+   - `WANDB_API_KEY` from [Weights & Biases](https://wandb.ai/settings) (for metrics tracking)
+
+4. **Add keys to your `.env` file**:
+```bash
+# Add to .env or ~/.hermes/.env
+TINKER_API_KEY=your_tinker_key
+WANDB_API_KEY=your_wandb_key
+```
+
+### Architecture
+
+The RL training pipeline uses three processes that communicate over HTTP:
+
+```
+┌──────────────────────┐   ┌─────────────────────┐   ┌────────────────────────┐
+│ Atropos Rollout API  │   │ Tinker Trainer       │   │ Environment            │
+│ (port 8000)          │◄──│ (port 8001)          │◄──│ (worker)               │
+│                      │   │                      │   │                        │
+│ • Collects batches   │   │ • LoRA training      │   │ • Generates prompts    │
+│ • Coordinates env    │   │ • Inference server   │   │ • Calls inference API  │
+│   and trainer        │   │ • Weight updates     │   │ • Scores responses     │
+│                      │   │ • WandB logging      │   │ • Sends scored batches │
+└──────────────────────┘   └─────────────────────┘   └────────────────────────┘
+```
+
+### Quick Start: GSM8k Agent Training
+
+This example trains a model on math problems using a Python REPL tool — the model learns to write and execute Python code to solve math:
+
+```bash
+# Terminal 1: Start Atropos Rollout API
+cd tinker-atropos
+source ../.venv/bin/activate
+set -a && source ../.env && set +a
+run-api
+
+# Terminal 2: Start Tinker Trainer + Inference Server
+cd tinker-atropos
+source ../.venv/bin/activate
+set -a && source ../.env && set +a
+python launch_training.py --config configs/gsm8k_agent.yaml
+
+# Terminal 3: Start GSM8k Agent Environment
+cd tinker-atropos
+source ../.venv/bin/activate
+set -a && source ../.env && set +a
+python tinker_atropos/environments/gsm8k_agent.py serve --config configs/gsm8k_agent.yaml
+```
+
+### Available Environments
+
+| Environment | File | Description |
+|------------|------|-------------|
+| `gsm8k` | `gsm8k_tinker.py` | Standard GSM8k math (no tools) |
+| `gsm8k_agent` | `gsm8k_agent.py` | GSM8k with Python REPL tool calling |
+
+### Configuration
+
+Configs are YAML files in `tinker-atropos/configs/` with three sections:
+
+```yaml
+env:                              # Atropos environment settings
+  group_size: 4                   # Parallel rollouts per problem
+  batch_size: 16                  # Training batch size
+  tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
+  max_token_length: 2048          # Max generation length
+  total_steps: 20                 # Training steps
+
+openai:                           # Inference server (served by Tinker trainer)
+  - model_name: "Qwen/Qwen3-4B-Instruct-2507"
+    base_url: "http://localhost:8001/v1"
+
+tinker:                           # Tinker training parameters
+  lora_rank: 16                   # LoRA rank (lower = faster, less capacity)
+  learning_rate: 0.00005          # Learning rate
+  max_token_trainer_length: 4096  # Max tokens for training
+  wandb_project: "hermes-agent-rl"
+```
+
+### RL CLI (Agent-Driven Training)
+
+For interactive training management via the Hermes agent:
+
+```bash
+# Interactive mode - let the agent manage training
+python rl_cli.py --interactive
+
+# List available environments
+python rl_cli.py --list-environments
+
+# Direct task
+python rl_cli.py "Train a model on GSM8k with tool use"
+```
+
+### Sandbox Backends for Agent Environments
+
+For agent environments that need isolated tool execution (e.g., SWE tasks), Hermes-Agent supports multiple sandbox backends:
+
+| Backend | Use Case | Command |
+|---------|----------|---------|
+| **Nomad + Docker** | Default, local development | `--env.tool_pool_mode nomad` |
+| **Nomad + Singularity** | HPC clusters without Docker | `--env.tool_pool_mode nomad --env.driver singularity` |
+| **Modal** | Cloud-based, auto-scaling | `--env.tool_pool_mode modal` |
+
+See [docs/MODAL_BACKEND.md](docs/MODAL_BACKEND.md) for Modal backend details.
+
+### Cost
+
+Check the [Tinker Rate Card](https://tinker-console.thinkingmachines.ai/rate-card) for available models and pricing.
+
+---
+
 ## Troubleshooting
 
 ```bash
diff --git a/pyproject.toml b/pyproject.toml
index 58fd934649..5b9c9612bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,14 +40,18 @@ dev = ["pytest", "pytest-asyncio"]
 messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0"]
 cron = ["croniter"]
 cli = ["simple-term-menu"]
-# Install Atropos from source (PyPI is often stale for this internal dependency).
+# Install Atropos + Tinker training integration from source.
 atropos = [
   "atroposlib @ git+https://github.com/NousResearch/atropos.git",
+  "tinker @ git+https://github.com/thinking-machines-lab/tinker.git",
   # Atropos integration runtime deps (kept optional for Hermes-only users)
   "aiohttp",
   "fastapi",
   "uvicorn",
   "pyte",
+  "torch",
+  "wandb",
+  "math-verify",
 ]
 all = [
   "hermes-agent[modal]",