mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 02:11:48 +00:00
500 lines
16 KiB
Python
500 lines
16 KiB
Python
"""
|
|
Nomad API Client for atropos-agent.
|
|
|
|
Provides a simple async client for interacting with the Nomad HTTP API:
|
|
- Submit/stop jobs
|
|
- Query allocations
|
|
- Get allocation addresses
|
|
- Scale jobs up/down
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import aiohttp
|
|
|
|
|
|
class AllocationStatus(Enum):
|
|
"""Nomad allocation status."""
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETE = "complete"
|
|
FAILED = "failed"
|
|
LOST = "lost"
|
|
|
|
|
|
@dataclass
|
|
class Allocation:
|
|
"""Information about a Nomad allocation."""
|
|
id: str
|
|
job_id: str
|
|
task_group: str
|
|
node_id: str
|
|
status: AllocationStatus
|
|
# Network info for reaching the allocation
|
|
address: Optional[str] = None
|
|
port: Optional[int] = None
|
|
|
|
@property
|
|
def http_address(self) -> Optional[str]:
|
|
"""Get full HTTP address for the allocation."""
|
|
if self.address and self.port:
|
|
return f"http://{self.address}:{self.port}"
|
|
return None
|
|
|
|
|
|
@dataclass
|
|
class JobStatus:
|
|
"""Status of a Nomad job."""
|
|
id: str
|
|
name: str
|
|
status: str
|
|
allocations: List[Allocation] = field(default_factory=list)
|
|
count: int = 0 # Number of task groups
|
|
|
|
|
|
class NomadClient:
|
|
"""
|
|
Async client for Nomad HTTP API.
|
|
|
|
Usage:
|
|
client = NomadClient(address="http://localhost:4646")
|
|
|
|
# Submit a job
|
|
await client.submit_job(job_spec)
|
|
|
|
# Get allocations
|
|
allocs = await client.get_job_allocations("sandbox-python")
|
|
|
|
# Scale job
|
|
await client.scale_job("sandbox-python", count=5)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
address: str = "http://localhost:4646",
|
|
token: Optional[str] = None,
|
|
timeout: float = 30.0,
|
|
):
|
|
self.address = address.rstrip("/")
|
|
self.token = token or os.environ.get("NOMAD_TOKEN")
|
|
self.timeout = aiohttp.ClientTimeout(total=timeout)
|
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
|
|
async def _get_session(self) -> aiohttp.ClientSession:
|
|
"""Get or create HTTP session."""
|
|
if self._session is None or self._session.closed:
|
|
headers = {}
|
|
if self.token:
|
|
headers["X-Nomad-Token"] = self.token
|
|
self._session = aiohttp.ClientSession(
|
|
timeout=self.timeout,
|
|
headers=headers,
|
|
)
|
|
return self._session
|
|
|
|
async def close(self):
|
|
"""Close the HTTP session."""
|
|
if self._session and not self._session.closed:
|
|
await self._session.close()
|
|
|
|
async def __aenter__(self):
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
await self.close()
|
|
|
|
async def _request(
|
|
self,
|
|
method: str,
|
|
path: str,
|
|
data: Optional[Dict[str, Any]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Make an HTTP request to Nomad API."""
|
|
session = await self._get_session()
|
|
url = f"{self.address}{path}"
|
|
|
|
try:
|
|
async with session.request(method, url, json=data) as response:
|
|
if response.status == 404:
|
|
return {"error": "not_found", "status": 404}
|
|
|
|
text = await response.text()
|
|
if not text:
|
|
return {"status": response.status}
|
|
|
|
try:
|
|
result = json.loads(text)
|
|
except json.JSONDecodeError:
|
|
return {"text": text, "status": response.status}
|
|
|
|
if response.status >= 400:
|
|
return {"error": result, "status": response.status}
|
|
|
|
return result if isinstance(result, dict) else {"data": result, "status": response.status}
|
|
|
|
except aiohttp.ClientError as e:
|
|
return {"error": str(e), "status": 0}
|
|
|
|
# Job Operations
|
|
|
|
async def submit_job(self, job_spec: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Submit a job to Nomad.
|
|
|
|
Args:
|
|
job_spec: Job specification dict (HCL converted to JSON)
|
|
|
|
Returns:
|
|
Response with EvalID if successful
|
|
"""
|
|
return await self._request("POST", "/v1/jobs", {"Job": job_spec})
|
|
|
|
async def stop_job(self, job_id: str, purge: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Stop (and optionally purge) a job.
|
|
|
|
Args:
|
|
job_id: Job identifier
|
|
purge: If True, completely remove the job
|
|
"""
|
|
path = f"/v1/job/{job_id}"
|
|
if purge:
|
|
path += "?purge=true"
|
|
return await self._request("DELETE", path)
|
|
|
|
async def get_job(self, job_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get job details."""
|
|
result = await self._request("GET", f"/v1/job/{job_id}")
|
|
if "error" in result and result.get("status") == 404:
|
|
return None
|
|
return result
|
|
|
|
async def get_job_status(self, job_id: str) -> Optional[JobStatus]:
|
|
"""Get job status with allocations."""
|
|
job = await self.get_job(job_id)
|
|
if not job:
|
|
return None
|
|
|
|
allocs = await self.get_job_allocations(job_id)
|
|
|
|
# Get count from task groups
|
|
count = 0
|
|
task_groups = job.get("TaskGroups", [])
|
|
for tg in task_groups:
|
|
count += tg.get("Count", 1)
|
|
|
|
return JobStatus(
|
|
id=job_id,
|
|
name=job.get("Name", job_id),
|
|
status=job.get("Status", "unknown"),
|
|
allocations=allocs,
|
|
count=count,
|
|
)
|
|
|
|
# Allocation Operations
|
|
|
|
async def get_job_allocations(self, job_id: str) -> List[Allocation]:
|
|
"""Get all allocations for a job."""
|
|
result = await self._request("GET", f"/v1/job/{job_id}/allocations")
|
|
|
|
if "error" in result:
|
|
return []
|
|
|
|
allocs_data = result.get("data", result) if isinstance(result, dict) else result
|
|
if not isinstance(allocs_data, list):
|
|
return []
|
|
|
|
allocations = []
|
|
for alloc_data in allocs_data:
|
|
# Parse allocation info
|
|
alloc_id = alloc_data.get("ID", "")
|
|
status_str = alloc_data.get("ClientStatus", "unknown")
|
|
|
|
try:
|
|
status = AllocationStatus(status_str)
|
|
except ValueError:
|
|
status = AllocationStatus.PENDING
|
|
|
|
# Get network info - need to fetch detailed allocation for this
|
|
address = None
|
|
port = None
|
|
|
|
# First try the summary data
|
|
resources = alloc_data.get("AllocatedResources") or {}
|
|
shared = resources.get("Shared") or {}
|
|
networks = shared.get("Networks") or []
|
|
|
|
# If no networks in summary, fetch detailed allocation
|
|
if not networks and alloc_id:
|
|
detailed = await self.get_allocation(alloc_id)
|
|
if detailed:
|
|
resources = detailed.get("AllocatedResources") or {}
|
|
shared = resources.get("Shared") or {}
|
|
networks = shared.get("Networks") or []
|
|
|
|
if networks:
|
|
network = networks[0]
|
|
address = network.get("IP")
|
|
# Look for dynamic ports OR reserved ports (Singularity/raw_exec uses reserved)
|
|
dyn_ports = network.get("DynamicPorts") or []
|
|
reserved_ports = network.get("ReservedPorts") or []
|
|
for dp in dyn_ports + reserved_ports:
|
|
if dp.get("Label") == "http":
|
|
port = dp.get("Value")
|
|
break
|
|
|
|
allocations.append(Allocation(
|
|
id=alloc_id,
|
|
job_id=job_id,
|
|
task_group=alloc_data.get("TaskGroup", ""),
|
|
node_id=alloc_data.get("NodeID", ""),
|
|
status=status,
|
|
address=address,
|
|
port=port,
|
|
))
|
|
|
|
return allocations
|
|
|
|
async def get_allocation(self, alloc_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get detailed allocation info."""
|
|
result = await self._request("GET", f"/v1/allocation/{alloc_id}")
|
|
if "error" in result and result.get("status") == 404:
|
|
return None
|
|
return result
|
|
|
|
# Scaling Operations
|
|
|
|
async def scale_job(self, job_id: str, count: int, task_group: str = "sandbox") -> Dict[str, Any]:
|
|
"""
|
|
Scale a job's task group to specified count.
|
|
|
|
Args:
|
|
job_id: Job identifier
|
|
count: Desired number of allocations
|
|
task_group: Name of task group to scale
|
|
"""
|
|
payload = {
|
|
"Count": count,
|
|
"Target": {
|
|
"Group": task_group,
|
|
},
|
|
}
|
|
return await self._request("POST", f"/v1/job/{job_id}/scale", payload)
|
|
|
|
async def get_job_scale_status(self, job_id: str) -> Dict[str, int]:
|
|
"""
|
|
Get current scale status for a job.
|
|
|
|
Returns:
|
|
Dict mapping task group name to count
|
|
"""
|
|
result = await self._request("GET", f"/v1/job/{job_id}/scale")
|
|
|
|
if "error" in result:
|
|
return {}
|
|
|
|
task_groups = result.get("TaskGroups", {})
|
|
return {
|
|
name: info.get("Running", 0)
|
|
for name, info in task_groups.items()
|
|
}
|
|
|
|
# Health Check
|
|
|
|
async def is_healthy(self) -> bool:
|
|
"""Check if Nomad is reachable and healthy."""
|
|
try:
|
|
result = await self._request("GET", "/v1/status/leader")
|
|
return "error" not in result
|
|
except Exception:
|
|
return False
|
|
|
|
async def get_leader(self) -> Optional[str]:
|
|
"""Get current Nomad leader address."""
|
|
result = await self._request("GET", "/v1/status/leader")
|
|
if isinstance(result, dict) and "data" in result:
|
|
return result["data"]
|
|
return None
|
|
|
|
|
|
def load_job_template(
|
|
template_name: str = "sandbox",
|
|
**kwargs,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Load and configure a job template.
|
|
|
|
Args:
|
|
template_name: Name of template (e.g., "sandbox")
|
|
**kwargs: Template variables to substitute
|
|
|
|
Returns:
|
|
Job specification dict ready for Nomad API
|
|
"""
|
|
# Default job template for sandbox container
|
|
if template_name == "sandbox":
|
|
return create_sandbox_job(**kwargs)
|
|
else:
|
|
raise ValueError(f"Unknown template: {template_name}")
|
|
|
|
|
|
def create_sandbox_job(
|
|
job_id: str = "atropos-sandbox",
|
|
image: str = "atropos-sandbox:local", # Use :local tag to avoid registry pull
|
|
count: int = 1,
|
|
slots_per_container: int = 10,
|
|
privileged: bool = False,
|
|
cpu: int = 500,
|
|
memory: int = 512,
|
|
port: int = 8080,
|
|
datacenter: str = "dc1",
|
|
driver: str = "docker", # "docker" or "singularity"
|
|
singularity_image: str = None, # Path to .sif file for singularity driver
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Create a sandbox job specification.
|
|
|
|
This job runs the sandbox_server.py inside a container,
|
|
with the specified number of slots for agent workspaces.
|
|
|
|
Args:
|
|
job_id: Unique job identifier
|
|
image: Docker image to use (for docker driver)
|
|
count: Number of container instances
|
|
slots_per_container: Number of slots per container
|
|
privileged: Run container in privileged mode (recommended for bubblewrap)
|
|
cpu: CPU allocation in MHz
|
|
memory: Memory allocation in MB
|
|
port: HTTP port for sandbox server
|
|
datacenter: Nomad datacenter
|
|
driver: Container driver - "docker" or "singularity"
|
|
singularity_image: Path to .sif file (required if driver="singularity")
|
|
|
|
Returns:
|
|
Job specification dict
|
|
"""
|
|
# Build task config based on driver
|
|
if driver == "singularity":
|
|
if not singularity_image:
|
|
raise ValueError("singularity_image path required when driver='singularity'")
|
|
|
|
# Use raw_exec driver to run apptainer via shell for variable expansion
|
|
# The container binds the allocation directory for workspace persistence
|
|
# For raw_exec, we use static port since Nomad's dynamic port mapping doesn't
|
|
# work the same as Docker - the process runs directly on the host.
|
|
shell_cmd = (
|
|
f'apptainer run '
|
|
f'--bind "$NOMAD_ALLOC_DIR/data:/data" '
|
|
f'--pwd /app '
|
|
f'--env PYTHONUNBUFFERED=1 '
|
|
f'{singularity_image} '
|
|
f'python sandbox_server.py '
|
|
f'--port {port} '
|
|
f'--slots {slots_per_container} '
|
|
f'--data-dir /data'
|
|
)
|
|
task_config = {
|
|
"command": "/bin/sh",
|
|
"args": ["-c", shell_cmd],
|
|
}
|
|
task_driver = "raw_exec"
|
|
else:
|
|
# Docker driver (default)
|
|
task_config = {
|
|
"image": image,
|
|
"force_pull": False, # Use local image, don't try to pull
|
|
"ports": ["http"],
|
|
"privileged": privileged,
|
|
"command": "python",
|
|
"args": [
|
|
"sandbox_server.py",
|
|
"--port", str(port),
|
|
"--slots", str(slots_per_container),
|
|
"--data-dir", "/data",
|
|
],
|
|
# Note: On Linux, you can mount persistent storage:
|
|
# "volumes": ["${NOMAD_ALLOC_DIR}/data:/data"],
|
|
# On macOS/Docker Desktop, skip volumes for PoC
|
|
# (container /data is ephemeral but works for testing)
|
|
}
|
|
task_driver = "docker"
|
|
|
|
# For Singularity/raw_exec, use static ports since the process runs directly on host.
|
|
# For Docker, use dynamic ports with port mapping.
|
|
if driver == "singularity":
|
|
network_config = {
|
|
"Mode": "host",
|
|
"ReservedPorts": [
|
|
{
|
|
"Label": "http",
|
|
"Value": port,
|
|
}
|
|
],
|
|
}
|
|
else:
|
|
network_config = {
|
|
"Mode": "host",
|
|
"DynamicPorts": [
|
|
{
|
|
"Label": "http",
|
|
"To": port,
|
|
}
|
|
],
|
|
}
|
|
|
|
return {
|
|
"ID": job_id,
|
|
"Name": job_id,
|
|
"Type": "service",
|
|
"Datacenters": [datacenter],
|
|
"TaskGroups": [
|
|
{
|
|
"Name": "sandbox",
|
|
"Count": count,
|
|
# Speed up deployments and avoid Consul checks. Without this, Nomad may
|
|
# keep an "active deployment" around for the default MinHealthyTime,
|
|
# which blocks immediate scaling under load.
|
|
"Update": {
|
|
"HealthCheck": "task_states",
|
|
"MinHealthyTime": 0,
|
|
},
|
|
"Networks": [network_config],
|
|
"Tasks": [
|
|
{
|
|
"Name": "sandbox-server",
|
|
"Driver": task_driver,
|
|
"Config": task_config,
|
|
"Env": {
|
|
"PYTHONUNBUFFERED": "1",
|
|
"NOMAD_ALLOC_DIR": "${NOMAD_ALLOC_DIR}",
|
|
},
|
|
"Resources": {
|
|
"CPU": cpu,
|
|
"MemoryMB": memory,
|
|
},
|
|
# Note: Services with Checks require Consul, which we skip for the PoC
|
|
}
|
|
],
|
|
"RestartPolicy": {
|
|
"Attempts": 3,
|
|
"Interval": 300_000_000_000, # 5 minutes
|
|
"Delay": 10_000_000_000, # 10 seconds
|
|
"Mode": "delay",
|
|
},
|
|
"ReschedulePolicy": {
|
|
"Attempts": 5,
|
|
"Interval": 3600_000_000_000, # 1 hour
|
|
"Delay": 30_000_000_000, # 30 seconds
|
|
"DelayFunction": "exponential",
|
|
"MaxDelay": 300_000_000_000, # 5 minutes
|
|
"Unlimited": False,
|
|
},
|
|
}
|
|
],
|
|
}
|