hermes-agent/atropos/nomad/client.py
2026-02-06 01:03:59 +00:00

500 lines
16 KiB
Python

"""
Nomad API Client for atropos-agent.
Provides a simple async client for interacting with the Nomad HTTP API:
- Submit/stop jobs
- Query allocations
- Get allocation addresses
- Scale jobs up/down
"""
import asyncio
import json
import os
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
import aiohttp
class AllocationStatus(Enum):
"""Nomad allocation status."""
PENDING = "pending"
RUNNING = "running"
COMPLETE = "complete"
FAILED = "failed"
LOST = "lost"
@dataclass
class Allocation:
"""Information about a Nomad allocation."""
id: str
job_id: str
task_group: str
node_id: str
status: AllocationStatus
# Network info for reaching the allocation
address: Optional[str] = None
port: Optional[int] = None
@property
def http_address(self) -> Optional[str]:
"""Get full HTTP address for the allocation."""
if self.address and self.port:
return f"http://{self.address}:{self.port}"
return None
@dataclass
class JobStatus:
"""Status of a Nomad job."""
id: str
name: str
status: str
allocations: List[Allocation] = field(default_factory=list)
count: int = 0 # Number of task groups
class NomadClient:
"""
Async client for Nomad HTTP API.
Usage:
client = NomadClient(address="http://localhost:4646")
# Submit a job
await client.submit_job(job_spec)
# Get allocations
allocs = await client.get_job_allocations("sandbox-python")
# Scale job
await client.scale_job("sandbox-python", count=5)
"""
def __init__(
self,
address: str = "http://localhost:4646",
token: Optional[str] = None,
timeout: float = 30.0,
):
self.address = address.rstrip("/")
self.token = token or os.environ.get("NOMAD_TOKEN")
self.timeout = aiohttp.ClientTimeout(total=timeout)
self._session: Optional[aiohttp.ClientSession] = None
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create HTTP session."""
if self._session is None or self._session.closed:
headers = {}
if self.token:
headers["X-Nomad-Token"] = self.token
self._session = aiohttp.ClientSession(
timeout=self.timeout,
headers=headers,
)
return self._session
async def close(self):
"""Close the HTTP session."""
if self._session and not self._session.closed:
await self._session.close()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def _request(
self,
method: str,
path: str,
data: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""Make an HTTP request to Nomad API."""
session = await self._get_session()
url = f"{self.address}{path}"
try:
async with session.request(method, url, json=data) as response:
if response.status == 404:
return {"error": "not_found", "status": 404}
text = await response.text()
if not text:
return {"status": response.status}
try:
result = json.loads(text)
except json.JSONDecodeError:
return {"text": text, "status": response.status}
if response.status >= 400:
return {"error": result, "status": response.status}
return result if isinstance(result, dict) else {"data": result, "status": response.status}
except aiohttp.ClientError as e:
return {"error": str(e), "status": 0}
# Job Operations
async def submit_job(self, job_spec: Dict[str, Any]) -> Dict[str, Any]:
"""
Submit a job to Nomad.
Args:
job_spec: Job specification dict (HCL converted to JSON)
Returns:
Response with EvalID if successful
"""
return await self._request("POST", "/v1/jobs", {"Job": job_spec})
async def stop_job(self, job_id: str, purge: bool = False) -> Dict[str, Any]:
"""
Stop (and optionally purge) a job.
Args:
job_id: Job identifier
purge: If True, completely remove the job
"""
path = f"/v1/job/{job_id}"
if purge:
path += "?purge=true"
return await self._request("DELETE", path)
async def get_job(self, job_id: str) -> Optional[Dict[str, Any]]:
"""Get job details."""
result = await self._request("GET", f"/v1/job/{job_id}")
if "error" in result and result.get("status") == 404:
return None
return result
async def get_job_status(self, job_id: str) -> Optional[JobStatus]:
"""Get job status with allocations."""
job = await self.get_job(job_id)
if not job:
return None
allocs = await self.get_job_allocations(job_id)
# Get count from task groups
count = 0
task_groups = job.get("TaskGroups", [])
for tg in task_groups:
count += tg.get("Count", 1)
return JobStatus(
id=job_id,
name=job.get("Name", job_id),
status=job.get("Status", "unknown"),
allocations=allocs,
count=count,
)
# Allocation Operations
async def get_job_allocations(self, job_id: str) -> List[Allocation]:
"""Get all allocations for a job."""
result = await self._request("GET", f"/v1/job/{job_id}/allocations")
if "error" in result:
return []
allocs_data = result.get("data", result) if isinstance(result, dict) else result
if not isinstance(allocs_data, list):
return []
allocations = []
for alloc_data in allocs_data:
# Parse allocation info
alloc_id = alloc_data.get("ID", "")
status_str = alloc_data.get("ClientStatus", "unknown")
try:
status = AllocationStatus(status_str)
except ValueError:
status = AllocationStatus.PENDING
# Get network info - need to fetch detailed allocation for this
address = None
port = None
# First try the summary data
resources = alloc_data.get("AllocatedResources") or {}
shared = resources.get("Shared") or {}
networks = shared.get("Networks") or []
# If no networks in summary, fetch detailed allocation
if not networks and alloc_id:
detailed = await self.get_allocation(alloc_id)
if detailed:
resources = detailed.get("AllocatedResources") or {}
shared = resources.get("Shared") or {}
networks = shared.get("Networks") or []
if networks:
network = networks[0]
address = network.get("IP")
# Look for dynamic ports OR reserved ports (Singularity/raw_exec uses reserved)
dyn_ports = network.get("DynamicPorts") or []
reserved_ports = network.get("ReservedPorts") or []
for dp in dyn_ports + reserved_ports:
if dp.get("Label") == "http":
port = dp.get("Value")
break
allocations.append(Allocation(
id=alloc_id,
job_id=job_id,
task_group=alloc_data.get("TaskGroup", ""),
node_id=alloc_data.get("NodeID", ""),
status=status,
address=address,
port=port,
))
return allocations
async def get_allocation(self, alloc_id: str) -> Optional[Dict[str, Any]]:
"""Get detailed allocation info."""
result = await self._request("GET", f"/v1/allocation/{alloc_id}")
if "error" in result and result.get("status") == 404:
return None
return result
# Scaling Operations
async def scale_job(self, job_id: str, count: int, task_group: str = "sandbox") -> Dict[str, Any]:
"""
Scale a job's task group to specified count.
Args:
job_id: Job identifier
count: Desired number of allocations
task_group: Name of task group to scale
"""
payload = {
"Count": count,
"Target": {
"Group": task_group,
},
}
return await self._request("POST", f"/v1/job/{job_id}/scale", payload)
async def get_job_scale_status(self, job_id: str) -> Dict[str, int]:
"""
Get current scale status for a job.
Returns:
Dict mapping task group name to count
"""
result = await self._request("GET", f"/v1/job/{job_id}/scale")
if "error" in result:
return {}
task_groups = result.get("TaskGroups", {})
return {
name: info.get("Running", 0)
for name, info in task_groups.items()
}
# Health Check
async def is_healthy(self) -> bool:
"""Check if Nomad is reachable and healthy."""
try:
result = await self._request("GET", "/v1/status/leader")
return "error" not in result
except Exception:
return False
async def get_leader(self) -> Optional[str]:
"""Get current Nomad leader address."""
result = await self._request("GET", "/v1/status/leader")
if isinstance(result, dict) and "data" in result:
return result["data"]
return None
def load_job_template(
template_name: str = "sandbox",
**kwargs,
) -> Dict[str, Any]:
"""
Load and configure a job template.
Args:
template_name: Name of template (e.g., "sandbox")
**kwargs: Template variables to substitute
Returns:
Job specification dict ready for Nomad API
"""
# Default job template for sandbox container
if template_name == "sandbox":
return create_sandbox_job(**kwargs)
else:
raise ValueError(f"Unknown template: {template_name}")
def create_sandbox_job(
job_id: str = "atropos-sandbox",
image: str = "atropos-sandbox:local", # Use :local tag to avoid registry pull
count: int = 1,
slots_per_container: int = 10,
privileged: bool = False,
cpu: int = 500,
memory: int = 512,
port: int = 8080,
datacenter: str = "dc1",
driver: str = "docker", # "docker" or "singularity"
singularity_image: str = None, # Path to .sif file for singularity driver
) -> Dict[str, Any]:
"""
Create a sandbox job specification.
This job runs the sandbox_server.py inside a container,
with the specified number of slots for agent workspaces.
Args:
job_id: Unique job identifier
image: Docker image to use (for docker driver)
count: Number of container instances
slots_per_container: Number of slots per container
privileged: Run container in privileged mode (recommended for bubblewrap)
cpu: CPU allocation in MHz
memory: Memory allocation in MB
port: HTTP port for sandbox server
datacenter: Nomad datacenter
driver: Container driver - "docker" or "singularity"
singularity_image: Path to .sif file (required if driver="singularity")
Returns:
Job specification dict
"""
# Build task config based on driver
if driver == "singularity":
if not singularity_image:
raise ValueError("singularity_image path required when driver='singularity'")
# Use raw_exec driver to run apptainer via shell for variable expansion
# The container binds the allocation directory for workspace persistence
# For raw_exec, we use static port since Nomad's dynamic port mapping doesn't
# work the same as Docker - the process runs directly on the host.
shell_cmd = (
f'apptainer run '
f'--bind "$NOMAD_ALLOC_DIR/data:/data" '
f'--pwd /app '
f'--env PYTHONUNBUFFERED=1 '
f'{singularity_image} '
f'python sandbox_server.py '
f'--port {port} '
f'--slots {slots_per_container} '
f'--data-dir /data'
)
task_config = {
"command": "/bin/sh",
"args": ["-c", shell_cmd],
}
task_driver = "raw_exec"
else:
# Docker driver (default)
task_config = {
"image": image,
"force_pull": False, # Use local image, don't try to pull
"ports": ["http"],
"privileged": privileged,
"command": "python",
"args": [
"sandbox_server.py",
"--port", str(port),
"--slots", str(slots_per_container),
"--data-dir", "/data",
],
# Note: On Linux, you can mount persistent storage:
# "volumes": ["${NOMAD_ALLOC_DIR}/data:/data"],
# On macOS/Docker Desktop, skip volumes for PoC
# (container /data is ephemeral but works for testing)
}
task_driver = "docker"
# For Singularity/raw_exec, use static ports since the process runs directly on host.
# For Docker, use dynamic ports with port mapping.
if driver == "singularity":
network_config = {
"Mode": "host",
"ReservedPorts": [
{
"Label": "http",
"Value": port,
}
],
}
else:
network_config = {
"Mode": "host",
"DynamicPorts": [
{
"Label": "http",
"To": port,
}
],
}
return {
"ID": job_id,
"Name": job_id,
"Type": "service",
"Datacenters": [datacenter],
"TaskGroups": [
{
"Name": "sandbox",
"Count": count,
# Speed up deployments and avoid Consul checks. Without this, Nomad may
# keep an "active deployment" around for the default MinHealthyTime,
# which blocks immediate scaling under load.
"Update": {
"HealthCheck": "task_states",
"MinHealthyTime": 0,
},
"Networks": [network_config],
"Tasks": [
{
"Name": "sandbox-server",
"Driver": task_driver,
"Config": task_config,
"Env": {
"PYTHONUNBUFFERED": "1",
"NOMAD_ALLOC_DIR": "${NOMAD_ALLOC_DIR}",
},
"Resources": {
"CPU": cpu,
"MemoryMB": memory,
},
# Note: Services with Checks require Consul, which we skip for the PoC
}
],
"RestartPolicy": {
"Attempts": 3,
"Interval": 300_000_000_000, # 5 minutes
"Delay": 10_000_000_000, # 10 seconds
"Mode": "delay",
},
"ReschedulePolicy": {
"Attempts": 5,
"Interval": 3600_000_000_000, # 1 hour
"Delay": 30_000_000_000, # 30 seconds
"DelayFunction": "exponential",
"MaxDelay": 300_000_000_000, # 5 minutes
"Unlimited": False,
},
}
],
}