""" Nomad API Client for atropos-agent. Provides a simple async client for interacting with the Nomad HTTP API: - Submit/stop jobs - Query allocations - Get allocation addresses - Scale jobs up/down """ import asyncio import json import os from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional import aiohttp class AllocationStatus(Enum): """Nomad allocation status.""" PENDING = "pending" RUNNING = "running" COMPLETE = "complete" FAILED = "failed" LOST = "lost" @dataclass class Allocation: """Information about a Nomad allocation.""" id: str job_id: str task_group: str node_id: str status: AllocationStatus # Network info for reaching the allocation address: Optional[str] = None port: Optional[int] = None @property def http_address(self) -> Optional[str]: """Get full HTTP address for the allocation.""" if self.address and self.port: return f"http://{self.address}:{self.port}" return None @dataclass class JobStatus: """Status of a Nomad job.""" id: str name: str status: str allocations: List[Allocation] = field(default_factory=list) count: int = 0 # Number of task groups class NomadClient: """ Async client for Nomad HTTP API. Usage: client = NomadClient(address="http://localhost:4646") # Submit a job await client.submit_job(job_spec) # Get allocations allocs = await client.get_job_allocations("sandbox-python") # Scale job await client.scale_job("sandbox-python", count=5) """ def __init__( self, address: str = "http://localhost:4646", token: Optional[str] = None, timeout: float = 30.0, ): self.address = address.rstrip("/") self.token = token or os.environ.get("NOMAD_TOKEN") self.timeout = aiohttp.ClientTimeout(total=timeout) self._session: Optional[aiohttp.ClientSession] = None async def _get_session(self) -> aiohttp.ClientSession: """Get or create HTTP session.""" if self._session is None or self._session.closed: headers = {} if self.token: headers["X-Nomad-Token"] = self.token self._session = aiohttp.ClientSession( timeout=self.timeout, headers=headers, ) return self._session async def close(self): """Close the HTTP session.""" if self._session and not self._session.closed: await self._session.close() async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): await self.close() async def _request( self, method: str, path: str, data: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Make an HTTP request to Nomad API.""" session = await self._get_session() url = f"{self.address}{path}" try: async with session.request(method, url, json=data) as response: if response.status == 404: return {"error": "not_found", "status": 404} text = await response.text() if not text: return {"status": response.status} try: result = json.loads(text) except json.JSONDecodeError: return {"text": text, "status": response.status} if response.status >= 400: return {"error": result, "status": response.status} return result if isinstance(result, dict) else {"data": result, "status": response.status} except aiohttp.ClientError as e: return {"error": str(e), "status": 0} # Job Operations async def submit_job(self, job_spec: Dict[str, Any]) -> Dict[str, Any]: """ Submit a job to Nomad. Args: job_spec: Job specification dict (HCL converted to JSON) Returns: Response with EvalID if successful """ return await self._request("POST", "/v1/jobs", {"Job": job_spec}) async def stop_job(self, job_id: str, purge: bool = False) -> Dict[str, Any]: """ Stop (and optionally purge) a job. Args: job_id: Job identifier purge: If True, completely remove the job """ path = f"/v1/job/{job_id}" if purge: path += "?purge=true" return await self._request("DELETE", path) async def get_job(self, job_id: str) -> Optional[Dict[str, Any]]: """Get job details.""" result = await self._request("GET", f"/v1/job/{job_id}") if "error" in result and result.get("status") == 404: return None return result async def get_job_status(self, job_id: str) -> Optional[JobStatus]: """Get job status with allocations.""" job = await self.get_job(job_id) if not job: return None allocs = await self.get_job_allocations(job_id) # Get count from task groups count = 0 task_groups = job.get("TaskGroups", []) for tg in task_groups: count += tg.get("Count", 1) return JobStatus( id=job_id, name=job.get("Name", job_id), status=job.get("Status", "unknown"), allocations=allocs, count=count, ) # Allocation Operations async def get_job_allocations(self, job_id: str) -> List[Allocation]: """Get all allocations for a job.""" result = await self._request("GET", f"/v1/job/{job_id}/allocations") if "error" in result: return [] allocs_data = result.get("data", result) if isinstance(result, dict) else result if not isinstance(allocs_data, list): return [] allocations = [] for alloc_data in allocs_data: # Parse allocation info alloc_id = alloc_data.get("ID", "") status_str = alloc_data.get("ClientStatus", "unknown") try: status = AllocationStatus(status_str) except ValueError: status = AllocationStatus.PENDING # Get network info - need to fetch detailed allocation for this address = None port = None # First try the summary data resources = alloc_data.get("AllocatedResources") or {} shared = resources.get("Shared") or {} networks = shared.get("Networks") or [] # If no networks in summary, fetch detailed allocation if not networks and alloc_id: detailed = await self.get_allocation(alloc_id) if detailed: resources = detailed.get("AllocatedResources") or {} shared = resources.get("Shared") or {} networks = shared.get("Networks") or [] if networks: network = networks[0] address = network.get("IP") # Look for dynamic ports OR reserved ports (Singularity/raw_exec uses reserved) dyn_ports = network.get("DynamicPorts") or [] reserved_ports = network.get("ReservedPorts") or [] for dp in dyn_ports + reserved_ports: if dp.get("Label") == "http": port = dp.get("Value") break allocations.append(Allocation( id=alloc_id, job_id=job_id, task_group=alloc_data.get("TaskGroup", ""), node_id=alloc_data.get("NodeID", ""), status=status, address=address, port=port, )) return allocations async def get_allocation(self, alloc_id: str) -> Optional[Dict[str, Any]]: """Get detailed allocation info.""" result = await self._request("GET", f"/v1/allocation/{alloc_id}") if "error" in result and result.get("status") == 404: return None return result # Scaling Operations async def scale_job(self, job_id: str, count: int, task_group: str = "sandbox") -> Dict[str, Any]: """ Scale a job's task group to specified count. Args: job_id: Job identifier count: Desired number of allocations task_group: Name of task group to scale """ payload = { "Count": count, "Target": { "Group": task_group, }, } return await self._request("POST", f"/v1/job/{job_id}/scale", payload) async def get_job_scale_status(self, job_id: str) -> Dict[str, int]: """ Get current scale status for a job. Returns: Dict mapping task group name to count """ result = await self._request("GET", f"/v1/job/{job_id}/scale") if "error" in result: return {} task_groups = result.get("TaskGroups", {}) return { name: info.get("Running", 0) for name, info in task_groups.items() } # Health Check async def is_healthy(self) -> bool: """Check if Nomad is reachable and healthy.""" try: result = await self._request("GET", "/v1/status/leader") return "error" not in result except Exception: return False async def get_leader(self) -> Optional[str]: """Get current Nomad leader address.""" result = await self._request("GET", "/v1/status/leader") if isinstance(result, dict) and "data" in result: return result["data"] return None def load_job_template( template_name: str = "sandbox", **kwargs, ) -> Dict[str, Any]: """ Load and configure a job template. Args: template_name: Name of template (e.g., "sandbox") **kwargs: Template variables to substitute Returns: Job specification dict ready for Nomad API """ # Default job template for sandbox container if template_name == "sandbox": return create_sandbox_job(**kwargs) else: raise ValueError(f"Unknown template: {template_name}") def create_sandbox_job( job_id: str = "atropos-sandbox", image: str = "atropos-sandbox:local", # Use :local tag to avoid registry pull count: int = 1, slots_per_container: int = 10, privileged: bool = False, cpu: int = 500, memory: int = 512, port: int = 8080, datacenter: str = "dc1", driver: str = "docker", # "docker" or "singularity" singularity_image: str = None, # Path to .sif file for singularity driver ) -> Dict[str, Any]: """ Create a sandbox job specification. This job runs the sandbox_server.py inside a container, with the specified number of slots for agent workspaces. Args: job_id: Unique job identifier image: Docker image to use (for docker driver) count: Number of container instances slots_per_container: Number of slots per container privileged: Run container in privileged mode (recommended for bubblewrap) cpu: CPU allocation in MHz memory: Memory allocation in MB port: HTTP port for sandbox server datacenter: Nomad datacenter driver: Container driver - "docker" or "singularity" singularity_image: Path to .sif file (required if driver="singularity") Returns: Job specification dict """ # Build task config based on driver if driver == "singularity": if not singularity_image: raise ValueError("singularity_image path required when driver='singularity'") # Use raw_exec driver to run apptainer via shell for variable expansion # The container binds the allocation directory for workspace persistence # For raw_exec, we use static port since Nomad's dynamic port mapping doesn't # work the same as Docker - the process runs directly on the host. shell_cmd = ( f'apptainer run ' f'--bind "$NOMAD_ALLOC_DIR/data:/data" ' f'--pwd /app ' f'--env PYTHONUNBUFFERED=1 ' f'{singularity_image} ' f'python sandbox_server.py ' f'--port {port} ' f'--slots {slots_per_container} ' f'--data-dir /data' ) task_config = { "command": "/bin/sh", "args": ["-c", shell_cmd], } task_driver = "raw_exec" else: # Docker driver (default) task_config = { "image": image, "force_pull": False, # Use local image, don't try to pull "ports": ["http"], "privileged": privileged, "command": "python", "args": [ "sandbox_server.py", "--port", str(port), "--slots", str(slots_per_container), "--data-dir", "/data", ], # Note: On Linux, you can mount persistent storage: # "volumes": ["${NOMAD_ALLOC_DIR}/data:/data"], # On macOS/Docker Desktop, skip volumes for PoC # (container /data is ephemeral but works for testing) } task_driver = "docker" # For Singularity/raw_exec, use static ports since the process runs directly on host. # For Docker, use dynamic ports with port mapping. if driver == "singularity": network_config = { "Mode": "host", "ReservedPorts": [ { "Label": "http", "Value": port, } ], } else: network_config = { "Mode": "host", "DynamicPorts": [ { "Label": "http", "To": port, } ], } return { "ID": job_id, "Name": job_id, "Type": "service", "Datacenters": [datacenter], "TaskGroups": [ { "Name": "sandbox", "Count": count, # Speed up deployments and avoid Consul checks. Without this, Nomad may # keep an "active deployment" around for the default MinHealthyTime, # which blocks immediate scaling under load. "Update": { "HealthCheck": "task_states", "MinHealthyTime": 0, }, "Networks": [network_config], "Tasks": [ { "Name": "sandbox-server", "Driver": task_driver, "Config": task_config, "Env": { "PYTHONUNBUFFERED": "1", "NOMAD_ALLOC_DIR": "${NOMAD_ALLOC_DIR}", }, "Resources": { "CPU": cpu, "MemoryMB": memory, }, # Note: Services with Checks require Consul, which we skip for the PoC } ], "RestartPolicy": { "Attempts": 3, "Interval": 300_000_000_000, # 5 minutes "Delay": 10_000_000_000, # 10 seconds "Mode": "delay", }, "ReschedulePolicy": { "Attempts": 5, "Interval": 3600_000_000_000, # 1 hour "Delay": 30_000_000_000, # 30 seconds "DelayFunction": "exponential", "MaxDelay": 300_000_000_000, # 5 minutes "Unlimited": False, }, } ], }