feat: devex help, add Makefile, ruff, pre-commit, and modernize CI

This commit is contained in:
Brooklyn Nicholson 2026-03-09 20:36:51 -05:00
parent 172a38c344
commit f4d7e6a29e
111 changed files with 11655 additions and 10200 deletions

View file

@ -38,21 +38,27 @@ Configuration:
Usage:
from mixture_of_agents_tool import mixture_of_agents_tool
import asyncio
# Process a complex query
result = await mixture_of_agents_tool(
user_prompt="Solve this complex mathematical proof..."
)
"""
import asyncio
import datetime
import json
import logging
import os
import asyncio
import datetime
from typing import Dict, Any, List, Optional
from tools.openrouter_client import get_async_client as _get_openrouter_client, check_api_key as check_openrouter_api_key
from typing import Any
from tools.debug_helpers import DebugSession
from tools.openrouter_client import (
check_api_key as check_openrouter_api_key,
)
from tools.openrouter_client import (
get_async_client as _get_openrouter_client,
)
logger = logging.getLogger(__name__)
@ -60,9 +66,9 @@ logger = logging.getLogger(__name__)
# Reference models - these generate diverse initial responses in parallel (OpenRouter slugs)
REFERENCE_MODELS = [
"anthropic/claude-opus-4.5",
"google/gemini-3-pro-preview",
"google/gemini-3-pro-preview",
"openai/gpt-5.2-pro",
"deepseek/deepseek-v3.2"
"deepseek/deepseek-v3.2",
]
# Aggregator model - synthesizes reference responses into final output
@ -83,18 +89,18 @@ Responses from models:"""
_debug = DebugSession("moa_tools", env_var="MOA_TOOLS_DEBUG")
def _construct_aggregator_prompt(system_prompt: str, responses: List[str]) -> str:
def _construct_aggregator_prompt(system_prompt: str, responses: list[str]) -> str:
"""
Construct the final system prompt for the aggregator including all model responses.
Args:
system_prompt (str): Base system prompt for aggregation
responses (List[str]): List of responses from reference models
Returns:
str: Complete system prompt with enumerated responses
"""
response_text = "\n".join([f"{i+1}. {response}" for i, response in enumerate(responses)])
response_text = "\n".join([f"{i + 1}. {response}" for i, response in enumerate(responses)])
return f"{system_prompt}\n\n{response_text}"
@ -103,48 +109,43 @@ async def _run_reference_model_safe(
user_prompt: str,
temperature: float = REFERENCE_TEMPERATURE,
max_tokens: int = 32000,
max_retries: int = 6
max_retries: int = 6,
) -> tuple[str, str, bool]:
"""
Run a single reference model with retry logic and graceful failure handling.
Args:
model (str): Model identifier to use
user_prompt (str): The user's query
temperature (float): Sampling temperature for response generation
max_tokens (int): Maximum tokens in response
max_retries (int): Maximum number of retry attempts
Returns:
tuple[str, str, bool]: (model_name, response_content_or_error, success_flag)
"""
for attempt in range(max_retries):
try:
logger.info("Querying %s (attempt %s/%s)", model, attempt + 1, max_retries)
# Build parameters for the API call
api_params = {
"model": model,
"messages": [{"role": "user", "content": user_prompt}],
"extra_body": {
"reasoning": {
"enabled": True,
"effort": "xhigh"
}
}
"extra_body": {"reasoning": {"enabled": True, "effort": "xhigh"}},
}
# GPT models (especially gpt-4o-mini) don't support custom temperature values
# Only include temperature for non-GPT models
if not model.lower().startswith('gpt-'):
if not model.lower().startswith("gpt-"):
api_params["temperature"] = temperature
response = await _get_openrouter_client().chat.completions.create(**api_params)
content = response.choices[0].message.content.strip()
logger.info("%s responded (%s characters)", model, len(content))
return model, content, True
except Exception as e:
error_str = str(e)
# Log more detailed error information for debugging
@ -154,7 +155,7 @@ async def _run_reference_model_safe(
logger.warning("%s rate limit error (attempt %s): %s", model, attempt + 1, error_str)
else:
logger.warning("%s unknown error (attempt %s): %s", model, attempt + 1, error_str)
if attempt < max_retries - 1:
# Exponential backoff for rate limiting: 2s, 4s, 8s, 16s, 32s, 60s
sleep_time = min(2 ** (attempt + 1), 60)
@ -167,60 +168,47 @@ async def _run_reference_model_safe(
async def _run_aggregator_model(
system_prompt: str,
user_prompt: str,
temperature: float = AGGREGATOR_TEMPERATURE,
max_tokens: int = None
system_prompt: str, user_prompt: str, temperature: float = AGGREGATOR_TEMPERATURE, max_tokens: int = None
) -> str:
"""
Run the aggregator model to synthesize the final response.
Args:
system_prompt (str): System prompt with all reference responses
user_prompt (str): Original user query
temperature (float): Focused temperature for consistent aggregation
max_tokens (int): Maximum tokens in final response
Returns:
str: Synthesized final response
"""
logger.info("Running aggregator model: %s", AGGREGATOR_MODEL)
# Build parameters for the API call
api_params = {
"model": AGGREGATOR_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"extra_body": {
"reasoning": {
"enabled": True,
"effort": "xhigh"
}
}
"messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
"extra_body": {"reasoning": {"enabled": True, "effort": "xhigh"}},
}
# GPT models (especially gpt-4o-mini) don't support custom temperature values
# Only include temperature for non-GPT models
if not AGGREGATOR_MODEL.lower().startswith('gpt-'):
if not AGGREGATOR_MODEL.lower().startswith("gpt-"):
api_params["temperature"] = temperature
response = await _get_openrouter_client().chat.completions.create(**api_params)
content = response.choices[0].message.content.strip()
logger.info("Aggregation complete (%s characters)", len(content))
return content
async def mixture_of_agents_tool(
user_prompt: str,
reference_models: Optional[List[str]] = None,
aggregator_model: Optional[str] = None
user_prompt: str, reference_models: list[str] | None = None, aggregator_model: str | None = None
) -> str:
"""
Process a complex query using the Mixture-of-Agents methodology.
This tool leverages multiple frontier language models to collaboratively solve
extremely difficult problems requiring intense reasoning. It's particularly
effective for:
@ -229,16 +217,16 @@ async def mixture_of_agents_tool(
- Multi-step analytical reasoning tasks
- Problems requiring diverse domain expertise
- Tasks where single models show limitations
The MoA approach uses a fixed 2-layer architecture:
1. Layer 1: Multiple reference models generate diverse responses in parallel (temp=0.6)
2. Layer 2: Aggregator model synthesizes the best elements into final response (temp=0.4)
Args:
user_prompt (str): The complex query or problem to solve
reference_models (Optional[List[str]]): Custom reference models to use
aggregator_model (Optional[str]): Custom aggregator model to use
Returns:
str: JSON string containing the MoA results with the following structure:
{
@ -250,12 +238,12 @@ async def mixture_of_agents_tool(
},
"processing_time": float
}
Raises:
Exception: If MoA processing fails or API key is not set
"""
start_time = datetime.datetime.now()
debug_call_data = {
"parameters": {
"user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
@ -263,7 +251,7 @@ async def mixture_of_agents_tool(
"aggregator_model": aggregator_model or AGGREGATOR_MODEL,
"reference_temperature": REFERENCE_TEMPERATURE,
"aggregator_temperature": AGGREGATOR_TEMPERATURE,
"min_successful_references": MIN_SUCCESSFUL_REFERENCES
"min_successful_references": MIN_SUCCESSFUL_REFERENCES,
},
"error": None,
"success": False,
@ -272,161 +260,152 @@ async def mixture_of_agents_tool(
"failed_models": [],
"final_response_length": 0,
"processing_time_seconds": 0,
"models_used": {}
"models_used": {},
}
try:
logger.info("Starting Mixture-of-Agents processing...")
logger.info("Query: %s", user_prompt[:100])
# Validate API key availability
if not os.getenv("OPENROUTER_API_KEY"):
raise ValueError("OPENROUTER_API_KEY environment variable not set")
# Use provided models or defaults
ref_models = reference_models or REFERENCE_MODELS
agg_model = aggregator_model or AGGREGATOR_MODEL
logger.info("Using %s reference models in 2-layer MoA architecture", len(ref_models))
# Layer 1: Generate diverse responses from reference models (with failure handling)
logger.info("Layer 1: Generating reference responses...")
model_results = await asyncio.gather(*[
_run_reference_model_safe(model, user_prompt, REFERENCE_TEMPERATURE)
for model in ref_models
])
model_results = await asyncio.gather(
*[_run_reference_model_safe(model, user_prompt, REFERENCE_TEMPERATURE) for model in ref_models]
)
# Separate successful and failed responses
successful_responses = []
failed_models = []
for model_name, content, success in model_results:
if success:
successful_responses.append(content)
else:
failed_models.append(model_name)
successful_count = len(successful_responses)
failed_count = len(failed_models)
logger.info("Reference model results: %s successful, %s failed", successful_count, failed_count)
if failed_models:
logger.warning("Failed models: %s", ', '.join(failed_models))
logger.warning("Failed models: %s", ", ".join(failed_models))
# Check if we have enough successful responses to proceed
if successful_count < MIN_SUCCESSFUL_REFERENCES:
raise ValueError(f"Insufficient successful reference models ({successful_count}/{len(ref_models)}). Need at least {MIN_SUCCESSFUL_REFERENCES} successful responses.")
raise ValueError(
f"Insufficient successful reference models ({successful_count}/{len(ref_models)}). Need at least {MIN_SUCCESSFUL_REFERENCES} successful responses."
)
debug_call_data["reference_responses_count"] = successful_count
debug_call_data["failed_models_count"] = failed_count
debug_call_data["failed_models"] = failed_models
# Layer 2: Aggregate responses using the aggregator model
logger.info("Layer 2: Synthesizing final response...")
aggregator_system_prompt = _construct_aggregator_prompt(
AGGREGATOR_SYSTEM_PROMPT,
successful_responses
)
final_response = await _run_aggregator_model(
aggregator_system_prompt,
user_prompt,
AGGREGATOR_TEMPERATURE
)
aggregator_system_prompt = _construct_aggregator_prompt(AGGREGATOR_SYSTEM_PROMPT, successful_responses)
final_response = await _run_aggregator_model(aggregator_system_prompt, user_prompt, AGGREGATOR_TEMPERATURE)
# Calculate processing time
end_time = datetime.datetime.now()
processing_time = (end_time - start_time).total_seconds()
logger.info("MoA processing completed in %.2f seconds", processing_time)
# Prepare successful response (only final aggregated result, minimal fields)
result = {
"success": True,
"response": final_response,
"models_used": {
"reference_models": ref_models,
"aggregator_model": agg_model
}
"models_used": {"reference_models": ref_models, "aggregator_model": agg_model},
}
debug_call_data["success"] = True
debug_call_data["final_response_length"] = len(final_response)
debug_call_data["processing_time_seconds"] = processing_time
debug_call_data["models_used"] = result["models_used"]
# Log debug information
_debug.log_call("mixture_of_agents_tool", debug_call_data)
_debug.save()
return json.dumps(result, indent=2, ensure_ascii=False)
except Exception as e:
error_msg = f"Error in MoA processing: {str(e)}"
logger.error("%s", error_msg)
# Calculate processing time even for errors
end_time = datetime.datetime.now()
processing_time = (end_time - start_time).total_seconds()
# Prepare error response (minimal fields)
result = {
"success": False,
"response": "MoA processing failed. Please try again or use a single model for this query.",
"models_used": {
"reference_models": reference_models or REFERENCE_MODELS,
"aggregator_model": aggregator_model or AGGREGATOR_MODEL
"aggregator_model": aggregator_model or AGGREGATOR_MODEL,
},
"error": error_msg
"error": error_msg,
}
debug_call_data["error"] = error_msg
debug_call_data["processing_time_seconds"] = processing_time
_debug.log_call("mixture_of_agents_tool", debug_call_data)
_debug.save()
return json.dumps(result, indent=2, ensure_ascii=False)
def check_moa_requirements() -> bool:
"""
Check if all requirements for MoA tools are met.
Returns:
bool: True if requirements are met, False otherwise
"""
return check_openrouter_api_key()
def get_debug_session_info() -> Dict[str, Any]:
def get_debug_session_info() -> dict[str, Any]:
"""
Get information about the current debug session.
Returns:
Dict[str, Any]: Dictionary containing debug session information
"""
return _debug.get_session_info()
def get_available_models() -> Dict[str, List[str]]:
def get_available_models() -> dict[str, list[str]]:
"""
Get information about available models for MoA processing.
Returns:
Dict[str, List[str]]: Dictionary with reference and aggregator models
"""
return {
"reference_models": REFERENCE_MODELS,
"aggregator_models": [AGGREGATOR_MODEL],
"supported_models": REFERENCE_MODELS + [AGGREGATOR_MODEL]
"supported_models": REFERENCE_MODELS + [AGGREGATOR_MODEL],
}
def get_moa_configuration() -> Dict[str, Any]:
def get_moa_configuration() -> dict[str, Any]:
"""
Get the current MoA configuration settings.
Returns:
Dict[str, Any]: Dictionary containing all configuration parameters
"""
@ -437,7 +416,7 @@ def get_moa_configuration() -> Dict[str, Any]:
"aggregator_temperature": AGGREGATOR_TEMPERATURE,
"min_successful_references": MIN_SUCCESSFUL_REFERENCES,
"total_reference_models": len(REFERENCE_MODELS),
"failure_tolerance": f"{len(REFERENCE_MODELS) - MIN_SUCCESSFUL_REFERENCES}/{len(REFERENCE_MODELS)} models can fail"
"failure_tolerance": f"{len(REFERENCE_MODELS) - MIN_SUCCESSFUL_REFERENCES}/{len(REFERENCE_MODELS)} models can fail",
}
@ -447,10 +426,10 @@ if __name__ == "__main__":
"""
print("🤖 Mixture-of-Agents Tool Module")
print("=" * 50)
# Check if API key is available
api_available = check_openrouter_api_key()
if not api_available:
print("❌ OPENROUTER_API_KEY environment variable not set")
print("Please set your API key: export OPENROUTER_API_KEY='your-key-here'")
@ -458,26 +437,26 @@ if __name__ == "__main__":
exit(1)
else:
print("✅ OpenRouter API key found")
print("🛠️ MoA tools ready for use!")
# Show current configuration
config = get_moa_configuration()
print(f"\n⚙️ Current Configuration:")
print("\n⚙️ Current Configuration:")
print(f" 🤖 Reference models ({len(config['reference_models'])}): {', '.join(config['reference_models'])}")
print(f" 🧠 Aggregator model: {config['aggregator_model']}")
print(f" 🌡️ Reference temperature: {config['reference_temperature']}")
print(f" 🌡️ Aggregator temperature: {config['aggregator_temperature']}")
print(f" 🛡️ Failure tolerance: {config['failure_tolerance']}")
print(f" 📊 Minimum successful models: {config['min_successful_references']}")
# Show debug mode status
if _debug.active:
print(f"\n🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
print(f" Debug logs will be saved to: ./logs/moa_tools_debug_{_debug.session_id}.json")
else:
print("\n🐛 Debug mode disabled (set MOA_TOOLS_DEBUG=true to enable)")
print("\nBasic usage:")
print(" from mixture_of_agents_tool import mixture_of_agents_tool")
print(" import asyncio")
@ -488,24 +467,26 @@ if __name__ == "__main__":
print(" )")
print(" print(result)")
print(" asyncio.run(main())")
print("\nBest use cases:")
print(" - Complex mathematical proofs and calculations")
print(" - Advanced coding problems and algorithm design")
print(" - Multi-step analytical reasoning tasks")
print(" - Problems requiring diverse domain expertise")
print(" - Tasks where single models show limitations")
print("\nPerformance characteristics:")
print(" - Higher latency due to multiple model calls")
print(" - Significantly improved quality for complex tasks")
print(" - Parallel processing for efficiency")
print(f" - Optimized temperatures: {REFERENCE_TEMPERATURE} for reference models, {AGGREGATOR_TEMPERATURE} for aggregation")
print(
f" - Optimized temperatures: {REFERENCE_TEMPERATURE} for reference models, {AGGREGATOR_TEMPERATURE} for aggregation"
)
print(" - Token-efficient: only returns final aggregated response")
print(" - Resilient: continues with partial model failures")
print(f" - Configurable: easy to modify models and settings at top of file")
print(" - Configurable: easy to modify models and settings at top of file")
print(" - State-of-the-art results on challenging benchmarks")
print("\nDebug mode:")
print(" # Enable debug logging")
print(" export MOA_TOOLS_DEBUG=true")
@ -526,11 +507,11 @@ MOA_SCHEMA = {
"properties": {
"user_prompt": {
"type": "string",
"description": "The complex query or problem to solve using multiple AI models. Should be a challenging problem that benefits from diverse perspectives and collaborative reasoning."
"description": "The complex query or problem to solve using multiple AI models. Should be a challenging problem that benefits from diverse perspectives and collaborative reasoning.",
}
},
"required": ["user_prompt"]
}
"required": ["user_prompt"],
},
}
registry.register(