feat: devex help, add Makefile, ruff, pre-commit, and modernize CI

2026-04-30 01:41:43 +00:00 · 2026-03-09 20:36:51 -05:00 · 2026-03-09 20:36:51 -05:00 · f4d7e6a29e
commit f4d7e6a29e
parent 172a38c344
111 changed files with 11655 additions and 10200 deletions
--- a/tools/mixture_of_agents_tool.py
+++ b/tools/mixture_of_agents_tool.py
@ -38,21 +38,27 @@ Configuration:
 Usage:
    from mixture_of_agents_tool import mixture_of_agents_tool
    import asyncio
-    
+
    # Process a complex query
    result = await mixture_of_agents_tool(
        user_prompt="Solve this complex mathematical proof..."
    )
 """

+import asyncio
+import datetime
 import json
 import logging
 import os
-import asyncio
-import datetime
-from typing import Dict, Any, List, Optional
-from tools.openrouter_client import get_async_client as _get_openrouter_client, check_api_key as check_openrouter_api_key
+from typing import Any
+
 from tools.debug_helpers import DebugSession
+from tools.openrouter_client import (
+    check_api_key as check_openrouter_api_key,
+)
+from tools.openrouter_client import (
+    get_async_client as _get_openrouter_client,
+)

 logger = logging.getLogger(__name__)

@ -60,9 +66,9 @@ logger = logging.getLogger(__name__)
 # Reference models - these generate diverse initial responses in parallel (OpenRouter slugs)
 REFERENCE_MODELS = [
    "anthropic/claude-opus-4.5",
-    "google/gemini-3-pro-preview", 
+    "google/gemini-3-pro-preview",
    "openai/gpt-5.2-pro",
-    "deepseek/deepseek-v3.2"
+    "deepseek/deepseek-v3.2",
 ]

 # Aggregator model - synthesizes reference responses into final output
@ -83,18 +89,18 @@ Responses from models:"""
 _debug = DebugSession("moa_tools", env_var="MOA_TOOLS_DEBUG")


-def _construct_aggregator_prompt(system_prompt: str, responses: List[str]) -> str:
+def _construct_aggregator_prompt(system_prompt: str, responses: list[str]) -> str:
    """
    Construct the final system prompt for the aggregator including all model responses.
-    
+
    Args:
        system_prompt (str): Base system prompt for aggregation
        responses (List[str]): List of responses from reference models
-        
+
    Returns:
        str: Complete system prompt with enumerated responses
    """
-    response_text = "\n".join([f"{i+1}. {response}" for i, response in enumerate(responses)])
+    response_text = "\n".join([f"{i + 1}. {response}" for i, response in enumerate(responses)])
    return f"{system_prompt}\n\n{response_text}"


@ -103,48 +109,43 @@ async def _run_reference_model_safe(
    user_prompt: str,
    temperature: float = REFERENCE_TEMPERATURE,
    max_tokens: int = 32000,
-    max_retries: int = 6
+    max_retries: int = 6,
 ) -> tuple[str, str, bool]:
    """
    Run a single reference model with retry logic and graceful failure handling.
-    
+
    Args:
        model (str): Model identifier to use
        user_prompt (str): The user's query
        temperature (float): Sampling temperature for response generation
        max_tokens (int): Maximum tokens in response
        max_retries (int): Maximum number of retry attempts
-        
+
    Returns:
        tuple[str, str, bool]: (model_name, response_content_or_error, success_flag)
    """
    for attempt in range(max_retries):
        try:
            logger.info("Querying %s (attempt %s/%s)", model, attempt + 1, max_retries)
-            
+
            # Build parameters for the API call
            api_params = {
                "model": model,
                "messages": [{"role": "user", "content": user_prompt}],
-                "extra_body": {
-                    "reasoning": {
-                        "enabled": True,
-                        "effort": "xhigh"
-                    }
-                }
+                "extra_body": {"reasoning": {"enabled": True, "effort": "xhigh"}},
            }
-            
+
            # GPT models (especially gpt-4o-mini) don't support custom temperature values
            # Only include temperature for non-GPT models
-            if not model.lower().startswith('gpt-'):
+            if not model.lower().startswith("gpt-"):
                api_params["temperature"] = temperature
-            
+
            response = await _get_openrouter_client().chat.completions.create(**api_params)
-            
+
            content = response.choices[0].message.content.strip()
            logger.info("%s responded (%s characters)", model, len(content))
            return model, content, True
-            
+
        except Exception as e:
            error_str = str(e)
            # Log more detailed error information for debugging
@ -154,7 +155,7 @@ async def _run_reference_model_safe(
                logger.warning("%s rate limit error (attempt %s): %s", model, attempt + 1, error_str)
            else:
                logger.warning("%s unknown error (attempt %s): %s", model, attempt + 1, error_str)
-                
+
            if attempt < max_retries - 1:
                # Exponential backoff for rate limiting: 2s, 4s, 8s, 16s, 32s, 60s
                sleep_time = min(2 ** (attempt + 1), 60)
@ -167,60 +168,47 @@ async def _run_reference_model_safe(


 async def _run_aggregator_model(
-    system_prompt: str,
-    user_prompt: str,
-    temperature: float = AGGREGATOR_TEMPERATURE,
-    max_tokens: int = None
+    system_prompt: str, user_prompt: str, temperature: float = AGGREGATOR_TEMPERATURE, max_tokens: int = None
 ) -> str:
    """
    Run the aggregator model to synthesize the final response.
-    
+
    Args:
        system_prompt (str): System prompt with all reference responses
        user_prompt (str): Original user query
        temperature (float): Focused temperature for consistent aggregation
        max_tokens (int): Maximum tokens in final response
-        
+
    Returns:
        str: Synthesized final response
    """
    logger.info("Running aggregator model: %s", AGGREGATOR_MODEL)
-    
+
    # Build parameters for the API call
    api_params = {
        "model": AGGREGATOR_MODEL,
-        "messages": [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
-        ],
-        "extra_body": {
-            "reasoning": {
-                "enabled": True,
-                "effort": "xhigh"
-            }
-        }
+        "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
+        "extra_body": {"reasoning": {"enabled": True, "effort": "xhigh"}},
    }
-    
+
    # GPT models (especially gpt-4o-mini) don't support custom temperature values
    # Only include temperature for non-GPT models
-    if not AGGREGATOR_MODEL.lower().startswith('gpt-'):
+    if not AGGREGATOR_MODEL.lower().startswith("gpt-"):
        api_params["temperature"] = temperature
-    
+
    response = await _get_openrouter_client().chat.completions.create(**api_params)
-    
+
    content = response.choices[0].message.content.strip()
    logger.info("Aggregation complete (%s characters)", len(content))
    return content


 async def mixture_of_agents_tool(
-    user_prompt: str,
-    reference_models: Optional[List[str]] = None,
-    aggregator_model: Optional[str] = None
+    user_prompt: str, reference_models: list[str] | None = None, aggregator_model: str | None = None
 ) -> str:
    """
    Process a complex query using the Mixture-of-Agents methodology.
-    
+
    This tool leverages multiple frontier language models to collaboratively solve
    extremely difficult problems requiring intense reasoning. It's particularly
    effective for:
@ -229,16 +217,16 @@ async def mixture_of_agents_tool(
    - Multi-step analytical reasoning tasks
    - Problems requiring diverse domain expertise
    - Tasks where single models show limitations
-    
+
    The MoA approach uses a fixed 2-layer architecture:
    1. Layer 1: Multiple reference models generate diverse responses in parallel (temp=0.6)
    2. Layer 2: Aggregator model synthesizes the best elements into final response (temp=0.4)
-    
+
    Args:
        user_prompt (str): The complex query or problem to solve
        reference_models (Optional[List[str]]): Custom reference models to use
        aggregator_model (Optional[str]): Custom aggregator model to use
-    
+
    Returns:
        str: JSON string containing the MoA results with the following structure:
             {
@ -250,12 +238,12 @@ async def mixture_of_agents_tool(
                 },
                 "processing_time": float
             }
-    
+
    Raises:
        Exception: If MoA processing fails or API key is not set
    """
    start_time = datetime.datetime.now()
-    
+
    debug_call_data = {
        "parameters": {
            "user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
@ -263,7 +251,7 @@ async def mixture_of_agents_tool(
            "aggregator_model": aggregator_model or AGGREGATOR_MODEL,
            "reference_temperature": REFERENCE_TEMPERATURE,
            "aggregator_temperature": AGGREGATOR_TEMPERATURE,
-            "min_successful_references": MIN_SUCCESSFUL_REFERENCES
+            "min_successful_references": MIN_SUCCESSFUL_REFERENCES,
        },
        "error": None,
        "success": False,
@ -272,161 +260,152 @@ async def mixture_of_agents_tool(
        "failed_models": [],
        "final_response_length": 0,
        "processing_time_seconds": 0,
-        "models_used": {}
+        "models_used": {},
    }
-    
+
    try:
        logger.info("Starting Mixture-of-Agents processing...")
        logger.info("Query: %s", user_prompt[:100])
-        
+
        # Validate API key availability
        if not os.getenv("OPENROUTER_API_KEY"):
            raise ValueError("OPENROUTER_API_KEY environment variable not set")
-        
+
        # Use provided models or defaults
        ref_models = reference_models or REFERENCE_MODELS
        agg_model = aggregator_model or AGGREGATOR_MODEL
-        
+
        logger.info("Using %s reference models in 2-layer MoA architecture", len(ref_models))
-        
+
        # Layer 1: Generate diverse responses from reference models (with failure handling)
        logger.info("Layer 1: Generating reference responses...")
-        model_results = await asyncio.gather(*[
-            _run_reference_model_safe(model, user_prompt, REFERENCE_TEMPERATURE)
-            for model in ref_models
-        ])
-        
+        model_results = await asyncio.gather(
+            *[_run_reference_model_safe(model, user_prompt, REFERENCE_TEMPERATURE) for model in ref_models]
+        )
+
        # Separate successful and failed responses
        successful_responses = []
        failed_models = []
-        
+
        for model_name, content, success in model_results:
            if success:
                successful_responses.append(content)
            else:
                failed_models.append(model_name)
-        
+
        successful_count = len(successful_responses)
        failed_count = len(failed_models)
-        
+
        logger.info("Reference model results: %s successful, %s failed", successful_count, failed_count)
-        
+
        if failed_models:
-            logger.warning("Failed models: %s", ', '.join(failed_models))
-        
+            logger.warning("Failed models: %s", ", ".join(failed_models))
+
        # Check if we have enough successful responses to proceed
        if successful_count < MIN_SUCCESSFUL_REFERENCES:
-            raise ValueError(f"Insufficient successful reference models ({successful_count}/{len(ref_models)}). Need at least {MIN_SUCCESSFUL_REFERENCES} successful responses.")
-        
+            raise ValueError(
+                f"Insufficient successful reference models ({successful_count}/{len(ref_models)}). Need at least {MIN_SUCCESSFUL_REFERENCES} successful responses."
+            )
+
        debug_call_data["reference_responses_count"] = successful_count
        debug_call_data["failed_models_count"] = failed_count
        debug_call_data["failed_models"] = failed_models
-        
+
        # Layer 2: Aggregate responses using the aggregator model
        logger.info("Layer 2: Synthesizing final response...")
-        aggregator_system_prompt = _construct_aggregator_prompt(
-            AGGREGATOR_SYSTEM_PROMPT, 
-            successful_responses
-        )
-        
-        final_response = await _run_aggregator_model(
-            aggregator_system_prompt,
-            user_prompt,
-            AGGREGATOR_TEMPERATURE
-        )
-        
+        aggregator_system_prompt = _construct_aggregator_prompt(AGGREGATOR_SYSTEM_PROMPT, successful_responses)
+
+        final_response = await _run_aggregator_model(aggregator_system_prompt, user_prompt, AGGREGATOR_TEMPERATURE)
+
        # Calculate processing time
        end_time = datetime.datetime.now()
        processing_time = (end_time - start_time).total_seconds()
-        
+
        logger.info("MoA processing completed in %.2f seconds", processing_time)
-        
+
        # Prepare successful response (only final aggregated result, minimal fields)
        result = {
            "success": True,
            "response": final_response,
-            "models_used": {
-                "reference_models": ref_models,
-                "aggregator_model": agg_model
-            }
+            "models_used": {"reference_models": ref_models, "aggregator_model": agg_model},
        }
-        
+
        debug_call_data["success"] = True
        debug_call_data["final_response_length"] = len(final_response)
        debug_call_data["processing_time_seconds"] = processing_time
        debug_call_data["models_used"] = result["models_used"]
-        
+
        # Log debug information
        _debug.log_call("mixture_of_agents_tool", debug_call_data)
        _debug.save()
-        
+
        return json.dumps(result, indent=2, ensure_ascii=False)
-        
+
    except Exception as e:
        error_msg = f"Error in MoA processing: {str(e)}"
        logger.error("%s", error_msg)
-        
+
        # Calculate processing time even for errors
        end_time = datetime.datetime.now()
        processing_time = (end_time - start_time).total_seconds()
-        
+
        # Prepare error response (minimal fields)
        result = {
            "success": False,
            "response": "MoA processing failed. Please try again or use a single model for this query.",
            "models_used": {
                "reference_models": reference_models or REFERENCE_MODELS,
-                "aggregator_model": aggregator_model or AGGREGATOR_MODEL
+                "aggregator_model": aggregator_model or AGGREGATOR_MODEL,
            },
-            "error": error_msg
+            "error": error_msg,
        }
-        
+
        debug_call_data["error"] = error_msg
        debug_call_data["processing_time_seconds"] = processing_time
        _debug.log_call("mixture_of_agents_tool", debug_call_data)
        _debug.save()
-        
+
        return json.dumps(result, indent=2, ensure_ascii=False)


 def check_moa_requirements() -> bool:
    """
    Check if all requirements for MoA tools are met.
-    
+
    Returns:
        bool: True if requirements are met, False otherwise
    """
    return check_openrouter_api_key()


-def get_debug_session_info() -> Dict[str, Any]:
+def get_debug_session_info() -> dict[str, Any]:
    """
    Get information about the current debug session.
-    
+
    Returns:
        Dict[str, Any]: Dictionary containing debug session information
    """
    return _debug.get_session_info()


-def get_available_models() -> Dict[str, List[str]]:
+def get_available_models() -> dict[str, list[str]]:
    """
    Get information about available models for MoA processing.
-    
+
    Returns:
        Dict[str, List[str]]: Dictionary with reference and aggregator models
    """
    return {
        "reference_models": REFERENCE_MODELS,
        "aggregator_models": [AGGREGATOR_MODEL],
-        "supported_models": REFERENCE_MODELS + [AGGREGATOR_MODEL]
+        "supported_models": REFERENCE_MODELS + [AGGREGATOR_MODEL],
    }


-def get_moa_configuration() -> Dict[str, Any]:
+def get_moa_configuration() -> dict[str, Any]:
    """
    Get the current MoA configuration settings.
-    
+
    Returns:
        Dict[str, Any]: Dictionary containing all configuration parameters
    """
@ -437,7 +416,7 @@ def get_moa_configuration() -> Dict[str, Any]:
        "aggregator_temperature": AGGREGATOR_TEMPERATURE,
        "min_successful_references": MIN_SUCCESSFUL_REFERENCES,
        "total_reference_models": len(REFERENCE_MODELS),
-        "failure_tolerance": f"{len(REFERENCE_MODELS) - MIN_SUCCESSFUL_REFERENCES}/{len(REFERENCE_MODELS)} models can fail"
+        "failure_tolerance": f"{len(REFERENCE_MODELS) - MIN_SUCCESSFUL_REFERENCES}/{len(REFERENCE_MODELS)} models can fail",
    }


@ -447,10 +426,10 @@ if __name__ == "__main__":
    """
    print("🤖 Mixture-of-Agents Tool Module")
    print("=" * 50)
-    
+
    # Check if API key is available
    api_available = check_openrouter_api_key()
-    
+
    if not api_available:
        print("❌ OPENROUTER_API_KEY environment variable not set")
        print("Please set your API key: export OPENROUTER_API_KEY='your-key-here'")
@ -458,26 +437,26 @@ if __name__ == "__main__":
        exit(1)
    else:
        print("✅ OpenRouter API key found")
-    
+
    print("🛠️  MoA tools ready for use!")
-    
+
    # Show current configuration
    config = get_moa_configuration()
-    print(f"\n⚙️  Current Configuration:")
+    print("\n⚙️  Current Configuration:")
    print(f"  🤖 Reference models ({len(config['reference_models'])}): {', '.join(config['reference_models'])}")
    print(f"  🧠 Aggregator model: {config['aggregator_model']}")
    print(f"  🌡️  Reference temperature: {config['reference_temperature']}")
    print(f"  🌡️  Aggregator temperature: {config['aggregator_temperature']}")
    print(f"  🛡️  Failure tolerance: {config['failure_tolerance']}")
    print(f"  📊 Minimum successful models: {config['min_successful_references']}")
-    
+
    # Show debug mode status
    if _debug.active:
        print(f"\n🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
        print(f"   Debug logs will be saved to: ./logs/moa_tools_debug_{_debug.session_id}.json")
    else:
        print("\n🐛 Debug mode disabled (set MOA_TOOLS_DEBUG=true to enable)")
-    
+
    print("\nBasic usage:")
    print("  from mixture_of_agents_tool import mixture_of_agents_tool")
    print("  import asyncio")
@ -488,24 +467,26 @@ if __name__ == "__main__":
    print("      )")
    print("      print(result)")
    print("  asyncio.run(main())")
-    
+
    print("\nBest use cases:")
    print("  - Complex mathematical proofs and calculations")
    print("  - Advanced coding problems and algorithm design")
    print("  - Multi-step analytical reasoning tasks")
    print("  - Problems requiring diverse domain expertise")
    print("  - Tasks where single models show limitations")
-    
+
    print("\nPerformance characteristics:")
    print("  - Higher latency due to multiple model calls")
    print("  - Significantly improved quality for complex tasks")
    print("  - Parallel processing for efficiency")
-    print(f"  - Optimized temperatures: {REFERENCE_TEMPERATURE} for reference models, {AGGREGATOR_TEMPERATURE} for aggregation")
+    print(
+        f"  - Optimized temperatures: {REFERENCE_TEMPERATURE} for reference models, {AGGREGATOR_TEMPERATURE} for aggregation"
+    )
    print("  - Token-efficient: only returns final aggregated response")
    print("  - Resilient: continues with partial model failures")
-    print(f"  - Configurable: easy to modify models and settings at top of file")
+    print("  - Configurable: easy to modify models and settings at top of file")
    print("  - State-of-the-art results on challenging benchmarks")
-    
+
    print("\nDebug mode:")
    print("  # Enable debug logging")
    print("  export MOA_TOOLS_DEBUG=true")
@ -526,11 +507,11 @@ MOA_SCHEMA = {
        "properties": {
            "user_prompt": {
                "type": "string",
-                "description": "The complex query or problem to solve using multiple AI models. Should be a challenging problem that benefits from diverse perspectives and collaborative reasoning."
+                "description": "The complex query or problem to solve using multiple AI models. Should be a challenging problem that benefits from diverse perspectives and collaborative reasoning.",
            }
        },
-        "required": ["user_prompt"]
-    }
+        "required": ["user_prompt"],
+    },
 }

 registry.register(