hermes-agent/tests/integration/test_web_tools.py

#!/usr/bin/env python3
"""
Comprehensive Test Suite for Web Tools Module

This script tests all web tools functionality to ensure they work correctly.
Run this after any updates to the web_tools.py module or backend libraries.

Usage:
    python test_web_tools.py              # Run all tests
    python test_web_tools.py --no-llm     # Skip LLM processing tests
    python test_web_tools.py --verbose    # Show detailed output

Requirements:
    - PARALLEL_API_KEY or FIRECRAWL_API_KEY environment variable must be set
    - An auxiliary LLM provider (OPENROUTER_API_KEY or Nous Portal auth) (optional, for LLM tests)
"""

import pytest
pytestmark = pytest.mark.integration

import json
import asyncio
import sys
import os
import argparse
from datetime import datetime
from typing import List

# Import the web tools to test (updated path after moving tools/)
from tools.web_tools import (
    web_search_tool,
    web_extract_tool,
    check_firecrawl_api_key,
    check_web_api_key,
    _get_backend,
)


class Colors:
    """ANSI color codes for terminal output"""
    HEADER = '\033[95m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def print_header(text: str):
    """Print a formatted header"""
    print(f"\n{Colors.HEADER}{Colors.BOLD}{'='*60}{Colors.ENDC}")
    print(f"{Colors.HEADER}{Colors.BOLD}{text}{Colors.ENDC}")
    print(f"{Colors.HEADER}{Colors.BOLD}{'='*60}{Colors.ENDC}")


def print_section(text: str):
    """Print a formatted section header"""
    print(f"\n{Colors.CYAN}{Colors.BOLD}📌 {text}{Colors.ENDC}")
    print(f"{Colors.CYAN}{'-'*50}{Colors.ENDC}")


def print_success(text: str):
    """Print success message"""
    print(f"{Colors.GREEN}✅ {text}{Colors.ENDC}")


def print_error(text: str):
    """Print error message"""
    print(f"{Colors.FAIL}❌ {text}{Colors.ENDC}")


def print_warning(text: str):
    """Print warning message"""
    print(f"{Colors.WARNING}⚠️  {text}{Colors.ENDC}")


def print_info(text: str, indent: int = 0):
    """Print info message"""
    indent_str = "  " * indent
    print(f"{indent_str}{Colors.BLUE}ℹ️  {text}{Colors.ENDC}")


class WebToolsTester:
    """Test suite for web tools"""

    def __init__(self, verbose: bool = False, test_llm: bool = True):
        self.verbose = verbose
        self.test_llm = test_llm
        self.test_results = {
            "passed": [],
            "failed": [],
            "skipped": []
        }
        self.start_time = None
        self.end_time = None

    def log_result(self, test_name: str, status: str, details: str = ""):
        """Log test result"""
        result = {
            "test": test_name,
            "status": status,
            "details": details,
            "timestamp": datetime.now().isoformat()
        }

        if status == "passed":
            self.test_results["passed"].append(result)
            print_success(f"{test_name}: {details}" if details else test_name)
        elif status == "failed":
            self.test_results["failed"].append(result)
            print_error(f"{test_name}: {details}" if details else test_name)
        elif status == "skipped":
            self.test_results["skipped"].append(result)
            print_warning(f"{test_name} skipped: {details}" if details else f"{test_name} skipped")

    def test_environment(self) -> bool:
        """Test environment setup and API keys"""
        print_section("Environment Check")

        # Check web backend API key (Parallel or Firecrawl)
        if not check_web_api_key():
            self.log_result("Web Backend API Key", "failed", "PARALLEL_API_KEY or FIRECRAWL_API_KEY not set")
            return False
        else:
            backend = _get_backend()
            self.log_result("Web Backend API Key", "passed", f"Using {backend} backend")

        # Auxiliary LLM summarization was removed — web_extract is now
        # truncate-and-store (no LLM). Keep the flag off so any residual
        # LLM-path assertions stay skipped.
        self.log_result("Auxiliary LLM", "skipped", "web_extract no longer uses an LLM (truncate-and-store)")
        self.test_llm = False

        return True

    def test_web_search(self) -> List[str]:
        """Test web search functionality"""
        print_section("Test 1: Web Search")

        test_queries = [
            ("Python web scraping tutorial", 5),
            ("Firecrawl API documentation", 3),
            ("inflammatory arthritis symptoms treatment", 8)  # Test medical query from your example
        ]

        extracted_urls = []

        for query, limit in test_queries:
            try:
                print(f"\n  Testing search: '{query}' (limit={limit})")

                if self.verbose:
                    print(f"  Calling web_search_tool(query='{query}', limit={limit})")

                # Perform search
                result = web_search_tool(query, limit)

                # Parse result
                try:
                    data = json.loads(result)
                except json.JSONDecodeError as e:
                    self.log_result(f"Search: {query[:30]}...", "failed", f"Invalid JSON: {e}")
                    if self.verbose:
                        print(f"    Raw response (first 500 chars): {result[:500]}...")
                    continue

                if "error" in data:
                    self.log_result(f"Search: {query[:30]}...", "failed", f"API error: {data['error']}")
                    continue

                # Check structure
                if "success" not in data or "data" not in data:
                    self.log_result(f"Search: {query[:30]}...", "failed", "Missing success or data fields")
                    if self.verbose:
                        print(f"    Response keys: {list(data.keys())}")
                    continue

                web_results = data.get("data", {}).get("web", [])

                if not web_results:
                    self.log_result(f"Search: {query[:30]}...", "failed", "Empty web results array")
                    if self.verbose:
                        print(f"    data.web content: {data.get('data', {}).get('web')}")
                    continue

                # Validate each result
                valid_results = 0
                missing_fields = []

                for i, result in enumerate(web_results):
                    required_fields = ["url", "title", "description"]
                    has_all_fields = all(key in result for key in required_fields)

                    if has_all_fields:
                        valid_results += 1
                        # Collect URLs for extraction test
                        if len(extracted_urls) < 3:
                            extracted_urls.append(result["url"])

                        if self.verbose:
                            print(f"    Result {i+1}: ✓ {result['title'][:50]}...")
                            print(f"      URL: {result['url'][:60]}...")
                    else:
                        missing = [f for f in required_fields if f not in result]
                        missing_fields.append(f"Result {i+1} missing: {missing}")
                        if self.verbose:
                            print(f"    Result {i+1}: ✗ Missing fields: {missing}")

                # Log results
                if valid_results == len(web_results):
                    self.log_result(
                        f"Search: {query[:30]}...",
                        "passed",
                        f"All {valid_results} results valid"
                    )
                else:
                    self.log_result(
                        f"Search: {query[:30]}...",
                        "failed",
                        f"Only {valid_results}/{len(web_results)} valid. Issues: {'; '.join(missing_fields[:3])}"
                    )

            except Exception as e:
                self.log_result(f"Search: {query[:30]}...", "failed", f"Exception: {type(e).__name__}: {str(e)}")
                if self.verbose:
                    import traceback
                    print(f"    Traceback: {traceback.format_exc()}")

        if self.verbose and extracted_urls:
            print(f"\n  URLs collected for extraction test: {len(extracted_urls)}")
            for url in extracted_urls:
                print(f"    - {url}")

        return extracted_urls

    async def test_web_extract(self, urls: List[str] = None):
        """Test web content extraction"""
        print_section("Test 2: Web Extract (without LLM)")

        # Use provided URLs or defaults
        if not urls:
            urls = [
                "https://docs.firecrawl.dev/introduction",
                "https://www.python.org/about/"
            ]
            print(f"  Using default URLs for testing")
        else:
            print(f"  Using {len(urls)} URLs from search results")

        # Test extraction
        if urls:
            try:
                test_urls = urls[:2]  # Test with max 2 URLs
                print(f"\n  Extracting content from {len(test_urls)} URL(s)...")
                for url in test_urls:
                    print(f"    - {url}")

                if self.verbose:
                    print(f"  Calling web_extract_tool(urls={test_urls}, format='markdown')")

                result = await web_extract_tool(
                    test_urls,
                    format="markdown",
                )

                # Parse result
                try:
                    data = json.loads(result)
                except json.JSONDecodeError as e:
                    self.log_result("Extract (no LLM)", "failed", f"Invalid JSON: {e}")
                    if self.verbose:
                        print(f"    Raw response (first 500 chars): {result[:500]}...")
                    return

                if "error" in data:
                    self.log_result("Extract (no LLM)", "failed", f"API error: {data['error']}")
                    return

                results = data.get("results", [])

                if not results:
                    self.log_result("Extract (no LLM)", "failed", "No results in response")
                    if self.verbose:
                        print(f"    Response keys: {list(data.keys())}")
                    return

                # Validate each result
                valid_results = 0
                failed_results = 0
                total_content_length = 0
                extraction_details = []

                for i, result in enumerate(results):
                    title = result.get("title", "No title")
                    content = result.get("content", "")
                    error = result.get("error")

                    if error:
                        failed_results += 1
                        extraction_details.append(f"Page {i+1}: ERROR - {error}")
                        if self.verbose:
                            print(f"    Page {i+1}: ✗ Error - {error}")
                    elif content:
                        content_len = len(content)
                        total_content_length += content_len
                        valid_results += 1
                        extraction_details.append(f"Page {i+1}: {title[:40]}... ({content_len} chars)")
                        if self.verbose:
                            print(f"    Page {i+1}: ✓ {title[:50]}... - {content_len} characters")
                            print(f"      First 100 chars: {content[:100]}...")
                    else:
                        extraction_details.append(f"Page {i+1}: {title[:40]}... (EMPTY)")
                        if self.verbose:
                            print(f"    Page {i+1}: ⚠ {title[:50]}... - Empty content")

                # Log results
                if valid_results > 0:
                    self.log_result(
                        "Extract (no LLM)",
                        "passed",
                        f"{valid_results}/{len(results)} pages extracted, {total_content_length} total chars"
                    )
                else:
                    self.log_result(
                        "Extract (no LLM)",
                        "failed",
                        f"No valid content. {failed_results} errors, {len(results) - failed_results} empty"
                    )
                    if self.verbose:
                        print(f"\n  Extraction details:")
                        for detail in extraction_details:
                            print(f"    {detail}")

            except Exception as e:
                self.log_result("Extract (no LLM)", "failed", f"Exception: {type(e).__name__}: {str(e)}")
                if self.verbose:
                    import traceback
                    print(f"    Traceback: {traceback.format_exc()}")

    async def test_web_extract_with_llm(self, urls: List[str] = None):
        """Test web extraction with LLM processing"""
        print_section("Test 3: Web Extract (with Gemini LLM)")

        if not self.test_llm:
            self.log_result("Extract (with LLM)", "skipped", "LLM testing disabled")
            return

        # Use a URL likely to have substantial content
        test_url = urls[0] if urls else "https://docs.firecrawl.dev/features/scrape"

        try:
            print(f"\n  Extracting and processing: {test_url}")

            result = await web_extract_tool(
                [test_url],
                format="markdown",
                char_limit=1000,  # small budget to force truncation in the test
            )

            data = json.loads(result)

            if "error" in data:
                self.log_result("Extract (with LLM)", "failed", data["error"])
                return

            results = data.get("results", [])

            if not results:
                self.log_result("Extract (with LLM)", "failed", "No results returned")
                return

            result = results[0]
            content = result.get("content", "")

            if content:
                content_len = len(content)

                # Check if content was actually processed (should be shorter than typical raw content)
                if content_len > 0:
                    self.log_result(
                        "Extract (with LLM)",
                        "passed",
                        f"Content processed: {content_len} chars"
                    )

                    if self.verbose:
                        print(f"\n    First 300 chars of processed content:")
                        print(f"    {content[:300]}...")
                else:
                    self.log_result("Extract (with LLM)", "failed", "No content after processing")
            else:
                self.log_result("Extract (with LLM)", "failed", "No content field in result")

        except json.JSONDecodeError as e:
            self.log_result("Extract (with LLM)", "failed", f"Invalid JSON: {e}")
        except Exception as e:
            self.log_result("Extract (with LLM)", "failed", str(e))

    async def run_all_tests(self):
        """Run all tests"""
        self.start_time = datetime.now()

        print_header("WEB TOOLS TEST SUITE")
        print(f"Started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")

        # Test environment
        if not self.test_environment():
            print_error("\nCannot proceed without required API keys!")
            return False

        # Test search and collect URLs
        urls = self.test_web_search()

        # Test extraction
        await self.test_web_extract(urls if urls else None)

        # Test extraction with LLM
        if self.test_llm:
            await self.test_web_extract_with_llm(urls if urls else None)

        # Print summary
        self.end_time = datetime.now()
        duration = (self.end_time - self.start_time).total_seconds()

        print_header("TEST SUMMARY")
        print(f"Duration: {duration:.2f} seconds")
        print(f"\n{Colors.GREEN}Passed: {len(self.test_results['passed'])}{Colors.ENDC}")
        print(f"{Colors.FAIL}Failed: {len(self.test_results['failed'])}{Colors.ENDC}")
        print(f"{Colors.WARNING}Skipped: {len(self.test_results['skipped'])}{Colors.ENDC}")

        # List failed tests
        if self.test_results["failed"]:
            print(f"\n{Colors.FAIL}{Colors.BOLD}Failed Tests:{Colors.ENDC}")
            for test in self.test_results["failed"]:
                print(f"  - {test['test']}: {test['details']}")

        # Save results to file
        self.save_results()

        return len(self.test_results["failed"]) == 0

    def save_results(self):
        """Save test results to a JSON file"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"test_results_web_tools_{timestamp}.json"

        results = {
            "test_suite": "Web Tools",
            "start_time": self.start_time.isoformat() if self.start_time else None,
            "end_time": self.end_time.isoformat() if self.end_time else None,
            "duration_seconds": (self.end_time - self.start_time).total_seconds() if self.start_time and self.end_time else None,
            "summary": {
                "passed": len(self.test_results["passed"]),
                "failed": len(self.test_results["failed"]),
                "skipped": len(self.test_results["skipped"])
            },
            "results": self.test_results,
            "environment": {
                "web_backend": _get_backend() if check_web_api_key() else None,
                "firecrawl_api_key": check_firecrawl_api_key(),
                "parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")),
                "auxiliary_model": False,
            }
        }

        try:
            with open(filename, 'w') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            print_info(f"Test results saved to: {filename}")
        except Exception as e:
            print_warning(f"Failed to save results: {e}")


async def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Test Web Tools Module")
    parser.add_argument("--no-llm", action="store_true", help="Skip LLM processing tests")
    parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output")
    parser.add_argument("--debug", action="store_true", help="Enable debug mode for web tools")

    args = parser.parse_args()

    # Set debug mode if requested
    if args.debug:
        os.environ["WEB_TOOLS_DEBUG"] = "true"
        print_info("Debug mode enabled for web tools")

    # Create tester
    tester = WebToolsTester(
        verbose=args.verbose,
        test_llm=not args.no_llm
    )

    # Run tests
    success = await tester.run_all_tests()

    # Exit with appropriate code
    sys.exit(0 if success else 1)


if __name__ == "__main__":
    asyncio.run(main())