hermes-agent/tests/tools/test_web_tools_truncate.py

"""Unit tests for the truncate-and-store web_extract path (no LLM).

Covers convert_base64_images_to_links, _truncate_with_footer, _store_full_text,
_get_extract_char_limit, and the end-to-end web_extract_tool truncation behavior.
"""
import asyncio
import json
import os
from unittest.mock import patch

import pytest

import tools.web_tools as wt


class TestImageConversion:
    def test_markdown_base64_image_keeps_alt_drops_blob(self):
        blob = "A" * 5000
        text = f"before ![a cat]( data:image/png;base64,{blob}) after"
        out = wt.convert_base64_images_to_links(text)
        assert "[IMAGE: a cat]" in out
        assert "base64" not in out
        assert blob not in out
        assert "before" in out and "after" in out

    def test_markdown_base64_image_no_alt(self):
        out = wt.convert_base64_images_to_links("x ![](data:image/jpeg;base64,QQ==) y")
        assert "[IMAGE]" in out
        assert "base64" not in out

    def test_real_http_image_links_preserved(self):
        text = "see ![logo](https://example.com/logo.png) here"
        out = wt.convert_base64_images_to_links(text)
        # Real image URLs must survive so the agent can inspect them.
        assert "![logo](https://example.com/logo.png)" in out

    def test_bare_and_parenthesised_base64_become_placeholder(self):
        blob = "Z" * 3000
        bare = wt.convert_base64_images_to_links(f"data:image/gif;base64,{blob}")
        assert bare == "[IMAGE]"
        paren = wt.convert_base64_images_to_links(f"(data:image/gif;base64,{blob})")
        assert paren == "[IMAGE]"


class TestTruncation:
    def test_short_content_returned_whole(self):
        content = "# Title\n\nshort body\n"
        out, truncated = wt._truncate_with_footer(content, "https://e.com", 15000)
        assert out == content
        assert truncated is False

    def test_long_content_truncated_with_footer(self, tmp_path, monkeypatch):
        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
        body = "\n".join(f"line {i} " + "x" * 50 for i in range(2000))
        out, truncated = wt._truncate_with_footer(body, "https://example.com/page", 4000)
        assert truncated is True
        assert "[TRUNCATED]" in out
        assert "Full text saved to:" in out
        assert "read_file" in out
        # Head and tail are both present (first and last lines survive).
        assert "line 0 " in out
        assert "line 1999 " in out
        # The omitted middle is gone.
        assert "line 1000 " not in out
        # Sent text is bounded near the budget (+ footer overhead).
        assert len(out) < 4000 + 2000

    def test_truncation_stores_full_text_readable(self, tmp_path, monkeypatch):
        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
        body = "UNIQUE_MIDDLE_MARKER\n" + ("\n".join(f"row {i}" for i in range(5000)))
        out, truncated = wt._truncate_with_footer(body, "https://example.com/doc", 3000)
        assert truncated is True
        # Extract the stored path from the footer and confirm full text is there.
        path_line = next(ln for ln in out.splitlines() if "Full text saved to:" in ln)
        stored_path = path_line.split("Full text saved to:", 1)[1].strip()
        assert os.path.exists(stored_path)
        full = open(stored_path).read()
        assert "UNIQUE_MIDDLE_MARKER" in full
        assert "row 2500" in full  # the omitted-middle row is in the stored file


class TestCharLimitConfig:
    def test_default_when_unset(self):
        with patch("tools.web_tools._load_web_config", return_value={}):
            assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT

    def test_config_override(self):
        with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 40000}):
            assert wt._get_extract_char_limit() == 40000

    def test_clamps_floor(self):
        with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 100}):
            assert wt._get_extract_char_limit() == 2000

    def test_bad_value_falls_back(self):
        with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": "nope"}):
            assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT


class TestEndToEnd:
    def test_web_extract_truncates_large_page_no_llm(self, tmp_path, monkeypatch):
        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
        big = "\n".join(f"para {i} " + "y" * 80 for i in range(3000))

        class FakeProvider:
            name = "fake"
            display_name = "Fake"

            def supports_extract(self):
                return True

            async def extract(self, urls, **kwargs):
                return [{"url": urls[0], "title": "Big Page", "content": big,
                         "raw_content": big, "metadata": {}}]

        with patch("tools.web_tools._ensure_web_plugins_loaded"), \
             patch("tools.web_tools._get_extract_backend", return_value="fake"), \
             patch("tools.web_tools.async_is_safe_url", new=_AsyncTrue()), \
             patch("agent.web_search_registry.get_provider", return_value=FakeProvider()):
            result = json.loads(asyncio.new_event_loop().run_until_complete(
                wt.web_extract_tool(["https://example.com/big"], char_limit=5000)
            ))

        assert "results" in result
        content = result["results"][0]["content"]
        assert "[TRUNCATED]" in content
        assert "Full text saved to:" in content
        # No LLM was involved: para 0 (head) and the last para (tail) are verbatim.
        assert "para 0 " in content
        assert "para 2999 " in content


def _make_awaitable(value):
    async def _coro(*a, **k):
        return value
    return _coro()


class _AsyncTrue:
    """Async callable that always returns True (re-awaitable per call)."""
    async def __call__(self, *a, **k):
        return True