diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index 76761e262..59e7622fb 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -74,8 +74,11 @@ def _get_anthropic_max_output(model: str) -> int: model IDs (claude-sonnet-4-5-20250929) and variant suffixes (:1m, :fast) resolve correctly. Longest-prefix match wins to avoid e.g. "claude-3-5" matching before "claude-3-5-sonnet". + + Normalizes dots to hyphens so that model names like + ``anthropic/claude-opus-4.6`` match the ``claude-opus-4-6`` table key. """ - m = model.lower() + m = model.lower().replace(".", "-") best_key = "" best_val = _ANTHROPIC_DEFAULT_OUTPUT_LIMIT for key, val in _ANTHROPIC_OUTPUT_LIMITS.items(): diff --git a/cli.py b/cli.py index 659fa9741..221976ad2 100644 --- a/cli.py +++ b/cli.py @@ -2308,17 +2308,59 @@ class HermesCLI: # Append to a pre-filter buffer first self._stream_prefilt = getattr(self, "_stream_prefilt", "") + text - # Check if we're entering a reasoning block + # Check if we're entering a reasoning block. + # Only match tags that appear at a "block boundary": start of the + # stream, after a newline (with optional whitespace), or when nothing + # but whitespace has been emitted on the current line. + # This prevents false positives when models *mention* tags in prose + # like "(/think not producing tags)". + # + # _stream_last_was_newline tracks whether the last character emitted + # (or the start of the stream) is a line boundary. It's True at + # stream start and set True whenever emitted text ends with '\n'. + if not hasattr(self, "_stream_last_was_newline"): + self._stream_last_was_newline = True # start of stream = boundary + if not getattr(self, "_in_reasoning_block", False): for tag in _OPEN_TAGS: - idx = self._stream_prefilt.find(tag) - if idx != -1: - # Emit everything before the tag - before = self._stream_prefilt[:idx] - if before: - self._emit_stream_text(before) - self._in_reasoning_block = True - self._stream_prefilt = self._stream_prefilt[idx + len(tag):] + search_start = 0 + while True: + idx = self._stream_prefilt.find(tag, search_start) + if idx == -1: + break + # Check if this is a block boundary position + preceding = self._stream_prefilt[:idx] + if idx == 0: + # At buffer start — only a boundary if we're at + # a line start (stream start or last emit ended + # with newline) + is_block_boundary = getattr(self, "_stream_last_was_newline", True) + else: + # Find last newline in the buffer before the tag + last_nl = preceding.rfind("\n") + if last_nl == -1: + # No newline in buffer — boundary only if + # last emit was a newline AND only whitespace + # has accumulated before the tag + is_block_boundary = ( + getattr(self, "_stream_last_was_newline", True) + and preceding.strip() == "" + ) + else: + # Text between last newline and tag must be + # whitespace-only + is_block_boundary = preceding[last_nl + 1:].strip() == "" + if is_block_boundary: + # Emit everything before the tag + if preceding: + self._emit_stream_text(preceding) + self._stream_last_was_newline = preceding.endswith("\n") + self._in_reasoning_block = True + self._stream_prefilt = self._stream_prefilt[idx + len(tag):] + break + # Not a block boundary — keep searching after this occurrence + search_start = idx + 1 + if getattr(self, "_in_reasoning_block", False): break # Could also be a partial open tag at the end — hold it back @@ -2332,6 +2374,7 @@ class HermesCLI: break if safe: self._emit_stream_text(safe) + self._stream_last_was_newline = safe.endswith("\n") self._stream_prefilt = self._stream_prefilt[len(safe):] return @@ -2421,6 +2464,14 @@ class HermesCLI: def _flush_stream(self) -> None: """Emit any remaining partial line from the stream buffer and close the box.""" + # If we're still inside a "reasoning block" at end-of-stream, it was + # a false positive — the model mentioned a tag like in prose + # but never closed it. Recover the buffered content as regular text. + if getattr(self, "_in_reasoning_block", False) and getattr(self, "_stream_prefilt", ""): + self._in_reasoning_block = False + self._emit_stream_text(self._stream_prefilt) + self._stream_prefilt = "" + # Close reasoning box if still open (in case no content tokens arrived) self._close_reasoning_box() @@ -2443,6 +2494,7 @@ class HermesCLI: self._stream_text_ansi = "" self._stream_prefilt = "" self._in_reasoning_block = False + self._stream_last_was_newline = True self._reasoning_box_opened = False self._reasoning_buf = "" self._reasoning_preview_buf = "" diff --git a/run_agent.py b/run_agent.py index 448b0004b..9a684d17f 100644 --- a/run_agent.py +++ b/run_agent.py @@ -5610,20 +5610,20 @@ class AIAgent: if self.max_tokens is not None: if not self._is_qwen_portal(): api_kwargs.update(self._max_tokens_param(self.max_tokens)) - elif self._is_openrouter_url() and "claude" in (self.model or "").lower(): - # OpenRouter translates requests to Anthropic's Messages API, - # which requires max_tokens as a mandatory field. When we omit - # it, OpenRouter picks a default that can be too low — the model - # spends its output budget on thinking and has almost nothing - # left for the actual response (especially large tool calls like - # write_file). Sending the model's real output limit ensures - # full capacity. Other providers handle the default fine. + elif (self._is_openrouter_url() or "nousresearch" in self._base_url_lower) and "claude" in (self.model or "").lower(): + # OpenRouter and Nous Portal translate requests to Anthropic's + # Messages API, which requires max_tokens as a mandatory field. + # When we omit it, the proxy picks a default that can be too + # low — the model spends its output budget on thinking and has + # almost nothing left for the actual response (especially large + # tool calls like write_file). Sending the model's real output + # limit ensures full capacity. try: from agent.anthropic_adapter import _get_anthropic_max_output _model_output_limit = _get_anthropic_max_output(self.model) api_kwargs["max_tokens"] = _model_output_limit except Exception: - pass # fail open — let OpenRouter pick its default + pass # fail open — let the proxy pick its default extra_body = {} @@ -9116,6 +9116,11 @@ class AIAgent: self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count) + # Reset per-turn retry counters after successful tool + # execution so a single truncation doesn't poison the + # entire conversation. + truncated_tool_call_retries = 0 + # Signal that a paragraph break is needed before the next # streamed text. We don't emit it immediately because # multiple consecutive tool iterations would stack up diff --git a/tests/cli/test_stream_delta_think_tag.py b/tests/cli/test_stream_delta_think_tag.py new file mode 100644 index 000000000..e7c406b37 --- /dev/null +++ b/tests/cli/test_stream_delta_think_tag.py @@ -0,0 +1,138 @@ +"""Tests for _stream_delta's handling of tags in prose vs real reasoning blocks.""" +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +import pytest + + +def _make_cli_stub(): + """Create a minimal HermesCLI-like object with stream state.""" + from cli import HermesCLI + + cli = HermesCLI.__new__(HermesCLI) + cli.show_reasoning = False + cli._stream_buf = "" + cli._stream_started = False + cli._stream_box_opened = False + cli._stream_prefilt = "" + cli._in_reasoning_block = False + cli._reasoning_stream_started = False + cli._reasoning_box_opened = False + cli._reasoning_buf = "" + cli._reasoning_preview_buf = "" + cli._deferred_content = "" + cli._stream_text_ansi = "" + cli._stream_needs_break = False + cli._emitted = [] + + # Mock _emit_stream_text to capture output + def mock_emit(text): + cli._emitted.append(text) + cli._emit_stream_text = mock_emit + + # Mock _stream_reasoning_delta + cli._reasoning_emitted = [] + def mock_reasoning(text): + cli._reasoning_emitted.append(text) + cli._stream_reasoning_delta = mock_reasoning + + return cli + + +class TestThinkTagInProse: + """ mentioned in prose should NOT trigger reasoning suppression.""" + + def test_think_tag_mid_sentence(self): + """'(/think not producing tags)' should pass through.""" + cli = _make_cli_stub() + tokens = [ + " 1. Fix reasoning mode in eval ", + "(/think not producing ", + "", + " tags — ~2% gap)", + "\n 2. Launch production", + ] + for t in tokens: + cli._stream_delta(t) + assert not cli._in_reasoning_block, " in prose should not enter reasoning block" + full = "".join(cli._emitted) + assert "" in full, "The literal tag should be in the emitted text" + assert "Launch production" in full + + def test_think_tag_after_text_on_same_line(self): + """'some text ' should NOT trigger reasoning.""" + cli = _make_cli_stub() + cli._stream_delta("Here is the tag explanation") + assert not cli._in_reasoning_block + full = "".join(cli._emitted) + assert "" in full + + def test_think_tag_in_backticks(self): + """'``' should NOT trigger reasoning.""" + cli = _make_cli_stub() + cli._stream_delta("Use the `` tag for reasoning") + assert not cli._in_reasoning_block + + +class TestRealReasoningBlock: + """Real tags at block boundaries should still be caught.""" + + def test_think_at_start_of_stream(self): + """'reasoninganswer' should suppress reasoning.""" + cli = _make_cli_stub() + cli._stream_delta("") + assert cli._in_reasoning_block + cli._stream_delta("I need to analyze this") + cli._stream_delta("") + assert not cli._in_reasoning_block + cli._stream_delta("Here is my answer") + full = "".join(cli._emitted) + assert "Here is my answer" in full + assert "I need to analyze" not in full # reasoning was suppressed + + def test_think_after_newline(self): + """'text\\n' should trigger reasoning block.""" + cli = _make_cli_stub() + cli._stream_delta("Some preamble\n") + assert cli._in_reasoning_block + full = "".join(cli._emitted) + assert "Some preamble" in full + + def test_think_after_newline_with_whitespace(self): + """'text\\n ' should trigger reasoning block.""" + cli = _make_cli_stub() + cli._stream_delta("Some preamble\n ") + assert cli._in_reasoning_block + + def test_think_with_only_whitespace_before(self): + """' ' (whitespace only prefix) should trigger.""" + cli = _make_cli_stub() + cli._stream_delta(" ") + assert cli._in_reasoning_block + + +class TestFlushRecovery: + """_flush_stream should recover content from false-positive reasoning blocks.""" + + def test_flush_recovers_buffered_content(self): + """If somehow in reasoning block at flush, content is recovered.""" + cli = _make_cli_stub() + # Manually set up a false-positive state + cli._in_reasoning_block = True + cli._stream_prefilt = " tags — ~2% gap)\n 2. Launch production" + cli._stream_box_opened = True + + # Mock _close_reasoning_box and box closing + cli._close_reasoning_box = lambda: None + + # Call flush + from unittest.mock import patch + import shutil + with patch.object(shutil, "get_terminal_size", return_value=os.terminal_size((80, 24))): + with patch("cli._cprint"): + cli._flush_stream() + + assert not cli._in_reasoning_block + full = "".join(cli._emitted) + assert "Launch production" in full