diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 1871bc9166..1be21ff0fa 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -315,7 +315,7 @@ DEFAULT_CONFIG = { "model": "", "base_url": "", "api_key": "", - "timeout": 30, # seconds — increase for slow local models + "timeout": 360, # seconds (6min) — per-attempt LLM summarization timeout; increase for slow local models }, "compression": { "provider": "auto", diff --git a/tools/web_tools.py b/tools/web_tools.py index 69ab16e863..8571c2a26f 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -554,8 +554,24 @@ async def process_content_with_llm( return processed_content except Exception as e: - logger.debug("Error processing content with LLM: %s", e) - return f"[Failed to process content: {str(e)[:100]}. Content size: {len(content):,} chars]" + logger.warning( + "web_extract LLM summarization failed (%s). " + "Tip: increase auxiliary.web_extract.timeout in config.yaml " + "or switch to a faster auxiliary model.", + str(e)[:120], + ) + # Fall back to truncated raw content instead of returning a useless + # error message. The first ~5000 chars are almost always more useful + # to the model than "[Failed to process content: ...]". + truncated = content[:MAX_OUTPUT_SIZE] + if len(content) > MAX_OUTPUT_SIZE: + truncated += ( + f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of " + f"{len(content):,} chars. LLM summarization timed out. " + f"To fix: increase auxiliary.web_extract.timeout in config.yaml, " + f"or use a faster auxiliary model. Use browser_navigate for the full page.]" + ) + return truncated async def _call_summarizer_llm( @@ -620,8 +636,9 @@ Your goal is to preserve ALL important information while reducing length. Never Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights.""" - # Call the LLM with retry logic - max_retries = 6 + # Call the LLM with retry logic — keep retries low since summarization + # is a nice-to-have; the caller falls back to truncated content on failure. + max_retries = 2 retry_delay = 2 last_error = None @@ -640,6 +657,9 @@ Create a markdown summary that captures all key information in a well-organized, ], "temperature": 0.1, "max_tokens": max_tokens, + # No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout + # from config (default 360s / 6min). Users with slow local models can + # increase it in config.yaml. } if extra_body: call_kwargs["extra_body"] = extra_body @@ -1264,10 +1284,24 @@ async def web_extract_tool( try: logger.info("Scraping: %s", url) - scrape_result = _get_firecrawl_client().scrape( - url=url, - formats=formats - ) + # Run synchronous Firecrawl scrape in a thread with a + # 60s timeout so a hung fetch doesn't block the session. + try: + scrape_result = await asyncio.wait_for( + asyncio.to_thread( + _get_firecrawl_client().scrape, + url=url, + formats=formats, + ), + timeout=60, + ) + except asyncio.TimeoutError: + logger.warning("Firecrawl scrape timed out for %s", url) + results.append({ + "url": url, "title": "", "content": "", + "error": "Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.", + }) + continue scrape_payload = _extract_scrape_payload(scrape_result) metadata = scrape_payload.get("metadata", {})