mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-30 01:41:43 +00:00
Merge upstream/main into feat/add-brave-search-backend
Resolved conflicts in tools/web_tools.py: - Merged backend compatibility list to include Brave Search - Updated _get_backend() fallback to use _is_tool_gateway_ready() while keeping Brave - Added Brave support to _is_backend_available() - Added BRAVE_API_KEY to _web_requires_env()
This commit is contained in:
commit
d2ec3b5a29
874 changed files with 143136 additions and 17364 deletions
|
|
@ -4,17 +4,19 @@ Standalone Web Tools Module
|
|||
|
||||
This module provides generic web tools that work with multiple backend providers.
|
||||
Backend is selected during ``hermes tools`` setup (web.backend in config.yaml).
|
||||
When available, Hermes can route Firecrawl calls through a Nous-hosted tool-gateway
|
||||
for Nous Subscribers only.
|
||||
|
||||
Available tools:
|
||||
- web_search_tool: Search the web for information
|
||||
- web_extract_tool: Extract content from specific web pages
|
||||
- web_crawl_tool: Crawl websites with specific instructions (Firecrawl only)
|
||||
- web_crawl_tool: Crawl websites with specific instructions
|
||||
|
||||
Backend compatibility:
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl)
|
||||
- Brave Search: https://brave.com/search/api/ (search only - extract falls back to Firecrawl)
|
||||
- Parallel: https://docs.parallel.ai (search, extract)
|
||||
- Exa: https://exa.ai (search, extract)
|
||||
- Brave Search: https://brave.com/search/api/ (search only - extract falls back to Firecrawl)
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway.<domain> for Nous Subscribers)
|
||||
- Parallel: https://docs.parallel.ai (search, extract)
|
||||
- Tavily: https://tavily.com (search, extract, crawl)
|
||||
|
||||
LLM Processing:
|
||||
|
|
@ -47,8 +49,18 @@ import asyncio
|
|||
from typing import List, Dict, Any, Optional
|
||||
import httpx
|
||||
from firecrawl import Firecrawl
|
||||
from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
|
||||
from agent.auxiliary_client import (
|
||||
async_call_llm,
|
||||
extract_content_or_reasoning,
|
||||
get_async_text_auxiliary_client,
|
||||
)
|
||||
from tools.debug_helpers import DebugSession
|
||||
from tools.managed_tool_gateway import (
|
||||
build_vendor_gateway_url,
|
||||
read_nous_access_token as _read_nous_access_token,
|
||||
resolve_managed_tool_gateway,
|
||||
)
|
||||
from tools.tool_backend_helpers import managed_nous_tools_enabled
|
||||
from tools.url_safety import is_safe_url
|
||||
from tools.website_policy import check_website_access
|
||||
|
||||
|
|
@ -80,49 +92,156 @@ def _get_backend() -> str:
|
|||
if configured in ("parallel", "firecrawl", "tavily", "exa", "brave"):
|
||||
return configured
|
||||
|
||||
# Fallback for manual / legacy config — pick highest-priority backend
|
||||
# that has a key configured. Order: firecrawl > parallel > tavily > brave > exa.
|
||||
for backend, keys in [
|
||||
("firecrawl", ("FIRECRAWL_API_KEY", "FIRECRAWL_API_URL")),
|
||||
("parallel", ("PARALLEL_API_KEY",)),
|
||||
("tavily", ("TAVILY_API_KEY",)),
|
||||
("brave", ("BRAVE_API_KEY",)),
|
||||
("exa", ("EXA_API_KEY",)),
|
||||
]:
|
||||
if any(_has_env(k) for k in keys):
|
||||
# Fallback for manual / legacy config — pick the highest-priority
|
||||
# available backend. Firecrawl also counts as available when the managed
|
||||
# tool gateway is configured for Nous subscribers.
|
||||
backend_candidates = (
|
||||
("firecrawl", _has_env("FIRECRAWL_API_KEY") or _has_env("FIRECRAWL_API_URL") or _is_tool_gateway_ready()),
|
||||
("parallel", _has_env("PARALLEL_API_KEY")),
|
||||
("tavily", _has_env("TAVILY_API_KEY")),
|
||||
("brave", _has_env("BRAVE_API_KEY")),
|
||||
("exa", _has_env("EXA_API_KEY")),
|
||||
)
|
||||
for backend, available in backend_candidates:
|
||||
if available:
|
||||
return backend
|
||||
|
||||
return "firecrawl" # default (backward compat)
|
||||
|
||||
|
||||
def _is_backend_available(backend: str) -> bool:
|
||||
"""Return True when the selected backend is currently usable."""
|
||||
if backend == "exa":
|
||||
return _has_env("EXA_API_KEY")
|
||||
if backend == "parallel":
|
||||
return _has_env("PARALLEL_API_KEY")
|
||||
if backend == "firecrawl":
|
||||
return check_firecrawl_api_key()
|
||||
if backend == "tavily":
|
||||
return _has_env("TAVILY_API_KEY")
|
||||
if backend == "brave":
|
||||
return _has_env("BRAVE_API_KEY")
|
||||
return False
|
||||
|
||||
# ─── Firecrawl Client ────────────────────────────────────────────────────────
|
||||
|
||||
_firecrawl_client = None
|
||||
_firecrawl_client_config = None
|
||||
|
||||
|
||||
def _get_direct_firecrawl_config() -> Optional[tuple[Dict[str, str], tuple[str, Optional[str], Optional[str]]]]:
|
||||
"""Return explicit direct Firecrawl kwargs + cache key, or None when unset."""
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
||||
api_url = os.getenv("FIRECRAWL_API_URL", "").strip().rstrip("/")
|
||||
|
||||
if not api_key and not api_url:
|
||||
return None
|
||||
|
||||
kwargs: Dict[str, str] = {}
|
||||
if api_key:
|
||||
kwargs["api_key"] = api_key
|
||||
if api_url:
|
||||
kwargs["api_url"] = api_url
|
||||
|
||||
return kwargs, ("direct", api_url or None, api_key or None)
|
||||
|
||||
|
||||
def _get_firecrawl_gateway_url() -> str:
|
||||
"""Return configured Firecrawl gateway URL."""
|
||||
return build_vendor_gateway_url("firecrawl")
|
||||
|
||||
|
||||
def _is_tool_gateway_ready() -> bool:
|
||||
"""Return True when gateway URL and a Nous Subscriber token are available."""
|
||||
return resolve_managed_tool_gateway("firecrawl", token_reader=_read_nous_access_token) is not None
|
||||
|
||||
|
||||
def _has_direct_firecrawl_config() -> bool:
|
||||
"""Return True when direct Firecrawl config is explicitly configured."""
|
||||
return _get_direct_firecrawl_config() is not None
|
||||
|
||||
|
||||
def _raise_web_backend_configuration_error() -> None:
|
||||
"""Raise a clear error for unsupported web backend configuration."""
|
||||
message = (
|
||||
"Web tools are not configured. "
|
||||
"Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL for a self-hosted Firecrawl instance."
|
||||
)
|
||||
if managed_nous_tools_enabled():
|
||||
message += (
|
||||
" If you have the hidden Nous-managed tools flag enabled, you can also login to Nous "
|
||||
"(`hermes model`) and provide FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN."
|
||||
)
|
||||
raise ValueError(message)
|
||||
|
||||
|
||||
def _firecrawl_backend_help_suffix() -> str:
|
||||
"""Return optional managed-gateway guidance for Firecrawl help text."""
|
||||
if not managed_nous_tools_enabled():
|
||||
return ""
|
||||
return (
|
||||
", or, if you have the hidden Nous-managed tools flag enabled, login to Nous and use "
|
||||
"FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN"
|
||||
)
|
||||
|
||||
|
||||
def _web_requires_env() -> list[str]:
|
||||
"""Return tool metadata env vars for the currently enabled web backends."""
|
||||
requires = [
|
||||
"EXA_API_KEY",
|
||||
"PARALLEL_API_KEY",
|
||||
"TAVILY_API_KEY",
|
||||
"BRAVE_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"FIRECRAWL_API_URL",
|
||||
]
|
||||
if managed_nous_tools_enabled():
|
||||
requires.extend(
|
||||
[
|
||||
"FIRECRAWL_GATEWAY_URL",
|
||||
"TOOL_GATEWAY_DOMAIN",
|
||||
"TOOL_GATEWAY_SCHEME",
|
||||
"TOOL_GATEWAY_USER_TOKEN",
|
||||
]
|
||||
)
|
||||
return requires
|
||||
|
||||
|
||||
def _get_firecrawl_client():
|
||||
"""Get or create the Firecrawl client (lazy initialization).
|
||||
"""Get or create Firecrawl client.
|
||||
|
||||
Uses the cloud API by default (requires FIRECRAWL_API_KEY).
|
||||
Set FIRECRAWL_API_URL to point at a self-hosted instance instead —
|
||||
in that case the API key is optional (set USE_DB_AUTHENTICATION=false
|
||||
on your Firecrawl server to disable auth entirely).
|
||||
Direct Firecrawl takes precedence when explicitly configured. Otherwise
|
||||
Hermes falls back to the Firecrawl tool-gateway for logged-in Nous Subscribers.
|
||||
"""
|
||||
global _firecrawl_client
|
||||
if _firecrawl_client is None:
|
||||
api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
api_url = os.getenv("FIRECRAWL_API_URL")
|
||||
if not api_key and not api_url:
|
||||
logger.error("Firecrawl client initialization failed: missing configuration.")
|
||||
raise ValueError(
|
||||
"Firecrawl client not configured. "
|
||||
"Set FIRECRAWL_API_KEY (cloud) or FIRECRAWL_API_URL (self-hosted). "
|
||||
"This tool requires Firecrawl to be available."
|
||||
)
|
||||
kwargs = {}
|
||||
if api_key:
|
||||
kwargs["api_key"] = api_key
|
||||
if api_url:
|
||||
kwargs["api_url"] = api_url
|
||||
_firecrawl_client = Firecrawl(**kwargs)
|
||||
global _firecrawl_client, _firecrawl_client_config
|
||||
|
||||
direct_config = _get_direct_firecrawl_config()
|
||||
if direct_config is not None:
|
||||
kwargs, client_config = direct_config
|
||||
else:
|
||||
managed_gateway = resolve_managed_tool_gateway(
|
||||
"firecrawl",
|
||||
token_reader=_read_nous_access_token,
|
||||
)
|
||||
if managed_gateway is None:
|
||||
logger.error("Firecrawl client initialization failed: missing direct config and tool-gateway auth.")
|
||||
_raise_web_backend_configuration_error()
|
||||
|
||||
kwargs = {
|
||||
"api_key": managed_gateway.nous_user_token,
|
||||
"api_url": managed_gateway.gateway_origin,
|
||||
}
|
||||
client_config = (
|
||||
"tool-gateway",
|
||||
kwargs["api_url"],
|
||||
managed_gateway.nous_user_token,
|
||||
)
|
||||
|
||||
if _firecrawl_client is not None and _firecrawl_client_config == client_config:
|
||||
return _firecrawl_client
|
||||
|
||||
_firecrawl_client = Firecrawl(**kwargs)
|
||||
_firecrawl_client_config = client_config
|
||||
return _firecrawl_client
|
||||
|
||||
# ─── Parallel Client ─────────────────────────────────────────────────────────
|
||||
|
|
@ -304,10 +423,115 @@ def _normalize_tavily_documents(response: dict, fallback_url: str = "") -> List[
|
|||
return documents
|
||||
|
||||
|
||||
def _to_plain_object(value: Any) -> Any:
|
||||
"""Convert SDK objects to plain python data structures when possible."""
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
if isinstance(value, (dict, list, str, int, float, bool)):
|
||||
return value
|
||||
|
||||
if hasattr(value, "model_dump"):
|
||||
try:
|
||||
return value.model_dump()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if hasattr(value, "__dict__"):
|
||||
try:
|
||||
return {k: v for k, v in value.__dict__.items() if not k.startswith("_")}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_result_list(values: Any) -> List[Dict[str, Any]]:
|
||||
"""Normalize mixed SDK/list payloads into a list of dicts."""
|
||||
if not isinstance(values, list):
|
||||
return []
|
||||
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
for item in values:
|
||||
plain = _to_plain_object(item)
|
||||
if isinstance(plain, dict):
|
||||
normalized.append(plain)
|
||||
return normalized
|
||||
|
||||
|
||||
def _extract_web_search_results(response: Any) -> List[Dict[str, Any]]:
|
||||
"""Extract Firecrawl search results across SDK/direct/gateway response shapes."""
|
||||
response_plain = _to_plain_object(response)
|
||||
|
||||
if isinstance(response_plain, dict):
|
||||
data = response_plain.get("data")
|
||||
if isinstance(data, list):
|
||||
return _normalize_result_list(data)
|
||||
|
||||
if isinstance(data, dict):
|
||||
data_web = _normalize_result_list(data.get("web"))
|
||||
if data_web:
|
||||
return data_web
|
||||
data_results = _normalize_result_list(data.get("results"))
|
||||
if data_results:
|
||||
return data_results
|
||||
|
||||
top_web = _normalize_result_list(response_plain.get("web"))
|
||||
if top_web:
|
||||
return top_web
|
||||
|
||||
top_results = _normalize_result_list(response_plain.get("results"))
|
||||
if top_results:
|
||||
return top_results
|
||||
|
||||
if hasattr(response, "web"):
|
||||
return _normalize_result_list(getattr(response, "web", []))
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def _extract_scrape_payload(scrape_result: Any) -> Dict[str, Any]:
|
||||
"""Normalize Firecrawl scrape payload shape across SDK and gateway variants."""
|
||||
result_plain = _to_plain_object(scrape_result)
|
||||
if not isinstance(result_plain, dict):
|
||||
return {}
|
||||
|
||||
nested = result_plain.get("data")
|
||||
if isinstance(nested, dict):
|
||||
return nested
|
||||
|
||||
return result_plain
|
||||
|
||||
|
||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||
|
||||
# Allow per-task override via env var
|
||||
DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
|
||||
def _is_nous_auxiliary_client(client: Any) -> bool:
|
||||
"""Return True when the resolved auxiliary backend is Nous Portal."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
base_url = str(getattr(client, "base_url", "") or "")
|
||||
host = (urlparse(base_url).hostname or "").lower()
|
||||
return host == "nousresearch.com" or host.endswith(".nousresearch.com")
|
||||
|
||||
|
||||
def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optional[Any], Optional[str], Dict[str, Any]]:
|
||||
"""Resolve the current web-extract auxiliary client, model, and extra body."""
|
||||
client, default_model = get_async_text_auxiliary_client("web_extract")
|
||||
configured_model = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
|
||||
effective_model = model or configured_model or default_model
|
||||
|
||||
extra_body: Dict[str, Any] = {}
|
||||
if client is not None and _is_nous_auxiliary_client(client):
|
||||
from agent.auxiliary_client import get_auxiliary_extra_body
|
||||
extra_body = get_auxiliary_extra_body() or {"tags": ["product=hermes-agent"]}
|
||||
|
||||
return client, effective_model, extra_body
|
||||
|
||||
|
||||
def _get_default_summarizer_model() -> Optional[str]:
|
||||
"""Return the current default model for web extraction summarization."""
|
||||
_, model, _ = _resolve_web_extract_auxiliary()
|
||||
return model
|
||||
|
||||
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
||||
|
||||
|
|
@ -316,7 +540,7 @@ async def process_content_with_llm(
|
|||
content: str,
|
||||
url: str = "",
|
||||
title: str = "",
|
||||
model: str = DEFAULT_SUMMARIZER_MODEL,
|
||||
model: Optional[str] = None,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
|
|
@ -392,14 +616,30 @@ async def process_content_with_llm(
|
|||
return processed_content
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("Error processing content with LLM: %s", e)
|
||||
return f"[Failed to process content: {str(e)[:100]}. Content size: {len(content):,} chars]"
|
||||
logger.warning(
|
||||
"web_extract LLM summarization failed (%s). "
|
||||
"Tip: increase auxiliary.web_extract.timeout in config.yaml "
|
||||
"or switch to a faster auxiliary model.",
|
||||
str(e)[:120],
|
||||
)
|
||||
# Fall back to truncated raw content instead of returning a useless
|
||||
# error message. The first ~5000 chars are almost always more useful
|
||||
# to the model than "[Failed to process content: ...]".
|
||||
truncated = content[:MAX_OUTPUT_SIZE]
|
||||
if len(content) > MAX_OUTPUT_SIZE:
|
||||
truncated += (
|
||||
f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of "
|
||||
f"{len(content):,} chars. LLM summarization timed out. "
|
||||
f"To fix: increase auxiliary.web_extract.timeout in config.yaml, "
|
||||
f"or use a faster auxiliary model. Use browser_navigate for the full page.]"
|
||||
)
|
||||
return truncated
|
||||
|
||||
|
||||
async def _call_summarizer_llm(
|
||||
content: str,
|
||||
context_str: str,
|
||||
model: str,
|
||||
model: Optional[str],
|
||||
max_tokens: int = 20000,
|
||||
is_chunk: bool = False,
|
||||
chunk_info: str = ""
|
||||
|
|
@ -458,24 +698,33 @@ Your goal is to preserve ALL important information while reducing length. Never
|
|||
|
||||
Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
|
||||
|
||||
# Call the LLM with retry logic
|
||||
max_retries = 6
|
||||
# Call the LLM with retry logic — keep retries low since summarization
|
||||
# is a nice-to-have; the caller falls back to truncated content on failure.
|
||||
max_retries = 2
|
||||
retry_delay = 2
|
||||
last_error = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
|
||||
if aux_client is None or not effective_model:
|
||||
logger.warning("No auxiliary model available for web content processing")
|
||||
return None
|
||||
call_kwargs = {
|
||||
"task": "web_extract",
|
||||
"model": effective_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": max_tokens,
|
||||
# No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout
|
||||
# from config (default 360s / 6min). Users with slow local models can
|
||||
# increase it in config.yaml.
|
||||
}
|
||||
if model:
|
||||
call_kwargs["model"] = model
|
||||
if extra_body:
|
||||
call_kwargs["extra_body"] = extra_body
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
content = extract_content_or_reasoning(response)
|
||||
if content:
|
||||
|
|
@ -506,7 +755,7 @@ Create a markdown summary that captures all key information in a well-organized,
|
|||
async def _process_large_content_chunked(
|
||||
content: str,
|
||||
context_str: str,
|
||||
model: str,
|
||||
model: Optional[str],
|
||||
chunk_size: int,
|
||||
max_output_size: int
|
||||
) -> Optional[str]:
|
||||
|
|
@ -593,17 +842,26 @@ Synthesize these into ONE cohesive, comprehensive summary that:
|
|||
Create a single, unified markdown summary."""
|
||||
|
||||
try:
|
||||
aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
|
||||
if aux_client is None or not effective_model:
|
||||
logger.warning("No auxiliary model for synthesis, concatenating summaries")
|
||||
fallback = "\n\n".join(summaries)
|
||||
if len(fallback) > max_output_size:
|
||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
||||
return fallback
|
||||
|
||||
call_kwargs = {
|
||||
"task": "web_extract",
|
||||
"model": effective_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
|
||||
{"role": "user", "content": synthesis_prompt}
|
||||
{"role": "user", "content": synthesis_prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 20000,
|
||||
}
|
||||
if model:
|
||||
call_kwargs["model"] = model
|
||||
if extra_body:
|
||||
call_kwargs["extra_body"] = extra_body
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
final_summary = extract_content_or_reasoning(response)
|
||||
|
||||
|
|
@ -613,6 +871,14 @@ Create a single, unified markdown summary."""
|
|||
response = await async_call_llm(**call_kwargs)
|
||||
final_summary = extract_content_or_reasoning(response)
|
||||
|
||||
# If still None after retry, fall back to concatenated summaries
|
||||
if not final_summary:
|
||||
logger.warning("Synthesis failed after retry — concatenating chunk summaries")
|
||||
fallback = "\n\n".join(summaries)
|
||||
if len(fallback) > max_output_size:
|
||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
||||
return fallback
|
||||
|
||||
# Enforce hard cap
|
||||
if len(final_summary) > max_output_size:
|
||||
final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"
|
||||
|
|
@ -875,7 +1141,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
try:
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return json.dumps({"error": "Interrupted", "success": False})
|
||||
return tool_error("Interrupted", success=False)
|
||||
|
||||
# Dispatch to the configured backend
|
||||
backend = _get_backend()
|
||||
|
|
@ -935,35 +1201,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
limit=limit
|
||||
)
|
||||
|
||||
# The response is a SearchData object with web, news, and images attributes
|
||||
# When not scraping, the results are directly in these attributes
|
||||
web_results = []
|
||||
|
||||
# Check if response has web attribute (SearchData object)
|
||||
if hasattr(response, 'web'):
|
||||
# Response is a SearchData object with web attribute
|
||||
if response.web:
|
||||
# Convert each SearchResultWeb object to dict
|
||||
for result in response.web:
|
||||
if hasattr(result, 'model_dump'):
|
||||
# Pydantic model - use model_dump
|
||||
web_results.append(result.model_dump())
|
||||
elif hasattr(result, '__dict__'):
|
||||
# Regular object - use __dict__
|
||||
web_results.append(result.__dict__)
|
||||
elif isinstance(result, dict):
|
||||
# Already a dict
|
||||
web_results.append(result)
|
||||
elif hasattr(response, 'model_dump'):
|
||||
# Response has model_dump method - use it to get dict
|
||||
response_dict = response.model_dump()
|
||||
if 'web' in response_dict and response_dict['web']:
|
||||
web_results = response_dict['web']
|
||||
elif isinstance(response, dict):
|
||||
# Response is already a dictionary
|
||||
if 'web' in response and response['web']:
|
||||
web_results = response['web']
|
||||
|
||||
web_results = _extract_web_search_results(response)
|
||||
results_count = len(web_results)
|
||||
logger.info("Found %d search results", results_count)
|
||||
|
||||
|
|
@ -992,33 +1230,35 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
except Exception as e:
|
||||
error_msg = f"Error searching web: {str(e)}"
|
||||
logger.debug("%s", error_msg)
|
||||
|
||||
|
||||
debug_call_data["error"] = error_msg
|
||||
_debug.log_call("web_search_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
return json.dumps({"error": error_msg}, ensure_ascii=False)
|
||||
|
||||
return tool_error(error_msg)
|
||||
|
||||
|
||||
async def web_extract_tool(
|
||||
urls: List[str],
|
||||
format: str = None,
|
||||
urls: List[str],
|
||||
format: str = None,
|
||||
use_llm_processing: bool = True,
|
||||
model: str = DEFAULT_SUMMARIZER_MODEL,
|
||||
model: Optional[str] = None,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> str:
|
||||
"""
|
||||
Extract content from specific web pages using available extraction API backend.
|
||||
|
||||
|
||||
This function provides a generic interface for web content extraction that
|
||||
can work with multiple backends. Currently uses Firecrawl.
|
||||
|
||||
|
||||
Args:
|
||||
urls (List[str]): List of URLs to extract content from
|
||||
format (str): Desired output format ("markdown" or "html", optional)
|
||||
use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
|
||||
model (str): The model to use for LLM processing (default: google/gemini-3-flash-preview)
|
||||
model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
|
||||
min_length (int): Minimum content length to trigger LLM processing (default: 5000)
|
||||
|
||||
Security: URLs are checked for embedded secrets before fetching.
|
||||
|
||||
Returns:
|
||||
str: JSON string containing extracted content. If LLM processing is enabled and successful,
|
||||
|
|
@ -1027,6 +1267,18 @@ async def web_extract_tool(
|
|||
Raises:
|
||||
Exception: If extraction fails or API key is not set
|
||||
"""
|
||||
# Block URLs containing embedded secrets (exfiltration prevention).
|
||||
# URL-decode first so percent-encoded secrets (%73k- = sk-) are caught.
|
||||
from agent.redact import _PREFIX_RE
|
||||
from urllib.parse import unquote
|
||||
for _url in urls:
|
||||
if _PREFIX_RE.search(_url) or _PREFIX_RE.search(unquote(_url)):
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "Blocked: URL contains what appears to be an API key or token. "
|
||||
"Secrets must not be sent in URLs.",
|
||||
})
|
||||
|
||||
debug_call_data = {
|
||||
"parameters": {
|
||||
"urls": urls,
|
||||
|
|
@ -1114,44 +1366,30 @@ async def web_extract_tool(
|
|||
|
||||
try:
|
||||
logger.info("Scraping: %s", url)
|
||||
scrape_result = _get_firecrawl_client().scrape(
|
||||
url=url,
|
||||
formats=formats
|
||||
)
|
||||
# Run synchronous Firecrawl scrape in a thread with a
|
||||
# 60s timeout so a hung fetch doesn't block the session.
|
||||
try:
|
||||
scrape_result = await asyncio.wait_for(
|
||||
asyncio.to_thread(
|
||||
_get_firecrawl_client().scrape,
|
||||
url=url,
|
||||
formats=formats,
|
||||
),
|
||||
timeout=60,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Firecrawl scrape timed out for %s", url)
|
||||
results.append({
|
||||
"url": url, "title": "", "content": "",
|
||||
"error": "Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.",
|
||||
})
|
||||
continue
|
||||
|
||||
# Process the result - properly handle object serialization
|
||||
metadata = {}
|
||||
scrape_payload = _extract_scrape_payload(scrape_result)
|
||||
metadata = scrape_payload.get("metadata", {})
|
||||
title = ""
|
||||
content_markdown = None
|
||||
content_html = None
|
||||
|
||||
# Extract data from the scrape result
|
||||
if hasattr(scrape_result, 'model_dump'):
|
||||
# Pydantic model - use model_dump to get dict
|
||||
result_dict = scrape_result.model_dump()
|
||||
content_markdown = result_dict.get('markdown')
|
||||
content_html = result_dict.get('html')
|
||||
metadata = result_dict.get('metadata', {})
|
||||
elif hasattr(scrape_result, '__dict__'):
|
||||
# Regular object with attributes
|
||||
content_markdown = getattr(scrape_result, 'markdown', None)
|
||||
content_html = getattr(scrape_result, 'html', None)
|
||||
|
||||
# Handle metadata - convert to dict if it's an object
|
||||
metadata_obj = getattr(scrape_result, 'metadata', {})
|
||||
if hasattr(metadata_obj, 'model_dump'):
|
||||
metadata = metadata_obj.model_dump()
|
||||
elif hasattr(metadata_obj, '__dict__'):
|
||||
metadata = metadata_obj.__dict__
|
||||
elif isinstance(metadata_obj, dict):
|
||||
metadata = metadata_obj
|
||||
else:
|
||||
metadata = {}
|
||||
elif isinstance(scrape_result, dict):
|
||||
# Already a dictionary
|
||||
content_markdown = scrape_result.get('markdown')
|
||||
content_html = scrape_result.get('html')
|
||||
metadata = scrape_result.get('metadata', {})
|
||||
content_markdown = scrape_payload.get("markdown")
|
||||
content_html = scrape_payload.get("html")
|
||||
|
||||
# Ensure metadata is a dict (not an object)
|
||||
if not isinstance(metadata, dict):
|
||||
|
|
@ -1209,9 +1447,11 @@ async def web_extract_tool(
|
|||
|
||||
debug_call_data["pages_extracted"] = pages_extracted
|
||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
effective_model = model or _get_default_summarizer_model()
|
||||
auxiliary_available = check_auxiliary_model()
|
||||
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing:
|
||||
if use_llm_processing and auxiliary_available:
|
||||
logger.info("Processing extracted content with LLM (parallel)...")
|
||||
debug_call_data["processing_applied"].append("llm_processing")
|
||||
|
||||
|
|
@ -1229,7 +1469,7 @@ async def web_extract_tool(
|
|||
|
||||
# Process content with LLM
|
||||
processed = await process_content_with_llm(
|
||||
raw_content, url, title, model, min_length
|
||||
raw_content, url, title, effective_model, min_length
|
||||
)
|
||||
|
||||
if processed:
|
||||
|
|
@ -1245,7 +1485,7 @@ async def web_extract_tool(
|
|||
"original_size": original_size,
|
||||
"processed_size": processed_size,
|
||||
"compression_ratio": compression_ratio,
|
||||
"model_used": model
|
||||
"model_used": effective_model
|
||||
}
|
||||
return result, metrics, "processed"
|
||||
else:
|
||||
|
|
@ -1277,6 +1517,9 @@ async def web_extract_tool(
|
|||
else:
|
||||
logger.warning("%s (no content to process)", url)
|
||||
else:
|
||||
if use_llm_processing and not auxiliary_available:
|
||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
||||
# Print summary of extracted pages for debugging (original behavior)
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
|
|
@ -1297,7 +1540,7 @@ async def web_extract_tool(
|
|||
trimmed_response = {"results": trimmed_results}
|
||||
|
||||
if trimmed_response.get("results") == []:
|
||||
result_json = json.dumps({"error": "Content was inaccessible or not found"}, ensure_ascii=False)
|
||||
result_json = tool_error("Content was inaccessible or not found")
|
||||
|
||||
cleaned_result = clean_base64_images(result_json)
|
||||
|
||||
|
|
@ -1323,7 +1566,7 @@ async def web_extract_tool(
|
|||
_debug.log_call("web_extract_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
return json.dumps({"error": error_msg}, ensure_ascii=False)
|
||||
return tool_error(error_msg)
|
||||
|
||||
|
||||
async def web_crawl_tool(
|
||||
|
|
@ -1331,7 +1574,7 @@ async def web_crawl_tool(
|
|||
instructions: str = None,
|
||||
depth: str = "basic",
|
||||
use_llm_processing: bool = True,
|
||||
model: str = DEFAULT_SUMMARIZER_MODEL,
|
||||
model: Optional[str] = None,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> str:
|
||||
"""
|
||||
|
|
@ -1345,7 +1588,7 @@ async def web_crawl_tool(
|
|||
instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
|
||||
depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
|
||||
use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
|
||||
model (str): The model to use for LLM processing (default: google/gemini-3-flash-preview)
|
||||
model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
|
||||
min_length (int): Minimum content length to trigger LLM processing (default: 5000)
|
||||
|
||||
Returns:
|
||||
|
|
@ -1375,6 +1618,8 @@ async def web_crawl_tool(
|
|||
}
|
||||
|
||||
try:
|
||||
effective_model = model or _get_default_summarizer_model()
|
||||
auxiliary_available = check_auxiliary_model()
|
||||
backend = _get_backend()
|
||||
|
||||
# Tavily supports crawl via its /crawl endpoint
|
||||
|
|
@ -1397,7 +1642,7 @@ async def web_crawl_tool(
|
|||
|
||||
from tools.interrupt import is_interrupted as _is_int
|
||||
if _is_int():
|
||||
return json.dumps({"error": "Interrupted", "success": False})
|
||||
return tool_error("Interrupted", success=False)
|
||||
|
||||
logger.info("Tavily crawl: %s", url)
|
||||
payload: Dict[str, Any] = {
|
||||
|
|
@ -1419,7 +1664,7 @@ async def web_crawl_tool(
|
|||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing:
|
||||
if use_llm_processing and auxiliary_available:
|
||||
logger.info("Processing crawled content with LLM (parallel)...")
|
||||
debug_call_data["processing_applied"].append("llm_processing")
|
||||
|
||||
|
|
@ -1430,12 +1675,12 @@ async def web_crawl_tool(
|
|||
if not content:
|
||||
return result, None, "no_content"
|
||||
original_size = len(content)
|
||||
processed = await process_content_with_llm(content, page_url, title, model, min_length)
|
||||
processed = await process_content_with_llm(content, page_url, title, effective_model, min_length)
|
||||
if processed:
|
||||
result['raw_content'] = content
|
||||
result['content'] = processed
|
||||
metrics = {"url": page_url, "original_size": original_size, "processed_size": len(processed),
|
||||
"compression_ratio": len(processed) / original_size if original_size else 1.0, "model_used": model}
|
||||
"compression_ratio": len(processed) / original_size if original_size else 1.0, "model_used": effective_model}
|
||||
return result, metrics, "processed"
|
||||
metrics = {"url": page_url, "original_size": original_size, "processed_size": original_size,
|
||||
"compression_ratio": 1.0, "model_used": None, "reason": "content_too_short"}
|
||||
|
|
@ -1448,6 +1693,10 @@ async def web_crawl_tool(
|
|||
debug_call_data["compression_metrics"].append(metrics)
|
||||
debug_call_data["pages_processed_with_llm"] += 1
|
||||
|
||||
if use_llm_processing and not auxiliary_available:
|
||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
||||
|
||||
trimmed_results = [{"url": r.get("url", ""), "title": r.get("title", ""), "content": r.get("content", ""), "error": r.get("error"),
|
||||
**({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {})} for r in response.get("results", [])]
|
||||
result_json = json.dumps({"results": trimmed_results}, indent=2, ensure_ascii=False)
|
||||
|
|
@ -1457,11 +1706,11 @@ async def web_crawl_tool(
|
|||
_debug.save()
|
||||
return cleaned_result
|
||||
|
||||
# web_crawl requires Firecrawl — Parallel has no crawl API
|
||||
if not (os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")):
|
||||
# web_crawl requires Firecrawl or the Firecrawl tool-gateway — Parallel has no crawl API
|
||||
if not check_firecrawl_api_key():
|
||||
return json.dumps({
|
||||
"error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, "
|
||||
"or use web_search + web_extract instead.",
|
||||
"error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, FIRECRAWL_API_URL"
|
||||
f"{_firecrawl_backend_help_suffix()}, or use web_search + web_extract instead.",
|
||||
"success": False,
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
|
@ -1504,7 +1753,7 @@ async def web_crawl_tool(
|
|||
|
||||
from tools.interrupt import is_interrupted as _is_int
|
||||
if _is_int():
|
||||
return json.dumps({"error": "Interrupted", "success": False})
|
||||
return tool_error("Interrupted", success=False)
|
||||
|
||||
try:
|
||||
crawl_result = _get_firecrawl_client().crawl(
|
||||
|
|
@ -1621,7 +1870,7 @@ async def web_crawl_tool(
|
|||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing:
|
||||
if use_llm_processing and auxiliary_available:
|
||||
logger.info("Processing crawled content with LLM (parallel)...")
|
||||
debug_call_data["processing_applied"].append("llm_processing")
|
||||
|
||||
|
|
@ -1639,7 +1888,7 @@ async def web_crawl_tool(
|
|||
|
||||
# Process content with LLM
|
||||
processed = await process_content_with_llm(
|
||||
content, page_url, title, model, min_length
|
||||
content, page_url, title, effective_model, min_length
|
||||
)
|
||||
|
||||
if processed:
|
||||
|
|
@ -1655,7 +1904,7 @@ async def web_crawl_tool(
|
|||
"original_size": original_size,
|
||||
"processed_size": processed_size,
|
||||
"compression_ratio": compression_ratio,
|
||||
"model_used": model
|
||||
"model_used": effective_model
|
||||
}
|
||||
return result, metrics, "processed"
|
||||
else:
|
||||
|
|
@ -1687,6 +1936,9 @@ async def web_crawl_tool(
|
|||
else:
|
||||
logger.warning("%s (no content to process)", page_url)
|
||||
else:
|
||||
if use_llm_processing and not auxiliary_available:
|
||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
||||
# Print summary of crawled pages for debugging (original behavior)
|
||||
for result in response.get('results', []):
|
||||
page_url = result.get('url', 'Unknown URL')
|
||||
|
|
@ -1727,42 +1979,37 @@ async def web_crawl_tool(
|
|||
_debug.log_call("web_crawl_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
return json.dumps({"error": error_msg}, ensure_ascii=False)
|
||||
return tool_error(error_msg)
|
||||
|
||||
|
||||
# Convenience function to check if API key is available
|
||||
# Convenience function to check Firecrawl credentials
|
||||
def check_firecrawl_api_key() -> bool:
|
||||
"""
|
||||
Check if the Firecrawl API key is available in environment variables.
|
||||
Check whether the Firecrawl backend is available.
|
||||
|
||||
Availability is true when either:
|
||||
1) direct Firecrawl config (`FIRECRAWL_API_KEY` or `FIRECRAWL_API_URL`), or
|
||||
2) Firecrawl gateway origin + Nous Subscriber access token
|
||||
(fallback when direct Firecrawl is not configured).
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
bool: True if direct Firecrawl or the tool-gateway can be used.
|
||||
"""
|
||||
return bool(os.getenv("FIRECRAWL_API_KEY"))
|
||||
return _has_direct_firecrawl_config() or _is_tool_gateway_ready()
|
||||
|
||||
|
||||
def check_web_api_key() -> bool:
|
||||
"""Check if any web backend API key is available (Exa, Parallel, Firecrawl, or Tavily)."""
|
||||
return bool(
|
||||
os.getenv("EXA_API_KEY")
|
||||
or os.getenv("PARALLEL_API_KEY")
|
||||
or os.getenv("FIRECRAWL_API_KEY")
|
||||
or os.getenv("FIRECRAWL_API_URL")
|
||||
or os.getenv("TAVILY_API_KEY")
|
||||
)
|
||||
"""Check whether the configured web backend is available."""
|
||||
configured = _load_web_config().get("backend", "").lower().strip()
|
||||
if configured in ("exa", "parallel", "firecrawl", "tavily"):
|
||||
return _is_backend_available(configured)
|
||||
return any(_is_backend_available(backend) for backend in ("exa", "parallel", "firecrawl", "tavily"))
|
||||
|
||||
|
||||
def check_auxiliary_model() -> bool:
|
||||
"""Check if an auxiliary text model is available for LLM content processing."""
|
||||
try:
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
for p in ("openrouter", "nous", "custom", "codex"):
|
||||
client, _ = resolve_provider_client(p)
|
||||
if client is not None:
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
client, _, _ = _resolve_web_extract_auxiliary()
|
||||
return client is not None
|
||||
|
||||
|
||||
def get_debug_session_info() -> Dict[str, Any]:
|
||||
|
|
@ -1779,7 +2026,11 @@ if __name__ == "__main__":
|
|||
|
||||
# Check if API keys are available
|
||||
web_available = check_web_api_key()
|
||||
tool_gateway_available = _is_tool_gateway_ready()
|
||||
firecrawl_key_available = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
|
||||
firecrawl_url_available = bool(os.getenv("FIRECRAWL_API_URL", "").strip())
|
||||
nous_available = check_auxiliary_model()
|
||||
default_summarizer_model = _get_default_summarizer_model()
|
||||
|
||||
if web_available:
|
||||
backend = _get_backend()
|
||||
|
|
@ -1791,17 +2042,27 @@ if __name__ == "__main__":
|
|||
elif backend == "tavily":
|
||||
print(" Using Tavily API (https://tavily.com)")
|
||||
else:
|
||||
print(" Using Firecrawl API (https://firecrawl.dev)")
|
||||
if firecrawl_url_available:
|
||||
print(f" Using self-hosted Firecrawl: {os.getenv('FIRECRAWL_API_URL').strip().rstrip('/')}")
|
||||
elif firecrawl_key_available:
|
||||
print(" Using direct Firecrawl cloud API")
|
||||
elif tool_gateway_available:
|
||||
print(f" Using Firecrawl tool-gateway: {_get_firecrawl_gateway_url()}")
|
||||
else:
|
||||
print(" Firecrawl backend selected but not configured")
|
||||
else:
|
||||
print("❌ No web search backend configured")
|
||||
print("Set EXA_API_KEY, PARALLEL_API_KEY, TAVILY_API_KEY, or FIRECRAWL_API_KEY")
|
||||
print(
|
||||
"Set EXA_API_KEY, PARALLEL_API_KEY, TAVILY_API_KEY, FIRECRAWL_API_KEY, FIRECRAWL_API_URL"
|
||||
f"{_firecrawl_backend_help_suffix()}"
|
||||
)
|
||||
|
||||
if not nous_available:
|
||||
print("❌ No auxiliary model available for LLM content processing")
|
||||
print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY")
|
||||
print("⚠️ Without an auxiliary model, LLM content processing will be disabled")
|
||||
else:
|
||||
print(f"✅ Auxiliary model available: {DEFAULT_SUMMARIZER_MODEL}")
|
||||
print(f"✅ Auxiliary model available: {default_summarizer_model}")
|
||||
|
||||
if not web_available:
|
||||
exit(1)
|
||||
|
|
@ -1809,7 +2070,7 @@ if __name__ == "__main__":
|
|||
print("🛠️ Web tools ready for use!")
|
||||
|
||||
if nous_available:
|
||||
print(f"🧠 LLM content processing available with {DEFAULT_SUMMARIZER_MODEL}")
|
||||
print(f"🧠 LLM content processing available with {default_summarizer_model}")
|
||||
print(f" Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
|
||||
|
||||
# Show debug mode status
|
||||
|
|
@ -1864,7 +2125,7 @@ if __name__ == "__main__":
|
|||
# ---------------------------------------------------------------------------
|
||||
# Registry
|
||||
# ---------------------------------------------------------------------------
|
||||
from tools.registry import registry
|
||||
from tools.registry import registry, tool_error
|
||||
|
||||
WEB_SEARCH_SCHEMA = {
|
||||
"name": "web_search",
|
||||
|
|
@ -1904,8 +2165,9 @@ registry.register(
|
|||
schema=WEB_SEARCH_SCHEMA,
|
||||
handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=5),
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=["EXA_API_KEY", "PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "TAVILY_API_KEY"],
|
||||
requires_env=_web_requires_env(),
|
||||
emoji="🔍",
|
||||
max_result_size_chars=100_000,
|
||||
)
|
||||
registry.register(
|
||||
name="web_extract",
|
||||
|
|
@ -1914,7 +2176,8 @@ registry.register(
|
|||
handler=lambda args, **kw: web_extract_tool(
|
||||
args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"),
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=["EXA_API_KEY", "PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "TAVILY_API_KEY"],
|
||||
requires_env=_web_requires_env(),
|
||||
is_async=True,
|
||||
emoji="📄",
|
||||
max_result_size_chars=100_000,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue