feat: add SearXNG as a native web search backend

Adds SearXNG (https://docs.searxng.org) as a self-hosted, privacy-first
web search backend alongside Firecrawl, Tavily, Exa, and Parallel.

SearXNG is a meta-search engine that aggregates results from 70+ search
engines. No API key needed -- just set SEARXNG_URL to your instance.

Changes:
- tools/web_tools.py: _get_searxng_url(), _searxng_search(), search
  dispatch, extract falls back to Firecrawl (SearXNG is search-only)
- hermes_cli/tools_config.py: SearXNG provider in web tool picker
- hermes_cli/config.py: SEARXNG_URL env var, diagnostics, set command
- tests/tools/test_web_tools_searxng.py: 15 tests
- optional-skills/research/searxng-search/: agent-guided skill
- Docs: configuration.md, environment-variables.md, skills catalogs

Based on #6071 by @gnanam1990, #8106 by @cro, #2572 by @bhovig,
#2710 and #9961 by @StreamOfRon, #7258 by @coldxiangyu163
This commit is contained in:
kshitijk4poor 2026-04-17 14:37:59 +05:30
parent 2367c6ffd5
commit 130b021d74
10 changed files with 600 additions and 6 deletions

View file

@ -16,7 +16,8 @@ Backend compatibility:
- Exa: https://exa.ai (search, extract)
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway.<domain> for Nous Subscribers)
- Parallel: https://docs.parallel.ai (search, extract)
- Tavily: https://tavily.com (search, extract, crawl)
- SearXNG: https://docs.searxng.org (search; self-hosted open-source federated search)
- Tavily: https://tavily.com (search, extract, crawl)
LLM Processing:
- Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
@ -88,7 +89,7 @@ def _get_backend() -> str:
keys manually without running setup.
"""
configured = (_load_web_config().get("backend") or "").lower().strip()
if configured in ("parallel", "firecrawl", "tavily", "exa"):
if configured in ("parallel", "firecrawl", "tavily", "exa", "searxng"):
return configured
# Fallback for manual / legacy config — pick the highest-priority
@ -96,6 +97,7 @@ def _get_backend() -> str:
# tool gateway is configured for Nous subscribers.
backend_candidates = (
("firecrawl", _has_env("FIRECRAWL_API_KEY") or _has_env("FIRECRAWL_API_URL") or _is_tool_gateway_ready()),
("searxng", _has_env("SEARXNG_URL")),
("parallel", _has_env("PARALLEL_API_KEY")),
("tavily", _has_env("TAVILY_API_KEY")),
("exa", _has_env("EXA_API_KEY")),
@ -117,6 +119,8 @@ def _is_backend_available(backend: str) -> bool:
return check_firecrawl_api_key()
if backend == "tavily":
return _has_env("TAVILY_API_KEY")
if backend == "searxng":
return _has_env("SEARXNG_URL")
return False
# ─── Firecrawl Client ────────────────────────────────────────────────────────
@ -189,6 +193,7 @@ def _web_requires_env() -> list[str]:
"TAVILY_API_KEY",
"FIRECRAWL_API_KEY",
"FIRECRAWL_API_URL",
"SEARXNG_URL",
]
if managed_nous_tools_enabled():
requires.extend(
@ -956,6 +961,58 @@ def _exa_extract(urls: List[str]) -> List[Dict[str, Any]]:
return results
# ─── SearXNG Search Helper ───────────────────────────────────────────────────
def _get_searxng_url() -> str:
"""Return the configured SearXNG instance URL.
Requires the ``SEARXNG_URL`` environment variable pointing to a running
SearXNG instance (e.g. ``https://searx.example.com``).
"""
url = os.getenv("SEARXNG_URL", "").strip().rstrip("/")
if not url:
raise ValueError(
"SEARXNG_URL environment variable not set. "
"Set it to your SearXNG instance URL (e.g., https://searx.example.com)"
)
return url
def _searxng_search(query: str, limit: int = 5) -> dict:
"""Search using a SearXNG instance and return normalized results."""
from tools.interrupt import is_interrupted
if is_interrupted():
return {"error": "Interrupted", "success": False}
base_url = _get_searxng_url()
logger.info("SearXNG search: '%s' (limit=%d, base=%s)", query, limit, base_url)
response = httpx.get(
f"{base_url}/search",
params={
"q": query,
"format": "json",
"pageno": 1,
},
timeout=30,
)
response.raise_for_status()
raw = response.json()
web_results = []
for i, result in enumerate(raw.get("results", [])):
if i >= limit:
break
web_results.append({
"url": result.get("url", ""),
"title": result.get("title", ""),
"description": result.get("content", ""),
"position": i + 1,
})
return {"success": True, "data": {"web": web_results}}
# ─── Parallel Search & Extract Helpers ────────────────────────────────────────
def _parallel_search(query: str, limit: int = 5) -> dict:
@ -1102,6 +1159,15 @@ def web_search_tool(query: str, limit: int = 5) -> str:
_debug.save()
return result_json
if backend == "searxng":
response_data = _searxng_search(query, limit)
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
debug_call_data["final_response_size"] = len(result_json)
_debug.log_call("web_search_tool", debug_call_data)
_debug.save()
return result_json
if backend == "tavily":
logger.info("Tavily search: '%s' (limit: %d)", query, limit)
raw = _tavily_request("search", {
@ -1252,6 +1318,37 @@ async def web_extract_tool(
"include_images": False,
})
results = _normalize_tavily_documents(raw, fallback_url=safe_urls[0] if safe_urls else "")
elif backend == "searxng":
# SearXNG is search-only — fall through to Firecrawl for extraction
logger.info("SearXNG does not support extraction, falling back to Firecrawl")
formats: List[str] = []
if format == "markdown":
formats = ["markdown"]
elif format == "html":
formats = ["html"]
results: List[Dict[str, Any]] = []
for url in safe_urls:
try:
scrape_params = {}
if formats:
scrape_params["formats"] = formats
response = _get_firecrawl_client().scrape(url=url, **scrape_params)
content = ""
if isinstance(response, dict):
content = response.get("markdown") or response.get("html") or response.get("rawHtml", "")
elif hasattr(response, "markdown"):
content = response.markdown or getattr(response, "html", "") or ""
results.append({
"url": url,
"title": getattr(response, "metadata", {}).get("title", "") if hasattr(response, "metadata") else (response.get("metadata", {}) or {}).get("title", ""),
"content": content,
"raw_content": content,
"metadata": {"sourceURL": url, "title": ""},
})
except Exception as e:
logger.warning("Firecrawl fallback extraction failed for %s: %s", url, e)
results.append({"url": url, "title": "", "content": "", "error": f"Extraction failed: {e}"})
else:
# ── Firecrawl extraction ──
# Determine requested formats for Firecrawl v2