mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-05 02:31:47 +00:00
feat: add SearXNG as a native web search backend
Adds SearXNG (https://docs.searxng.org) as a self-hosted, privacy-first web search backend alongside Firecrawl, Tavily, Exa, and Parallel. SearXNG is a meta-search engine that aggregates results from 70+ search engines. No API key needed -- just set SEARXNG_URL to your instance. Changes: - tools/web_tools.py: _get_searxng_url(), _searxng_search(), search dispatch, extract falls back to Firecrawl (SearXNG is search-only) - hermes_cli/tools_config.py: SearXNG provider in web tool picker - hermes_cli/config.py: SEARXNG_URL env var, diagnostics, set command - tests/tools/test_web_tools_searxng.py: 15 tests - optional-skills/research/searxng-search/: agent-guided skill - Docs: configuration.md, environment-variables.md, skills catalogs Based on #6071 by @gnanam1990, #8106 by @cro, #2572 by @bhovig, #2710 and #9961 by @StreamOfRon, #7258 by @coldxiangyu163
This commit is contained in:
parent
2367c6ffd5
commit
130b021d74
10 changed files with 600 additions and 6 deletions
|
|
@ -16,7 +16,8 @@ Backend compatibility:
|
|||
- Exa: https://exa.ai (search, extract)
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway.<domain> for Nous Subscribers)
|
||||
- Parallel: https://docs.parallel.ai (search, extract)
|
||||
- Tavily: https://tavily.com (search, extract, crawl)
|
||||
- SearXNG: https://docs.searxng.org (search; self-hosted open-source federated search)
|
||||
- Tavily: https://tavily.com (search, extract, crawl)
|
||||
|
||||
LLM Processing:
|
||||
- Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
|
||||
|
|
@ -88,7 +89,7 @@ def _get_backend() -> str:
|
|||
keys manually without running setup.
|
||||
"""
|
||||
configured = (_load_web_config().get("backend") or "").lower().strip()
|
||||
if configured in ("parallel", "firecrawl", "tavily", "exa"):
|
||||
if configured in ("parallel", "firecrawl", "tavily", "exa", "searxng"):
|
||||
return configured
|
||||
|
||||
# Fallback for manual / legacy config — pick the highest-priority
|
||||
|
|
@ -96,6 +97,7 @@ def _get_backend() -> str:
|
|||
# tool gateway is configured for Nous subscribers.
|
||||
backend_candidates = (
|
||||
("firecrawl", _has_env("FIRECRAWL_API_KEY") or _has_env("FIRECRAWL_API_URL") or _is_tool_gateway_ready()),
|
||||
("searxng", _has_env("SEARXNG_URL")),
|
||||
("parallel", _has_env("PARALLEL_API_KEY")),
|
||||
("tavily", _has_env("TAVILY_API_KEY")),
|
||||
("exa", _has_env("EXA_API_KEY")),
|
||||
|
|
@ -117,6 +119,8 @@ def _is_backend_available(backend: str) -> bool:
|
|||
return check_firecrawl_api_key()
|
||||
if backend == "tavily":
|
||||
return _has_env("TAVILY_API_KEY")
|
||||
if backend == "searxng":
|
||||
return _has_env("SEARXNG_URL")
|
||||
return False
|
||||
|
||||
# ─── Firecrawl Client ────────────────────────────────────────────────────────
|
||||
|
|
@ -189,6 +193,7 @@ def _web_requires_env() -> list[str]:
|
|||
"TAVILY_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"FIRECRAWL_API_URL",
|
||||
"SEARXNG_URL",
|
||||
]
|
||||
if managed_nous_tools_enabled():
|
||||
requires.extend(
|
||||
|
|
@ -956,6 +961,58 @@ def _exa_extract(urls: List[str]) -> List[Dict[str, Any]]:
|
|||
return results
|
||||
|
||||
|
||||
# ─── SearXNG Search Helper ───────────────────────────────────────────────────
|
||||
|
||||
def _get_searxng_url() -> str:
|
||||
"""Return the configured SearXNG instance URL.
|
||||
|
||||
Requires the ``SEARXNG_URL`` environment variable pointing to a running
|
||||
SearXNG instance (e.g. ``https://searx.example.com``).
|
||||
"""
|
||||
url = os.getenv("SEARXNG_URL", "").strip().rstrip("/")
|
||||
if not url:
|
||||
raise ValueError(
|
||||
"SEARXNG_URL environment variable not set. "
|
||||
"Set it to your SearXNG instance URL (e.g., https://searx.example.com)"
|
||||
)
|
||||
return url
|
||||
|
||||
|
||||
def _searxng_search(query: str, limit: int = 5) -> dict:
|
||||
"""Search using a SearXNG instance and return normalized results."""
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return {"error": "Interrupted", "success": False}
|
||||
|
||||
base_url = _get_searxng_url()
|
||||
logger.info("SearXNG search: '%s' (limit=%d, base=%s)", query, limit, base_url)
|
||||
|
||||
response = httpx.get(
|
||||
f"{base_url}/search",
|
||||
params={
|
||||
"q": query,
|
||||
"format": "json",
|
||||
"pageno": 1,
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()
|
||||
|
||||
web_results = []
|
||||
for i, result in enumerate(raw.get("results", [])):
|
||||
if i >= limit:
|
||||
break
|
||||
web_results.append({
|
||||
"url": result.get("url", ""),
|
||||
"title": result.get("title", ""),
|
||||
"description": result.get("content", ""),
|
||||
"position": i + 1,
|
||||
})
|
||||
|
||||
return {"success": True, "data": {"web": web_results}}
|
||||
|
||||
|
||||
# ─── Parallel Search & Extract Helpers ────────────────────────────────────────
|
||||
|
||||
def _parallel_search(query: str, limit: int = 5) -> dict:
|
||||
|
|
@ -1102,6 +1159,15 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
_debug.save()
|
||||
return result_json
|
||||
|
||||
if backend == "searxng":
|
||||
response_data = _searxng_search(query, limit)
|
||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
||||
debug_call_data["final_response_size"] = len(result_json)
|
||||
_debug.log_call("web_search_tool", debug_call_data)
|
||||
_debug.save()
|
||||
return result_json
|
||||
|
||||
if backend == "tavily":
|
||||
logger.info("Tavily search: '%s' (limit: %d)", query, limit)
|
||||
raw = _tavily_request("search", {
|
||||
|
|
@ -1252,6 +1318,37 @@ async def web_extract_tool(
|
|||
"include_images": False,
|
||||
})
|
||||
results = _normalize_tavily_documents(raw, fallback_url=safe_urls[0] if safe_urls else "")
|
||||
elif backend == "searxng":
|
||||
# SearXNG is search-only — fall through to Firecrawl for extraction
|
||||
logger.info("SearXNG does not support extraction, falling back to Firecrawl")
|
||||
formats: List[str] = []
|
||||
if format == "markdown":
|
||||
formats = ["markdown"]
|
||||
elif format == "html":
|
||||
formats = ["html"]
|
||||
|
||||
results: List[Dict[str, Any]] = []
|
||||
for url in safe_urls:
|
||||
try:
|
||||
scrape_params = {}
|
||||
if formats:
|
||||
scrape_params["formats"] = formats
|
||||
response = _get_firecrawl_client().scrape(url=url, **scrape_params)
|
||||
content = ""
|
||||
if isinstance(response, dict):
|
||||
content = response.get("markdown") or response.get("html") or response.get("rawHtml", "")
|
||||
elif hasattr(response, "markdown"):
|
||||
content = response.markdown or getattr(response, "html", "") or ""
|
||||
results.append({
|
||||
"url": url,
|
||||
"title": getattr(response, "metadata", {}).get("title", "") if hasattr(response, "metadata") else (response.get("metadata", {}) or {}).get("title", ""),
|
||||
"content": content,
|
||||
"raw_content": content,
|
||||
"metadata": {"sourceURL": url, "title": ""},
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("Firecrawl fallback extraction failed for %s: %s", url, e)
|
||||
results.append({"url": url, "title": "", "content": "", "error": f"Extraction failed: {e}"})
|
||||
else:
|
||||
# ── Firecrawl extraction ──
|
||||
# Determine requested formats for Firecrawl v2
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue