From 130b021d74ca4ff6382563ac3077df8bffb7b31c Mon Sep 17 00:00:00 2001 From: kshitijk4poor Date: Fri, 17 Apr 2026 14:37:59 +0530 Subject: [PATCH] feat: add SearXNG as a native web search backend Adds SearXNG (https://docs.searxng.org) as a self-hosted, privacy-first web search backend alongside Firecrawl, Tavily, Exa, and Parallel. SearXNG is a meta-search engine that aggregates results from 70+ search engines. No API key needed -- just set SEARXNG_URL to your instance. Changes: - tools/web_tools.py: _get_searxng_url(), _searxng_search(), search dispatch, extract falls back to Firecrawl (SearXNG is search-only) - hermes_cli/tools_config.py: SearXNG provider in web tool picker - hermes_cli/config.py: SEARXNG_URL env var, diagnostics, set command - tests/tools/test_web_tools_searxng.py: 15 tests - optional-skills/research/searxng-search/: agent-guided skill - Docs: configuration.md, environment-variables.md, skills catalogs Based on #6071 by @gnanam1990, #8106 by @cro, #2572 by @bhovig, #2710 and #9961 by @StreamOfRon, #7258 by @coldxiangyu163 --- hermes_cli/config.py | 11 +- hermes_cli/tools_config.py | 9 + .../research/searxng-search/SKILL.md | 211 +++++++++++++++ .../searxng-search/scripts/searxng.sh | 22 ++ tests/tools/test_web_tools_searxng.py | 240 ++++++++++++++++++ tools/web_tools.py | 101 +++++++- .../docs/reference/environment-variables.md | 1 + .../docs/reference/optional-skills-catalog.md | 1 + website/docs/reference/skills-catalog.md | 1 + website/docs/user-guide/configuration.md | 9 +- 10 files changed, 600 insertions(+), 6 deletions(-) create mode 100644 optional-skills/research/searxng-search/SKILL.md create mode 100755 optional-skills/research/searxng-search/scripts/searxng.sh create mode 100644 tests/tools/test_web_tools_searxng.py diff --git a/hermes_cli/config.py b/hermes_cli/config.py index c7df03370..ca1f29059 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1194,6 +1194,14 @@ OPTIONAL_ENV_VARS = { "password": True, "category": "tool", }, + "SEARXNG_URL": { + "description": "SearXNG instance URL for open-source federated web search", + "prompt": "SearXNG instance URL", + "url": None, + "tools": ["web_search"], + "password": False, + "category": "tool", + }, "BROWSERBASE_API_KEY": { "description": "Browserbase API key for cloud browser (optional — local browser works without this)", "prompt": "Browserbase API key", @@ -3224,6 +3232,7 @@ def show_config(): ("PARALLEL_API_KEY", "Parallel"), ("FIRECRAWL_API_KEY", "Firecrawl"), ("TAVILY_API_KEY", "Tavily"), + ("SEARXNG_URL", "SearXNG"), ("BROWSERBASE_API_KEY", "Browserbase"), ("BROWSER_USE_API_KEY", "Browser Use"), ("FAL_KEY", "FAL"), @@ -3401,7 +3410,7 @@ def set_config_value(key: str, value: str): 'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY', 'EXA_API_KEY', 'PARALLEL_API_KEY', 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'FIRECRAWL_GATEWAY_URL', 'TOOL_GATEWAY_DOMAIN', 'TOOL_GATEWAY_SCHEME', - 'TOOL_GATEWAY_USER_TOKEN', 'TAVILY_API_KEY', + 'TOOL_GATEWAY_USER_TOKEN', 'TAVILY_API_KEY', 'SEARXNG_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY', 'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN', 'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY', diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 8bfbc059f..62ef0f9f6 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -235,6 +235,15 @@ TOOL_CATEGORIES = { {"key": "TAVILY_API_KEY", "prompt": "Tavily API key", "url": "https://app.tavily.com/home"}, ], }, + { + "name": "SearXNG", + "badge": "free · self-hosted", + "tag": "Open-source federated search (self-hosted)", + "web_backend": "searxng", + "env_vars": [ + {"key": "SEARXNG_URL", "prompt": "SearXNG instance URL (e.g., https://searx.example.com)"}, + ], + }, { "name": "Firecrawl Self-Hosted", "badge": "free · self-hosted", diff --git a/optional-skills/research/searxng-search/SKILL.md b/optional-skills/research/searxng-search/SKILL.md new file mode 100644 index 000000000..c2d170591 --- /dev/null +++ b/optional-skills/research/searxng-search/SKILL.md @@ -0,0 +1,211 @@ +--- +name: searxng-search +description: Free meta-search via SearXNG — aggregates results from 70+ search engines. Self-hosted or use a public instance. No API key needed. Falls back automatically when the web search toolset is unavailable. +version: 1.0.0 +author: hermes-agent +license: MIT +metadata: + hermes: + tags: [search, searxng, meta-search, self-hosted, free, fallback] + related_skills: [duckduckgo-search, domain-intel] + fallback_for_toolsets: [web] +--- + +# SearXNG Search + +Free meta-search using [SearXNG](https://searxng.org/) — a privacy-respecting, self-hosted search aggregator that queries 70+ search engines simultaneously. + +**No API key required** when using a public instance. Can also be self-hosted for full control. Automatically appears as a fallback when the main web search toolset (`FIRECRAWL_API_KEY`) is not configured. + +## Configuration + +SearXNG requires a `SEARXNG_URL` environment variable pointing to your SearXNG instance: + +```bash +# Public instances (no setup required) +SEARXNG_URL=https://searxng.example.com + +# Self-hosted SearXNG +SEARXNG_URL=http://localhost:8888 +``` + +If no instance is configured, this skill is unavailable and the agent falls back to other search options. + +## Detection Flow + +Check what is actually available before choosing an approach: + +```bash +# Check if SEARXNG_URL is set and the instance is reachable +curl -s --max-time 5 "${SEARXNG_URL}/search?q=test&format=json" | head -c 200 +``` + +Decision tree: +1. If `SEARXNG_URL` is set and the instance responds, use SearXNG +2. If `SEARXNG_URL` is unset or unreachable, fall back to other available search tools +3. If the user wants SearXNG specifically, help them set up an instance or find a public one + +## Method 1: CLI via curl (Preferred) + +Use `curl` via `terminal` to call the SearXNG JSON API. This avoids assuming any particular Python package is installed. + +```bash +# Text search (JSON output) +curl -s --max-time 10 \ + "${SEARXNG_URL}/search?q=python+async+programming&format=json&engines=google,bing&limit=10" + +# With Safesearch off +curl -s --max-time 10 \ + "${SEARXNG_URL}/search?q=example&format=json&safesearch=0" + +# Specific categories (general, news, science, etc.) +curl -s --max-time 10 \ + "${SEARXNG_URL}/search?q=AI+news&format=json&categories=news" +``` + +### Common CLI Flags + +| Flag | Description | Example | +|------|-------------|---------| +| `q` | Query string (URL-encoded) | `q=python+async` | +| `format` | Output format: `json`, `csv`, `rss` | `format=json` | +| `engines` | Comma-separated engine names | `engines=google,bing,ddg` | +| `limit` | Max results per engine (default 10) | `limit=5` | +| `categories` | Filter by category | `categories=news,science` | +| `safesearch` | 0=none, 1=moderate, 2=strict | `safesearch=0` | +| `time_range` | Filter: `day`, `week`, `month`, `year` | `time_range=week` | + +### Parsing JSON Results + +```bash +# Extract titles and URLs from JSON +curl -s --max-time 10 "${SEARXNG_URL}/search?q=fastapi&format=json&limit=5" \ + | python3 -c " +import json, sys +data = json.load(sys.stdin) +for r in data.get('results', []): + print(r.get('title','')) + print(r.get('url','')) + print(r.get('content','')[:200]) + print() +" +``` + +Returns per result: `title`, `url`, `content` (snippet), `engine`, `parsed_url`, `img_src`, `thumbnail`, `author`, `published_date` + +## Method 2: Python API via `requests` + +Use the SearXNG REST API directly from Python with the `requests` library: + +```python +import os, requests, urllib.parse + +base_url = os.environ.get("SEARXNG_URL", "") +if not base_url: + raise RuntimeError("SEARXNG_URL is not set") + +query = "fastapi deployment guide" +params = { + "q": query, + "format": "json", + "limit": 5, + "engines": "google,bing", +} + +resp = requests.get(f"{base_url}/search", params=params, timeout=10) +resp.raise_for_status() +data = resp.json() + +for r in data.get("results", []): + print(r["title"]) + print(r["url"]) + print(r.get("content", "")[:200]) + print() +``` + +## Method 3: searxng-data Python Package + +For more structured access, install the `searxng-data` package: + +```bash +pip install searxng-data +``` + +```python +from searxng_data import engines + +# List available engines +print(engines.list_engines()) +``` + +Note: This package only provides engine metadata, not the search API itself. + +## Self-Hosting SearXNG + +To run your own SearXNG instance: + +```bash +# Using Docker +docker run -d -p 8888:8080 \ + -v $(pwd)/searxng:/etc/searxng \ + searxng/searxng:latest + +# Then set +SEARXNG_URL=http://localhost:8888 +``` + +Or install via pip: +```bash +pip install searxng +# Edit /etc/searxng/settings.yml +searxng-run +``` + +Public SearXNG instances are available at: +- `https://searxng.example.com` (replace with any public instance) + +## Workflow: Search then Extract + +SearXNG returns titles, URLs, and snippets — not full page content. To get full page content, search first and then extract the most relevant URL with `web_extract`, browser tools, or `curl`. + +```bash +# Search for relevant pages +curl -s "${SEARXNG_URL}/search?q=fastapi+deployment&format=json&limit=3" +# Output: list of results with titles and URLs + +# Then extract the best URL with web_extract +``` + +## Limitations + +- **Instance availability**: If the SearXNG instance is down or unreachable, search fails. Always check `SEARXNG_URL` is set and the instance is reachable. +- **No content extraction**: SearXNG returns snippets, not full page content. Use `web_extract`, browser tools, or `curl` for full articles. +- **Rate limiting**: Some public instances limit requests. Self-hosting avoids this. +- **Engine coverage**: Available engines depend on the SearXNG instance configuration. Some engines may be disabled. +- **Results freshness**: Meta-search aggregates external engines — result freshness depends on those engines. + +## Troubleshooting + +| Problem | Likely Cause | What To Do | +|---------|--------------|------------| +| `SEARXNG_URL` not set | No instance configured | Use a public SearXNG instance or set up your own | +| Connection refused | Instance not running or wrong URL | Check the URL is correct and the instance is running | +| Empty results | Instance blocks the query | Try a different instance or self-host | +| Slow responses | Public instance under load | Self-host or use a less-loaded public instance | +| `json` format not supported | Old SearXNG version | Try `format=rss` or upgrade SearXNG | + +## Pitfalls + +- **Always set `SEARXNG_URL`**: Without it, the skill cannot function. +- **URL-encode queries**: Spaces and special characters must be URL-encoded in curl, or use `urllib.parse.quote()` in Python. +- **Use `format=json`**: The default format may not be machine-readable. Always request JSON explicitly. +- **Set a timeout**: Always use `--max-time` or `timeout=` to avoid hanging on unreachable instances. +- **Self-hosting is best**: Public instances may go down, rate-limit, or block. A self-hosted instance is reliable. + +## Instance Discovery + +If `SEARXNG_URL` is not set and the user asks about SearXNG, help them either: +1. Find a public SearXNG instance (search for "public searxng instance") +2. Set up their own with Docker or pip + +Public instances are listed at: https://searxng.org/ diff --git a/optional-skills/research/searxng-search/scripts/searxng.sh b/optional-skills/research/searxng-search/scripts/searxng.sh new file mode 100755 index 000000000..12fe792d0 --- /dev/null +++ b/optional-skills/research/searxng-search/scripts/searxng.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Usage: ./searxng.sh [max_results] [engines] +# Example: ./searxng.sh "python async" 10 "google,bing" + +QUERY="${1:-}" +MAX="${2:-5}" +ENGINES="${3:-google,bing}" + +if [ -z "$SEARXNG_URL" ]; then + echo "Error: SEARXNG_URL is not set" + exit 1 +fi + +if [ -z "$QUERY" ]; then + echo "Usage: $0 [max_results] [engines]" + exit 1 +fi + +ENCODED_QUERY=$(echo "$QUERY" | sed 's/ /+/g') + +curl -s --max-time 10 \ + "${SEARXNG_URL}/search?q=${ENCODED_QUERY}&format=json&limit=${MAX}&engines=${ENGINES}" diff --git a/tests/tools/test_web_tools_searxng.py b/tests/tools/test_web_tools_searxng.py new file mode 100644 index 000000000..a2a6a91f5 --- /dev/null +++ b/tests/tools/test_web_tools_searxng.py @@ -0,0 +1,240 @@ +"""Tests for SearXNG web backend integration. + +Coverage: + _get_searxng_url() — URL handling, missing env var. + _searxng_search() — search request construction, result normalization. + web_search_tool — SearXNG dispatch path. + web_extract_tool — SearXNG graceful fallback to Firecrawl. + Backend selection — SearXNG in _get_backend and _is_backend_available. +""" + +import json +import os +import pytest +from unittest.mock import patch, MagicMock + + +# ─── _get_searxng_url ─────────────────────────────────────────────────────── + +class TestGetSearxngUrl: + """Test suite for the _get_searxng_url helper.""" + + def test_raises_without_url(self): + """No SEARXNG_URL → ValueError with guidance.""" + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("SEARXNG_URL", None) + from tools.web_tools import _get_searxng_url + with pytest.raises(ValueError, match="SEARXNG_URL"): + _get_searxng_url() + + def test_returns_trimmed_url(self): + """Trailing slashes and whitespace are stripped.""" + with patch.dict(os.environ, {"SEARXNG_URL": " https://searx.example.com/ "}): + from tools.web_tools import _get_searxng_url + assert _get_searxng_url() == "https://searx.example.com" + + def test_returns_url_without_trailing_slash(self): + """Trailing slash is removed for clean URL joining.""" + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com/"}): + from tools.web_tools import _get_searxng_url + assert _get_searxng_url() == "https://searx.example.com" + + +# ─── _searxng_search ──────────────────────────────────────────────────────── + +class TestSearxngSearch: + """Test suite for the _searxng_search helper.""" + + def test_returns_normalized_results(self): + """SearXNG JSON response is normalized to the standard format.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "results": [ + {"title": "Python Docs", "url": "https://docs.python.org", "content": "Official docs"}, + {"title": "Tutorial", "url": "https://example.com", "content": "A tutorial"}, + ] + } + mock_response.raise_for_status = MagicMock() + + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + with patch("tools.web_tools.httpx.get", return_value=mock_response) as mock_get: + with patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import _searxng_search + result = _searxng_search("python docs", limit=5) + + assert result["success"] is True + web = result["data"]["web"] + assert len(web) == 2 + assert web[0]["title"] == "Python Docs" + assert web[0]["url"] == "https://docs.python.org" + assert web[0]["description"] == "Official docs" + assert web[0]["position"] == 1 + assert web[1]["position"] == 2 + + # Verify correct URL and params + mock_get.assert_called_once() + call_args = mock_get.call_args + assert "searx.example.com/search" in call_args.args[0] + params = call_args.kwargs.get("params") or call_args[1].get("params") + assert params["q"] == "python docs" + assert params["format"] == "json" + + def test_respects_limit(self): + """Results are truncated to the requested limit.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "results": [ + {"title": f"Result {i}", "url": f"https://r{i}.com", "content": f"desc {i}"} + for i in range(10) + ] + } + mock_response.raise_for_status = MagicMock() + + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + with patch("tools.web_tools.httpx.get", return_value=mock_response): + with patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import _searxng_search + result = _searxng_search("test", limit=3) + assert len(result["data"]["web"]) == 3 + + def test_empty_results(self): + """Empty SearXNG response returns empty web list.""" + mock_response = MagicMock() + mock_response.json.return_value = {"results": []} + mock_response.raise_for_status = MagicMock() + + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + with patch("tools.web_tools.httpx.get", return_value=mock_response): + with patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import _searxng_search + result = _searxng_search("nothing") + assert result["success"] is True + assert result["data"]["web"] == [] + + def test_missing_fields(self): + """Results with missing fields default to empty strings.""" + mock_response = MagicMock() + mock_response.json.return_value = {"results": [{}]} + mock_response.raise_for_status = MagicMock() + + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + with patch("tools.web_tools.httpx.get", return_value=mock_response): + with patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import _searxng_search + result = _searxng_search("test") + web = result["data"]["web"] + assert web[0]["title"] == "" + assert web[0]["url"] == "" + assert web[0]["description"] == "" + + def test_interrupted_returns_error(self): + """Interrupted search returns error dict.""" + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + with patch("tools.interrupt.is_interrupted", return_value=True): + from tools.web_tools import _searxng_search + result = _searxng_search("test") + assert result["success"] is False + assert "Interrupted" in result["error"] + + def test_raises_on_http_error(self): + """Non-2xx responses propagate as httpx.HTTPStatusError.""" + import httpx as _httpx + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = _httpx.HTTPStatusError( + "500 Server Error", request=MagicMock(), response=mock_response + ) + + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + with patch("tools.web_tools.httpx.get", return_value=mock_response): + with patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import _searxng_search + with pytest.raises(_httpx.HTTPStatusError): + _searxng_search("test") + + +# ─── web_search_tool (SearXNG dispatch) ───────────────────────────────────── + +class TestWebSearchSearxng: + """Test web_search_tool dispatch to SearXNG.""" + + def test_search_dispatches_to_searxng(self): + mock_response = MagicMock() + mock_response.json.return_value = { + "results": [{"title": "Result", "url": "https://r.com", "content": "desc"}] + } + mock_response.raise_for_status = MagicMock() + + with patch("tools.web_tools._get_backend", return_value="searxng"), \ + patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}), \ + patch("tools.web_tools.httpx.get", return_value=mock_response), \ + patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import web_search_tool + result = json.loads(web_search_tool("test query", limit=3)) + assert result["success"] is True + assert len(result["data"]["web"]) == 1 + assert result["data"]["web"][0]["title"] == "Result" + + +# ─── web_extract_tool (SearXNG falls back to Firecrawl) ───────────────────── + +class TestWebExtractSearxng: + """Test web_extract_tool Firecrawl fallback when SearXNG is selected.""" + + def test_extract_falls_back_to_firecrawl(self): + mock_firecrawl = MagicMock() + mock_firecrawl.scrape.return_value = { + "markdown": "# Page Content", + "metadata": {"title": "Example Page"}, + } + + with patch("tools.web_tools._get_backend", return_value="searxng"), \ + patch("tools.web_tools.is_safe_url", return_value=True), \ + patch("tools.web_tools._get_firecrawl_client", return_value=mock_firecrawl), \ + patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import web_extract_tool + import asyncio + result = json.loads(asyncio.get_event_loop().run_until_complete( + web_extract_tool(["https://example.com"], use_llm_processing=False) + )) + assert "results" in result + assert result["results"][0]["content"] == "# Page Content" + mock_firecrawl.scrape.assert_called_once() + + def test_extract_fallback_handles_firecrawl_error(self): + mock_firecrawl = MagicMock() + mock_firecrawl.scrape.side_effect = ValueError("No API key") + + with patch("tools.web_tools._get_backend", return_value="searxng"), \ + patch("tools.web_tools.is_safe_url", return_value=True), \ + patch("tools.web_tools._get_firecrawl_client", return_value=mock_firecrawl), \ + patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import web_extract_tool + import asyncio + result = json.loads(asyncio.get_event_loop().run_until_complete( + web_extract_tool(["https://example.com"], use_llm_processing=False) + )) + assert "results" in result + assert "error" in result["results"][0] + assert "failed" in result["results"][0]["error"].lower() + + +# ─── Backend selection ────────────────────────────────────────────────────── + +class TestSearxngBackendSelection: + """Test that SearXNG is correctly selected as a backend.""" + + def test_searxng_selected_from_config(self): + with patch("tools.web_tools._load_web_config", return_value={"backend": "searxng"}): + from tools.web_tools import _get_backend + assert _get_backend() == "searxng" + + def test_searxng_available_with_url(self): + with patch.dict(os.environ, {"SEARXNG_URL": "https://searx.example.com"}): + from tools.web_tools import _is_backend_available + assert _is_backend_available("searxng") is True + + def test_searxng_unavailable_without_url(self): + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("SEARXNG_URL", None) + from tools.web_tools import _is_backend_available + assert _is_backend_available("searxng") is False diff --git a/tools/web_tools.py b/tools/web_tools.py index c24f1fc38..4f97e4fca 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -16,7 +16,8 @@ Backend compatibility: - Exa: https://exa.ai (search, extract) - Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway. for Nous Subscribers) - Parallel: https://docs.parallel.ai (search, extract) -- Tavily: https://tavily.com (search, extract, crawl) + - SearXNG: https://docs.searxng.org (search; self-hosted open-source federated search) + - Tavily: https://tavily.com (search, extract, crawl) LLM Processing: - Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction @@ -88,7 +89,7 @@ def _get_backend() -> str: keys manually without running setup. """ configured = (_load_web_config().get("backend") or "").lower().strip() - if configured in ("parallel", "firecrawl", "tavily", "exa"): + if configured in ("parallel", "firecrawl", "tavily", "exa", "searxng"): return configured # Fallback for manual / legacy config — pick the highest-priority @@ -96,6 +97,7 @@ def _get_backend() -> str: # tool gateway is configured for Nous subscribers. backend_candidates = ( ("firecrawl", _has_env("FIRECRAWL_API_KEY") or _has_env("FIRECRAWL_API_URL") or _is_tool_gateway_ready()), + ("searxng", _has_env("SEARXNG_URL")), ("parallel", _has_env("PARALLEL_API_KEY")), ("tavily", _has_env("TAVILY_API_KEY")), ("exa", _has_env("EXA_API_KEY")), @@ -117,6 +119,8 @@ def _is_backend_available(backend: str) -> bool: return check_firecrawl_api_key() if backend == "tavily": return _has_env("TAVILY_API_KEY") + if backend == "searxng": + return _has_env("SEARXNG_URL") return False # ─── Firecrawl Client ──────────────────────────────────────────────────────── @@ -189,6 +193,7 @@ def _web_requires_env() -> list[str]: "TAVILY_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL", + "SEARXNG_URL", ] if managed_nous_tools_enabled(): requires.extend( @@ -956,6 +961,58 @@ def _exa_extract(urls: List[str]) -> List[Dict[str, Any]]: return results +# ─── SearXNG Search Helper ─────────────────────────────────────────────────── + +def _get_searxng_url() -> str: + """Return the configured SearXNG instance URL. + + Requires the ``SEARXNG_URL`` environment variable pointing to a running + SearXNG instance (e.g. ``https://searx.example.com``). + """ + url = os.getenv("SEARXNG_URL", "").strip().rstrip("/") + if not url: + raise ValueError( + "SEARXNG_URL environment variable not set. " + "Set it to your SearXNG instance URL (e.g., https://searx.example.com)" + ) + return url + + +def _searxng_search(query: str, limit: int = 5) -> dict: + """Search using a SearXNG instance and return normalized results.""" + from tools.interrupt import is_interrupted + if is_interrupted(): + return {"error": "Interrupted", "success": False} + + base_url = _get_searxng_url() + logger.info("SearXNG search: '%s' (limit=%d, base=%s)", query, limit, base_url) + + response = httpx.get( + f"{base_url}/search", + params={ + "q": query, + "format": "json", + "pageno": 1, + }, + timeout=30, + ) + response.raise_for_status() + raw = response.json() + + web_results = [] + for i, result in enumerate(raw.get("results", [])): + if i >= limit: + break + web_results.append({ + "url": result.get("url", ""), + "title": result.get("title", ""), + "description": result.get("content", ""), + "position": i + 1, + }) + + return {"success": True, "data": {"web": web_results}} + + # ─── Parallel Search & Extract Helpers ──────────────────────────────────────── def _parallel_search(query: str, limit: int = 5) -> dict: @@ -1102,6 +1159,15 @@ def web_search_tool(query: str, limit: int = 5) -> str: _debug.save() return result_json + if backend == "searxng": + response_data = _searxng_search(query, limit) + debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) + result_json = json.dumps(response_data, indent=2, ensure_ascii=False) + debug_call_data["final_response_size"] = len(result_json) + _debug.log_call("web_search_tool", debug_call_data) + _debug.save() + return result_json + if backend == "tavily": logger.info("Tavily search: '%s' (limit: %d)", query, limit) raw = _tavily_request("search", { @@ -1252,6 +1318,37 @@ async def web_extract_tool( "include_images": False, }) results = _normalize_tavily_documents(raw, fallback_url=safe_urls[0] if safe_urls else "") + elif backend == "searxng": + # SearXNG is search-only — fall through to Firecrawl for extraction + logger.info("SearXNG does not support extraction, falling back to Firecrawl") + formats: List[str] = [] + if format == "markdown": + formats = ["markdown"] + elif format == "html": + formats = ["html"] + + results: List[Dict[str, Any]] = [] + for url in safe_urls: + try: + scrape_params = {} + if formats: + scrape_params["formats"] = formats + response = _get_firecrawl_client().scrape(url=url, **scrape_params) + content = "" + if isinstance(response, dict): + content = response.get("markdown") or response.get("html") or response.get("rawHtml", "") + elif hasattr(response, "markdown"): + content = response.markdown or getattr(response, "html", "") or "" + results.append({ + "url": url, + "title": getattr(response, "metadata", {}).get("title", "") if hasattr(response, "metadata") else (response.get("metadata", {}) or {}).get("title", ""), + "content": content, + "raw_content": content, + "metadata": {"sourceURL": url, "title": ""}, + }) + except Exception as e: + logger.warning("Firecrawl fallback extraction failed for %s: %s", url, e) + results.append({"url": url, "title": "", "content": "", "error": f"Extraction failed: {e}"}) else: # ── Firecrawl extraction ── # Determine requested formats for Firecrawl v2 diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index 63844b3f9..0c601d230 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -90,6 +90,7 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe | `FIRECRAWL_API_KEY` | Web scraping and cloud browser ([firecrawl.dev](https://firecrawl.dev/)) | | `FIRECRAWL_API_URL` | Custom Firecrawl API endpoint for self-hosted instances (optional) | | `TAVILY_API_KEY` | Tavily API key for AI-native web search, extract, and crawl ([app.tavily.com](https://app.tavily.com/home)) | +| `SEARXNG_URL` | SearXNG instance URL for self-hosted federated web search ([docs.searxng.org](https://docs.searxng.org)) | | `EXA_API_KEY` | Exa API key for AI-native web search and contents ([exa.ai](https://exa.ai/)) | | `BROWSERBASE_API_KEY` | Browser automation ([browserbase.com](https://browserbase.com/)) | | `BROWSERBASE_PROJECT_ID` | Browserbase project ID | diff --git a/website/docs/reference/optional-skills-catalog.md b/website/docs/reference/optional-skills-catalog.md index 18ec4b381..bdb6af7ae 100644 --- a/website/docs/reference/optional-skills-catalog.md +++ b/website/docs/reference/optional-skills-catalog.md @@ -128,6 +128,7 @@ The largest optional category — covers the full ML pipeline from data curation | **bioinformatics** | Gateway to 400+ bioinformatics skills from bioSkills and ClawBio. Covers genomics, transcriptomics, single-cell, variant calling, pharmacogenomics, metagenomics, and structural biology. | | **domain-intel** | Passive domain reconnaissance using Python stdlib. Subdomain discovery, SSL certificate inspection, WHOIS lookups, DNS records, and bulk multi-domain analysis. No API keys required. | | **duckduckgo-search** | Free web search via DuckDuckGo — text, news, images, videos. No API key needed. | +| **searxng-search** | Free meta-search via SearXNG — aggregates results from 70+ search engines. Self-hosted or use a public instance. No API key needed. | | **gitnexus-explorer** | Index a codebase with GitNexus and serve an interactive knowledge graph via web UI and Cloudflare tunnel. | | **parallel-cli** | Vendor skill for Parallel CLI — agent-native web search, extraction, deep research, enrichment, and monitoring. | | **qmd** | Search personal knowledge bases, notes, docs, and meeting transcripts locally using qmd — a hybrid retrieval engine with BM25, vector search, and LLM reranking. | diff --git a/website/docs/reference/skills-catalog.md b/website/docs/reference/skills-catalog.md index 13ef2f7fc..ba0d03ec1 100644 --- a/website/docs/reference/skills-catalog.md +++ b/website/docs/reference/skills-catalog.md @@ -256,6 +256,7 @@ Skills for academic research, paper discovery, literature review, domain reconna | `llm-wiki` | Karpathy's LLM Wiki — build and maintain a persistent, interlinked markdown knowledge base. Ingest sources, query compiled knowledge, and lint for consistency. Unlike RAG, the wiki compiles knowledge once and keeps it current. Works as an Obsidian vault. Wiki path is controlled by the `WIKI_PATH` env var (defaults to `~/wiki`). | `research/llm-wiki` | | `domain-intel` | Passive domain reconnaissance using Python stdlib. Subdomain discovery, SSL certificate inspection, WHOIS lookups, DNS records, domain availability checks, and bulk multi-domain analysis. No API keys required. | `research/domain-intel` | | `duckduckgo-search` | Free web search via DuckDuckGo — text, news, images, videos. No API key needed. Prefer the `ddgs` CLI when installed; use the Python DDGS library only after verifying that `ddgs` is available in the current runtime. | `research/duckduckgo-search` | +| `searxng-search` | Free meta-search via SearXNG — aggregates results from 70+ search engines. Self-hosted or use a public instance. No API key needed. Falls back automatically when the web search toolset is unavailable. | `research/searxng-search` | | `ml-paper-writing` | Write publication-ready ML/AI papers for NeurIPS, ICML, ICLR, ACL, AAAI, COLM. Use when drafting papers from research repos, structuring arguments, verifying citations, or preparing camera-ready submissions. Includes LaTeX templates, reviewer guidelines, and citation verificatio… | `research/ml-paper-writing` | | `polymarket` | Query Polymarket prediction market data — search markets, get prices, orderbooks, and price history. Read-only via public REST APIs, no API key needed. | `research/polymarket` | diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index bef9b5cfd..b33d73473 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -1102,21 +1102,24 @@ code_execution: ## Web Search Backends -The `web_search`, `web_extract`, and `web_crawl` tools support four backend providers. Configure the backend in `config.yaml` or via `hermes tools`: +The `web_search`, `web_extract`, and `web_crawl` tools support five backend providers. Configure the backend in `config.yaml` or via `hermes tools`: ```yaml web: - backend: firecrawl # firecrawl | parallel | tavily | exa + backend: firecrawl # firecrawl | searxng | parallel | tavily | exa ``` | Backend | Env Var | Search | Extract | Crawl | |---------|---------|--------|---------|-------| | **Firecrawl** (default) | `FIRECRAWL_API_KEY` | ✔ | ✔ | ✔ | +| **SearXNG** | `SEARXNG_URL` | ✔ | — (falls back to Firecrawl) | — | | **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | — | | **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | ✔ | | **Exa** | `EXA_API_KEY` | ✔ | ✔ | — | -**Backend selection:** If `web.backend` is not set, the backend is auto-detected from available API keys. If only `EXA_API_KEY` is set, Exa is used. If only `TAVILY_API_KEY` is set, Tavily is used. If only `PARALLEL_API_KEY` is set, Parallel is used. Otherwise Firecrawl is the default. +**Backend selection:** If `web.backend` is not set, the backend is auto-detected from available API keys. If only `SEARXNG_URL` is set, SearXNG is used. If only `EXA_API_KEY` is set, Exa is used. If only `TAVILY_API_KEY` is set, Tavily is used. If only `PARALLEL_API_KEY` is set, Parallel is used. Otherwise Firecrawl is the default. + +**SearXNG** is a self-hosted open-source meta-search engine that aggregates results from 70+ search engines. No API key is needed — just set `SEARXNG_URL` to your instance (e.g., `https://searx.example.com`). SearXNG only supports search; extract and crawl automatically fall back to Firecrawl. **Self-hosted Firecrawl:** Set `FIRECRAWL_API_URL` to point at your own instance. When a custom URL is set, the API key becomes optional (set `USE_DB_AUTHENTICATION=false` on the server to disable auth).