mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: harden website blocklist — default off, TTL cache, fail-open, guarded imports
- Default enabled: false (zero overhead when not configured) - Fast path: cached disabled state skips all work immediately - TTL cache (30s) for parsed policy — avoids re-reading config.yaml on every URL check - Missing shared files warn + skip instead of crashing all web tools - Lazy yaml import — missing PyYAML doesn't break browser toolset - Guarded browser_tool import — fail-open lambda fallback - check_website_access never raises for default path (fail-open with warning log); only raises with explicit config_path (test mode) - Simplified enforcement code in web_tools/browser_tool — no more try/except wrappers since errors are handled internally
This commit is contained in:
parent
d132a3dfbb
commit
6fc76ef954
5 changed files with 136 additions and 53 deletions
|
|
@ -49,7 +49,7 @@ from typing import List, Dict, Any, Optional
|
|||
from firecrawl import Firecrawl
|
||||
from agent.auxiliary_client import async_call_llm
|
||||
from tools.debug_helpers import DebugSession
|
||||
from tools.website_policy import WebsitePolicyError, check_website_access
|
||||
from tools.website_policy import check_website_access
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -618,11 +618,7 @@ async def web_extract_tool(
|
|||
continue
|
||||
|
||||
# Website policy check — block before fetching
|
||||
try:
|
||||
blocked = check_website_access(url)
|
||||
except WebsitePolicyError as policy_err:
|
||||
results.append({"url": url, "title": "", "content": "", "error": f"Website policy error: {policy_err}"})
|
||||
continue
|
||||
blocked = check_website_access(url)
|
||||
if blocked:
|
||||
logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
|
||||
results.append({
|
||||
|
|
@ -687,10 +683,7 @@ async def web_extract_tool(
|
|||
|
||||
# Re-check final URL after redirect
|
||||
final_url = metadata.get("sourceURL", url)
|
||||
try:
|
||||
final_blocked = check_website_access(final_url)
|
||||
except WebsitePolicyError:
|
||||
final_blocked = None
|
||||
final_blocked = check_website_access(final_url)
|
||||
if final_blocked:
|
||||
logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
|
||||
results.append({
|
||||
|
|
@ -903,10 +896,7 @@ async def web_crawl_tool(
|
|||
logger.info("Crawling %s%s", url, instructions_text)
|
||||
|
||||
# Website policy check — block before crawling
|
||||
try:
|
||||
blocked = check_website_access(url)
|
||||
except WebsitePolicyError as policy_err:
|
||||
return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": f"Website policy error: {policy_err}"}]}, ensure_ascii=False)
|
||||
blocked = check_website_access(url)
|
||||
if blocked:
|
||||
logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"])
|
||||
return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"],
|
||||
|
|
@ -1018,10 +1008,7 @@ async def web_crawl_tool(
|
|||
title = metadata.get("title", "")
|
||||
|
||||
# Re-check crawled page URL against policy
|
||||
try:
|
||||
page_blocked = check_website_access(page_url)
|
||||
except WebsitePolicyError:
|
||||
page_blocked = None
|
||||
page_blocked = check_website_access(page_url)
|
||||
if page_blocked:
|
||||
logger.info("Blocked crawled page %s by rule %s", page_blocked["host"], page_blocked["rule"])
|
||||
pages.append({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue