hermes-agent/code_stats.py

#!/usr/bin/env python3
"""代码统计分析工具 - 按目录/类型分组，区分代码行/注释行/空行，ASCII柱状图"""

import os
import sys
from pathlib import Path
from collections import defaultdict

# 文件类型 → (单行注释, 多行开始, 多行结束)
LANG_MAP = {
    ".py":   ("#", '"""', '"""'),
    ".js":   ("//", "/*", "*/"),
    ".ts":   ("//", "/*", "*/"),
    ".tsx":  ("//", "/*", "*/"),
    ".jsx":  ("//", "/*", "*/"),
    ".java": ("//", "/*", "*/"),
    ".c":    ("//", "/*", "*/"),
    ".cpp":  ("//", "/*", "*/"),
    ".h":    ("//", "/*", "*/"),
    ".go":   ("//", "/*", "*/"),
    ".rs":   ("//", "/*", "*/"),
    ".rb":   ("#", "=begin", "=end"),
    ".sh":   ("#", None, None),
    ".bash": ("#", None, None),
    ".yml":  ("#", None, None),
    ".yaml": ("#", None, None),
    ".toml": ("#", None, None),
    ".sql":  ("--", "/*", "*/"),
    ".html": (None, "<!--", "-->"),
    ".css":  (None, "/*", "*/"),
    ".vue":  ("//", "/*", "*/"),
    ".swift": ("//", "/*", "*/"),
    ".kt":   ("//", "/*", "*/"),
    ".lua":  ("--", "--[[", "]]"),
    ".r":    ("#", None, None),
    ".php":  ("//", "/*", "*/"),
}

SKIP_DIRS = {
    ".git", ".svn", ".hg", "node_modules", "__pycache__",
    ".venv", "venv", "env", ".env", ".tox", "dist", "build",
    ".mypy_cache", ".pytest_cache", ".eggs", "target", "vendor",
    ".next", ".nuxt", "coverage",
}


def should_skip(path: Path) -> bool:
    return any(part in SKIP_DIRS for part in path.parts)


def analyze_file(filepath: Path) -> dict:
    ext = filepath.suffix.lower()
    if ext not in LANG_MAP:
        return None

    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            lines = f.readlines()
    except (OSError, PermissionError):
        return None

    line_comment, block_start, block_end = LANG_MAP[ext]
    total = len(lines)
    blank = 0
    comment = 0
    in_block = False

    for line in lines:
        stripped = line.strip()
        if not stripped:
            blank += 1
            continue
        if in_block:
            comment += 1
            if block_end and block_end in stripped:
                in_block = False
            continue
        if block_start and stripped.startswith(block_start):
            comment += 1
            if block_end and block_end not in stripped[len(block_start):]:
                in_block = True
            continue
        if line_comment and stripped.startswith(line_comment):
            comment += 1

    return {"ext": ext, "total": total, "code": total - blank - comment, "comment": comment, "blank": blank}


def bar(value, max_val, width=25):
    if max_val == 0:
        return ""
    return "█" * int(width * value / max_val) + "░" * (width - int(width * value / max_val))


def stacked_bar(code, comment, blank, total, width=30):
    """三段式柱状图：█代码 ▓注释 ░空行"""
    if total == 0:
        return "░" * width
    cw = int(width * code / total)
    mw = int(width * comment / total)
    bw = width - cw - mw
    return "█" * cw + "▓" * mw + "░" * bw


def fmt(n):
    """数字格式化"""
    return f"{n:,}"


def print_table(title, rows, headers, col_widths, grand_row=None):
    """通用表格打印"""
    print(f"\n📁 {title}")
    header_line = "".join(h.rjust(w) for h, w in zip(headers, col_widths))
    print(header_line)
    print("─" * len(header_line))

    for row in rows:
        print("".join(str(v).rjust(w) for v, w in zip(row, col_widths)))

    if grand_row:
        print("─" * len(header_line))
        print("".join(str(v).rjust(w) for v, w in zip(grand_row, col_widths)))


def main():
    target = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
    if not target.is_dir():
        print(f"❌ 不是有效目录: {target}")
        sys.exit(1)

    # 收集所有文件的分析结果
    file_results = []
    for filepath in target.rglob("*"):
        if not filepath.is_file() or should_skip(filepath):
            continue
        result = analyze_file(filepath)
        if result:
            rel = filepath.relative_to(target)
            file_results.append({"path": rel, **result})

    if not file_results:
        print("未找到可分析的代码文件。")
        return

    # 汇总
    grand = {"files": len(file_results), "total": 0, "code": 0, "comment": 0, "blank": 0}
    for r in file_results:
        for k in ("total", "code", "comment", "blank"):
            grand[k] += r[k]

    comment_rate = grand["comment"] / max(grand["code"] + grand["comment"], 1) * 100

    print(f"\n📊 代码统计 — {target.resolve()}")
    print(f"   {grand['files']} 个文件 | {fmt(grand['total'])} 行 (代码 {fmt(grand['code'])} / 注释 {fmt(grand['comment'])} / 空行 {fmt(grand['blank'])})")
    print(f"   代码占比 {grand['code']/max(grand['total'],1)*100:.1f}% | 注释率 {comment_rate:.1f}%")

    # ── 按目录分组 ──
    dir_stats = defaultdict(lambda: {"files": 0, "total": 0, "code": 0, "comment": 0, "blank": 0})
    for r in file_results:
        top_dir = r["path"].parts[0] if len(r["path"].parts) > 1 else "[root]"
        bucket = dir_stats[top_dir]
        bucket["files"] += 1
        for k in ("total", "code", "comment", "blank"):
            bucket[k] += r[k]

    headers = ["目录", "文件数", "代码行", "注释行", "空行", "注释率"]
    col_widths = [16, 8, 10, 10, 10, 9]
    rows = []
    for d, b in sorted(dir_stats.items(), key=lambda x: x[1]["code"], reverse=True):
        cr = b["comment"] / max(b["code"] + b["comment"], 1) * 100
        rows.append([d, str(b["files"]), fmt(b["code"]), fmt(b["comment"]), fmt(b["blank"]), f"{cr:.1f}%"])

    print_table("按目录分组", rows, headers, col_widths,
                grand_row=["合计", str(grand["files"]), fmt(grand["code"]), fmt(grand["comment"]), fmt(grand["blank"]), f"{comment_rate:.1f}%"])

    # 按目录柱状图
    max_code = max(b["code"] for b in dir_stats.values())
    print(f"\n📊 目录代码量分布")
    for d, b in sorted(dir_stats.items(), key=lambda x: x[1]["code"], reverse=True):
        pct = b["code"] / max(grand["code"], 1) * 100
        print(f"  {d:<14} {bar(b['code'], max_code, 30)} {pct:5.1f}%")

    # ── 按文件类型 ──
    ext_stats = defaultdict(lambda: {"files": 0, "total": 0, "code": 0, "comment": 0, "blank": 0})
    for r in file_results:
        bucket = ext_stats[r["ext"]]
        bucket["files"] += 1
        for k in ("total", "code", "comment", "blank"):
            bucket[k] += r[k]

    headers2 = ["类型", "文件数", "代码行", "注释行", "空行", "注释率"]
    col_widths2 = [10, 8, 10, 10, 10, 9]
    rows2 = []
    for ext, b in sorted(ext_stats.items(), key=lambda x: x[1]["code"], reverse=True):
        cr = b["comment"] / max(b["code"] + b["comment"], 1) * 100
        rows2.append([ext, str(b["files"]), fmt(b["code"]), fmt(b["comment"]), fmt(b["blank"]), f"{cr:.1f}%"])

    print_table("按文件类型", rows2, headers2, col_widths2,
                grand_row=["合计", str(grand["files"]), fmt(grand["code"]), fmt(grand["comment"]), fmt(grand["blank"]), f"{comment_rate:.1f}%"])

    # 按类型柱状图
    max_code_ext = max(b["code"] for b in ext_stats.values())
    print(f"\n📊 类型代码量分布")
    for ext, b in sorted(ext_stats.items(), key=lambda x: x[1]["code"], reverse=True):
        pct = b["code"] / max(grand["code"], 1) * 100
        print(f"  {ext:<8} {bar(b['code'], max_code_ext, 30)} {pct:5.1f}%")

    # ── 综合堆叠柱状图（按目录） ──
    print(f"\n📊 目录代码结构（█代码 ▓注释 ░空行）")
    for d, b in sorted(dir_stats.items(), key=lambda x: x[1]["code"], reverse=True):
        print(f"  {d:<14} {stacked_bar(b['code'], b['comment'], b['blank'], b['total'], 40)}")

    print()


if __name__ == "__main__":
    main()