hermes-agent/self_evolution/git_analyzer.py

"""
Self Evolution Plugin — Git Analysis
=====================================

Analyzes git commit history for the dream consolidation engine.

Uses a single batched ``git log --stat --name-only`` call instead of
25+ individual subprocess invocations.

Extracted from reflection_engine.py for single-responsibility.
"""

from __future__ import annotations

import logging
import re
import subprocess
import time
from pathlib import Path
from typing import Dict

from self_evolution.models import CodeChangeAnalysis, CommitInfo

logger = logging.getLogger(__name__)


def analyze_code_changes(hours: int = 24) -> CodeChangeAnalysis:
    """Analyze git commits from the previous period.

    Uses a single batched git log call with --stat --name-only
    instead of 25+ individual subprocess calls.
    """
    project_root = str(Path(__file__).resolve().parent.parent)

    cutoff_epoch = time.time() - (hours * 3600)
    cutoff_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(cutoff_epoch))

    try:
        # Single batched call: format + shortstat + name-only
        result = subprocess.run(
            ["git", "log",
             "--format=COMMITSTART%h%n%s%n%an%n%at%n%b%nENDHEADER",
             "--shortstat", "--name-only",
             "--no-merges", f"--since={cutoff_date}", "-15"],
            capture_output=True, text=True, timeout=30,
            cwd=project_root,
        )
        if result.returncode != 0 or not result.stdout.strip():
            return CodeChangeAnalysis()

        commits = _parse_batched_output(result.stdout)
        if not commits:
            return CodeChangeAnalysis()

        # Aggregate stats
        total_ins = sum(c.insertions for c in commits)
        total_del = sum(c.deletions for c in commits)
        total_files = sum(c.files_changed for c in commits)
        authors = list(dict.fromkeys(c.author for c in commits))

        # Categorize by conventional commit prefix
        categories: Dict[str, int] = {}
        for c in commits:
            cat = _categorize_commit(c.subject)
            categories[cat] = categories.get(cat, 0) + 1

        # Extract top-level module areas
        all_files = []
        for c in commits:
            all_files.extend(c.file_list)
        areas = list(dict.fromkeys(
            f.split("/")[0] for f in all_files
            if "/" in f and not f.startswith(".")
        ))[:10]

        return CodeChangeAnalysis(
            commits=commits,
            total_commits=len(commits),
            total_insertions=total_ins,
            total_deletions=total_del,
            total_files_changed=total_files,
            authors=authors,
            change_categories=categories,
            areas_touched=areas,
        )

    except (subprocess.SubprocessError, FileNotFoundError, OSError):
        logger.debug("git analysis unavailable", exc_info=True)
        return CodeChangeAnalysis()


def _parse_batched_output(stdout: str) -> list:
    """Parse the batched git log output into CommitInfo objects."""
    commits = []
    raw_commits = stdout.split("COMMITSTART")
    for raw in raw_commits:
        raw = raw.strip()
        if not raw:
            continue

        header_end = raw.find("ENDHEADER")
        if header_end < 0:
            continue
        header = raw[:header_end].strip()
        lines = header.split("\n")
        if len(lines) < 4:
            continue

        hash_short = lines[0].strip()
        subject = lines[1].strip()
        author = lines[2].strip()
        try:
            timestamp = float(lines[3].strip())
        except ValueError:
            continue
        body = "\n".join(lines[4:]).strip()[:500]

        # After ENDHEADER: shortstat line(s) + file list
        rest = raw[header_end + len("ENDHEADER"):].strip()

        files_changed = 0
        insertions = 0
        deletions = 0
        file_list = []
        stat_done = False
        for rline in rest.split("\n"):
            rline = rline.strip()
            if not rline:
                continue
            if not stat_done and ("files changed" in rline or "file changed" in rline
                                  or "insertion" in rline or "deletion" in rline):
                files_changed = _parse_int(r'(\d+) files? changed', rline)
                insertions = _parse_int(r'(\d+) insertion', rline)
                deletions = _parse_int(r'(\d+) deletion', rline)
                stat_done = True
                continue
            if "/" in rline or "." in rline:
                file_list.append(rline)

        commits.append(CommitInfo(
            hash_short=hash_short,
            subject=subject,
            body=body,
            author=author,
            timestamp=timestamp,
            files_changed=files_changed,
            insertions=insertions,
            deletions=deletions,
            file_list=file_list[:20],
        ))

    return commits


# ── Helpers ───────────────────────────────────────────────────────────────


def _parse_int(pattern: str, text: str) -> int:
    """Extract first integer matching regex pattern from text."""
    m = re.search(pattern, text)
    return int(m.group(1)) if m else 0


def _categorize_commit(subject: str) -> str:
    """Categorize commit by conventional commit prefix."""
    s = subject.lower()
    for prefix in ("feat", "fix", "refactor", "test", "docs", "chore", "perf", "style", "ci", "build"):
        if s.startswith(prefix):
            return prefix
    return "other"