diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 3f1b6b2..5c75bd7 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "understand-anything", "description": "AI-powered codebase understanding — analyze, visualize, and explain any project", - "version": "2.7.4", + "version": "2.7.5", "author": { "name": "Lum1104" }, diff --git a/.copilot-plugin/plugin.json b/.copilot-plugin/plugin.json index b5b668f..b18679d 100644 --- a/.copilot-plugin/plugin.json +++ b/.copilot-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "understand-anything", "description": "AI-powered codebase understanding — analyze, visualize, and explain any project", - "version": "2.7.4", + "version": "2.7.5", "author": { "name": "Lum1104" }, diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 0e5ba34..075114d 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -2,7 +2,7 @@ "name": "understand-anything", "displayName": "Understand Anything", "description": "AI-powered codebase understanding — analyze, visualize, and explain any project", - "version": "2.7.4", + "version": "2.7.5", "author": { "name": "Lum1104" }, diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0063d1..69771eb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,4 +33,4 @@ jobs: run: pnpm --filter @understand-anything/core test - name: Test skill - run: pnpm --filter @understand-anything/skill test + run: pnpm test diff --git a/CLAUDE.md b/CLAUDE.md index 9ab3dc7..f79ae95 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,7 +35,7 @@ An open-source tool combining LLM intelligence + static analysis to produce inte - `pnpm --filter @understand-anything/core build` — Build the core package - `pnpm --filter @understand-anything/core test` — Run core tests - `pnpm --filter @understand-anything/skill build` — Build the plugin package -- `pnpm --filter @understand-anything/skill test` — Run plugin tests +- `pnpm test` — Run all tests (skill tests live at repo-root `tests/skill/`, picked up by root `vitest.config.ts`) - `pnpm --filter @understand-anything/dashboard build` — Build the dashboard - `pnpm dev:dashboard` — Start dashboard dev server - `pnpm lint` — Run ESLint across the project diff --git a/docs/superpowers/plans/2026-05-24-semantic-batching-and-output-chunking-impl.md b/docs/superpowers/plans/2026-05-24-semantic-batching-and-output-chunking-impl.md new file mode 100644 index 0000000..06e5361 --- /dev/null +++ b/docs/superpowers/plans/2026-05-24-semantic-batching-and-output-chunking-impl.md @@ -0,0 +1,2353 @@ +# Semantic Batching and Output Chunking Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. **All dispatched subagents must use `model="opus"`** (project convention). + +**Goal:** Replace count-based file-analyzer batching with Louvain semantic batching (Phase 1.5), and add defensive output chunking in file-analyzer (60 nodes / 120 edges per part), so `/understand` stops hitting Bedrock OPUS output caps and produces better cross-batch semantic edge coverage. One PR. + +**Architecture:** Add `compute-batches.mjs` (Phase 1.5) which runs Louvain on the import graph from `scan-result.json` and writes `batches.json` containing pre-built `batchImportData` + `neighborMap` (paths + exported symbols). file-analyzer reads neighborMap to confidently emit cross-batch edges, and self-splits its output into `batch--part-.json` when above thresholds. `merge-batch-graphs.py` glob already accepts multi-part naming (no code change, only stderr report + missing-part warning). + +**Tech Stack:** Node.js ≥22 + pnpm ≥10, `graphology` + `graphology-communities-louvain` (new deps), `@understand-anything/core` TreeSitterPlugin (existing), Vitest for `.mjs` tests, Python `unittest` for `merge-batch-graphs.py` tests. + +**Source spec:** [`docs/superpowers/specs/2026-05-24-semantic-batching-and-output-chunking-design.md`](../specs/2026-05-24-semantic-batching-and-output-chunking-design.md) + +**Branch:** `feat/semantic-batching-and-output-chunking` (already created). + +--- + +## File Structure + +**Create:** + +- `understand-anything-plugin/skills/understand/compute-batches.mjs` — Phase 1.5 script +- `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` — Vitest unit tests +- `understand-anything-plugin/skills/understand/test/fixtures/scan-result-3-cliques.json` — synthetic test fixture (3 disjoint import cliques) +- `understand-anything-plugin/skills/understand/test/fixtures/scan-result-large-community.json` — synthetic test fixture (50-node complete graph) +- `understand-anything-plugin/skills/understand/test/fixtures/scan-result-non-code.json` — synthetic test fixture (Dockerfile/CI/SQL groups) + +**Modify:** + +- `understand-anything-plugin/package.json` — add `graphology` + `graphology-communities-louvain` to `dependencies` +- `understand-anything-plugin/skills/understand/SKILL.md` — insert Phase 1.5; replace Phase 2 ANALYZE batching prose; replace Incremental update path +- `understand-anything-plugin/agents/file-analyzer.md` — add Cross-batch context (neighborMap) section; replace Writing Results with multi-part protocol +- `understand-anything-plugin/skills/understand/merge-batch-graphs.py` — multi-part stderr summary + missing-part warning +- `understand-anything-plugin/skills/understand/test_merge_batch_graphs.py` — new `TestMultiPart` class +- `understand-anything-plugin/package.json`, `understand-anything-plugin/.claude-plugin/plugin.json`, `.claude-plugin/plugin.json`, `.cursor-plugin/plugin.json`, `.copilot-plugin/plugin.json` — version bump (Task 16) + +--- + +## Task 1: Add graphology dependencies + +**Files:** +- Modify: `understand-anything-plugin/package.json` + +- [ ] **Step 1: Add deps to package.json** + +Edit `understand-anything-plugin/package.json` `dependencies` block: + +```json +{ + "name": "@understand-anything/skill", + "version": "2.7.4", + "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "scripts": { + "build": "tsc", + "test": "vitest run" + }, + "dependencies": { + "@understand-anything/core": "workspace:*", + "graphology": "^0.26.0", + "graphology-communities-louvain": "^2.0.2" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "typescript": "^5.7.0", + "vitest": "^3.1.0" + } +} +``` + +- [ ] **Step 2: Install** + +Run from repo root: +```bash +pnpm install +``` +Expected: lockfile updates with graphology + graphology-communities-louvain; no other version churn. + +- [ ] **Step 3: Smoke test the imports work** + +Run from `understand-anything-plugin/`: +```bash +node -e "import('graphology').then(m => { const G = m.default; const g = new G({type:'undirected'}); g.addNode('a'); g.addNode('b'); g.addEdge('a','b'); console.log('graphology ok, edges:', g.size); })" +node -e "Promise.all([import('graphology'), import('graphology-communities-louvain')]).then(([G,L]) => { const g = new G.default({type:'undirected'}); ['a','b','c'].forEach(n => g.addNode(n)); g.addEdge('a','b'); g.addEdge('b','c'); console.log('louvain ok:', JSON.stringify(L.default(g))); })" +``` +Expected: prints `graphology ok, edges: 1` and `louvain ok: {...}` with community ids assigned. + +- [ ] **Step 4: Commit** + +```bash +git add understand-anything-plugin/package.json pnpm-lock.yaml +git commit -m "deps: add graphology + graphology-communities-louvain" +``` + +--- + +## Task 2: Prototype compute-batches.mjs (load + Louvain print) + +This is the **feasibility prototype** — the spec gates the size-enforcement design on what real community sizes look like. Build the skeleton, then run it against a synthetic fixture (and optionally a real `scan-result.json` from this repo if one exists) before adding more code. + +**Files:** +- Create: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Create: `understand-anything-plugin/skills/understand/test/fixtures/scan-result-3-cliques.json` + +- [ ] **Step 1: Create test fixture (3 disjoint import cliques)** + +Create `understand-anything-plugin/skills/understand/test/fixtures/scan-result-3-cliques.json`: + +```json +{ + "name": "fixture-3-cliques", + "description": "Three disjoint import cliques for Louvain testing", + "languages": ["typescript"], + "frameworks": [], + "files": [ + {"path": "src/auth/login.ts", "language": "typescript", "sizeLines": 50, "fileCategory": "code"}, + {"path": "src/auth/session.ts", "language": "typescript", "sizeLines": 40, "fileCategory": "code"}, + {"path": "src/auth/tokens.ts", "language": "typescript", "sizeLines": 60, "fileCategory": "code"}, + {"path": "src/api/handlers.ts", "language": "typescript", "sizeLines": 80, "fileCategory": "code"}, + {"path": "src/api/middleware.ts", "language": "typescript", "sizeLines": 30, "fileCategory": "code"}, + {"path": "src/api/routes.ts", "language": "typescript", "sizeLines": 45, "fileCategory": "code"}, + {"path": "src/db/users.ts", "language": "typescript", "sizeLines": 70, "fileCategory": "code"}, + {"path": "src/db/queries.ts", "language": "typescript", "sizeLines": 55, "fileCategory": "code"}, + {"path": "src/db/migrations.ts", "language": "typescript", "sizeLines": 35, "fileCategory": "code"} + ], + "totalFiles": 9, + "filteredByIgnore": 0, + "estimatedComplexity": "small", + "importMap": { + "src/auth/login.ts": ["src/auth/session.ts", "src/auth/tokens.ts"], + "src/auth/session.ts": ["src/auth/tokens.ts"], + "src/auth/tokens.ts": [], + "src/api/handlers.ts": ["src/api/middleware.ts", "src/api/routes.ts"], + "src/api/middleware.ts": ["src/api/routes.ts"], + "src/api/routes.ts": [], + "src/db/users.ts": ["src/db/queries.ts", "src/db/migrations.ts"], + "src/db/queries.ts": ["src/db/migrations.ts"], + "src/db/migrations.ts": [] + } +} +``` + +- [ ] **Step 2: Write skeleton compute-batches.mjs (Louvain only, no neighborMap, no exports, no fallback)** + +Create `understand-anything-plugin/skills/understand/compute-batches.mjs`: + +```javascript +#!/usr/bin/env node +/** + * compute-batches.mjs — Phase 1.5 of /understand + * + * Reads scan-result.json, runs Louvain community detection on the import + * graph, and writes batches.json containing batches + neighborMap. + * + * Usage: + * node compute-batches.mjs [--changed-files=] + * + * Input: /.understand-anything/intermediate/scan-result.json + * Output: /.understand-anything/intermediate/batches.json + */ + +import { readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { dirname, resolve, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +import Graph from 'graphology'; +import louvain from 'graphology-communities-louvain'; + +// ── Skeleton main: load → Louvain → print sizes ─────────────────────────── +async function main() { + const projectRoot = process.argv[2]; + if (!projectRoot) { + process.stderr.write('Usage: node compute-batches.mjs [--changed-files=]\n'); + process.exit(1); + } + + const scanPath = join(projectRoot, '.understand-anything', 'intermediate', 'scan-result.json'); + if (!existsSync(scanPath)) { + process.stderr.write(`Error: scan-result.json not found at ${scanPath}\n`); + process.exit(1); + } + + const scan = JSON.parse(readFileSync(scanPath, 'utf-8')); + const codeFiles = (scan.files || []).filter(f => f.fileCategory === 'code'); + const importMap = scan.importMap || {}; + + process.stderr.write(`Loaded ${scan.files.length} files (${codeFiles.length} code).\n`); + + // Build undirected import graph + const g = new Graph({ type: 'undirected', allowSelfLoops: false }); + for (const f of codeFiles) g.addNode(f.path); + for (const [src, targets] of Object.entries(importMap)) { + if (!g.hasNode(src)) continue; + for (const tgt of targets) { + if (!g.hasNode(tgt) || src === tgt || g.hasEdge(src, tgt)) continue; + g.addEdge(src, tgt); + } + } + + // Run Louvain + const communities = louvain(g); // { nodeId: communityId } + + // Print size distribution + const sizeByCommunity = new Map(); + for (const [, cid] of Object.entries(communities)) { + sizeByCommunity.set(cid, (sizeByCommunity.get(cid) || 0) + 1); + } + const sizes = [...sizeByCommunity.values()].sort((a, b) => b - a); + process.stderr.write( + `Louvain produced ${sizes.length} communities. Size distribution: [${sizes.join(', ')}]\n`, + ); + process.stderr.write( + `Max community size: ${sizes[0] ?? 0}, min: ${sizes.at(-1) ?? 0}, ` + + `>35: ${sizes.filter(s => s > 35).length}, <5: ${sizes.filter(s => s < 5).length}\n`, + ); +} + +// CLI entry guard (mirrors extract-structure.mjs pattern) +import { realpathSync } from 'node:fs'; +function isCliEntry() { + if (!process.argv[1]) return false; + try { + return realpathSync(fileURLToPath(import.meta.url)) === realpathSync(process.argv[1]); + } catch { + return false; + } +} + +if (isCliEntry()) { + try { + await main(); + } catch (err) { + process.stderr.write(`compute-batches.mjs failed: ${err.message}\n${err.stack}\n`); + process.exit(1); + } +} +``` + +- [ ] **Step 3: Run skeleton against the fixture** + +Create a temporary scratch directory with the fixture in the expected layout: + +```bash +mkdir -p /tmp/ua-prototype/.understand-anything/intermediate +cp understand-anything-plugin/skills/understand/test/fixtures/scan-result-3-cliques.json \ + /tmp/ua-prototype/.understand-anything/intermediate/scan-result.json +node understand-anything-plugin/skills/understand/compute-batches.mjs /tmp/ua-prototype +``` + +Expected stderr: +``` +Loaded 9 files (9 code). +Louvain produced 3 communities. Size distribution: [3, 3, 3] +Max community size: 3, min: 3, >35: 0, <5: 3 +``` + +(All 9 files split into 3 cliques of 3. All under min=5 — that's expected for the fixture; in the real plan we accept this and don't merge.) + +- [ ] **Step 4: (Optional) Run against this repo's scan-result.json if it exists** + +```bash +if [ -f .understand-anything/intermediate/scan-result.json ]; then + node understand-anything-plugin/skills/understand/compute-batches.mjs "$(pwd)" +else + echo "No real scan-result.json — skipping (fixture run is sufficient for prototype)." +fi +``` + +Record the output: if the real-repo run shows any community size > 35, implement edge-betweenness split in Task 4. Otherwise, Task 4 can be a minimal defensive WCC partition. + +- [ ] **Step 5: Commit skeleton** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test/fixtures/scan-result-3-cliques.json +git commit -m "feat(compute-batches): skeleton — Louvain on import graph (prototype)" +``` + +--- + +## Task 3: Write Vitest harness + first Louvain unit test + +**Files:** +- Create: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` + +- [ ] **Step 1: Write failing test (Louvain produces 3 batches for 3 cliques)** + +Create `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs`: + +```javascript +import { describe, it, expect, beforeEach } from 'vitest'; +import { mkdtempSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { dirname, resolve } from 'node:path'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const SCRIPT = resolve(__dirname, 'compute-batches.mjs'); +const FIXTURES = resolve(__dirname, 'test/fixtures'); + +function runScript(projectRoot, extraArgs = []) { + return spawnSync('node', [SCRIPT, projectRoot, ...extraArgs], { + encoding: 'utf-8', + }); +} + +function setupProject(fixtureName) { + const root = mkdtempSync(join(tmpdir(), 'ua-cb-test-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + const fixturePath = join(FIXTURES, fixtureName); + const dest = join(root, '.understand-anything', 'intermediate', 'scan-result.json'); + writeFileSync(dest, readFileSync(fixturePath, 'utf-8')); + return root; +} + +function readBatches(projectRoot) { + const p = join(projectRoot, '.understand-anything', 'intermediate', 'batches.json'); + return JSON.parse(readFileSync(p, 'utf-8')); +} + +describe('compute-batches.mjs — Louvain basic', () => { + let projectRoot; + + beforeEach(() => { + projectRoot = setupProject('scan-result-3-cliques.json'); + }); + + it('produces 3 batches for 3 disjoint cliques', () => { + const result = runScript(projectRoot); + expect(result.status).toBe(0); + + const batches = readBatches(projectRoot); + expect(batches.algorithm).toBe('louvain'); + expect(batches.totalFiles).toBe(9); + expect(batches.batches.length).toBe(3); + + // Each batch should contain exactly one clique (3 files) + for (const b of batches.batches) { + expect(b.files.length).toBe(3); + const dirs = new Set(b.files.map(f => f.path.split('/')[1])); + expect(dirs.size).toBe(1); // all files in the batch share src// + } + }); +}); +``` + +- [ ] **Step 2: Run test, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "Louvain basic" +``` + +Expected: FAIL — `compute-batches.mjs` skeleton from Task 2 only prints to stderr, doesn't write `batches.json`. Test fails on `readBatches` → ENOENT. + +- [ ] **Step 3: Make skeleton write batches.json** + +Replace the trailing `process.stderr.write(...)` lines in `compute-batches.mjs` `main()` with the full minimal-batches output. Replace lines starting from `// Print size distribution` to end of `main()`: + +```javascript + // Group files by community id, sorted by largest first for stable assignment + const filesByCommunity = new Map(); + for (const [path, cid] of Object.entries(communities)) { + if (!filesByCommunity.has(cid)) filesByCommunity.set(cid, []); + filesByCommunity.get(cid).push(path); + } + + // Sort communities by size desc, then by min-path asc for determinism + const sortedCommunities = [...filesByCommunity.entries()] + .sort((a, b) => { + if (b[1].length !== a[1].length) return b[1].length - a[1].length; + const minA = [...a[1]].sort()[0]; + const minB = [...b[1]].sort()[0]; + return minA.localeCompare(minB); + }); + + // Build per-batch file list with full file metadata from scan + const fileMetaByPath = new Map(scan.files.map(f => [f.path, f])); + const batches = sortedCommunities.map(([, paths], idx) => ({ + batchIndex: idx + 1, + files: paths.sort().map(p => fileMetaByPath.get(p)), + batchImportData: {}, + neighborMap: {}, + })); + + const output = { + schemaVersion: 1, + algorithm: 'louvain', + totalFiles: scan.files.length, + totalBatches: batches.length, + batches, + }; + + const outPath = join(projectRoot, '.understand-anything', 'intermediate', 'batches.json'); + writeFileSync(outPath, JSON.stringify(output, null, 2), 'utf-8'); + process.stderr.write(`Wrote ${batches.length} batches to ${outPath}\n`); +``` + +- [ ] **Step 4: Run test, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "Louvain basic" +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs +git commit -m "feat(compute-batches): emit batches.json with code communities" +``` + +--- + +## Task 4: Size enforcement — split oversized communities + +If the Task 2 prototype run showed any community > 35 files, implement edge-betweenness split. Otherwise, implement a minimal weakly-connected-component (WCC) split as a defensive guard. + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Modify: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` +- Create: `understand-anything-plugin/skills/understand/test/fixtures/scan-result-large-community.json` + +- [ ] **Step 1: Create large-community fixture (40-node complete graph in one community)** + +Create `understand-anything-plugin/skills/understand/test/fixtures/scan-result-large-community.json`. Build programmatically once and commit the JSON: + +```bash +node -e " +const files = []; +const importMap = {}; +for (let i = 0; i < 40; i++) { + const p = 'src/big/f' + i + '.ts'; + files.push({ path: p, language: 'typescript', sizeLines: 50, fileCategory: 'code' }); + importMap[p] = []; + // Every file imports every other — guarantees a single community of 40 + for (let j = 0; j < 40; j++) if (i !== j) importMap[p].push('src/big/f' + j + '.ts'); +} +const out = { + name: 'fixture-large-community', + description: '40 files all importing each other — one community over the max=35 cap', + languages: ['typescript'], + frameworks: [], + files, + totalFiles: 40, + filteredByIgnore: 0, + estimatedComplexity: 'moderate', + importMap, +}; +console.log(JSON.stringify(out, null, 2)); +" > understand-anything-plugin/skills/understand/test/fixtures/scan-result-large-community.json +``` + +- [ ] **Step 2: Write failing test (large community splits to ≤ 35)** + +Append to `test_compute_batches.test.mjs`: + +```javascript +describe('compute-batches.mjs — size enforcement', () => { + it('splits a 40-node clique into batches ≤ 35', () => { + const root = setupProject('scan-result-large-community.json'); + const result = runScript(root); + expect(result.status).toBe(0); + + const batches = readBatches(root); + expect(batches.totalFiles).toBe(40); + for (const b of batches.batches) { + expect(b.files.length).toBeLessThanOrEqual(35); + } + // Sum of all batch file counts equals total files + const sum = batches.batches.reduce((acc, b) => acc + b.files.length, 0); + expect(sum).toBe(40); + // Warning was emitted to stderr + expect(result.stderr).toMatch(/Warning: compute-batches: community size 40 > max 35/); + }); +}); +``` + +- [ ] **Step 3: Run test, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "size enforcement" +``` + +Expected: FAIL — at least one batch has 40 files; no warning emitted. + +- [ ] **Step 4: Implement WCC-style split + warning** + +In `compute-batches.mjs`, after the `const communities = louvain(g);` line and before grouping by community, insert size-enforcement logic. Replace the existing grouping block with: + +```javascript + // Group files by community id + const filesByCommunity = new Map(); + for (const [path, cid] of Object.entries(communities)) { + if (!filesByCommunity.has(cid)) filesByCommunity.set(cid, []); + filesByCommunity.get(cid).push(path); + } + + // Size enforcement: split any community > MAX_COMMUNITY_SIZE. + // Strategy: deterministic alphabetical chunking within the oversize community. + // Edge-betweenness would be more modularity-aware but adds dependency surface; + // alphabetical chunking is deterministic, locality-preserving for co-located + // files, and bounded by the cap. Each sub-community gets a fresh synthetic id. + const MAX_COMMUNITY_SIZE = 35; + const splitCommunities = new Map(); + let nextSyntheticId = 0; + for (const [cid, paths] of filesByCommunity) { + if (paths.length <= MAX_COMMUNITY_SIZE) { + splitCommunities.set(cid, paths); + continue; + } + process.stderr.write( + `Warning: compute-batches: community size ${paths.length} > max ${MAX_COMMUNITY_SIZE} ` + + `— splitting via alphabetical chunking — modularity may decrease\n`, + ); + const sorted = [...paths].sort(); + const parts = Math.ceil(paths.length / MAX_COMMUNITY_SIZE); + const perPart = Math.ceil(paths.length / parts); + for (let i = 0; i < parts; i++) { + const slice = sorted.slice(i * perPart, (i + 1) * perPart); + const synthId = `__split_${cid}_${nextSyntheticId++}`; + splitCommunities.set(synthId, slice); + } + } +``` + +Then update the `sortedCommunities` line to use `splitCommunities` instead of `filesByCommunity`: + +```javascript + const sortedCommunities = [...splitCommunities.entries()] +``` + +- [ ] **Step 5: Run test, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "size enforcement" +``` + +Expected: PASS — 40 files split into 2 batches of 20 each, warning emitted. + +- [ ] **Step 6: Run prior test too, expect still PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all tests PASS. + +- [ ] **Step 7: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs \ + understand-anything-plugin/skills/understand/test/fixtures/scan-result-large-community.json +git commit -m "feat(compute-batches): split communities > 35 with visible warning" +``` + +--- + +## Task 5: Exports extraction via TreeSitterPlugin + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Modify: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` + +- [ ] **Step 1: Write failing test (exports populated on real TS files)** + +Add a fixture-on-disk test that writes real source files and points the fixture at them. Append to `test_compute_batches.test.mjs`: + +```javascript +describe('compute-batches.mjs — exports extraction', () => { + it('populates exports for code files via tree-sitter', () => { + const root = mkdtempSync(join(tmpdir(), 'ua-cb-exp-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + mkdirSync(join(root, 'src'), { recursive: true }); + writeFileSync(join(root, 'src', 'a.ts'), + 'export function greet(name: string) { return "hi " + name; }\n' + + 'export class Greeter { greet(n: string) { return "hi " + n; } }\n'); + writeFileSync(join(root, 'src', 'b.ts'), + 'import { greet } from "./a";\nexport const helper = () => greet("world");\n'); + + const scan = { + name: 'exports-test', + description: '', + languages: ['typescript'], + frameworks: [], + files: [ + { path: 'src/a.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/b.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + ], + totalFiles: 2, filteredByIgnore: 0, estimatedComplexity: 'small', + importMap: { 'src/a.ts': [], 'src/b.ts': ['src/a.ts'] }, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + + const result = runScript(root); + expect(result.status).toBe(0); + + const batches = readBatches(root); + // batches.json doesn't directly store exports — they live in neighborMap. + // For this test, dig into the script's internal exports map by re-reading + // it. Add an `exportsByPath` debug field to batches.json output (see impl). + expect(batches.exportsByPath).toBeDefined(); + expect(batches.exportsByPath['src/a.ts']).toEqual( + expect.arrayContaining(['greet', 'Greeter'])); + expect(batches.exportsByPath['src/b.ts']).toEqual( + expect.arrayContaining(['helper'])); + }); +}); +``` + +(The `exportsByPath` debug field is a temporary affordance that we keep so future tasks can inspect exports without going through neighborMap. It's emitted in the script output but not consumed by Phase 2 — it's a side-channel for testing and observability.) + +- [ ] **Step 2: Run test, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "exports extraction" +``` + +Expected: FAIL — `batches.exportsByPath` is undefined. + +- [ ] **Step 3: Add TreeSitterPlugin loader + exports loop** + +In `compute-batches.mjs`, add core import dance at top of the file (after existing imports): + +```javascript +import { createRequire } from 'node:module'; +import { pathToFileURL } from 'node:url'; + +const __filename = fileURLToPath(import.meta.url); +const PLUGIN_ROOT = resolve(dirname(__filename), '../..'); +const require = createRequire(resolve(PLUGIN_ROOT, 'package.json')); + +let core; +try { + core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href); +} catch { + core = await import(pathToFileURL(resolve(PLUGIN_ROOT, 'packages/core/dist/index.js')).href); +} +const { TreeSitterPlugin, PluginRegistry, builtinLanguageConfigs, registerAllParsers } = core; +``` + +Then add an `extractExports(projectRoot, codeFiles)` function before `main()`: + +```javascript +/** + * For each code file, returns its top-level exported symbol names (functions, + * classes, exported consts). Per-file errors are swallowed into [] with a + * visible warning so a single bad file does not abort batching. + * + * Returns Map. + */ +async function extractExports(projectRoot, codeFiles) { + const tsConfigs = builtinLanguageConfigs.filter(c => c.treeSitter); + const tsPlugin = new TreeSitterPlugin(tsConfigs); + await tsPlugin.init(); + const registry = new PluginRegistry(); + registry.register(tsPlugin); + registerAllParsers(registry); + + const exportsByPath = new Map(); + for (const file of codeFiles) { + const abs = join(projectRoot, file.path); + let content; + try { + content = readFileSync(abs, 'utf-8'); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: exports extraction failed for ${file.path} ` + + `(read error: ${err.message}) — symbols=[] in neighborMap — ` + + `cross-batch edges to this file limited to file-level\n`, + ); + exportsByPath.set(file.path, []); + continue; + } + try { + const analysis = registry.analyzeFile(file.path, content); + const names = (analysis?.exports || []).map(e => e.name).filter(Boolean); + exportsByPath.set(file.path, names); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: exports extraction failed for ${file.path} ` + + `(${err.message}) — symbols=[] in neighborMap — ` + + `cross-batch edges to this file limited to file-level\n`, + ); + exportsByPath.set(file.path, []); + } + } + return exportsByPath; +} +``` + +In `main()`, after building `codeFiles` and before Louvain, call: + +```javascript + const exportsByPath = await extractExports(projectRoot, codeFiles); +``` + +In the output object, attach the debug field: + +```javascript + const output = { + schemaVersion: 1, + algorithm: 'louvain', + totalFiles: scan.files.length, + totalBatches: batches.length, + exportsByPath: Object.fromEntries(exportsByPath), + batches, + }; +``` + +- [ ] **Step 4: Run test, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "exports extraction" +``` + +Expected: PASS. + +- [ ] **Step 5: Run all tests, expect still PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all PASS. + +- [ ] **Step 6: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs +git commit -m "feat(compute-batches): extract top-level exports via TreeSitter, warn on failure" +``` + +--- + +## Task 6: Non-code batching (Groups A-E) + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Modify: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` +- Create: `understand-anything-plugin/skills/understand/test/fixtures/scan-result-non-code.json` + +- [ ] **Step 1: Create non-code fixture** + +Create `understand-anything-plugin/skills/understand/test/fixtures/scan-result-non-code.json`: + +```json +{ + "name": "fixture-non-code", + "description": "Mix of non-code files exercising Groups A-E", + "languages": ["typescript", "dockerfile", "yaml", "sql", "markdown"], + "frameworks": [], + "files": [ + {"path": "src/index.ts", "language": "typescript", "sizeLines": 10, "fileCategory": "code"}, + {"path": "Dockerfile", "language": "dockerfile", "sizeLines": 20, "fileCategory": "infra"}, + {"path": "docker-compose.yml", "language": "yaml", "sizeLines": 15, "fileCategory": "infra"}, + {"path": ".dockerignore", "language": "config", "sizeLines": 5, "fileCategory": "config"}, + {"path": "services/api/Dockerfile", "language": "dockerfile", "sizeLines": 18, "fileCategory": "infra"}, + {"path": "services/api/docker-compose.yml", "language": "yaml", "sizeLines": 12, "fileCategory": "infra"}, + {"path": ".github/workflows/ci.yml", "language": "yaml", "sizeLines": 30, "fileCategory": "infra"}, + {"path": ".github/workflows/deploy.yml", "language": "yaml", "sizeLines": 25, "fileCategory": "infra"}, + {"path": "migrations/001_init.sql", "language": "sql", "sizeLines": 40, "fileCategory": "data"}, + {"path": "migrations/002_users.sql", "language": "sql", "sizeLines": 20, "fileCategory": "data"}, + {"path": "docs/getting-started.md", "language": "markdown", "sizeLines": 100, "fileCategory": "docs"}, + {"path": "README.md", "language": "markdown", "sizeLines": 200, "fileCategory": "docs"} + ], + "totalFiles": 12, + "filteredByIgnore": 0, + "estimatedComplexity": "small", + "importMap": { + "src/index.ts": [], + "Dockerfile": [], "docker-compose.yml": [], ".dockerignore": [], + "services/api/Dockerfile": [], "services/api/docker-compose.yml": [], + ".github/workflows/ci.yml": [], ".github/workflows/deploy.yml": [], + "migrations/001_init.sql": [], "migrations/002_users.sql": [], + "docs/getting-started.md": [], "README.md": [] + } +} +``` + +- [ ] **Step 2: Write failing tests for each non-code group** + +Append to `test_compute_batches.test.mjs`: + +```javascript +describe('compute-batches.mjs — non-code grouping', () => { + let root; + let batches; + + beforeEach(() => { + root = setupProject('scan-result-non-code.json'); + const result = runScript(root); + expect(result.status).toBe(0); + batches = readBatches(root); + }); + + it('Group A: bundles Dockerfile cluster per directory', () => { + // Root-level cluster: Dockerfile + docker-compose.yml + .dockerignore → one batch + const rootDockerBatch = batches.batches.find(b => + b.files.some(f => f.path === 'Dockerfile')); + expect(rootDockerBatch).toBeDefined(); + const paths = rootDockerBatch.files.map(f => f.path).sort(); + expect(paths).toEqual(['.dockerignore', 'Dockerfile', 'docker-compose.yml']); + + // services/api cluster is a separate batch + const apiDockerBatch = batches.batches.find(b => + b.files.some(f => f.path === 'services/api/Dockerfile')); + expect(apiDockerBatch).toBeDefined(); + expect(apiDockerBatch).not.toBe(rootDockerBatch); + expect(apiDockerBatch.files.map(f => f.path).sort()).toEqual([ + 'services/api/Dockerfile', 'services/api/docker-compose.yml', + ]); + }); + + it('Group B: .github/workflows/* all in one batch', () => { + const wfBatch = batches.batches.find(b => + b.files.some(f => f.path.startsWith('.github/workflows/'))); + expect(wfBatch).toBeDefined(); + const wfPaths = wfBatch.files.map(f => f.path).filter(p => p.startsWith('.github/workflows/')); + expect(wfPaths.sort()).toEqual([ + '.github/workflows/ci.yml', '.github/workflows/deploy.yml', + ]); + }); + + it('Group D: SQL migrations under migrations/ in one batch', () => { + const migBatch = batches.batches.find(b => + b.files.some(f => f.path.startsWith('migrations/'))); + expect(migBatch).toBeDefined(); + const migPaths = migBatch.files.map(f => f.path).filter(p => p.startsWith('migrations/')); + expect(migPaths.sort()).toEqual([ + 'migrations/001_init.sql', 'migrations/002_users.sql', + ]); + }); + + it('non-code batch indices follow code batches', () => { + const codeBatches = batches.batches.filter(b => + b.files.every(f => f.fileCategory === 'code')); + const nonCodeBatches = batches.batches.filter(b => + b.files.some(f => f.fileCategory !== 'code')); + expect(codeBatches.length).toBeGreaterThan(0); + expect(nonCodeBatches.length).toBeGreaterThan(0); + const maxCodeIdx = Math.max(...codeBatches.map(b => b.batchIndex)); + const minNonCodeIdx = Math.min(...nonCodeBatches.map(b => b.batchIndex)); + expect(minNonCodeIdx).toBeGreaterThan(maxCodeIdx); + }); +}); +``` + +- [ ] **Step 3: Run tests, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "non-code grouping" +``` + +Expected: FAIL on all four (non-code files currently end up nowhere — they're not in `codeFiles`, not in any batch). + +- [ ] **Step 4: Implement non-code grouping** + +In `compute-batches.mjs`, add a `buildNonCodeBatches(nonCodeFiles, startIndex)` function before `main()`: + +```javascript +/** + * Build batches for non-code files per Groups A-E in the design spec. + * Returns Array<{ files: FileMeta[] }> (without batchIndex — caller assigns). + */ +function buildNonCodeBatches(nonCodeFiles) { + const byPath = new Map(nonCodeFiles.map(f => [f.path, f])); + const consumed = new Set(); + const groups = []; + + const dirOf = p => p.includes('/') ? p.slice(0, p.lastIndexOf('/')) : ''; + const baseOf = p => p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p; + + // Group A: per-directory Dockerfile clusters. + const dirsWithDockerfile = new Set( + [...byPath.keys()] + .filter(p => baseOf(p) === 'Dockerfile') + .map(dirOf), + ); + for (const dir of dirsWithDockerfile) { + const inDir = [...byPath.keys()].filter(p => dirOf(p) === dir); + const cluster = inDir.filter(p => { + const b = baseOf(p); + return b === 'Dockerfile' + || b === '.dockerignore' + || b.startsWith('docker-compose.'); + }); + if (cluster.length) { + groups.push({ files: cluster.map(p => byPath.get(p)) }); + cluster.forEach(p => consumed.add(p)); + } + } + + // Group B: .github/workflows/* + const ghWorkflows = [...byPath.keys()].filter( + p => p.startsWith('.github/workflows/') && (p.endsWith('.yml') || p.endsWith('.yaml')), + ).filter(p => !consumed.has(p)); + if (ghWorkflows.length) { + groups.push({ files: ghWorkflows.map(p => byPath.get(p)) }); + ghWorkflows.forEach(p => consumed.add(p)); + } + + // Group C: .gitlab-ci.yml + .circleci/* + const ciFiles = [...byPath.keys()].filter( + p => (p === '.gitlab-ci.yml' || p.startsWith('.circleci/')) + && !consumed.has(p), + ); + if (ciFiles.length) { + groups.push({ files: ciFiles.map(p => byPath.get(p)) }); + ciFiles.forEach(p => consumed.add(p)); + } + + // Group D: SQL migrations per migrations/ or migration/ directory + const migrationDirs = new Set( + [...byPath.keys()] + .filter(p => p.endsWith('.sql')) + .map(dirOf) + .filter(d => /(^|\/)migrations?$/.test(d)), + ); + for (const dir of migrationDirs) { + const sqls = [...byPath.keys()] + .filter(p => dirOf(p) === dir && p.endsWith('.sql') && !consumed.has(p)) + .sort(); + if (sqls.length) { + groups.push({ files: sqls.map(p => byPath.get(p)) }); + sqls.forEach(p => consumed.add(p)); + } + } + + // Group E: all remaining grouped by immediate parent dir, max 20 per batch + const remainingByDir = new Map(); + for (const p of [...byPath.keys()].sort()) { + if (consumed.has(p)) continue; + const dir = dirOf(p); + if (!remainingByDir.has(dir)) remainingByDir.set(dir, []); + remainingByDir.get(dir).push(p); + } + const MAX_E = 20; + for (const [, paths] of remainingByDir) { + for (let i = 0; i < paths.length; i += MAX_E) { + const slice = paths.slice(i, i + MAX_E); + groups.push({ files: slice.map(p => byPath.get(p)) }); + } + } + + return groups; +} +``` + +In `main()`, after `const codeFiles = ...` add: + +```javascript + const nonCodeFiles = (scan.files || []).filter(f => f.fileCategory !== 'code'); +``` + +After the `sortedCommunities`/batches construction for code, build non-code batches and append: + +```javascript + // Assign code batchIndex first + const codeBatchObjs = sortedCommunities.map(([, paths], idx) => ({ + batchIndex: idx + 1, + files: paths.sort().map(p => fileMetaByPath.get(p)), + batchImportData: {}, + neighborMap: {}, + })); + + // Append non-code batches after code + const nonCodeGroups = buildNonCodeBatches(nonCodeFiles); + const nonCodeBatchObjs = nonCodeGroups.map((g, i) => ({ + batchIndex: codeBatchObjs.length + i + 1, + files: g.files, + batchImportData: {}, + neighborMap: {}, + })); + + const batches = [...codeBatchObjs, ...nonCodeBatchObjs]; +``` + +(Remove the old `const batches = sortedCommunities.map(...)` line — it's been replaced.) + +- [ ] **Step 5: Run tests, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all PASS. + +- [ ] **Step 6: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs \ + understand-anything-plugin/skills/understand/test/fixtures/scan-result-non-code.json +git commit -m "feat(compute-batches): non-code grouping Groups A-E" +``` + +--- + +## Task 7: batchImportData + neighborMap + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Modify: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` + +- [ ] **Step 1: Write failing tests (batchImportData populated, neighborMap correct, excludes same-batch)** + +Append to `test_compute_batches.test.mjs`: + +```javascript +describe('compute-batches.mjs — neighborMap + batchImportData', () => { + let batches; + let batchOf; // path → batchIndex + + beforeEach(() => { + const root = setupProject('scan-result-3-cliques.json'); + const result = runScript(root); + expect(result.status).toBe(0); + batches = readBatches(root); + batchOf = new Map(); + for (const b of batches.batches) { + for (const f of b.files) batchOf.set(f.path, b.batchIndex); + } + }); + + it('batchImportData mirrors scan importMap per batch', () => { + for (const b of batches.batches) { + for (const f of b.files) { + expect(b.batchImportData[f.path]).toBeDefined(); + // each file's batchImportData should be an array (possibly empty) + expect(Array.isArray(b.batchImportData[f.path])).toBe(true); + } + } + // src/auth/login.ts imports src/auth/session.ts and src/auth/tokens.ts + const loginBatch = batches.batches.find(b => + b.files.some(f => f.path === 'src/auth/login.ts')); + expect(loginBatch.batchImportData['src/auth/login.ts'].sort()).toEqual([ + 'src/auth/session.ts', 'src/auth/tokens.ts', + ]); + }); + + it('neighborMap excludes same-batch files', () => { + // The fixture's three cliques each go into one batch — all imports are + // intra-batch, so no neighbor map should reference any same-batch file. + for (const b of batches.batches) { + const sameBatchPaths = new Set(b.files.map(f => f.path)); + for (const [file, neighbors] of Object.entries(b.neighborMap)) { + for (const n of neighbors) { + expect(sameBatchPaths.has(n.path)).toBe(false); + } + } + } + }); + + it('neighborMap entries carry symbols when target has exports', () => { + // For a custom case where two cliques cross-import each other, ensure + // the neighborMap entry includes the target's exported symbol names. + // Build a custom fixture inline. + const root = mkdtempSync(join(tmpdir(), 'ua-cb-nbr-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + mkdirSync(join(root, 'src'), { recursive: true }); + writeFileSync(join(root, 'src', 'a.ts'), + 'export function findUser(id: string) { return null; }\nexport class User {}\n'); + writeFileSync(join(root, 'src', 'b.ts'), + 'import { findUser } from "./a";\nexport const wrap = () => findUser("x");\n'); + // To force a/b into different batches, add a third unrelated clique that + // dominates one community; here we just rely on small graph behavior. + const scan = { + name: 't', description: '', + languages: ['typescript'], frameworks: [], + files: [ + { path: 'src/a.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/b.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + ], + totalFiles: 2, filteredByIgnore: 0, estimatedComplexity: 'small', + importMap: { 'src/a.ts': [], 'src/b.ts': ['src/a.ts'] }, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + const result = runScript(root); + expect(result.status).toBe(0); + const out = readBatches(root); + // If Louvain puts a and b in the same community, this test is degenerate. + // We just assert: for every cross-batch neighbor entry that points to a.ts, + // the symbols list includes findUser and User. + for (const b of out.batches) { + for (const [, neighbors] of Object.entries(b.neighborMap)) { + for (const n of neighbors) { + if (n.path === 'src/a.ts') { + expect(n.symbols).toEqual(expect.arrayContaining(['findUser', 'User'])); + } + } + } + } + }); +}); +``` + +- [ ] **Step 2: Run tests, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "neighborMap" +``` + +Expected: FAIL — `batchImportData` and `neighborMap` are currently empty `{}` on every batch. + +- [ ] **Step 3: Implement batchImportData + neighborMap construction** + +In `compute-batches.mjs`, before the final `output = {...}` write, add a populate step. Replace the `codeBatchObjs` + `nonCodeBatchObjs` construction with the following: + +```javascript + // Helper: lookup batchIndex by path (any batch — code or non-code) + // Build it after we know batch assignments. + function buildBatchOfMap(allBatches) { + const m = new Map(); + for (const b of allBatches) { + for (const f of b.files) m.set(f.path, b.batchIndex); + } + return m; + } + + // First-pass: assemble files-only batches + const codeBatchObjsBare = sortedCommunities.map(([, paths], idx) => ({ + batchIndex: idx + 1, + files: paths.sort().map(p => fileMetaByPath.get(p)), + })); + const nonCodeGroups = buildNonCodeBatches(nonCodeFiles); + const nonCodeBatchObjsBare = nonCodeGroups.map((g, i) => ({ + batchIndex: codeBatchObjsBare.length + i + 1, + files: g.files, + })); + const bareBatches = [...codeBatchObjsBare, ...nonCodeBatchObjsBare]; + const batchOf = buildBatchOfMap(bareBatches); + + // Build reverse import map: target → [sources that import target] + const reverseImportMap = new Map(); + for (const [src, targets] of Object.entries(importMap)) { + for (const tgt of targets) { + if (!reverseImportMap.has(tgt)) reverseImportMap.set(tgt, []); + reverseImportMap.get(tgt).push(src); + } + } + + // Compute neighbor degree (number of import relations) per path, used for + // truncation when neighborMap[file] has > MAX_NEIGHBORS entries. + const NEIGHBOR_DEGREE = new Map(); + for (const f of codeFiles) { + const outDeg = (importMap[f.path] || []).length; + const inDeg = (reverseImportMap.get(f.path) || []).length; + NEIGHBOR_DEGREE.set(f.path, outDeg + inDeg); + } + + const MAX_NEIGHBORS = 50; + + // Second-pass: enrich each batch with batchImportData + neighborMap + const batches = bareBatches.map(b => { + const batchPaths = new Set(b.files.map(f => f.path)); + const batchImportData = {}; + const neighborMap = {}; + for (const f of b.files) { + batchImportData[f.path] = (importMap[f.path] || []).slice(); + + // 1-hop neighbors: imports out + imported-by in, excluding same batch. + const outNeighbors = importMap[f.path] || []; + const inNeighbors = reverseImportMap.get(f.path) || []; + const all = new Set([...outNeighbors, ...inNeighbors]); + const filtered = [...all].filter(p => batchOf.has(p) && !batchPaths.has(p)); + + let kept = filtered.map(p => ({ + path: p, + batchIndex: batchOf.get(p), + symbols: exportsByPath.get(p) || [], + })); + + if (kept.length > MAX_NEIGHBORS) { + const original = kept.length; + kept.sort((a, b2) => (NEIGHBOR_DEGREE.get(b2.path) || 0) + - (NEIGHBOR_DEGREE.get(a.path) || 0)); + kept = kept.slice(0, MAX_NEIGHBORS); + process.stderr.write( + `Warning: compute-batches: neighborMap for ${f.path} truncated from ` + + `${original} to top ${MAX_NEIGHBORS} (by neighbor degree)\n`, + ); + } + + if (kept.length) neighborMap[f.path] = kept; + } + return { batchIndex: b.batchIndex, files: b.files, batchImportData, neighborMap }; + }); +``` + +- [ ] **Step 4: Run tests, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all PASS. + +- [ ] **Step 5: Add neighborMap truncation test** + +Append: + +```javascript +describe('compute-batches.mjs — neighborMap truncation', () => { + it('truncates and warns when neighbors > 50', () => { + const root = mkdtempSync(join(tmpdir(), 'ua-cb-trunc-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + // hub.ts imported by 60 other files + const files = [{ path: 'src/hub.ts', language: 'typescript', sizeLines: 1, fileCategory: 'code' }]; + const importMap = { 'src/hub.ts': [] }; + for (let i = 0; i < 60; i++) { + const p = `src/leaf${i}.ts`; + files.push({ path: p, language: 'typescript', sizeLines: 1, fileCategory: 'code' }); + importMap[p] = ['src/hub.ts']; + } + const scan = { + name: 't', description: '', languages: ['typescript'], frameworks: [], + files, totalFiles: files.length, filteredByIgnore: 0, + estimatedComplexity: 'moderate', importMap, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + const result = runScript(root); + expect(result.status).toBe(0); + expect(result.stderr).toMatch(/neighborMap for src\/hub\.ts truncated from 60 to top 50/); + const out = readBatches(root); + // Find hub.ts and confirm its neighbor list capped at 50 (in whichever batch it landed) + for (const b of out.batches) { + const nbrs = b.neighborMap['src/hub.ts']; + if (nbrs) expect(nbrs.length).toBeLessThanOrEqual(50); + } + }); +}); +``` + +- [ ] **Step 6: Run tests, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all PASS. + +- [ ] **Step 7: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs +git commit -m "feat(compute-batches): batchImportData + neighborMap with truncation warning" +``` + +--- + +## Task 8: Fallback path + Louvain warning + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Modify: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` + +- [ ] **Step 1: Write failing test (Louvain crash → fallback, warning emitted, batches still valid)** + +Append to `test_compute_batches.test.mjs`: + +```javascript +describe('compute-batches.mjs — fallback', () => { + it('falls back to count-based when Louvain throws (env-injected mock)', () => { + // We can't easily monkey-patch louvain mid-script in Vitest because the + // script runs in a subprocess. Instead, set an env var the script honors: + // UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW=1 → script throws inside its + // Louvain branch, exercising the fallback path. + const root = setupProject('scan-result-3-cliques.json'); + const result = spawnSync('node', + [SCRIPT, root], + { encoding: 'utf-8', env: { ...process.env, UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW: '1' } }, + ); + expect(result.status).toBe(0); + expect(result.stderr).toMatch( + /Warning: compute-batches: Louvain failed.*falling back to count-based grouping/); + const out = readBatches(root); + expect(out.algorithm).toBe('count-fallback'); + expect(out.totalFiles).toBe(9); + // Count-based: 12 files per batch → all 9 fit in one batch + const codeBatchFileCount = out.batches + .filter(b => b.files.every(f => f.fileCategory === 'code')) + .reduce((sum, b) => sum + b.files.length, 0); + expect(codeBatchFileCount).toBe(9); + }); +}); +``` + +- [ ] **Step 2: Run test, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "fallback" +``` + +Expected: FAIL — no fallback path exists; script crashes or produces `algorithm: "louvain"`. + +- [ ] **Step 3: Implement fallback** + +In `compute-batches.mjs`, refactor the Louvain section into a function and wrap it in try/catch. + +**Boundary explicitly:** the block to replace **starts** at `const g = new Graph({ type: 'undirected', allowSelfLoops: false });` and **ends** at the closing brace of the `for (const [cid, paths] of filesByCommunity) { ... }` size-enforcement loop (the loop introduced in Task 4 step 4). Do NOT replace the `const sortedCommunities = [...splitCommunities.entries()] ...` line that follows — it stays as-is and continues to work because the replacement still produces `splitCommunities`. + +Add a `runLouvain(codeFiles, importMap)` function before `main()`: + +```javascript +/** + * Returns Map via Louvain. May throw — caller must catch + * and fall back if it does. Honors UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW=1 + * to allow tests to exercise the fallback path. + */ +function runLouvain(codeFiles, importMap) { + if (process.env.UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW === '1') { + throw new Error('forced throw for test'); + } + const g = new Graph({ type: 'undirected', allowSelfLoops: false }); + for (const f of codeFiles) g.addNode(f.path); + for (const [src, targets] of Object.entries(importMap)) { + if (!g.hasNode(src)) continue; + for (const tgt of targets) { + if (!g.hasNode(tgt) || src === tgt || g.hasEdge(src, tgt)) continue; + g.addEdge(src, tgt); + } + } + const cs = louvain(g); // { nodeId: communityId } + return new Map(Object.entries(cs)); +} + +/** + * Returns Map via alphabetical chunking of 12 files per + * batch. Deterministic, used as fallback when Louvain fails. + */ +function countBasedAssignment(codeFiles, batchSize = 12) { + const out = new Map(); + const sorted = [...codeFiles].map(f => f.path).sort(); + for (let i = 0; i < sorted.length; i++) { + out.set(sorted[i], `count_${Math.floor(i / batchSize)}`); + } + return out; +} +``` + +In `main()`, replace the Louvain call + size-enforcement block with: + +```javascript + let algorithm = 'louvain'; + let perFileCommunity; + try { + perFileCommunity = runLouvain(codeFiles, importMap); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: Louvain failed (${err.message}) ` + + `— falling back to count-based grouping (12 files/batch) ` + + `— module semantic boundaries lost\n`, + ); + perFileCommunity = countBasedAssignment(codeFiles, 12); + algorithm = 'count-fallback'; + } + + // Group files by community id + const filesByCommunity = new Map(); + for (const [path, cid] of perFileCommunity) { + if (!filesByCommunity.has(cid)) filesByCommunity.set(cid, []); + filesByCommunity.get(cid).push(path); + } + + // Size enforcement only on louvain output. count-fallback already chunked. + const MAX_COMMUNITY_SIZE = 35; + const splitCommunities = new Map(); + let nextSyntheticId = 0; + if (algorithm === 'louvain') { + for (const [cid, paths] of filesByCommunity) { + if (paths.length <= MAX_COMMUNITY_SIZE) { + splitCommunities.set(cid, paths); + continue; + } + process.stderr.write( + `Warning: compute-batches: community size ${paths.length} > max ${MAX_COMMUNITY_SIZE} ` + + `— splitting via alphabetical chunking — modularity may decrease\n`, + ); + const sorted = [...paths].sort(); + const parts = Math.ceil(paths.length / MAX_COMMUNITY_SIZE); + const perPart = Math.ceil(paths.length / parts); + for (let i = 0; i < parts; i++) { + const slice = sorted.slice(i * perPart, (i + 1) * perPart); + const synthId = `__split_${cid}_${nextSyntheticId++}`; + splitCommunities.set(synthId, slice); + } + } + } else { + for (const [cid, paths] of filesByCommunity) splitCommunities.set(cid, paths); + } +``` + +And update the output object's `algorithm` field: + +```javascript + const output = { + schemaVersion: 1, + algorithm, + totalFiles: scan.files.length, + totalBatches: batches.length, + exportsByPath: Object.fromEntries(exportsByPath), + batches, + }; +``` + +- [ ] **Step 4: Run tests, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all PASS including new fallback test. + +- [ ] **Step 5: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs +git commit -m "feat(compute-batches): count-based fallback with visible warning" +``` + +--- + +## Task 9: --changed-files mode + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/compute-batches.mjs` +- Modify: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` + +- [ ] **Step 1: Write failing test** + +Append: + +```javascript +describe('compute-batches.mjs — --changed-files', () => { + it('emits only batches containing changed files', () => { + const root = setupProject('scan-result-3-cliques.json'); + const changedPath = join(root, 'changed.txt'); + // Only the auth clique is changed + writeFileSync(changedPath, ['src/auth/login.ts', 'src/auth/tokens.ts'].join('\n')); + + const result = runScript(root, [`--changed-files=${changedPath}`]); + expect(result.status).toBe(0); + + const out = readBatches(root); + // Auth files are in batches; other cliques' batches must be omitted + const allPaths = out.batches.flatMap(b => b.files.map(f => f.path)); + expect(allPaths).toContain('src/auth/login.ts'); + expect(allPaths).toContain('src/auth/tokens.ts'); + expect(allPaths).not.toContain('src/api/handlers.ts'); + expect(allPaths).not.toContain('src/db/users.ts'); + + // neighborMap may still reference unchanged files (with their full-graph batchIndex) + const loginBatch = out.batches.find(b => + b.files.some(f => f.path === 'src/auth/login.ts')); + // No assertion on neighborMap content here — the auth clique is fully + // changed, so neighborMap entries may be empty. The point is the script + // doesn't crash and only emits relevant batches. + expect(loginBatch).toBeDefined(); + }); +}); +``` + +- [ ] **Step 2: Run test, expect FAIL** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs -t "changed-files" +``` + +Expected: FAIL — flag is unrecognized; output contains all batches. + +- [ ] **Step 3: Implement --changed-files filtering** + +In `compute-batches.mjs`, at the start of `main()`, after reading `projectRoot`: + +```javascript + let changedFiles = null; + for (const arg of process.argv.slice(3)) { + const m = arg.match(/^--changed-files=(.+)$/); + if (m) { + const p = m[1]; + const lines = readFileSync(p, 'utf-8') + .split('\n') + .map(s => s.trim()) + .filter(Boolean); + changedFiles = new Set(lines); + } + } +``` + +Just before writing the output (after `batches` is assembled), filter: + +```javascript + let finalBatches = batches; + if (changedFiles) { + finalBatches = batches.filter(b => b.files.some(f => changedFiles.has(f.path))); + // batchIndex on filtered batches retains the full-graph assignment + // (the design says neighborMap should still reference unchanged files' + // full-graph batchIndex). No renumbering. + } + + const output = { + schemaVersion: 1, + algorithm, + totalFiles: scan.files.length, + totalBatches: finalBatches.length, + exportsByPath: Object.fromEntries(exportsByPath), + batches: finalBatches, + }; +``` + +- [ ] **Step 4: Run test, expect PASS** + +```bash +pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs +``` + +Expected: all PASS. + +- [ ] **Step 5: Commit** + +```bash +git add understand-anything-plugin/skills/understand/compute-batches.mjs \ + understand-anything-plugin/skills/understand/test_compute_batches.test.mjs +git commit -m "feat(compute-batches): --changed-files mode for incremental updates" +``` + +--- + +## Task 10: file-analyzer.md — add Cross-batch context (neighborMap) section + +**Files:** +- Modify: `understand-anything-plugin/agents/file-analyzer.md` + +- [ ] **Step 1: Insert the new section** + +In `understand-anything-plugin/agents/file-analyzer.md`, find the existing line: +``` +### Step 1 — Prepare the input JSON +``` + +(This is at approximately line 32.) + +After Step 1's closing code block (the bash heredoc that ends with `ENDJSON`), and **before** `### Step 2 — Execute the bundled extraction script`, insert a new sub-section. Use the Edit tool: + +Old text (the boundary between Step 1 and Step 2): +``` +ENDJSON +``` + +### Step 2 — Execute the bundled extraction script +``` + +New text: +``` +ENDJSON +``` + +### Cross-batch context (neighborMap) + +Your dispatch prompt includes a `neighborMap` — for each file in your batch, it lists project-internal neighbors in OTHER batches (files that import yours or that you import), with their exported symbols. + +Use neighborMap as a confidence boost for cross-batch edges (`calls`, `related`, `inherits`, `implements` to nodes outside your batch): + +- If your source clearly references a symbol that appears in some `neighbor.symbols`, emit the edge to `function::` or `class::` with confidence. +- If your source references a cross-batch symbol that is NOT in neighborMap (the project-scanner may not have extracted it), you may still emit the edge if you saw it explicitly in the imported file's surface — but prefer matching neighborMap symbols when available. +- Imports continue to use `batchImportData` (fully resolved), not neighborMap. + +The merge script's dangling-edge dropper is the safety net for genuinely unresolvable targets. + +### Step 2 — Execute the bundled extraction script +``` + +- [ ] **Step 2: Verify the section was inserted correctly** + +```bash +grep -n "Cross-batch context (neighborMap)" understand-anything-plugin/agents/file-analyzer.md +grep -n "Step 1 — Prepare the input JSON" understand-anything-plugin/agents/file-analyzer.md +grep -n "Step 2 — Execute the bundled extraction script" understand-anything-plugin/agents/file-analyzer.md +``` + +Expected: all three lines exist, and the Cross-batch context line number is between Step 1's and Step 2's line numbers. + +- [ ] **Step 3: Commit** + +```bash +git add understand-anything-plugin/agents/file-analyzer.md +git commit -m "docs(file-analyzer): add Cross-batch context (neighborMap) section" +``` + +--- + +## Task 11: file-analyzer.md — replace Writing Results with multi-part protocol + +**Files:** +- Modify: `understand-anything-plugin/agents/file-analyzer.md` + +- [ ] **Step 1: Replace the Writing Results section** + +In `understand-anything-plugin/agents/file-analyzer.md`, find the existing block (at approximately lines 467-475): + +Old text: +``` +## Writing Results + +After producing the JSON: + +1. Write the JSON to: `/.understand-anything/intermediate/batch-.json` +2. The project root and batch index will be provided in your prompt. +3. Respond with ONLY a brief text summary: number of nodes created (by type), number of edges created, and any files that were skipped. + +Do NOT include the full JSON in your text response. +``` + +New text: +``` +## Writing Results — single or multi-part + +**Step A — Compute totals.** +``` +nodeCount = nodes.length +edgeCount = edges.length +``` + +**Step B — Decide split.** +- If `nodeCount ≤ 60` AND `edgeCount ≤ 120`: write ONE file to `.understand-anything/intermediate/batch-.json`. Done. Skip to Step F. +- Otherwise: `parts = ceil(max(nodeCount / 60, edgeCount / 120))`. + +**Step C — Partition.** +Sort files in your batch alphabetically by path. Chunk them sequentially into `parts` groups of size `ceil(N / parts)`. For each part: +- All nodes whose `filePath` is in this part's files (for non-file nodes like `module`/`concept`, use the file they belong to). +- All edges whose `source` is in this part's nodes (target may be anywhere — same part, different part of same batch, different batch). + +**Step D — Write each part.** +Write part `k` (1-indexed) to `.understand-anything/intermediate/batch--part-.json`. Each part is a valid GraphFragment: `{ "nodes": [...], "edges": [...] }`. + +**Step E — Self-validate.** +For each file written, verify: +- Valid JSON. +- `nodes` array exists and is well-formed. +- For every edge: `source` and `target` both appear as either (a) a node `id` in this part's nodes, OR (b) a `file:` reference where `` is in `neighborMap` or `batchImportData`, OR (c) a `function::` / `class::` reference where `` is in some `neighbor.symbols`. + +If validation fails on a part, do NOT silently rebuild. Respond with an explicit error stating which part failed, which edge(s) failed validation, and why. The dispatching session can then retry. + +**Step F — Respond.** +Respond with ONLY a brief text summary: parts written (1 or more), total nodes/edges across all parts, any files skipped. Do NOT include JSON content in the response. +``` + +- [ ] **Step 2: Verify** + +```bash +grep -n "Writing Results — single or multi-part" understand-anything-plugin/agents/file-analyzer.md +grep -n "Step A — Compute totals" understand-anything-plugin/agents/file-analyzer.md +grep -n "Step F — Respond" understand-anything-plugin/agents/file-analyzer.md +# Confirm old prose is gone: +! grep -n "After producing the JSON:" understand-anything-plugin/agents/file-analyzer.md +``` + +Expected: first three exist, last `grep` returns non-zero (i.e. no match). + +- [ ] **Step 3: Commit** + +```bash +git add understand-anything-plugin/agents/file-analyzer.md +git commit -m "docs(file-analyzer): replace Writing Results with multi-part output protocol" +``` + +--- + +## Task 12: SKILL.md — Phase 1.5 + Phase 2 rewrite + Incremental path rewrite + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/SKILL.md` + +- [ ] **Step 1: Insert Phase 1.5 after Phase 1** + +In `understand-anything-plugin/skills/understand/SKILL.md`, find the line: +``` +## Phase 2 — ANALYZE +``` + +(At approximately line 278.) + +Immediately before that line, insert the Phase 1.5 block. The boundary is the `---` separator above `## Phase 2 — ANALYZE`. Use the Edit tool to replace: + +Old text (the separator + Phase 2 header): +``` +--- + +## Phase 2 — ANALYZE +``` + +New text: +``` +--- + +## Phase 1.5 — BATCH + +Report: `[Phase 1.5/7] Computing semantic batches...` + +Run the bundled batching script: +```bash +node /compute-batches.mjs $PROJECT_ROOT +``` + +Reads `.understand-anything/intermediate/scan-result.json`, writes `.understand-anything/intermediate/batches.json`. + +Capture stderr. Append any line starting with `Warning:` to `$PHASE_WARNINGS` for the final report. + +If the script exits non-zero, the failure is hard — relay the full stderr to the user as a Phase 1.5 failure. Do not attempt to recover; the script's internal fallback (count-based) already handles recoverable issues. A non-zero exit means a fundamental problem (missing input file, malformed JSON, etc.). + +--- + +## Phase 2 — ANALYZE +``` + +- [ ] **Step 2: Replace Phase 2 ANALYZE Full analysis path** + +In SKILL.md, find the block starting `### Full analysis path` (at approximately line 280) and ending just before `### Incremental update path`. + +Old text (the entire Full analysis path section — multi-paragraph; use Edit to replace from `### Full analysis path` through the line `Include the script's warnings in \`$PHASE_WARNINGS\` for the reviewer.`): + +``` +### Full analysis path + +Batch the file list from Phase 1 into groups of **20-30 files each** (aim for ~25 files per batch for balanced sizes). + +**Batching strategy for non-code files:** +- Group related non-code files together in the same batch when possible: + - Dockerfile + docker-compose.yml + .dockerignore → same batch + - SQL migration files → same batch (ordered by filename) + - CI/CD config files (.github/workflows/*) → same batch + - Documentation files (docs/*.md) → same batch +- This allows the file-analyzer to create cross-file edges (e.g., docker-compose `depends_on` Dockerfile) +- Non-code files can be mixed with code files in the same batch if batch sizes are small +- Each file's `fileCategory` from Phase 1 must be included in the batch file list + +After batching, report the plan to the user: +> `[Phase 2/7] Analyzing files — files in batches (up to 5 concurrent)...` + +For each batch, dispatch a subagent using the `file-analyzer` agent definition (at `agents/file-analyzer.md`). Run up to **5 subagents concurrently** using parallel dispatch. Append the following additional context: + +> **Additional context from main session:** +> +> Project: `` — `` +> Languages: `` +> +> $LANGUAGE_DIRECTIVE + +Before dispatching each batch, construct `batchImportData` from `$IMPORT_MAP`: +```json +batchImportData = {} +for each file in this batch: + batchImportData[file.path] = $IMPORT_MAP[file.path] ?? [] +``` + +Fill in batch-specific parameters below and dispatch: + +> Analyze these files and produce GraphNode and GraphEdge objects. +> Project root: `$PROJECT_ROOT` +> Project: `` +> Languages: `` +> Batch: `/` +> Skill directory (for bundled scripts): `` +> Write output to: `$PROJECT_ROOT/.understand-anything/intermediate/batch-.json` +> +> Pre-resolved import data for this batch (use this for all import edge creation — do NOT re-resolve imports from source): +> ```json +> +> ``` +> +> Files to analyze in this batch (every entry MUST be passed through to `batchFiles` with all four fields — `path`, `language`, `sizeLines`, `fileCategory`): +> 1. `` ( lines, language: ``, fileCategory: ``) +> 2. `` ( lines, language: ``, fileCategory: ``) +> ... + +After ALL batches complete, report to the user: `Phase 2 complete. All batches analyzed.` + +Run the merge-and-normalize script bundled with this skill (located next to this SKILL.md file — use the skill directory path, not the project root): +```bash +python /merge-batch-graphs.py $PROJECT_ROOT +``` + +This script reads all `batch-*.json` files from `$PROJECT_ROOT/.understand-anything/intermediate/`, then in one pass: +- Combines all nodes and edges across batches +- Normalizes node IDs (strips double prefixes, project-name prefixes, adds missing prefixes) +- Normalizes complexity values (`low`→`simple`, `medium`→`moderate`, `high`→`complex`, etc.) +- Rewrites edge references to match corrected node IDs +- Deduplicates nodes by ID (keeps last occurrence) and edges by `(source, target, type)` +- Drops dangling edges referencing missing nodes +- Logs all corrections and dropped items to stderr + +The merge script also runs a `tested_by` linker that canonicalizes test-coverage edges in two passes. **Pass 1** walks LLM-emitted `tested_by` edges and flips inverted ones in place (the LLM systematically emits `test → production` because it sees the import only when analyzing the test file); semantically broken edges (test↔test, prod↔prod, orphan endpoints) are dropped. **Pass 2** supplements with path-convention pairings (`X.ts` ↔ `X.test.ts`, JS/TS `__tests__/` and `/test/` walk-out, Python in-package `tests/`, Go `_test.go` sibling, Maven/Gradle `src/test/...` ↔ `src/main/...`, .NET `/tests/` ↔ `/src/...` and `.Tests/` ↔ `/`). Production nodes that end up sourcing any `tested_by` edge get a `"tested"` tag. All resulting edges run `production → test`. + +Output: `$PROJECT_ROOT/.understand-anything/intermediate/assembled-graph.json` + +Include the script's warnings in `$PHASE_WARNINGS` for the reviewer. +``` + +New text: +``` +### Full analysis path + +Load `.understand-anything/intermediate/batches.json` (produced by Phase 1.5). Iterate the `batches[]` array. + +Report: `[Phase 2/7] Analyzing files — files in batches (up to 5 concurrent)...` + +For each batch, dispatch a subagent using the `file-analyzer` agent definition (at `agents/file-analyzer.md`). Run up to **5 subagents concurrently**. Append the following additional context: + +> **Additional context from main session:** +> +> Project: `` — `` +> Languages: `` +> +> $LANGUAGE_DIRECTIVE + +Dispatch prompt template (fill in batch-specific values from `batches.json[i]`): + +> Analyze these files and produce GraphNode and GraphEdge objects. +> Project root: `$PROJECT_ROOT` +> Project: `` +> Languages: `` +> Batch: `/` +> Skill directory (for bundled scripts): `` +> Output: write to `$PROJECT_ROOT/.understand-anything/intermediate/batch-.json` (single-file mode) OR `batch--part-.json` (split mode, per Step B of your output protocol). +> +> Pre-resolved import data for this batch (use directly — do NOT re-resolve imports from source): +> ```json +> +> ``` +> +> Cross-batch neighbors with their exported symbols (confidence boost for cross-batch edges): +> ```json +> +> ``` +> +> Files to analyze in this batch (every entry MUST be passed through to `batchFiles` with all four fields — `path`, `language`, `sizeLines`, `fileCategory`): +> 1. `` ( lines, language: ``, fileCategory: ``) +> 2. `` ( lines, language: ``, fileCategory: ``) +> ... + +After ALL batches complete, report to the user: `Phase 2 complete. All batches analyzed.` + +Run the merge-and-normalize script bundled with this skill: +```bash +python /merge-batch-graphs.py $PROJECT_ROOT +``` + +This script reads all `batch-*.json` files (including `batch--part-.json` produced by file-analyzers that split their output) from `$PROJECT_ROOT/.understand-anything/intermediate/`, then in one pass: +- Combines all nodes and edges across batches +- Normalizes node IDs (strips double prefixes, project-name prefixes, adds missing prefixes) +- Normalizes complexity values (`low`→`simple`, `medium`→`moderate`, `high`→`complex`, etc.) +- Rewrites edge references to match corrected node IDs +- Deduplicates nodes by ID (keeps last occurrence) and edges by `(source, target, type)` +- Drops dangling edges referencing missing nodes +- Logs all corrections and dropped items to stderr + +The merge script also runs a `tested_by` linker that canonicalizes test-coverage edges in two passes. **Pass 1** walks LLM-emitted `tested_by` edges and flips inverted ones in place; semantically broken edges (test↔test, prod↔prod, orphan endpoints) are dropped. **Pass 2** supplements with path-convention pairings. Production nodes that end up sourcing any `tested_by` edge get a `"tested"` tag. All resulting edges run `production → test`. + +Output: `$PROJECT_ROOT/.understand-anything/intermediate/assembled-graph.json` + +Include the script's warnings in `$PHASE_WARNINGS` for the reviewer. +``` + +- [ ] **Step 3: Replace Incremental update path** + +Find: +``` +### Incremental update path + +Use the changed files list from Phase 0. Batch and dispatch file-analyzer subagents using the same process as above (20-30 files per batch, up to 5 concurrent, with batchImportData constructed from $IMPORT_MAP), but only for changed files. + +After batches complete: +1. Remove old nodes whose `filePath` matches any changed file from the existing graph +2. Remove old edges whose `source` or `target` references a removed node +3. Write the pruned existing nodes/edges as `batch-existing.json` in the intermediate directory +4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files: + ```bash + python /merge-batch-graphs.py $PROJECT_ROOT + ``` +``` + +Replace with: +``` +### Incremental update path + +Write the changed-files list (one path per line) to a temp file: +```bash +git diff ..HEAD --name-only > $PROJECT_ROOT/.understand-anything/tmp/changed-files.txt +``` + +Run compute-batches with `--changed-files`: +```bash +node /compute-batches.mjs $PROJECT_ROOT \ + --changed-files=$PROJECT_ROOT/.understand-anything/tmp/changed-files.txt +``` + +This produces a `batches.json` that contains only batches with changed files, but neighborMap entries still reference unchanged files (with their full-graph batchIndex) so cross-batch edges remain emittable. + +Then dispatch file-analyzer subagents per the same template as the full path. + +After batches complete: +1. Remove old nodes whose `filePath` matches any changed file from the existing graph +2. Remove old edges whose `source` or `target` references a removed node +3. Write the pruned existing nodes/edges as `batch-existing.json` in the intermediate directory +4. Run the same merge script — it will combine `batch-existing.json` with the fresh `batch-*.json` files: + ```bash + python /merge-batch-graphs.py $PROJECT_ROOT + ``` +``` + +- [ ] **Step 4: Verify** + +```bash +grep -n "Phase 1.5 — BATCH" understand-anything-plugin/skills/understand/SKILL.md +grep -n "Load \`.understand-anything/intermediate/batches.json\`" understand-anything-plugin/skills/understand/SKILL.md +grep -n "compute-batches.mjs" understand-anything-plugin/skills/understand/SKILL.md +# Confirm old prose is gone (each command should print "OK: ... absent"): +if grep -q "groups of \*\*20-30 files each\*\*" understand-anything-plugin/skills/understand/SKILL.md; then echo "FAIL: old batching prose still present"; else echo "OK: old batching prose absent"; fi +if grep -qF "Dockerfile + docker-compose.yml + .dockerignore → same batch" understand-anything-plugin/skills/understand/SKILL.md; then echo "FAIL: old non-code prose still present"; else echo "OK: old non-code prose absent"; fi +``` + +Expected: first three exist (compute-batches.mjs should appear at least 3 times — Phase 1.5 + Incremental); both check commands print "OK: ... absent". + +- [ ] **Step 5: Commit** + +```bash +git add understand-anything-plugin/skills/understand/SKILL.md +git commit -m "feat(understand): introduce Phase 1.5 (compute-batches) and rewrite Phase 2 prose" +``` + +--- + +## Task 13: merge-batch-graphs.py — multi-part stderr report + missing-part warning + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/merge-batch-graphs.py` + +- [ ] **Step 1: Replace the "Found N batch files:" report** + +In `merge-batch-graphs.py`, find the block at approximately line 1026: + +Old text: +```python + print(f"Found {len(batch_files)} batch files:", file=sys.stderr) +``` + +New text: +```python + # Group by logical batch index so the report distinguishes single-batch + # files from multi-part file-analyzer outputs. + from collections import defaultdict as _dd + by_batch = _dd(list) + for f in batch_files: + m = re.match(r"batch-(\d+)(?:-part-(\d+))?\.json", f.name) + if m: + by_batch[int(m.group(1))].append((f.name, int(m.group(2)) if m.group(2) else None)) + + logical_count = len(by_batch) + multi_part = sum(1 for entries in by_batch.values() if len(entries) > 1) + print( + f"Found {len(batch_files)} batch files " + f"({logical_count} logical batches, {multi_part} multi-part):", + file=sys.stderr, + ) + + # Missing-part detection: for any logical batch with parts (len > 1), the + # set of part numbers MUST be contiguous starting at 1. Gaps suggest a + # truncated write — emit a visible warning so the user can investigate. + for idx, entries in by_batch.items(): + part_nums = [p for (_n, p) in entries if p is not None] + if not part_nums: + continue + present = set(part_nums) + expected = set(range(1, max(part_nums) + 1)) + missing = sorted(expected - present) + if missing: + print( + f"Warning: merge: batch {idx} has parts {sorted(present)} but " + f"missing part {missing} — possible truncated write — " + f"affected nodes/edges may be lost", + file=sys.stderr, + ) +``` + +- [ ] **Step 2: Verify the file still parses** + +```bash +python3 -c "import ast; ast.parse(open('understand-anything-plugin/skills/understand/merge-batch-graphs.py').read())" && echo "OK" +``` + +Expected: prints `OK`. + +- [ ] **Step 3: Smoke-test the existing test suite still passes** + +```bash +cd understand-anything-plugin/skills/understand && python3 -m unittest test_merge_batch_graphs.py -v 2>&1 | tail -20 +``` + +Expected: all existing tests pass (we haven't broken anything). + +- [ ] **Step 4: Commit** + +```bash +git add understand-anything-plugin/skills/understand/merge-batch-graphs.py +git commit -m "feat(merge-batch-graphs): multi-part aware stderr report + missing-part warning" +``` + +--- + +## Task 14: merge-batch-graphs.py — multi-part unit tests + +**Files:** +- Modify: `understand-anything-plugin/skills/understand/test_merge_batch_graphs.py` + +- [ ] **Step 1: Append TestMultiPart class** + +Append to `understand-anything-plugin/skills/understand/test_merge_batch_graphs.py`: + +```python + + +# ── Multi-part batch handling ───────────────────────────────────────────── + + +class TestMultiPart(unittest.TestCase): + """End-to-end tests for batch--part-.json input handling. + + These tests invoke merge-batch-graphs.py as a subprocess in a temp + directory so we exercise the full path: glob → load → merge → write. + """ + + def setUp(self) -> None: + import tempfile + self.tmp = Path(tempfile.mkdtemp(prefix="ua-mbg-")) + self.intermediate = self.tmp / ".understand-anything" / "intermediate" + self.intermediate.mkdir(parents=True, exist_ok=True) + + def tearDown(self) -> None: + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def _write_batch(self, name: str, nodes: list, edges: list) -> None: + import json as _j + (self.intermediate / name).write_text( + _j.dumps({"nodes": nodes, "edges": edges}), + encoding="utf-8", + ) + + def _run_merge(self) -> tuple[int, str, dict]: + import subprocess + import json as _j + result = subprocess.run( + ["python3", str(_MODULE_PATH), str(self.tmp)], + capture_output=True, text=True, + ) + out_path = self.intermediate / "assembled-graph.json" + assembled = _j.loads(out_path.read_text()) if out_path.exists() else {} + return result.returncode, result.stderr, assembled + + def test_two_parts_of_one_logical_batch_merge(self) -> None: + self._write_batch("batch-1-part-1.json", + [_file_node("src/a.ts")], + [{"source": "file:src/a.ts", "target": "file:src/b.ts", + "type": "imports", "direction": "forward", "weight": 0.7}]) + self._write_batch("batch-1-part-2.json", + [_file_node("src/b.ts")], + []) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, {"file:src/a.ts", "file:src/b.ts"}) + # Cross-part edge survived + edge_keys = {(e["source"], e["target"], e["type"]) for e in assembled["edges"]} + self.assertIn( + ("file:src/a.ts", "file:src/b.ts", "imports"), edge_keys) + + def test_three_parts_of_one_logical_batch_merge(self) -> None: + for k, path in enumerate(["src/a.ts", "src/b.ts", "src/c.ts"], start=1): + self._write_batch(f"batch-1-part-{k}.json", + [_file_node(path)], []) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, + {"file:src/a.ts", "file:src/b.ts", "file:src/c.ts"}) + + def test_malformed_part_is_skipped_with_warning(self) -> None: + (self.intermediate / "batch-1-part-1.json").write_text( + "{ this is not valid json", encoding="utf-8") + self._write_batch("batch-1-part-2.json", + [_file_node("src/b.ts")], []) + rc, stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + # The skip warning is from existing load_batch logic + self.assertIn("skipping batch-1-part-1.json", stderr) + # part-2 content still made it in + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, {"file:src/b.ts"}) + + def test_mixed_single_and_multi_part(self) -> None: + self._write_batch("batch-1.json", + [_file_node("src/single.ts")], []) + self._write_batch("batch-2-part-1.json", + [_file_node("src/multi-a.ts")], []) + self._write_batch("batch-2-part-2.json", + [_file_node("src/multi-b.ts")], []) + self._write_batch("batch-3.json", + [_file_node("src/another-single.ts")], []) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, { + "file:src/single.ts", "file:src/multi-a.ts", + "file:src/multi-b.ts", "file:src/another-single.ts", + }) + + def test_missing_part_emits_warning(self) -> None: + # parts {2, 3} present, part-1 missing + self._write_batch("batch-1-part-2.json", + [_file_node("src/b.ts")], []) + self._write_batch("batch-1-part-3.json", + [_file_node("src/c.ts")], []) + rc, stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + self.assertRegex(stderr, + r"Warning: merge: batch 1 has parts \[2, 3\] but " + r"missing part \[1\] — possible truncated write") + + def test_stderr_report_format(self) -> None: + self._write_batch("batch-1.json", [_file_node("src/a.ts")], []) + self._write_batch("batch-2-part-1.json", [_file_node("src/b.ts")], []) + self._write_batch("batch-2-part-2.json", [_file_node("src/c.ts")], []) + rc, stderr, _assembled = self._run_merge() + self.assertEqual(rc, 0) + # 3 files on disk, 2 logical batches, 1 multi-part + self.assertIn( + "Found 3 batch files (2 logical batches, 1 multi-part)", stderr) +``` + +- [ ] **Step 2: Run tests, expect PASS** + +```bash +cd understand-anything-plugin/skills/understand && python3 -m unittest test_merge_batch_graphs.TestMultiPart -v +``` + +Expected: all 6 tests PASS. + +- [ ] **Step 3: Run full test suite** + +```bash +cd understand-anything-plugin/skills/understand && python3 -m unittest test_merge_batch_graphs -v 2>&1 | tail -5 +``` + +Expected: all tests PASS (pre-existing + new). + +- [ ] **Step 4: Commit** + +```bash +git add understand-anything-plugin/skills/understand/test_merge_batch_graphs.py +git commit -m "test(merge-batch-graphs): TestMultiPart for batch-i-part-k handling" +``` + +--- + +## Task 15: Integration acceptance gate (manual) + +This task is a **gated manual checklist** — execute interactively, mark each item, do not auto-merge without all green. + +**Files:** none (this is a verification step) + +- [ ] **Step 1: Install + build clean** + +```bash +pnpm install +pnpm --filter @understand-anything/core build +pnpm --filter @understand-anything/skill build +``` + +Expected: all succeed. + +- [ ] **Step 2: Sync local plugin into Claude Code's plugin cache for testing** + +Per project's CLAUDE.md "Testing Local Plugin Changes" section. From repo root: + +```bash +INSTALLED_VERSION=$(ls ~/.claude/plugins/cache/understand-anything/understand-anything/ | head -1) +echo "Installed version: $INSTALLED_VERSION" +rm -rf ~/.claude/plugins/cache/understand-anything/understand-anything/$INSTALLED_VERSION +cp -R ./understand-anything-plugin ~/.claude/plugins/cache/understand-anything/understand-anything/$INSTALLED_VERSION +``` + +- [ ] **Step 3: Start a fresh Claude Code session and run /understand --full on this repo** + +In a fresh session in this repo's directory: +``` +/understand --full +``` + +Expected during run: +- `[Phase 1.5/7] Computing semantic batches...` appears +- Phase 2 reports batch count from `batches.json` (not arbitrary count-based) +- At least one batch with > 60 nodes / > 120 edges triggers multi-part output (look in `.understand-anything/intermediate/` for any `batch--part-.json` files) + +Expected after run: +- `knowledge-graph.json` exists with reasonable node/edge counts compared to current main +- Dashboard renders normally +- Phase 7 final report's warnings section includes any compute-batches warnings IF they fired + +- [ ] **Step 4: Sanity-check batches.json contents** + +```bash +jq '.algorithm, .totalFiles, .totalBatches, (.batches | length), [.batches[].files | length]' \ + .understand-anything/intermediate/batches.json 2>/dev/null \ + || echo "batches.json was cleaned up by Phase 7 — re-run with /understand --full and inspect before Phase 7 cleanup, or check git diff for the script's behavior." +``` + +Note: Phase 7 cleans up `.understand-anything/intermediate/` so this is best inspected mid-run, not after. + +- [ ] **Step 5: Run on a small repo (5-10 files) to verify fallback batch path** + +```bash +mkdir -p /tmp/ua-smoke-small/src +cd /tmp/ua-smoke-small +git init && git commit --allow-empty -m init +echo 'export const a = 1;' > src/a.ts +echo 'export const b = 2;' > src/b.ts +echo 'export const c = 3;' > src/c.ts +echo '{"name":"smoke","version":"0.0.1"}' > package.json +git add . && git commit -m setup +``` + +Then `cd /tmp/ua-smoke-small` in a Claude Code session and run `/understand --full`. Expected: completes without errors, single small batch. + +- [ ] **Step 6: Run on a ~100-file repo to validate the bug fix** + +If you have a ~100-file repo handy (or use the largest test fixture from the project), run `/understand --full` and confirm no "output limit" errors appear, even on Bedrock OPUS. + +If you do not have a suitable repo, document this in the PR description as a deferred manual verification step. + +- [ ] **Step 7: Stage results** + +This task does not commit anything — it's a verification gate. If Step 3 reveals bugs, go back to the relevant task and fix; otherwise proceed to Task 16. + +--- + +## Task 16: Version bump in 5 files + +Per project CLAUDE.md: when pushing to remote, bump version in **all five** files listed. + +**Files:** +- Modify: `understand-anything-plugin/package.json` +- Modify: `understand-anything-plugin/.claude-plugin/plugin.json` +- Modify: `.claude-plugin/plugin.json` +- Modify: `.cursor-plugin/plugin.json` +- Modify: `.copilot-plugin/plugin.json` + +- [ ] **Step 1: Determine new version** + +Current version is `2.7.4` (per `understand-anything-plugin/package.json` line 3). This PR adds a substantial feature (Phase 1.5 + multi-part output) — bump **minor**: `2.8.0`. + +- [ ] **Step 2: Confirm all five files have the same current version** + +```bash +grep -H '"version"' \ + understand-anything-plugin/package.json \ + understand-anything-plugin/.claude-plugin/plugin.json \ + .claude-plugin/plugin.json \ + .cursor-plugin/plugin.json \ + .copilot-plugin/plugin.json +``` + +Expected: all five print `"version": "2.7.4"` (or whatever the current version is — use that as the baseline). If they diverge, stop and reconcile with the user. + +- [ ] **Step 3: Bump each file from `2.7.4` to `2.8.0`** + +Use the Edit tool on each of the five files. For each, replace `"version": "2.7.4"` with `"version": "2.8.0"`. + +- [ ] **Step 4: Verify all five updated** + +```bash +grep -H '"version"' \ + understand-anything-plugin/package.json \ + understand-anything-plugin/.claude-plugin/plugin.json \ + .claude-plugin/plugin.json \ + .cursor-plugin/plugin.json \ + .copilot-plugin/plugin.json +``` + +Expected: all five print `"version": "2.8.0"`. + +- [ ] **Step 5: Commit** + +```bash +git add understand-anything-plugin/package.json \ + understand-anything-plugin/.claude-plugin/plugin.json \ + .claude-plugin/plugin.json \ + .cursor-plugin/plugin.json \ + .copilot-plugin/plugin.json +git commit -m "chore: bump version to 2.8.0" +``` + +- [ ] **Step 6: Push branch and open PR** + +```bash +git push -u origin feat/semantic-batching-and-output-chunking +gh pr create --title "feat(understand): semantic batching (Phase 1.5) + output chunking — fixes #159" --body "$(cat <<'EOF' +## Summary +- Replace count-based file-analyzer batching with Louvain community detection on the import graph (new Phase 1.5, deterministic `compute-batches.mjs` script). +- file-analyzer self-splits its output into `batch--part-.json` when above 60 nodes / 120 edges per part (Bedrock OPUS output cap safety). +- Cross-batch neighbors (with their exported symbols) passed to file-analyzer via `neighborMap` so semantic edges like `calls` and `inherits` can be confidently emitted across batches. +- Every fallback path emits a visible `Warning:` line that bubbles to `$PHASE_WARNINGS` in the Phase 7 final report. +- merge-batch-graphs.py multi-part-aware stderr report + missing-part warning; glob/sort-key already accepted multi-part naming so no algorithmic change required there. + +Fixes #159. + +Design: `docs/superpowers/specs/2026-05-24-semantic-batching-and-output-chunking-design.md` +Plan: `docs/superpowers/plans/2026-05-24-semantic-batching-and-output-chunking-impl.md` + +## Test plan +- [x] `pnpm install` (graphology + graphology-communities-louvain install cleanly) +- [x] `pnpm --filter @understand-anything/core build` +- [x] `pnpm --filter @understand-anything/skill exec vitest run skills/understand/test_compute_batches.test.mjs` — all green +- [x] `cd understand-anything-plugin/skills/understand && python3 -m unittest test_merge_batch_graphs -v` — all green +- [x] Run `/understand --full` on this repo — `batches.json` generated; multi-part triggered on at least one batch; assembled-graph node/edge counts within expected range vs current main; dashboard renders normally; Phase 7 warnings section includes any compute-batches warnings. +- [ ] (Deferred / external) Run on a ~100-file repo on Bedrock OPUS — confirm no "output limit" errors. Document any deferred verification in PR comments. +EOF +)" +``` + +Expected: PR URL returned. + +--- + +## Implementation done. Final check before merge: + +- [ ] All 16 tasks above complete with checkboxes ticked. +- [ ] Branch builds + tests green: `pnpm install && pnpm --filter @understand-anything/core build && pnpm --filter @understand-anything/skill exec vitest run skills/understand/ && cd understand-anything-plugin/skills/understand && python3 -m unittest test_merge_batch_graphs test_compute_batches 2>&1 | tail -10` (note: `test_compute_batches` is the Vitest tree, this just sanity-checks Python; the Vitest run is separate) +- [ ] No `try { ... } catch { /* silent */ }` or `except: pass` patterns added (grep your diff). +- [ ] Spec ↔ plan ↔ code alignment spot-checked: every Failure-mode warning string in the spec is asserted by at least one unit test. diff --git a/docs/superpowers/specs/2026-05-24-semantic-batching-and-output-chunking-design.md b/docs/superpowers/specs/2026-05-24-semantic-batching-and-output-chunking-design.md new file mode 100644 index 0000000..d100632 --- /dev/null +++ b/docs/superpowers/specs/2026-05-24-semantic-batching-and-output-chunking-design.md @@ -0,0 +1,587 @@ +# Semantic Batching and Output Chunking Design + +**Date:** 2026-05-24 +**Status:** Draft +**Branch:** `feat/semantic-batching-and-output-chunking` +**Issue:** [#159](https://github.com/Lum1104/Understand-Anything/issues/159) — Frequently seeing output limit exceeded + +--- + +## Problem + +The `/understand` skill's Phase 2 dispatches `file-analyzer` subagents in batches of 20-30 files each (`skills/understand/SKILL.md:282`). Two issues compound on output-constrained LLM backends (notably Bedrock OPUS with default max_tokens of 4096-8192): + +1. **Output cap pressure.** Each `file-analyzer` writes one `batch-.json` containing all nodes (file + functions + classes) and edges for its batch. For 25 dense files the JSON content easily exceeds the per-turn `Write(content=...)` token budget. The agent improvises by entering an undefined "minimal output mode" and drops nodes/edges silently. Issue #159 reports this for OPUS on Bedrock at the 100-file scale. + +2. **Count-based batching breaks module semantics.** Files are batched by count, not by logical relationship. Files that import each other (and would together form an `auth` module, an `api` module, etc.) get split across batches. The file-analyzer only sees within-batch edges confidently; `calls`/`related`/`inherits`/`implements` edges between modules get dropped at batch boundaries. + +The existing `recover_imports_from_scan` in `merge-batch-graphs.py:913` is a deterministic safety net for `imports` edges — but it cannot recover semantic edges (calls / related / inherits / implements). Those are lost. + +--- + +## Goals + +- Eliminate "Batch X failed (output limit)" from `/understand` runs on Bedrock OPUS for projects up to 500 files. +- Improve cross-batch semantic edge coverage by replacing count-based batching with Louvain community detection on the import graph. +- Maintain `imports` edge coverage parity (no regression on existing safety net). +- Stay within one PR — defer broader refactors to follow-ups (Section "Out of scope"). + +## Non-goals + +- Refactoring Phase 1 / 2 tree-sitter usage to deduplicate per-batch extraction. +- Adding LLM-generated file summaries to neighborMap. +- Auto-tuning output thresholds per provider. + +--- + +## Architecture + +Pipeline before: + +``` +Phase 1 project-scanner → scan-result.json (files + importMap) +Phase 2 file-analyzer (×N concur) → batch-.json (one per batch; SKILL.md prose batching) +Phase 2末 merge-batch-graphs.py → assembled-graph.json +``` + +Pipeline after: + +``` +Phase 1 project-scanner → scan-result.json (unchanged) +Phase 1.5 compute-batches.mjs → batches.json (NEW — semantic batching + neighborMap) +Phase 2 file-analyzer (×N concur) → batch-.json (single) OR batch--part-.json (split) +Phase 2末 merge-batch-graphs.py → assembled-graph.json (verified, no code change) +``` + +**Phase 1.5 single responsibility:** topology decision + neighborMap construction. Pure algorithm — reads `scan-result.json`, writes `batches.json`, no LLM calls. + +**Phase 2 changes:** SKILL.md stops doing prose batching; iterates `batches.json` and dispatches one file-analyzer per batch. + +**file-analyzer changes:** consumes neighborMap; self-checks output size before writing; splits into `batch--part-.json` when above thresholds. + +**merge-batch-graphs.py:** no code changes — the `batch-*.json` glob and sort-key regex already accept multi-part naming. Test fixture and stderr report enhancement added. + +--- + +## Component 1 — `compute-batches.mjs` + +**Location:** `understand-anything-plugin/skills/understand/compute-batches.mjs` + +**Invocation:** `node /compute-batches.mjs $PROJECT_ROOT [--changed-files=]` + +**Input:** `$PROJECT_ROOT/.understand-anything/intermediate/scan-result.json` + +**Output:** `$PROJECT_ROOT/.understand-anything/intermediate/batches.json` + +### Dependencies + +Added to `understand-anything-plugin/package.json`: + +- `graphology` (~10KB) +- `graphology-communities-louvain` (~30KB) + +Reuses `@understand-anything/core`'s `TreeSitterPlugin` and `PluginRegistry` (already imported by `extract-structure.mjs`). + +### Algorithm + +``` +1. Load scan-result.json. + +2. Partition files by fileCategory: + - codeFiles = files where fileCategory === "code" + - nonCodeFiles = the rest + +3. Code batching (Louvain on import graph): + a. Build undirected graph: nodes = codeFiles, edges = importMap relations + (weight=1, undirected so import and imported-by both count). + b. Run graphology-communities-louvain → community assignment per file. + c. For any community with size > 35 (max): split via edge-betweenness greedy + cut (or simpler weakly-connected-component partition) until each + sub-community ≤ 35. Log warning per split. + (Whether this branch fires is decided by the implementation prototype + step — see "Prototype-first implementation" below.) + d. Communities with size < 5 are kept as-is. Wasted dispatches are + bounded by the 5-concurrent cap, and the alternative ("merge small") + adds edge cases without proportional value. + +4. Non-code batching (hardcoded heuristics, moved from SKILL.md prose): + - Group A: For each directory containing a `Dockerfile`, bundle that + directory's `Dockerfile` + any `docker-compose.*` + any + `.dockerignore` → one batch per such directory (so multi-service + repos with several Dockerfiles get one batch per service). + - Group B: `.github/workflows/*.yml` files → one batch. + - Group C: `.gitlab-ci.yml` + files under `.circleci/` → one batch. + - Group D: SQL files under any `migrations/` or `migration/` directory, + sorted by filename → one batch per directory. + - Group E: All other non-code files grouped by their immediate parent + directory, max 20 per batch. + +5. Assign batchIndex: code communities first (1..N), non-code groups + second (N+1..M). + +6. Exports extraction: + - For each code file, run TreeSitterPlugin.extract() and collect + top-level exports (function names, class names, exported const names). + - Per-file failures: catch, set exports = [], emit warning. + - Non-code files: exports = []. + +7. Construct neighborMap (1-hop): + For each file F in batch B: + neighborMap[F.path] = [ + { path: G.path, batchIndex: G.batch, symbols: G.exports } + for G in importMap[F.path] ∪ reverseImportMap[F.path] + where G.batch ≠ B + ] + If neighborMap[F.path].length > 50, truncate to top 50 by neighbor + degree (highest-imported neighbors kept), emit warning. + +8. Construct batchImportData: + For each batch B: + batchImportData[F.path] = importMap[F.path] for F in B.files + +9. Write batches.json. + +Fallback (script-internal): If steps 3a-3c throw, catch → emit warning +→ assign batches by alphabetical chunking (12 files per code batch). +Steps 4, 6, 7, 8 still run normally. Set `algorithm: "count-fallback"` +in the output. +``` + +### Louvain implementation + +Use `graphology-communities-louvain`'s default modularity-greedy algorithm: + +```js +import Graph from 'graphology'; +import louvain from 'graphology-communities-louvain'; + +const graph = new Graph({ type: 'undirected' }); +for (const file of codeFiles) graph.addNode(file.path); +for (const [src, targets] of Object.entries(importMap)) { + for (const tgt of targets) { + if (graph.hasNode(src) && graph.hasNode(tgt) && !graph.hasEdge(src, tgt)) { + graph.addEdge(src, tgt); + } + } +} +const communities = louvain(graph); // { nodeId: communityId } +``` + +### Output schema (`batches.json`) + +```json +{ + "schemaVersion": 1, + "algorithm": "louvain", + "totalFiles": 100, + "totalBatches": 7, + "batches": [ + { + "batchIndex": 1, + "files": [ + { "path": "src/auth/login.ts", "language": "typescript", + "sizeLines": 120, "fileCategory": "code" } + ], + "batchImportData": { + "src/auth/login.ts": ["src/auth/session.ts", "src/db/users.ts"] + }, + "neighborMap": { + "src/auth/login.ts": [ + { "path": "src/db/users.ts", "batchIndex": 3, + "symbols": ["User", "findById", "createUser"] } + ] + } + } + ] +} +``` + +`algorithm` is `"louvain"` on the happy path, `"count-fallback"` when the Louvain branch crashed. + +### `--changed-files` mode + +When invoked with `--changed-files=`, the script: + +- Loads file paths from `` (one per line). +- Still builds the full project import graph (for accurate neighborMap construction). +- Only emits batches containing changed files. +- neighborMap entries reference unchanged files with their batchIndex from the deterministic full-graph Louvain re-run. The seed is fixed so the assignment is reproducible across incremental invocations. + +### Prototype-first implementation + +Before writing the full script, build a minimal skeleton: + +1. Load `scan-result.json` from this repo's `.understand-anything/` directory (if absent, generate via `/understand --full`). +2. Run Louvain only — no size enforcement, no neighborMap. +3. Print community size distribution. +4. Decide: do real-world communities cluster in [5, 35]? If yes, size enforcement branch may be unnecessary or trivially defensive. If no, implement edge-betweenness split. + +This gates the more speculative code (size enforcement) on empirical observation rather than upfront design. + +--- + +## Component 2 — `skills/understand/SKILL.md` changes + +### Add — Phase 1.5 section (after Phase 1) + +```markdown +## Phase 1.5 — BATCH + +Report: `[Phase 1.5/7] Computing semantic batches...` + +Run the bundled batching script: +\`\`\`bash +node /compute-batches.mjs $PROJECT_ROOT +\`\`\` + +Reads `.understand-anything/intermediate/scan-result.json`, writes +`.understand-anything/intermediate/batches.json`. + +Capture stderr. Append any line starting with `Warning:` to +$PHASE_WARNINGS for the final report. + +If the script exits non-zero, the failure is hard — relay the full +stderr to the user as a Phase 1.5 failure. Do not attempt to recover; +the script's internal fallback (count-based) already handles recoverable +issues. A non-zero exit means a fundamental problem (missing input file, +malformed JSON, etc.). +``` + +### Replace — Phase 2 ANALYZE section (current SKILL.md:280-332) + +Delete the existing "Batch the file list from Phase 1 into groups of 20-30 files each" prose, the non-code grouping prose (now in compute-batches), and the dispatch-time `batchImportData` construction prose (now provided in batches.json). Replace with: + +```markdown +## Phase 2 — ANALYZE + +### Full analysis path + +Load `.understand-anything/intermediate/batches.json` (produced by +Phase 1.5). Iterate the `batches[]` array. + +Report: `[Phase 2/7] Analyzing files — files in + batches (up to 5 concurrent)...` + +For each batch, dispatch a `file-analyzer` subagent (up to 5 +concurrent). Dispatch prompt template: + +> Analyze these files and produce GraphNode and GraphEdge objects. +> Project root: `$PROJECT_ROOT` +> Project: `` +> Languages: `` +> Batch: `/` +> Skill directory: `` +> Output: write to +> `$PROJECT_ROOT/.understand-anything/intermediate/batch-.json` +> (single-file mode) OR `batch--part-.json` (split mode, +> per Step B of your output protocol). +> +> Pre-resolved import data (use directly — do NOT re-resolve from source): +> \`\`\`json +> +> \`\`\` +> +> Cross-batch neighbors with their exported symbols (confidence boost +> for cross-batch edges): +> \`\`\`json +> +> \`\`\` +> +> Files to analyze: +> 1. `` ( lines, language: ``, +> fileCategory: ``) +> ... + +$LANGUAGE_DIRECTIVE + +After ALL batches complete, run the merge-and-normalize script: +\`\`\`bash +python /merge-batch-graphs.py $PROJECT_ROOT +\`\`\` + +(Rest of Phase 2 unchanged.) +``` + +### Replace — Incremental update path (current SKILL.md:355-366) + +```markdown +### Incremental update path + +Run compute-batches.mjs with `--changed-files=`, where `` +is a temp file listing changed file paths (one per line). The script +reuses the full project's import graph for neighborMap computation +but only emits batches containing changed files. Dispatch file-analyzer +subagents per the same template as the full path. +``` + +### Line budget + +Net added LLM-context prose: Phase 1.5 (~12 lines) + Phase 2 template clarifications (~5 lines) − removed batching prose (~15 lines) − removed batchImportData construction prose (~6 lines) ≈ **−4 lines**. + +--- + +## Component 3 — `agents/file-analyzer.md` changes + +### Add — Cross-batch context section + +Insert after "Step 1: Input file construction": + +```markdown +### Cross-batch context (neighborMap) + +Your dispatch prompt includes a `neighborMap` — for each file in your +batch, it lists project-internal neighbors in OTHER batches (files that +import yours or that you import), with their exported symbols. + +Use neighborMap as a confidence boost for cross-batch edges (`calls`, +`related`, `inherits`, `implements` to nodes outside your batch): + +- If your source clearly references a symbol that appears in some + `neighbor.symbols`, emit the edge to + `function::` or + `class::` with confidence. +- If your source references a cross-batch symbol that is NOT in + neighborMap (the project-scanner may not have extracted it), you may + still emit the edge if you saw it explicitly in the imported file's + surface — but prefer matching neighborMap symbols when available. +- Imports continue to use `batchImportData` (fully resolved), not + neighborMap. + +The merge script's dangling-edge dropper is the safety net for +genuinely unresolvable targets. +``` + +### Replace — Writing Results section (current file-analyzer.md:467-475) + +```markdown +## Writing Results — single or multi-part + +**Step A — Compute totals.** +\`\`\` +nodeCount = nodes.length +edgeCount = edges.length +\`\`\` + +**Step B — Decide split.** +- If `nodeCount ≤ 60` AND `edgeCount ≤ 120`: write ONE file to + `.understand-anything/intermediate/batch-.json`. Done. + Skip to Step E. +- Otherwise: `parts = ceil(max(nodeCount / 60, edgeCount / 120))`. + +**Step C — Partition.** +Sort files in your batch alphabetically by path. Chunk them sequentially +into `parts` groups of size `ceil(N / parts)`. For each part: +- All nodes whose `filePath` is in this part's files (for non-file + nodes like `module`/`concept`, use the file they belong to). +- All edges whose `source` is in this part's nodes (target may be + anywhere — same part, different part of same batch, different batch). + +**Step D — Write each part.** +Write part `k` (1-indexed) to +`.understand-anything/intermediate/batch--part-.json`. +Each part is a valid GraphFragment: `{ "nodes": [...], "edges": [...] }`. + +**Step E — Self-validate.** +For each file written, verify: +- Valid JSON. +- `nodes` array exists and is well-formed. +- For every edge: `source` and `target` both appear as either (a) a + node `id` in this part's nodes, OR (b) a `file:` reference + where `` is in `neighborMap` or `batchImportData`, OR (c) a + `function::` / `class::` reference where + `` is in some `neighbor.symbols`. + +If validation fails on a part, do NOT silently rebuild. Respond with +an explicit error stating which part failed, which edge(s) failed +validation, and why. The dispatching session can then retry. + +**Step F — Respond.** +Respond with ONLY a brief text summary: parts written (1 or more), +total nodes/edges across all parts, any files skipped. Do NOT include +JSON content in the response. +``` + +### Threshold rationale + +`60 nodes / 120 edges per part` derives from: + +- File node JSON serialized ≈ 150-300 chars; function/class ≈ 80-150 chars; edge ≈ 100-150 chars. +- 60 nodes + 120 edges ≈ 25-35KB JSON ≈ 7000-9000 output tokens (JSON tokenization is dense). +- Bedrock OPUS default `max_tokens` 4096-8192 → ~10% safety margin. + +These constants live as file-analyzer.md prose for now. Auto-tuning per provider is deferred to follow-up. + +--- + +## Component 4 — `merge-batch-graphs.py` (verify-only) + +### Confirmed compatibility + +The existing glob and sort-key already handle multi-part files transparently: + +- `intermediate_dir.glob("batch-*.json")` matches `batch-3-part-1.json`. +- `re.search(r"batch-(\d+)", p.stem)` extracts `3` from `batch-3-part-1`, giving the same sort key as `batch-3.json`. Python `sorted` is stable, so parts load in lexicographic tie-break order. +- `merge_and_normalize` walks `all_nodes.extend(...)` / `all_edges.extend(...)`; load order does not affect dedup correctness. +- `recover_imports_from_scan` operates on the merged graph — transparent to multi-part inputs. +- `link_tests` operates on the merged node pool — transparent. + +No code change required for correctness. + +### Add — Multi-part awareness in stderr report + +`merge-batch-graphs.py:1026` currently prints `Found {N} batch files:`. Enhance: + +```python +from collections import defaultdict +by_batch = defaultdict(list) +for f in batch_files: + m = re.match(r"batch-(\d+)(?:-part-(\d+))?\.json", f.name) + if m: + by_batch[int(m.group(1))].append(f.name) + +logical_count = len(by_batch) +multi_part = sum(1 for files in by_batch.values() if len(files) > 1) +print( + f"Found {len(batch_files)} batch files " + f"({logical_count} logical batches, {multi_part} multi-part)", + file=sys.stderr, +) +``` + +### Add — Missing-part warning + +After grouping, detect logical batches with non-contiguous part numbers (e.g. parts `{2, 3}` present but `1` missing) and emit: + +``` +Warning: merge: batch has parts {} but missing part {} + — possible truncated write — affected nodes/edges may be lost +``` + +--- + +## Failure modes & observability + +| Failure point | Behavior | Safety net | Required warning text | +|---|---|---|---| +| Louvain library throws | exception | Script-internal: catch → count-based fallback (12 files/batch); neighborMap still built | `Warning: compute-batches: Louvain failed () — falling back to count-based grouping (12 files/batch) — module semantic boundaries lost` | +| tree-sitter exports per-file failure | empty exports | symbols=[] in neighborMap | `Warning: compute-batches: exports extraction failed for () — symbols=[] in neighborMap — cross-batch edges to this file limited to file-level` | +| Louvain produces oversized community | size > 35 | Edge-betweenness split | `Warning: compute-batches: community size > max 35 — splitting via edge-betweenness — modularity may decrease` | +| compute-batches complete crash | exit non-zero, no batches.json | SKILL.md surfaces full stderr to user; no Phase 2 fallback | (script's own error to stderr; SKILL.md relays verbatim) | +| neighborMap truncation | > 50 neighbors | Top-50 by degree kept | `Warning: compute-batches: neighborMap for truncated from to top 50 (by neighbor degree)` | +| file-analyzer part JSON malformed | `load_batch` skips | Existing `load_batch:139` warns and skips | (existing — verify the warning is not swallowed) | +| Missing part in multi-part batch | gap in parts | merge detects and warns | `Warning: merge: batch has parts {} but missing part {} — possible truncated write — affected nodes/edges may be lost` | +| file-analyzer dangling edges | source/target missing | merge drops, adds to `unfixable` (existing) | (existing) | +| file-analyzer dispatch fails | subagent error | existing retry-once mechanism | (existing) | + +### Observability invariant + +Every fallback / degrade / drop MUST: + +1. Write a stderr line in `Warning: : ` format. +2. Bubble up to `$PHASE_WARNINGS` (SKILL.md existing mechanism) → user-facing Phase 7 final report. +3. Never use silent `catch {}` / `except: pass`. Code review treats this as a blocker. + +### Invariants + +1. **scan-result.json is source of truth.** Any batching/topology change preserves importMap; `recover_imports_from_scan` always restores `imports` edges. +2. **Dangling-edge dropper is final defense.** No batch-generated edge can connect to a nonexistent node in the assembled graph. +3. **No silent fallback.** `batches.json` missing → loud failure. Internal compute-batches fallback → loud warning that bubbles to user. + +--- + +## Testing + +### Unit tests — `compute-batches.mjs` + +New file: `understand-anything-plugin/skills/understand/test_compute_batches.test.mjs` (Vitest). + +Required cases: + +- **Louvain basic:** 3 disjoint cliques → 3 batches. +- **Empty importMap:** independent files → count-fallback batches by alphabetical chunking. +- **Oversized community:** 50-node complete graph → split triggered, all sub-batches ≤ 35. +- **Non-code grouping A:** `Dockerfile` + `docker-compose.yml` + `.dockerignore` siblings → one batch per directory cluster. +- **Non-code grouping B:** `.github/workflows/*.yml` → one batch. +- **Non-code grouping C:** SQL migrations under `migrations/` → one batch per directory. +- **Mixed code + non-code:** non-code batchIndex follows code batches. +- **neighborMap correctness:** file A imports file B across batches → `neighborMap[A]` contains `{path: B, batchIndex: B's, symbols: B's exports}`. +- **neighborMap excludes same-batch:** A and C in same batch → `neighborMap[A]` does not contain C. +- **Exports failure tolerance:** mock TreeSitter to throw on one file → `exports = []` for that file, others unaffected. +- **`--changed-files`:** input subset → output contains only batches with changed files; neighborMap may reference unchanged files. +- **Fallback triggers:** mock Louvain throw → `algorithm` field = `"count-fallback"`, warning in stderr. +- **Warning assertion per fallback:** for each of {Louvain crash, exports failure, oversize split, neighborMap truncation}, assert the exact warning string appears in stderr. + +### Unit tests — `merge-batch-graphs.py` + +New test class `TestMultiPart` in `test_merge_batch_graphs.py`: + +- Two parts of one logical batch: `batch-1-part-1.json` + `batch-1-part-2.json` → assembled contains all nodes/edges from both. +- Three parts of one logical batch. +- Cross-part edges: edge with source in part-1, target node in part-2 → connected after merge. +- Malformed part-1 + valid part-2: part-1 skipped with warning, part-2 contents present. +- Mixed single-batch and multi-part inputs. +- Missing part detection: `batch-1-part-2.json` + `batch-1-part-3.json` (no part-1) → warning emitted with exact text. +- stderr format: assert `"X logical batches, Y multi-part"` appears. + +### Integration — PR acceptance gate (manual) + +Documented in the PR's Test plan: + +- [ ] `pnpm install` (graphology installs cleanly). +- [ ] `pnpm --filter @understand-anything/core build`. +- [ ] Run `/understand --full` on this repo (Understand-Anything itself): + - `batches.json` generated; community size distribution sanity-check (mix of small and medium batches). + - At least one batch produces multi-part output. + - `assembled-graph.json` node/edge counts within expected range vs current main. + - Dashboard renders normally. + - Phase 7 final report includes any `$PHASE_WARNINGS` from compute-batches (visually verify warnings reach user-facing output, not just stderr). +- [ ] Run on a ~100-file repo matching ayushghosh's scenario; confirm no "output limit" errors. +- [ ] Run on a 5-10 file small repo: fallback path (all one batch) works correctly. + +### Not tested + +- Louvain algorithm correctness (trust `graphology-communities-louvain`'s own tests). +- Performance benchmarks (sub-second on 100-500 files is empirical; not gated). +- Multiple LLM provider output-cap variations (thresholds are conservative for Bedrock OPUS; first-party Anthropic is more permissive). + +--- + +## Out of scope (tracked for follow-up) + +### Tree-sitter deduplication + +Currently Phase 1 (project-scanner), Phase 1.5 (compute-batches), and Phase 2 (file-analyzer per-batch) each run tree-sitter independently. Consolidating into a single Phase 1.5 structure extraction would simplify file-analyzer and save time on large projects. Defer because it requires reorganizing file-analyzer's protocol significantly. + +### neighborMap LLM summaries + +Adding one-sentence summaries per file to neighborMap would enable file-analyzer to emit `related` edges across batches with semantic justification. Requires a new lightweight summary-pass agent; defer until the tree-sitter dedup lands (Phase 1.5 will already have full structure → cheaper to add). + +### Adaptive thresholds + +`60 nodes / 120 edges` are conservative for Bedrock OPUS. Anthropic first-party supports much larger output caps. Adding a `--output-cap=` CLI to compute-batches and propagating to file-analyzer would unlock larger parts on permissive backends. Track real-world part counts before implementing. + +### Cross-batch edge audit + +A post-merge audit comparing neighborMap-suggested edges vs actually-emitted edges would surface gaps. Mirror the existing `recover_imports_from_scan` pattern. Requires preserving `batches.json` for merge-time consumption. + +### Multi-language monorepo handling + +Multi-language repos (TS + Python) tend to naturally split via Louvain (no cross-language imports). Bridge files (OpenAPI, protobuf) might create odd communities. Address only if real reports surface. + +--- + +## Implementation order + +1. **Prototype:** minimal `compute-batches.mjs` skeleton — load scan-result.json, run Louvain, print community sizes. Run against this repo's `scan-result.json` (generate if missing via `/understand --full`). Decide whether size-enforcement branch is needed; if needed, choose between edge-betweenness and weakly-connected-component split. +2. Add exports extraction (reuse TreeSitterPlugin). +3. Add neighborMap construction + batchImportData passthrough. +4. Add non-code grouping heuristics (Groups A-E). +5. Add fallback path + warning emissions for every failure mode listed in the Failure modes table. +6. Write unit tests for compute-batches (per Testing section), including warning-text assertions. +7. Modify `agents/file-analyzer.md` — add Cross-batch context section, replace Writing Results. +8. Modify `skills/understand/SKILL.md` — add Phase 1.5, replace Phase 2 ANALYZE batching prose, replace incremental path. +9. Add multi-part stderr report + missing-part warning to `merge-batch-graphs.py`. +10. Write unit tests for `merge-batch-graphs.py` multi-part handling. +11. Add `graphology` + `graphology-communities-louvain` to `understand-anything-plugin/package.json`. +12. Run integration acceptance gate. +13. Bump version in all five `package.json` / `plugin.json` files per the project's CLAUDE.md versioning rule. diff --git a/package.json b/package.json index 888b735..efae49a 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "scripts": { "prepare": "pnpm --filter @understand-anything/core build", "build": "pnpm -r build", - "test": "vitest", + "test": "vitest run", "dev:dashboard": "pnpm --filter @understand-anything/dashboard dev", "lint": "eslint ." }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bcb3810..3eb7e7a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -38,6 +38,12 @@ importers: '@understand-anything/core': specifier: workspace:* version: link:packages/core + graphology: + specifier: ~0.26.0 + version: 0.26.0(graphology-types@0.24.8) + graphology-communities-louvain: + specifier: ^2.0.2 + version: 2.0.2(graphology-types@0.24.8) devDependencies: '@types/node': specifier: ^22.0.0 @@ -1861,6 +1867,11 @@ packages: peerDependencies: graphology-types: '>=0.24.0' + graphology@0.26.0: + resolution: {integrity: sha512-8SSImzgUUYC89Z042s+0r/vMibY7GX/Emz4LDO5e7jYXhuoWfHISPFJYjpRLUSJGq6UQ6xlenvX1p/hJdfXuXg==} + peerDependencies: + graphology-types: '>=0.24.0' + h3@1.15.11: resolution: {integrity: sha512-L3THSe2MPeBwgIZVSH5zLdBBU90TOxarvhK9d04IDY2AmVS8j2Jz2LIWtwsGOU3lu2I5jCN7FNvVfY2+XyF+mg==} @@ -4966,6 +4977,11 @@ snapshots: graphology-types: 0.24.8 obliterator: 2.0.5 + graphology@0.26.0(graphology-types@0.24.8): + dependencies: + events: 3.3.0 + graphology-types: 0.24.8 + h3@1.15.11: dependencies: cookie-es: 1.2.3 diff --git a/tests/skill/understand/fixtures/scan-result-3-cliques.json b/tests/skill/understand/fixtures/scan-result-3-cliques.json new file mode 100644 index 0000000..fb4d4b3 --- /dev/null +++ b/tests/skill/understand/fixtures/scan-result-3-cliques.json @@ -0,0 +1,31 @@ +{ + "name": "fixture-3-cliques", + "description": "Three disjoint import cliques for Louvain testing", + "languages": ["typescript"], + "frameworks": [], + "files": [ + {"path": "src/auth/login.ts", "language": "typescript", "sizeLines": 50, "fileCategory": "code"}, + {"path": "src/auth/session.ts", "language": "typescript", "sizeLines": 40, "fileCategory": "code"}, + {"path": "src/auth/tokens.ts", "language": "typescript", "sizeLines": 60, "fileCategory": "code"}, + {"path": "src/api/handlers.ts", "language": "typescript", "sizeLines": 80, "fileCategory": "code"}, + {"path": "src/api/middleware.ts", "language": "typescript", "sizeLines": 30, "fileCategory": "code"}, + {"path": "src/api/routes.ts", "language": "typescript", "sizeLines": 45, "fileCategory": "code"}, + {"path": "src/db/users.ts", "language": "typescript", "sizeLines": 70, "fileCategory": "code"}, + {"path": "src/db/queries.ts", "language": "typescript", "sizeLines": 55, "fileCategory": "code"}, + {"path": "src/db/migrations.ts", "language": "typescript", "sizeLines": 35, "fileCategory": "code"} + ], + "totalFiles": 9, + "filteredByIgnore": 0, + "estimatedComplexity": "small", + "importMap": { + "src/auth/login.ts": ["src/auth/session.ts", "src/auth/tokens.ts"], + "src/auth/session.ts": ["src/auth/tokens.ts"], + "src/auth/tokens.ts": [], + "src/api/handlers.ts": ["src/api/middleware.ts", "src/api/routes.ts"], + "src/api/middleware.ts": ["src/api/routes.ts", "src/auth/session.ts"], + "src/api/routes.ts": [], + "src/db/users.ts": ["src/db/queries.ts", "src/db/migrations.ts"], + "src/db/queries.ts": ["src/db/migrations.ts"], + "src/db/migrations.ts": [] + } +} diff --git a/tests/skill/understand/fixtures/scan-result-large-community.json b/tests/skill/understand/fixtures/scan-result-large-community.json new file mode 100644 index 0000000..3f25ee0 --- /dev/null +++ b/tests/skill/understand/fixtures/scan-result-large-community.json @@ -0,0 +1,1895 @@ +{ + "name": "fixture-large-community", + "description": "40 files all importing each other — one community over the max=35 cap", + "languages": [ + "typescript" + ], + "frameworks": [], + "files": [ + { + "path": "src/big/f0.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f1.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f2.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f3.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f4.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f5.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f6.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f7.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f8.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f9.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f10.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f11.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f12.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f13.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f14.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f15.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f16.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f17.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f18.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f19.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f20.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f21.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f22.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f23.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f24.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f25.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f26.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f27.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f28.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f29.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f30.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f31.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f32.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f33.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f34.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f35.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f36.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f37.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f38.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + }, + { + "path": "src/big/f39.ts", + "language": "typescript", + "sizeLines": 50, + "fileCategory": "code" + } + ], + "totalFiles": 40, + "filteredByIgnore": 0, + "estimatedComplexity": "moderate", + "importMap": { + "src/big/f0.ts": [ + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f1.ts": [ + "src/big/f0.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f2.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f3.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f4.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f5.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f6.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f7.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f8.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f9.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f10.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f11.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f12.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f13.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f14.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f15.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f16.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f17.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f18.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f19.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f20.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f21.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f22.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f23.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f24.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f25.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f26.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f27.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f28.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f29.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f30.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f31.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f32.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f33.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f34.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f35.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f36.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f37.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f37.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f38.ts", + "src/big/f39.ts" + ], + "src/big/f38.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f39.ts" + ], + "src/big/f39.ts": [ + "src/big/f0.ts", + "src/big/f1.ts", + "src/big/f2.ts", + "src/big/f3.ts", + "src/big/f4.ts", + "src/big/f5.ts", + "src/big/f6.ts", + "src/big/f7.ts", + "src/big/f8.ts", + "src/big/f9.ts", + "src/big/f10.ts", + "src/big/f11.ts", + "src/big/f12.ts", + "src/big/f13.ts", + "src/big/f14.ts", + "src/big/f15.ts", + "src/big/f16.ts", + "src/big/f17.ts", + "src/big/f18.ts", + "src/big/f19.ts", + "src/big/f20.ts", + "src/big/f21.ts", + "src/big/f22.ts", + "src/big/f23.ts", + "src/big/f24.ts", + "src/big/f25.ts", + "src/big/f26.ts", + "src/big/f27.ts", + "src/big/f28.ts", + "src/big/f29.ts", + "src/big/f30.ts", + "src/big/f31.ts", + "src/big/f32.ts", + "src/big/f33.ts", + "src/big/f34.ts", + "src/big/f35.ts", + "src/big/f36.ts", + "src/big/f37.ts", + "src/big/f38.ts" + ] + } +} diff --git a/tests/skill/understand/fixtures/scan-result-merge-respects-non-mergeable.json b/tests/skill/understand/fixtures/scan-result-merge-respects-non-mergeable.json new file mode 100644 index 0000000..614b6f5 --- /dev/null +++ b/tests/skill/understand/fixtures/scan-result-merge-respects-non-mergeable.json @@ -0,0 +1,233 @@ +{ + "name": "fixture-merge-respects-non-mergeable", + "description": "Regression guard for mergeSmallBatches: a small non-mergeable batch (Dockerfile cluster, marked mergeable=false by buildNonCodeBatches Group A) must NOT be pooled into the misc bucket alongside isolated code singletons, even though its size (1) is well below MIN_BATCH_SIZE=3. Pooling Dockerfiles into misc would destroy the semantic atom — an LLM analyzing the misc batch loses the per-service infra context.", + "languages": [ + "typescript", + "dockerfile" + ], + "frameworks": [], + "files": [ + { + "path": "src/leaf000.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf001.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf002.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf003.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf004.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf005.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf006.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf007.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf008.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf009.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf010.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf011.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf012.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf013.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf014.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf015.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf016.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf017.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf018.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf019.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf020.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf021.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf022.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf023.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf024.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf025.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf026.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf027.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf028.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf029.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "services/api/Dockerfile", + "language": "dockerfile", + "sizeLines": 18, + "fileCategory": "infra" + } + ], + "totalFiles": 31, + "filteredByIgnore": 0, + "estimatedComplexity": "moderate", + "importMap": { + "src/leaf000.ts": [], + "src/leaf001.ts": [], + "src/leaf002.ts": [], + "src/leaf003.ts": [], + "src/leaf004.ts": [], + "src/leaf005.ts": [], + "src/leaf006.ts": [], + "src/leaf007.ts": [], + "src/leaf008.ts": [], + "src/leaf009.ts": [], + "src/leaf010.ts": [], + "src/leaf011.ts": [], + "src/leaf012.ts": [], + "src/leaf013.ts": [], + "src/leaf014.ts": [], + "src/leaf015.ts": [], + "src/leaf016.ts": [], + "src/leaf017.ts": [], + "src/leaf018.ts": [], + "src/leaf019.ts": [], + "src/leaf020.ts": [], + "src/leaf021.ts": [], + "src/leaf022.ts": [], + "src/leaf023.ts": [], + "src/leaf024.ts": [], + "src/leaf025.ts": [], + "src/leaf026.ts": [], + "src/leaf027.ts": [], + "src/leaf028.ts": [], + "src/leaf029.ts": [], + "services/api/Dockerfile": [] + } +} \ No newline at end of file diff --git a/tests/skill/understand/fixtures/scan-result-non-code.json b/tests/skill/understand/fixtures/scan-result-non-code.json new file mode 100644 index 0000000..9867a71 --- /dev/null +++ b/tests/skill/understand/fixtures/scan-result-non-code.json @@ -0,0 +1,38 @@ +{ + "name": "fixture-non-code", + "description": "Mix of non-code files exercising Groups A-E. The src/ clique has 3 mutually-importing files so it survives merge-small (size >= MIN_BATCH_SIZE=3) and stays a pure-code batch — required by the 'non-code batch indices follow code batches' assertion.", + "languages": ["typescript", "dockerfile", "yaml", "sql", "markdown"], + "frameworks": [], + "files": [ + {"path": "src/index.ts", "language": "typescript", "sizeLines": 10, "fileCategory": "code"}, + {"path": "src/server.ts", "language": "typescript", "sizeLines": 15, "fileCategory": "code"}, + {"path": "src/router.ts", "language": "typescript", "sizeLines": 12, "fileCategory": "code"}, + {"path": "Dockerfile", "language": "dockerfile", "sizeLines": 20, "fileCategory": "infra"}, + {"path": "docker-compose.yml", "language": "yaml", "sizeLines": 15, "fileCategory": "infra"}, + {"path": ".dockerignore", "language": "config", "sizeLines": 5, "fileCategory": "config"}, + {"path": "services/api/Dockerfile", "language": "dockerfile", "sizeLines": 18, "fileCategory": "infra"}, + {"path": "services/api/docker-compose.yml", "language": "yaml", "sizeLines": 12, "fileCategory": "infra"}, + {"path": ".github/workflows/ci.yml", "language": "yaml", "sizeLines": 30, "fileCategory": "infra"}, + {"path": ".github/workflows/deploy.yml", "language": "yaml", "sizeLines": 25, "fileCategory": "infra"}, + {"path": ".gitlab-ci.yml", "language": "yaml", "sizeLines": 20, "fileCategory": "infra"}, + {"path": ".circleci/config.yml", "language": "yaml", "sizeLines": 25, "fileCategory": "infra"}, + {"path": "migrations/001_init.sql", "language": "sql", "sizeLines": 40, "fileCategory": "data"}, + {"path": "migrations/002_users.sql", "language": "sql", "sizeLines": 20, "fileCategory": "data"}, + {"path": "docs/getting-started.md", "language": "markdown", "sizeLines": 100, "fileCategory": "docs"}, + {"path": "README.md", "language": "markdown", "sizeLines": 200, "fileCategory": "docs"} + ], + "totalFiles": 16, + "filteredByIgnore": 0, + "estimatedComplexity": "small", + "importMap": { + "src/index.ts": ["src/server.ts", "src/router.ts"], + "src/server.ts": ["src/router.ts"], + "src/router.ts": [], + "Dockerfile": [], "docker-compose.yml": [], ".dockerignore": [], + "services/api/Dockerfile": [], "services/api/docker-compose.yml": [], + ".github/workflows/ci.yml": [], ".github/workflows/deploy.yml": [], + ".gitlab-ci.yml": [], ".circleci/config.yml": [], + "migrations/001_init.sql": [], "migrations/002_users.sql": [], + "docs/getting-started.md": [], "README.md": [] + } +} diff --git a/tests/skill/understand/fixtures/scan-result-singletons.json b/tests/skill/understand/fixtures/scan-result-singletons.json new file mode 100644 index 0000000..f7f7c99 --- /dev/null +++ b/tests/skill/understand/fixtures/scan-result-singletons.json @@ -0,0 +1,715 @@ +{ + "name": "fixture-singletons", + "description": "100 isolated TS files that should merge into ~4 misc batches", + "languages": [ + "typescript" + ], + "frameworks": [], + "files": [ + { + "path": "src/leaf000.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf001.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf002.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf003.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf004.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf005.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf006.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf007.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf008.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf009.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf010.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf011.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf012.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf013.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf014.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf015.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf016.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf017.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf018.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf019.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf020.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf021.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf022.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf023.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf024.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf025.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf026.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf027.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf028.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf029.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf030.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf031.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf032.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf033.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf034.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf035.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf036.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf037.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf038.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf039.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf040.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf041.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf042.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf043.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf044.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf045.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf046.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf047.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf048.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf049.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf050.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf051.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf052.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf053.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf054.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf055.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf056.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf057.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf058.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf059.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf060.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf061.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf062.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf063.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf064.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf065.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf066.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf067.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf068.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf069.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf070.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf071.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf072.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf073.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf074.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf075.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf076.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf077.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf078.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf079.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf080.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf081.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf082.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf083.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf084.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf085.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf086.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf087.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf088.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf089.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf090.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf091.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf092.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf093.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf094.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf095.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf096.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf097.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf098.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + }, + { + "path": "src/leaf099.ts", + "language": "typescript", + "sizeLines": 10, + "fileCategory": "code" + } + ], + "totalFiles": 100, + "filteredByIgnore": 0, + "estimatedComplexity": "moderate", + "importMap": { + "src/leaf000.ts": [], + "src/leaf001.ts": [], + "src/leaf002.ts": [], + "src/leaf003.ts": [], + "src/leaf004.ts": [], + "src/leaf005.ts": [], + "src/leaf006.ts": [], + "src/leaf007.ts": [], + "src/leaf008.ts": [], + "src/leaf009.ts": [], + "src/leaf010.ts": [], + "src/leaf011.ts": [], + "src/leaf012.ts": [], + "src/leaf013.ts": [], + "src/leaf014.ts": [], + "src/leaf015.ts": [], + "src/leaf016.ts": [], + "src/leaf017.ts": [], + "src/leaf018.ts": [], + "src/leaf019.ts": [], + "src/leaf020.ts": [], + "src/leaf021.ts": [], + "src/leaf022.ts": [], + "src/leaf023.ts": [], + "src/leaf024.ts": [], + "src/leaf025.ts": [], + "src/leaf026.ts": [], + "src/leaf027.ts": [], + "src/leaf028.ts": [], + "src/leaf029.ts": [], + "src/leaf030.ts": [], + "src/leaf031.ts": [], + "src/leaf032.ts": [], + "src/leaf033.ts": [], + "src/leaf034.ts": [], + "src/leaf035.ts": [], + "src/leaf036.ts": [], + "src/leaf037.ts": [], + "src/leaf038.ts": [], + "src/leaf039.ts": [], + "src/leaf040.ts": [], + "src/leaf041.ts": [], + "src/leaf042.ts": [], + "src/leaf043.ts": [], + "src/leaf044.ts": [], + "src/leaf045.ts": [], + "src/leaf046.ts": [], + "src/leaf047.ts": [], + "src/leaf048.ts": [], + "src/leaf049.ts": [], + "src/leaf050.ts": [], + "src/leaf051.ts": [], + "src/leaf052.ts": [], + "src/leaf053.ts": [], + "src/leaf054.ts": [], + "src/leaf055.ts": [], + "src/leaf056.ts": [], + "src/leaf057.ts": [], + "src/leaf058.ts": [], + "src/leaf059.ts": [], + "src/leaf060.ts": [], + "src/leaf061.ts": [], + "src/leaf062.ts": [], + "src/leaf063.ts": [], + "src/leaf064.ts": [], + "src/leaf065.ts": [], + "src/leaf066.ts": [], + "src/leaf067.ts": [], + "src/leaf068.ts": [], + "src/leaf069.ts": [], + "src/leaf070.ts": [], + "src/leaf071.ts": [], + "src/leaf072.ts": [], + "src/leaf073.ts": [], + "src/leaf074.ts": [], + "src/leaf075.ts": [], + "src/leaf076.ts": [], + "src/leaf077.ts": [], + "src/leaf078.ts": [], + "src/leaf079.ts": [], + "src/leaf080.ts": [], + "src/leaf081.ts": [], + "src/leaf082.ts": [], + "src/leaf083.ts": [], + "src/leaf084.ts": [], + "src/leaf085.ts": [], + "src/leaf086.ts": [], + "src/leaf087.ts": [], + "src/leaf088.ts": [], + "src/leaf089.ts": [], + "src/leaf090.ts": [], + "src/leaf091.ts": [], + "src/leaf092.ts": [], + "src/leaf093.ts": [], + "src/leaf094.ts": [], + "src/leaf095.ts": [], + "src/leaf096.ts": [], + "src/leaf097.ts": [], + "src/leaf098.ts": [], + "src/leaf099.ts": [] + } +} diff --git a/tests/skill/understand/test_compute_batches.test.mjs b/tests/skill/understand/test_compute_batches.test.mjs new file mode 100644 index 0000000..14e09de --- /dev/null +++ b/tests/skill/understand/test_compute_batches.test.mjs @@ -0,0 +1,602 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtempSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { dirname, resolve } from 'node:path'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const SCRIPT = resolve(__dirname, '../../../understand-anything-plugin/skills/understand/compute-batches.mjs'); +const FIXTURES = resolve(__dirname, 'fixtures'); + +function runScript(projectRoot, extraArgs = []) { + return spawnSync('node', [SCRIPT, projectRoot, ...extraArgs], { + encoding: 'utf-8', + }); +} + +function setupProject(fixtureName) { + const root = mkdtempSync(join(tmpdir(), 'ua-cb-test-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + const fixturePath = join(FIXTURES, fixtureName); + const dest = join(root, '.understand-anything', 'intermediate', 'scan-result.json'); + writeFileSync(dest, readFileSync(fixturePath, 'utf-8')); + return root; +} + +function readBatches(projectRoot) { + const p = join(projectRoot, '.understand-anything', 'intermediate', 'batches.json'); + return JSON.parse(readFileSync(p, 'utf-8')); +} + +describe('compute-batches.mjs — Louvain basic', () => { + let projectRoot; + + beforeEach(() => { + projectRoot = setupProject('scan-result-3-cliques.json'); + }); + + afterEach(() => { + if (projectRoot) rmSync(projectRoot, { recursive: true, force: true }); + }); + + it('produces 3 batches for 3 disjoint cliques', () => { + const result = runScript(projectRoot); + expect(result.status).toBe(0); + + const batches = readBatches(projectRoot); + expect(batches.algorithm).toBe('louvain'); + expect(batches.totalFiles).toBe(9); + expect(batches.batches.length).toBe(3); + expect(batches.schemaVersion).toBe(1); + expect(batches.totalBatches).toBe(3); + expect(batches.batches.map(b => b.batchIndex)).toEqual([1, 2, 3]); + + // Each batch should contain exactly one clique (3 files) + for (const b of batches.batches) { + expect(b.files.length).toBe(3); + const dirs = new Set(b.files.map(f => f.path.split('/')[1])); + expect(dirs.size).toBe(1); // all files in the batch share src// + } + }); + + it('produces deterministic output across runs', () => { + const r1 = runScript(projectRoot); + expect(r1.status).toBe(0); + const json1 = readFileSync( + join(projectRoot, '.understand-anything', 'intermediate', 'batches.json'), + 'utf-8', + ); + + const r2 = runScript(projectRoot); + expect(r2.status).toBe(0); + const json2 = readFileSync( + join(projectRoot, '.understand-anything', 'intermediate', 'batches.json'), + 'utf-8', + ); + + expect(json1).toBe(json2); + }); +}); + +describe('compute-batches.mjs — size enforcement', () => { + let projectRoot; + + beforeEach(() => { + projectRoot = setupProject('scan-result-large-community.json'); + }); + + afterEach(() => { + if (projectRoot) rmSync(projectRoot, { recursive: true, force: true }); + }); + + it('splits a 40-node clique into batches ≤ 35', () => { + const result = runScript(projectRoot); + expect(result.status).toBe(0); + + const batches = readBatches(projectRoot); + expect(batches.algorithm).toBe('louvain'); // confirm fallback didn't fire + expect(batches.totalFiles).toBe(40); + expect(batches.batches.length).toBe(2); + expect(batches.batches.map(b => b.files.length).sort()).toEqual([20, 20]); + // Sum of all batch file counts equals total files + const sum = batches.batches.reduce((acc, b) => acc + b.files.length, 0); + expect(sum).toBe(40); + // Warning was emitted to stderr + expect(result.stderr).toMatch(/Warning: compute-batches: community size 40 > max 35/); + }); +}); + +describe('compute-batches.mjs — exports extraction', () => { + let root; + + afterEach(() => { + if (root) rmSync(root, { recursive: true, force: true }); + }); + + it('populates exports for code files via tree-sitter', () => { + root = mkdtempSync(join(tmpdir(), 'ua-cb-exp-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + mkdirSync(join(root, 'src'), { recursive: true }); + writeFileSync(join(root, 'src', 'a.ts'), + 'export function greet(name: string) { return "hi " + name; }\n' + + 'export class Greeter { greet(n: string) { return "hi " + n; } }\n'); + writeFileSync(join(root, 'src', 'b.ts'), + 'import { greet } from "./a";\nexport const helper = () => greet("world");\n'); + + const scan = { + name: 'exports-test', + description: '', + languages: ['typescript'], + frameworks: [], + files: [ + { path: 'src/a.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/b.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + ], + totalFiles: 2, filteredByIgnore: 0, estimatedComplexity: 'small', + importMap: { 'src/a.ts': [], 'src/b.ts': ['src/a.ts'] }, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + + const result = runScript(root); + expect(result.status).toBe(0); + + const batches = readBatches(root); + expect(batches.exportsByPath).toBeDefined(); + expect(batches.exportsByPath['src/a.ts']).toEqual( + expect.arrayContaining(['greet', 'Greeter'])); + expect(batches.exportsByPath['src/b.ts']).toEqual( + expect.arrayContaining(['helper'])); + }); + + it('emits warning when file is missing from disk (read error path)', () => { + root = mkdtempSync(join(tmpdir(), 'ua-cb-exp-err-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + // Note: NOT creating the file on disk — scan-result.json references it, + // but the file doesn't exist, so the read branch fires. + const scan = { + name: 'missing-file-test', + description: '', + languages: ['typescript'], + frameworks: [], + files: [ + { path: 'src/missing.ts', language: 'typescript', sizeLines: 1, fileCategory: 'code' }, + ], + totalFiles: 1, filteredByIgnore: 0, estimatedComplexity: 'small', + importMap: { 'src/missing.ts': [] }, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + + const result = runScript(root); + expect(result.status).toBe(0); // script must still succeed + expect(result.stderr).toMatch( + /Warning: compute-batches: exports extraction failed for src\/missing\.ts \(read error:/); + + const batches = readBatches(root); + expect(batches.exportsByPath['src/missing.ts']).toEqual([]); + }); +}); + +describe('compute-batches.mjs — non-code grouping', () => { + let root; + let batches; + + beforeEach(() => { + root = setupProject('scan-result-non-code.json'); + const result = runScript(root); + expect(result.status).toBe(0); + batches = readBatches(root); + }); + + afterEach(() => { + if (root) rmSync(root, { recursive: true, force: true }); + }); + + it('Group A: bundles Dockerfile cluster per directory', () => { + // Root-level cluster: Dockerfile + docker-compose.yml + .dockerignore → one batch + const rootDockerBatch = batches.batches.find(b => + b.files.some(f => f.path === 'Dockerfile')); + expect(rootDockerBatch).toBeDefined(); + const paths = rootDockerBatch.files.map(f => f.path).sort(); + expect(paths).toEqual(['.dockerignore', 'Dockerfile', 'docker-compose.yml']); + + // services/api cluster is a separate batch + const apiDockerBatch = batches.batches.find(b => + b.files.some(f => f.path === 'services/api/Dockerfile')); + expect(apiDockerBatch).toBeDefined(); + expect(apiDockerBatch).not.toBe(rootDockerBatch); + expect(apiDockerBatch.files.map(f => f.path).sort()).toEqual([ + 'services/api/Dockerfile', 'services/api/docker-compose.yml', + ]); + }); + + it('Group B: .github/workflows/* all in one batch', () => { + const wfBatch = batches.batches.find(b => + b.files.some(f => f.path.startsWith('.github/workflows/'))); + expect(wfBatch).toBeDefined(); + const wfPaths = wfBatch.files.map(f => f.path).filter(p => p.startsWith('.github/workflows/')); + expect(wfPaths.sort()).toEqual([ + '.github/workflows/ci.yml', '.github/workflows/deploy.yml', + ]); + }); + + it('Group C: .gitlab-ci.yml + .circleci/* in one batch', () => { + const ciBatch = batches.batches.find(b => + b.files.some(f => f.path === '.gitlab-ci.yml')); + expect(ciBatch).toBeDefined(); + const ciPaths = ciBatch.files.map(f => f.path).sort(); + expect(ciPaths).toEqual(['.circleci/config.yml', '.gitlab-ci.yml']); + }); + + it('Group D: SQL migrations under migrations/ in one batch', () => { + const migBatch = batches.batches.find(b => + b.files.some(f => f.path.startsWith('migrations/'))); + expect(migBatch).toBeDefined(); + const migPaths = migBatch.files.map(f => f.path).filter(p => p.startsWith('migrations/')); + expect(migPaths.sort()).toEqual([ + 'migrations/001_init.sql', 'migrations/002_users.sql', + ]); + }); + + it('non-code batch indices follow code batches', () => { + const codeBatches = batches.batches.filter(b => + b.files.every(f => f.fileCategory === 'code')); + const nonCodeBatches = batches.batches.filter(b => + b.files.some(f => f.fileCategory !== 'code')); + expect(codeBatches.length).toBeGreaterThan(0); + expect(nonCodeBatches.length).toBeGreaterThan(0); + const maxCodeIdx = Math.max(...codeBatches.map(b => b.batchIndex)); + const minNonCodeIdx = Math.min(...nonCodeBatches.map(b => b.batchIndex)); + expect(minNonCodeIdx).toBeGreaterThan(maxCodeIdx); + }); +}); + +describe('compute-batches.mjs — Group E MAX_E split', () => { + let root; + + afterEach(() => { + if (root) rmSync(root, { recursive: true, force: true }); + }); + + it('splits 25 .md files under docs/ into [20, 5]', () => { + root = mkdtempSync(join(tmpdir(), 'ua-cb-maxe-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + + const files = []; + const importMap = {}; + for (let i = 0; i < 25; i++) { + const p = `docs/page${String(i).padStart(2, '0')}.md`; + files.push({ path: p, language: 'markdown', sizeLines: 10, fileCategory: 'docs' }); + importMap[p] = []; + } + const scan = { + name: 'maxe-test', description: '', + languages: ['markdown'], frameworks: [], + files, totalFiles: 25, filteredByIgnore: 0, + estimatedComplexity: 'small', importMap, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + + const result = runScript(root); + expect(result.status).toBe(0); + + const batches = readBatches(root); + // All 25 docs/ files go through Group E with MAX_E = 20, split into [20, 5]. + const docsBatches = batches.batches.filter(b => + b.files.every(f => f.path.startsWith('docs/'))); + expect(docsBatches.length).toBe(2); + const sizes = docsBatches.map(b => b.files.length).sort((a, b) => b - a); + expect(sizes).toEqual([20, 5]); + }); +}); + +describe('compute-batches.mjs — neighborMap + batchImportData', () => { + let batches; + let batchOf; // path → batchIndex + let projectRoot; + + beforeEach(() => { + projectRoot = setupProject('scan-result-3-cliques.json'); + const result = runScript(projectRoot); + expect(result.status).toBe(0); + batches = readBatches(projectRoot); + batchOf = new Map(); + for (const b of batches.batches) { + for (const f of b.files) batchOf.set(f.path, b.batchIndex); + } + }); + + afterEach(() => { + if (projectRoot) rmSync(projectRoot, { recursive: true, force: true }); + }); + + it('batchImportData mirrors scan importMap per batch', () => { + for (const b of batches.batches) { + for (const f of b.files) { + expect(b.batchImportData[f.path]).toBeDefined(); + expect(Array.isArray(b.batchImportData[f.path])).toBe(true); + } + } + // src/auth/login.ts imports src/auth/session.ts and src/auth/tokens.ts + const loginBatch = batches.batches.find(b => + b.files.some(f => f.path === 'src/auth/login.ts')); + expect(loginBatch.batchImportData['src/auth/login.ts'].sort()).toEqual([ + 'src/auth/session.ts', 'src/auth/tokens.ts', + ]); + }); + + it('neighborMap excludes same-batch files', () => { + // The fixture's three cliques each go into one batch — all imports are + // intra-batch, so no neighbor map should reference any same-batch file. + for (const b of batches.batches) { + const sameBatchPaths = new Set(b.files.map(f => f.path)); + for (const [, neighbors] of Object.entries(b.neighborMap)) { + for (const n of neighbors) { + expect(sameBatchPaths.has(n.path)).toBe(false); + } + } + } + }); + + it('neighborMap entries carry symbols when target has exports', () => { + const root = mkdtempSync(join(tmpdir(), 'ua-cb-nbr-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + mkdirSync(join(root, 'src', 'a'), { recursive: true }); + mkdirSync(join(root, 'src', 'b'), { recursive: true }); + + // Cluster A: 3 tightly-imported files. a/core.ts exports symbols. + writeFileSync(join(root, 'src', 'a', 'core.ts'), + 'export function findUser(id: string) { return null; }\nexport class User {}\n'); + writeFileSync(join(root, 'src', 'a', 'helper1.ts'), + 'import { findUser } from "./core";\nexport const h1 = () => findUser("x");\n'); + writeFileSync(join(root, 'src', 'a', 'helper2.ts'), + 'import { User } from "./core";\nimport { h1 } from "./helper1";\nexport const h2 = () => h1();\n'); + + // Cluster B: 3 tightly-imported files. b/entry.ts has ONE cross-cluster import to a/core.ts. + writeFileSync(join(root, 'src', 'b', 'entry.ts'), + 'import { findUser } from "../a/core";\nexport const entry = () => findUser("y");\n'); + writeFileSync(join(root, 'src', 'b', 'middle.ts'), + 'import { entry } from "./entry";\nexport const middle = () => entry();\n'); + writeFileSync(join(root, 'src', 'b', 'leaf.ts'), + 'import { middle } from "./middle";\nexport const leaf = () => middle();\n'); + + const files = [ + { path: 'src/a/core.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/a/helper1.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/a/helper2.ts', language: 'typescript', sizeLines: 3, fileCategory: 'code' }, + { path: 'src/b/entry.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/b/middle.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + { path: 'src/b/leaf.ts', language: 'typescript', sizeLines: 2, fileCategory: 'code' }, + ]; + const scan = { + name: 't', description: '', + languages: ['typescript'], frameworks: [], + files, + totalFiles: 6, filteredByIgnore: 0, estimatedComplexity: 'small', + importMap: { + 'src/a/core.ts': [], + 'src/a/helper1.ts': ['src/a/core.ts'], + 'src/a/helper2.ts': ['src/a/core.ts', 'src/a/helper1.ts'], + 'src/b/entry.ts': ['src/a/core.ts'], // CROSS-CLUSTER + 'src/b/middle.ts': ['src/b/entry.ts'], + 'src/b/leaf.ts': ['src/b/middle.ts'], + }, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + + const result = runScript(root); + expect(result.status).toBe(0); + const out = readBatches(root); + + // Expect 2 communities (cluster A and cluster B). Verify that some batch's + // neighborMap entry references src/a/core.ts with its symbols. + let sawSymbols = false; + for (const batch of out.batches) { + for (const [, neighbors] of Object.entries(batch.neighborMap)) { + for (const n of neighbors) { + if (n.path === 'src/a/core.ts') { + expect(n.symbols).toEqual(expect.arrayContaining(['findUser', 'User'])); + sawSymbols = true; + } + } + } + } + expect(sawSymbols).toBe(true); + + rmSync(root, { recursive: true, force: true }); + }); +}); + +describe('compute-batches.mjs — neighborMap truncation', () => { + let root; + + afterEach(() => { + if (root) rmSync(root, { recursive: true, force: true }); + }); + + it('truncates and warns when neighbors > 50', () => { + root = mkdtempSync(join(tmpdir(), 'ua-cb-trunc-')); + mkdirSync(join(root, '.understand-anything', 'intermediate'), { recursive: true }); + // hub.ts imported by 60 other files + const files = [{ path: 'src/hub.ts', language: 'typescript', sizeLines: 1, fileCategory: 'code' }]; + const importMap = { 'src/hub.ts': [] }; + for (let i = 0; i < 60; i++) { + const p = `src/leaf${i}.ts`; + files.push({ path: p, language: 'typescript', sizeLines: 1, fileCategory: 'code' }); + importMap[p] = ['src/hub.ts']; + } + const scan = { + name: 't', description: '', languages: ['typescript'], frameworks: [], + files, totalFiles: files.length, filteredByIgnore: 0, + estimatedComplexity: 'moderate', importMap, + }; + writeFileSync( + join(root, '.understand-anything', 'intermediate', 'scan-result.json'), + JSON.stringify(scan)); + const result = runScript(root); + expect(result.status).toBe(0); + expect(result.stderr).toMatch( + /neighborMap for src\/hub\.ts has high 1-hop degree 60 — exceeds soft cap of 50/); + const out = readBatches(root); + // Find hub.ts and confirm its neighbor list capped at 50 (in whichever batch it landed) + for (const b of out.batches) { + const nbrs = b.neighborMap['src/hub.ts']; + if (nbrs) expect(nbrs.length).toBeLessThanOrEqual(50); + } + }); +}); + +describe('compute-batches.mjs — fallback', () => { + let root; + + afterEach(() => { + if (root) rmSync(root, { recursive: true, force: true }); + }); + + it('falls back to count-based when Louvain throws (env-injected mock)', () => { + // We can't easily monkey-patch louvain mid-script in Vitest because the + // script runs in a subprocess. Instead, set an env var the script honors: + // UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW=1 → script throws inside its + // Louvain branch, exercising the fallback path. + root = setupProject('scan-result-3-cliques.json'); + const result = spawnSync('node', + [SCRIPT, root], + { encoding: 'utf-8', env: { ...process.env, UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW: '1' } }, + ); + expect(result.status).toBe(0); + expect(result.stderr).toMatch( + /Warning: compute-batches: Louvain failed.*falling back to count-based grouping/); + const out = readBatches(root); + expect(out.algorithm).toBe('count-fallback'); + expect(out.totalFiles).toBe(9); + // Count-based: 12 files per batch → all 9 fit in one batch + const codeBatchFileCount = out.batches + .filter(b => b.files.every(f => f.fileCategory === 'code')) + .reduce((sum, b) => sum + b.files.length, 0); + expect(codeBatchFileCount).toBe(9); + }); +}); + +describe('compute-batches.mjs — merge-small', () => { + let projectRoot; + + beforeEach(() => { + projectRoot = setupProject('scan-result-singletons.json'); + }); + + afterEach(() => { + if (projectRoot) rmSync(projectRoot, { recursive: true, force: true }); + }); + + it('merges 100 isolated singletons into a small number of misc batches', () => { + const result = runScript(projectRoot); + expect(result.status).toBe(0); + + const batches = readBatches(projectRoot); + expect(batches.totalFiles).toBe(100); + + // Without merge: 100 singletons → 100 batches. + // With merge-small (MAX_MERGE_TARGET=25): ceil(100 / 25) = exactly 4 misc + // batches. Pin the exact count — a loose >=4 && <=8 would mask off-by-one + // regressions in the slice math (e.g., a stride miscalculation that + // splintered the pool into 5-7 underfull buckets). + expect(batches.batches.length).toBe(4); + + // All files accounted for + const totalAssigned = batches.batches.reduce((sum, b) => sum + b.files.length, 0); + expect(totalAssigned).toBe(100); + + // Bucket-fullness check: 100 singletons evenly divisible by + // MAX_MERGE_TARGET=25, so every bucket must be exactly 25 — not just + // ≤ 25. Drift toward [25, 25, 25, 24, 1] etc. would slip past a + // ≤25 bound while indicating a stride bug. + for (const b of batches.batches) { + expect(b.files.length).toBe(25); + } + + // Info: (not Warning:) — merge-small is a routine optimization, not a + // fallback path. See compute-batches.mjs mergeSmallBatches WHY comment. + expect(result.stderr).toMatch( + /Info: compute-batches: merged \d+ small batches \(\d+ files\) into \d+ misc batches/); + expect(result.stderr).not.toMatch(/Warning: compute-batches: merged \d+ small batches/); + }); + + it('preserves non-mergeable batches: Dockerfile cluster not pooled into misc', () => { + // Dedicated fixture: 30 isolated TS singletons + 1 Dockerfile-only cluster. + // Group A marks the Dockerfile batch mergeable=false; even though its size + // (1) is below MIN_BATCH_SIZE=3, mergeSmallBatches must leave it intact. + const altRoot = setupProject('scan-result-merge-respects-non-mergeable.json'); + try { + const result = runScript(altRoot); + expect(result.status).toBe(0); + + const out = readBatches(altRoot); + expect(out.totalFiles).toBe(31); + + const dockerBatch = out.batches.find(b => + b.files.some(f => f.path === 'services/api/Dockerfile')); + expect(dockerBatch).toBeDefined(); + // Standalone: exactly the Dockerfile, nothing pooled in alongside it. + expect(dockerBatch.files.length).toBe(1); + expect(dockerBatch.files[0].path).toBe('services/api/Dockerfile'); + + // The TS singletons must still merge into at least one misc batch — + // and that misc batch must NOT contain the Dockerfile. + const miscBatches = out.batches.filter(b => + b.files.some(f => f.path.startsWith('src/leaf'))); + expect(miscBatches.length).toBeGreaterThanOrEqual(1); + for (const m of miscBatches) { + for (const f of m.files) { + expect(f.path).not.toBe('services/api/Dockerfile'); + } + } + + // Every TS singleton accounted for across the misc bucket(s). + const tsInMisc = miscBatches.flatMap(b => b.files.map(f => f.path)) + .filter(p => p.startsWith('src/leaf')); + expect(tsInMisc.length).toBe(30); + } finally { + rmSync(altRoot, { recursive: true, force: true }); + } + }); +}); + +describe('compute-batches.mjs — --changed-files', () => { + let root; + + afterEach(() => { + if (root) rmSync(root, { recursive: true, force: true }); + }); + + it('emits only batches containing changed files', () => { + root = setupProject('scan-result-3-cliques.json'); + const changedPath = join(root, 'changed.txt'); + // Only the auth clique is changed + writeFileSync(changedPath, ['src/auth/login.ts', 'src/auth/tokens.ts'].join('\n')); + + const result = runScript(root, [`--changed-files=${changedPath}`]); + expect(result.status).toBe(0); + + const out = readBatches(root); + // Auth files are in batches; other cliques' batches must be omitted + const allPaths = out.batches.flatMap(b => b.files.map(f => f.path)); + expect(allPaths).toContain('src/auth/login.ts'); + expect(allPaths).toContain('src/auth/tokens.ts'); + expect(allPaths).not.toContain('src/api/handlers.ts'); + expect(allPaths).not.toContain('src/db/users.ts'); + + // neighborMap may still reference unchanged files (with their full-graph batchIndex) + const loginBatch = out.batches.find(b => + b.files.some(f => f.path === 'src/auth/login.ts')); + expect(loginBatch).toBeDefined(); + }); +}); diff --git a/tests/skill/understand/test_extract_import_map.test.mjs b/tests/skill/understand/test_extract_import_map.test.mjs new file mode 100644 index 0000000..ee64136 --- /dev/null +++ b/tests/skill/understand/test_extract_import_map.test.mjs @@ -0,0 +1,1494 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { mkdtempSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, dirname, resolve } from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const SCRIPT = resolve(__dirname, '../../../understand-anything-plugin/skills/understand/extract-import-map.mjs'); + +/** + * Helper: write a source tree from a `files` object: { 'a/b.ts': '...', ... }. + * Creates parent dirs as needed. Returns the temp project root. + */ +function setupTree(files) { + const root = mkdtempSync(join(tmpdir(), 'ua-eim-test-')); + for (const [relPath, contents] of Object.entries(files)) { + const abs = join(root, relPath); + mkdirSync(dirname(abs), { recursive: true }); + writeFileSync(abs, contents, 'utf-8'); + } + return root; +} + +/** + * Run the extract-import-map.mjs script. Returns + * { status, stdout, stderr, output } where `output` is the parsed JSON + * written by the script (or null on failure to read). + * + * `extraNodeArgs` is prepended to the node argv before the script path, so + * tests can pass `--import` loader hooks to force specific failure modes. + */ +function runScript(projectRoot, input, extraNodeArgs = []) { + const inputPath = join(projectRoot, 'ua-eim-input.json'); + const outputPath = join(projectRoot, 'ua-eim-output.json'); + writeFileSync(inputPath, JSON.stringify(input), 'utf-8'); + const result = spawnSync( + 'node', + [...extraNodeArgs, SCRIPT, inputPath, outputPath], + { encoding: 'utf-8' }, + ); + let output = null; + try { + output = JSON.parse(readFileSync(outputPath, 'utf-8')); + } catch { + /* output missing on hard failure */ + } + return { status: result.status, stdout: result.stdout, stderr: result.stderr, output }; +} + +describe('extract-import-map.mjs — TypeScript / JavaScript resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves typescript relative imports with extension probes', () => { + projectRoot = setupTree({ + 'src/index.ts': `import { foo } from './utils';\nimport cfg from './config';\nfoo(cfg);\n`, + 'src/utils.ts': `export function foo(x: unknown) { return x; }\n`, + 'src/config.ts': `export default { debug: true };\n`, + 'README.md': '# project\n', + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/utils.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/config.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'README.md', language: 'markdown', fileCategory: 'docs' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.scriptCompleted).toBe(true); + expect(result.output.importMap['src/index.ts']).toEqual([ + 'src/config.ts', + 'src/utils.ts', + ]); + expect(result.output.importMap['src/utils.ts']).toEqual([]); + // Non-code file gets empty array + expect(result.output.importMap['README.md']).toEqual([]); + + expect(result.output.stats.filesScanned).toBe(4); + expect(result.output.stats.filesWithImports).toBe(1); + expect(result.output.stats.totalEdges).toBe(2); + }); + + it('resolves tsconfig paths aliases', () => { + projectRoot = setupTree({ + 'tsconfig.json': JSON.stringify({ + compilerOptions: { + baseUrl: '.', + paths: { + '@/*': ['src/*'], + '~lib/*': ['src/lib/*'], + }, + }, + }), + 'src/index.ts': `import { greet } from '@/utils/greet';\nimport { add } from '~lib/math';\n`, + 'src/utils/greet.ts': `export function greet(name: string) { return 'hi ' + name; }\n`, + 'src/lib/math.ts': `export const add = (a: number, b: number) => a + b;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'tsconfig.json', language: 'json', fileCategory: 'config' }, + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/utils/greet.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/lib/math.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/index.ts']).toEqual([ + 'src/lib/math.ts', + 'src/utils/greet.ts', + ]); + }); + + it('resolves /index.ts barrel imports', () => { + projectRoot = setupTree({ + 'src/index.ts': `import { thing } from './stuff';\n`, + 'src/stuff/index.ts': `export const thing = 1;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/stuff/index.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/index.ts']).toEqual(['src/stuff/index.ts']); + }); + + it('drops external package imports', () => { + projectRoot = setupTree({ + 'src/index.ts': `import express from 'express';\nimport { z } from 'zod';\nimport { foo } from './local';\n`, + 'src/local.ts': `export const foo = 1;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/local.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Only the local import survives; express/zod are external. + expect(result.output.importMap['src/index.ts']).toEqual(['src/local.ts']); + }); + + it('resolves javascript require() calls', () => { + projectRoot = setupTree({ + 'src/index.js': `const cfg = require('./config');\nconst utils = require('../shared/utils');\n`, + 'src/config.js': `module.exports = { x: 1 };\n`, + 'shared/utils.js': `module.exports = { y: 2 };\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/index.js', language: 'javascript', fileCategory: 'code' }, + { path: 'src/config.js', language: 'javascript', fileCategory: 'code' }, + { path: 'shared/utils.js', language: 'javascript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/index.js']).toEqual([ + 'shared/utils.js', + 'src/config.js', + ]); + }); + + it('resolves per-package tsconfig paths in a monorepo without cross-package leakage', () => { + // Two pnpm-workspace packages, each carrying its own tsconfig with its + // own `paths`. The resolver MUST dispatch per-importer to the nearest + // tsconfig — and aliases from one package must NOT resolve files in + // another package (each tsconfig anchors its baseUrl at its own dir). + projectRoot = setupTree({ + 'packages/foo/tsconfig.json': JSON.stringify({ + compilerOptions: { + baseUrl: '.', + paths: { '@foo/*': ['src/*'] }, + }, + }), + 'packages/foo/src/x.ts': `import { y } from '@foo/y';\nexport const x = y;\n`, + 'packages/foo/src/y.ts': `export const y = 1;\n`, + 'packages/bar/tsconfig.json': JSON.stringify({ + compilerOptions: { + baseUrl: '.', + paths: { '@bar/*': ['src/*'] }, + }, + }), + 'packages/bar/src/x.ts': + `import { y } from '@bar/y';\n` + + `import { fy } from '@foo/y';\n` + // must NOT resolve from bar + `export const x = y;\n`, + 'packages/bar/src/y.ts': `export const y = 2;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'packages/foo/tsconfig.json', language: 'json', fileCategory: 'config' }, + { path: 'packages/foo/src/x.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'packages/foo/src/y.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'packages/bar/tsconfig.json', language: 'json', fileCategory: 'config' }, + { path: 'packages/bar/src/x.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'packages/bar/src/y.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // foo/x sees its own @foo/y -> foo/src/y.ts only. + expect(result.output.importMap['packages/foo/src/x.ts']).toEqual([ + 'packages/foo/src/y.ts', + ]); + // bar/x sees its own @bar/y -> bar/src/y.ts. The cross-package @foo/y + // import does NOT resolve because bar's tsconfig has no @foo/* alias. + expect(result.output.importMap['packages/bar/src/x.ts']).toEqual([ + 'packages/bar/src/y.ts', + ]); + expect(result.output.importMap['packages/bar/src/x.ts']).not.toContain( + 'packages/foo/src/y.ts', + ); + }); +}); + +describe('extract-import-map.mjs — Python resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves python relative imports', () => { + projectRoot = setupTree({ + 'src/app.py': `from . import helpers\nfrom .utils import shout\nfrom ..core import boot\n`, + 'src/helpers.py': `def help(): pass\n`, + 'src/utils.py': `def shout(): pass\n`, + 'core.py': `def boot(): pass\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/app.py', language: 'python', fileCategory: 'code' }, + { path: 'src/helpers.py', language: 'python', fileCategory: 'code' }, + { path: 'src/utils.py', language: 'python', fileCategory: 'code' }, + { path: 'core.py', language: 'python', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // `from . import helpers` resolves `helpers` as a sibling submodule + // (`src/helpers.py`) even though `src/__init__.py` is absent — PEP 328 + // implicit namespace packages don't require it. `from .utils import shout` + // resolves to `src/utils.py`. `from ..core import boot` -> `core.py`. + expect(result.output.importMap['src/app.py']).toEqual([ + 'core.py', + 'src/helpers.py', + 'src/utils.py', + ]); + }); + + // Regression for Codex review #2 on PR #204: `from . import x` was + // dropped when no `__init__.py` was present at the importer's package + // dir, because resolvePythonProbe gated specifier probing on the package + // marker. Modern Python (PEP 420 namespace packages) commonly omits it. + it('resolves `from . import x` for namespace packages (no __init__.py)', () => { + projectRoot = setupTree({ + 'src/svc/main.py': + `from . import helpers, util\nfrom . import nested\n`, + 'src/svc/helpers.py': `def help(): pass\n`, + 'src/svc/util.py': `def u(): pass\n`, + 'src/svc/nested/__init__.py': `# package\n`, + // Crucially: NO src/svc/__init__.py — namespace package + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/svc/main.py', language: 'python', fileCategory: 'code' }, + { path: 'src/svc/helpers.py', language: 'python', fileCategory: 'code' }, + { path: 'src/svc/util.py', language: 'python', fileCategory: 'code' }, + { path: 'src/svc/nested/__init__.py', language: 'python', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // All three siblings should resolve — helpers.py + util.py as direct + // .py modules, nested/ as a package via its __init__.py. + expect(result.output.importMap['src/svc/main.py']).toEqual([ + 'src/svc/helpers.py', + 'src/svc/nested/__init__.py', + 'src/svc/util.py', + ]); + }); + + it('resolves python absolute imports and __init__.py matching', () => { + projectRoot = setupTree({ + 'main.py': `import src.utils.formatter\nfrom src.utils import formatter\nfrom src import config\n`, + 'src/__init__.py': '', + 'src/utils/__init__.py': '', + 'src/utils/formatter.py': `def fmt(): pass\n`, + 'src/config.py': `DEBUG = True\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'main.py', language: 'python', fileCategory: 'code' }, + { path: 'src/__init__.py', language: 'python', fileCategory: 'code' }, + { path: 'src/utils/__init__.py', language: 'python', fileCategory: 'code' }, + { path: 'src/utils/formatter.py', language: 'python', fileCategory: 'code' }, + { path: 'src/config.py', language: 'python', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // `import src.utils.formatter` -> src/utils/formatter.py + // `from src.utils import formatter` -> src/utils/__init__.py + src/utils/formatter.py + // `from src import config` -> src/__init__.py + src/config.py + expect(result.output.importMap['main.py']).toEqual([ + 'src/__init__.py', + 'src/config.py', + 'src/utils/__init__.py', + 'src/utils/formatter.py', + ]); + }); + + it('drops python external package imports', () => { + projectRoot = setupTree({ + 'app.py': `import os\nimport sys\nimport requests\nfrom datetime import datetime\nfrom .local import thing\n`, + 'local.py': `thing = 1\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'app.py', language: 'python', fileCategory: 'code' }, + { path: 'local.py', language: 'python', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // os/sys/requests/datetime are external; only ./local resolves. + expect(result.output.importMap['app.py']).toEqual(['local.py']); + }); + + it('resolves absolute imports against the importers per-service root in multi-service repos', () => { + // Mirrors microservices-demo: each service ships its own sibling files + // under src//, and uses bare `import helpers` to reach them. + // The probe MUST walk up from the importer's dir (not just probe + // projectRoot). The same module name in two services must NOT cross- + // resolve — importer-dir scope wins. + projectRoot = setupTree({ + 'src/svc_a/main.py': + `import helpers\nfrom helpers import shout\n`, + 'src/svc_a/helpers.py': + `def shout(): pass\n`, + 'src/svc_b/main.py': + `import helpers\nfrom helpers import shout\n`, + 'src/svc_b/helpers.py': + `def shout(): pass\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/svc_a/main.py', language: 'python', fileCategory: 'code' }, + { path: 'src/svc_a/helpers.py', language: 'python', fileCategory: 'code' }, + { path: 'src/svc_b/main.py', language: 'python', fileCategory: 'code' }, + { path: 'src/svc_b/helpers.py', language: 'python', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Each service's main.py resolves to its OWN helpers.py — no cross-link. + expect(result.output.importMap['src/svc_a/main.py']).toEqual([ + 'src/svc_a/helpers.py', + ]); + expect(result.output.importMap['src/svc_a/main.py']).not.toContain( + 'src/svc_b/helpers.py', + ); + expect(result.output.importMap['src/svc_b/main.py']).toEqual([ + 'src/svc_b/helpers.py', + ]); + expect(result.output.importMap['src/svc_b/main.py']).not.toContain( + 'src/svc_a/helpers.py', + ); + }); +}); + +describe('extract-import-map.mjs — Go resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves go imports by stripping the go.mod module prefix', () => { + projectRoot = setupTree({ + 'go.mod': `module github.com/foo/bar\n\ngo 1.21\n`, + 'main.go': `package main\n\nimport (\n\t"fmt"\n\t"github.com/foo/bar/util"\n\t"github.com/foo/bar/db"\n)\n\nfunc main() {\n\tfmt.Println(util.Hi())\n\tdb.Connect()\n}\n`, + 'util/hello.go': `package util\n\nfunc Hi() string { return "hi" }\n`, + 'util/world.go': `package util\n\nfunc World() string { return "world" }\n`, + 'db/db.go': `package db\n\nfunc Connect() {}\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'go.mod', language: 'config', fileCategory: 'config' }, + { path: 'main.go', language: 'go', fileCategory: 'code' }, + { path: 'util/hello.go', language: 'go', fileCategory: 'code' }, + { path: 'util/world.go', language: 'go', fileCategory: 'code' }, + { path: 'db/db.go', language: 'go', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // `github.com/foo/bar/util` -> all .go files under util/ + // `github.com/foo/bar/db` -> all .go files under db/ + // `fmt` is stdlib (no module prefix match) -> dropped + expect(result.output.importMap['main.go']).toEqual([ + 'db/db.go', + 'util/hello.go', + 'util/world.go', + ]); + }); + + it('resolves per-service imports in a multi-go.mod monorepo', () => { + // Mirrors Google's microservices-demo layout: every service ships its + // own go.mod, so the resolver MUST dispatch per-importer to the nearest + // ancestor module. Imports of a SIBLING module (a's file importing b's + // package) must be classified as external — from a's perspective, b is + // a third-party dependency. + projectRoot = setupTree({ + 'src/a/go.mod': `module github.com/org/a\n\ngo 1.21\n`, + 'src/a/main.go': + `package main\n\nimport (\n\t"github.com/org/a/sub"\n\t"github.com/org/b/sub"\n)\n\nfunc main() { sub.X() }\n`, + 'src/a/sub/sub.go': + `package sub\n\nfunc X() {}\n`, + 'src/b/go.mod': `module github.com/org/b\n\ngo 1.21\n`, + 'src/b/main.go': + `package main\n\nimport (\n\t"github.com/org/b/sub"\n)\n\nfunc main() { sub.Y() }\n`, + 'src/b/sub/sub.go': + `package sub\n\nfunc Y() {}\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/a/go.mod', language: 'config', fileCategory: 'config' }, + { path: 'src/a/main.go', language: 'go', fileCategory: 'code' }, + { path: 'src/a/sub/sub.go', language: 'go', fileCategory: 'code' }, + { path: 'src/b/go.mod', language: 'config', fileCategory: 'config' }, + { path: 'src/b/main.go', language: 'go', fileCategory: 'code' }, + { path: 'src/b/sub/sub.go', language: 'go', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // a/main resolves its own a/sub but NOT b/sub (b is external from a's + // module's perspective — different go.mod). + expect(result.output.importMap['src/a/main.go']).toEqual([ + 'src/a/sub/sub.go', + ]); + // b/main resolves its own b/sub. + expect(result.output.importMap['src/b/main.go']).toEqual([ + 'src/b/sub/sub.go', + ]); + }); + + it('emits a one-time Warning: when a .go file has no ancestor go.mod', () => { + // A .go file outside any module. Multiple module-prefix imports should + // produce ONE warning (deduped by importer path), and the importMap + // entry stays empty. + projectRoot = setupTree({ + 'orphan/main.go': + `package main\n\nimport (\n\t"github.com/foo/bar/util"\n\t"github.com/foo/bar/db"\n)\n\nfunc main() {}\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'orphan/main.go', language: 'go', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['orphan/main.go']).toEqual([]); + const goModWarnings = result.stderr + .split('\n') + .filter(l => l.includes('no ancestor go.mod')); + expect(goModWarnings).toHaveLength(1); + expect(goModWarnings[0]).toMatch( + /Warning: extract-import-map: Go file orphan\/main\.go has no ancestor go\.mod/, + ); + expect(goModWarnings[0]).toMatch(/module-prefix imports skipped/); + }); +}); + +describe('extract-import-map.mjs — Java resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves java dotted imports via suffix probe', () => { + projectRoot = setupTree({ + 'src/main/java/com/example/App.java': + `package com.example;\n\nimport com.example.foo.Bar;\nimport com.example.util.Helper;\n\npublic class App { }\n`, + 'src/main/java/com/example/foo/Bar.java': + `package com.example.foo;\n\npublic class Bar { }\n`, + 'src/main/java/com/example/util/Helper.java': + `package com.example.util;\n\npublic class Helper { }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/main/java/com/example/App.java', language: 'java', fileCategory: 'code' }, + { path: 'src/main/java/com/example/foo/Bar.java', language: 'java', fileCategory: 'code' }, + { path: 'src/main/java/com/example/util/Helper.java', language: 'java', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/main/java/com/example/App.java']).toEqual([ + 'src/main/java/com/example/foo/Bar.java', + 'src/main/java/com/example/util/Helper.java', + ]); + }); + + it('drops java external imports (java.util, etc.)', () => { + projectRoot = setupTree({ + 'src/x/App.java': + `package x;\nimport java.util.List;\nimport java.io.IOException;\nimport x.Local;\npublic class App { }\n`, + 'src/x/Local.java': + `package x;\npublic class Local { }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/x/App.java', language: 'java', fileCategory: 'code' }, + { path: 'src/x/Local.java', language: 'java', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // java.util/java.io are external (no project file matches the suffix); + // x.Local maps via suffix to src/x/Local.java. + expect(result.output.importMap['src/x/App.java']).toEqual(['src/x/Local.java']); + }); +}); + +describe('extract-import-map.mjs — Kotlin resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves kotlin dotted imports via suffix probe', () => { + projectRoot = setupTree({ + 'src/main/kotlin/com/example/Main.kt': + `package com.example\n\nimport com.example.foo.Bar\nimport com.example.util.Helper\n\nfun main() { }\n`, + 'src/main/kotlin/com/example/foo/Bar.kt': + `package com.example.foo\n\nclass Bar\n`, + 'src/main/kotlin/com/example/util/Helper.kt': + `package com.example.util\n\nobject Helper\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/main/kotlin/com/example/Main.kt', language: 'kotlin', fileCategory: 'code' }, + { path: 'src/main/kotlin/com/example/foo/Bar.kt', language: 'kotlin', fileCategory: 'code' }, + { path: 'src/main/kotlin/com/example/util/Helper.kt', language: 'kotlin', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/main/kotlin/com/example/Main.kt']).toEqual([ + 'src/main/kotlin/com/example/foo/Bar.kt', + 'src/main/kotlin/com/example/util/Helper.kt', + ]); + }); +}); + +describe('extract-import-map.mjs — C# resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves c# using directives via dotted-suffix probe', () => { + projectRoot = setupTree({ + 'Program.cs': + `using System;\nusing MyApp.Util.Helper;\nusing MyApp.Models.User;\n\nnamespace MyApp { class Program { } }\n`, + 'MyApp/Util/Helper.cs': + `namespace MyApp.Util { public class Helper { } }\n`, + 'MyApp/Models/User.cs': + `namespace MyApp.Models { public class User { } }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'Program.cs', language: 'csharp', fileCategory: 'code' }, + { path: 'MyApp/Util/Helper.cs', language: 'csharp', fileCategory: 'code' }, + { path: 'MyApp/Models/User.cs', language: 'csharp', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['Program.cs']).toEqual([ + 'MyApp/Models/User.cs', + 'MyApp/Util/Helper.cs', + ]); + }); +}); + +describe('extract-import-map.mjs — Ruby resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves ruby require_relative + require load-path probes', () => { + projectRoot = setupTree({ + 'app/controllers/users_controller.rb': + `require_relative '../helpers/auth'\nrequire 'shared/logger'\nrequire 'json'\n\nclass UsersController\nend\n`, + 'app/helpers/auth.rb': + `module Auth\nend\n`, + 'lib/shared/logger.rb': + `module Shared\n module Logger\n end\nend\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'app/controllers/users_controller.rb', language: 'ruby', fileCategory: 'code' }, + { path: 'app/helpers/auth.rb', language: 'ruby', fileCategory: 'code' }, + { path: 'lib/shared/logger.rb', language: 'ruby', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // require_relative '../helpers/auth' -> app/helpers/auth.rb + // require 'shared/logger' -> lib/shared/logger.rb (load-path probe) + // require 'json' -> external (no project file) + expect(result.output.importMap['app/controllers/users_controller.rb']).toEqual([ + 'app/helpers/auth.rb', + 'lib/shared/logger.rb', + ]); + }); +}); + +describe('extract-import-map.mjs — PHP resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves php use directives via composer.json PSR-4 autoload', () => { + projectRoot = setupTree({ + 'composer.json': JSON.stringify({ + autoload: { + 'psr-4': { + 'App\\': 'src/', + 'App\\Tests\\': 'tests/', + }, + }, + }), + 'src/Http/Controller.php': + ` src/Models/User.php (App\ -> src/) + // App\Util\Logger -> src/Util/Logger.php + // Symfony\... -> external (no autoload entry) + expect(result.output.importMap['src/Http/Controller.php']).toEqual([ + 'src/Models/User.php', + 'src/Util/Logger.php', + ]); + }); + + it('resolves per-package composer.json PSR-4 without cross-package leakage', () => { + // Multi-package Composer layout (think: Symfony or Laravel-style mono + // with package-scoped autoload). Each package's composer.json declares + // its own PSR-4 namespace. Cross-package `use` should NOT resolve via + // a sibling's autoload — that's exactly the silent miscompile the + // single-root assumption would introduce. + projectRoot = setupTree({ + 'packages/foo/composer.json': JSON.stringify({ + autoload: { 'psr-4': { 'App\\Foo\\': 'src/' } }, + }), + 'packages/foo/src/X.php': + ` packages/foo/src/Y.php only. + // The App\Bar\Z `use` is unresolvable from foo's perspective (foo's + // composer.json has no App\Bar entry). + expect(result.output.importMap['packages/foo/src/X.php']).toEqual([ + 'packages/foo/src/Y.php', + ]); + expect(result.output.importMap['packages/foo/src/X.php']).not.toContain( + 'packages/bar/src/Z.php', + ); + }); + + // Regression: Composer's fallback autoload mapping `"psr-4": {"": "src/"}` + // means "any namespace resolves under src/". Earlier code appended `\` to + // every prefix (so `""` became `"\\"`, matching nothing) AND the + // longest-prefix loop initialized bestPrefix to `''` and required + // strict `>` — so even when the empty prefix WAS preserved it could + // never win. Both fixes are required for this test to pass. Caught by + // Codex review on PR #204. + it('resolves PSR-4 empty-prefix fallback ("": "src/")', () => { + projectRoot = setupTree({ + 'composer.json': JSON.stringify({ + autoload: { + 'psr-4': { '': 'src/' }, + }, + }), + 'src/Foo/Bar.php': + ` `src/Foo/Baz.php` directly. + expect(result.output.importMap['src/Foo/Bar.php']).toEqual([ + 'src/Foo/Baz.php', + ]); + }); +}); + +describe('extract-import-map.mjs — Rust resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves rust use crate:: and mod declarations', () => { + projectRoot = setupTree({ + 'Cargo.toml': `[package]\nname = "demo"\nversion = "0.1.0"\nedition = "2021"\n`, + 'src/lib.rs': + `pub mod auth;\npub mod db;\n\nuse crate::auth::login;\nuse crate::db::query;\n\nfn boot() { login(); query(); }\n`, + 'src/auth.rs': + `pub fn login() { }\n`, + 'src/db.rs': + `pub fn query() { }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'Cargo.toml', language: 'toml', fileCategory: 'config' }, + { path: 'src/lib.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/auth.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/db.rs', language: 'rust', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // `pub mod auth;` and `pub mod db;` declare submodules in the same dir. + // `use crate::auth::login;` and `use crate::db::query;` resolve via crate src. + expect(result.output.importMap['src/lib.rs']).toEqual([ + 'src/auth.rs', + 'src/db.rs', + ]); + }); + + it('resolves rust super:: walking up one directory', () => { + projectRoot = setupTree({ + 'Cargo.toml': `[package]\nname = "demo"\nversion = "0.1.0"\n`, + 'src/lib.rs': `pub mod inner;\npub mod sibling;\n`, + 'src/sibling.rs': `pub fn hi() { }\n`, + 'src/inner/mod.rs': `use super::sibling::hi;\nfn boot() { hi(); }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'Cargo.toml', language: 'toml', fileCategory: 'config' }, + { path: 'src/lib.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/sibling.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/inner/mod.rs', language: 'rust', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/inner/mod.rs']).toEqual(['src/sibling.rs']); + }); +}); + +describe('extract-import-map.mjs — C/C++ resolver', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('resolves c/c++ #include probes (relative + include/ + src/)', () => { + projectRoot = setupTree({ + 'src/main.cpp': + `#include \n#include "util.h"\n#include "helpers/log.h"\n\nint main() { return 0; }\n`, + 'src/util.h': + `#ifndef UTIL_H\n#define UTIL_H\nvoid util();\n#endif\n`, + 'src/helpers/log.h': + `#pragma once\nvoid log_msg(const char*);\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/main.cpp', language: 'cpp', fileCategory: 'code' }, + { path: 'src/util.h', language: 'cpp', fileCategory: 'code' }, + { path: 'src/helpers/log.h', language: 'cpp', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // iostream is external; util.h resolves relative to importer dir; + // helpers/log.h also relative. + expect(result.output.importMap['src/main.cpp']).toEqual([ + 'src/helpers/log.h', + 'src/util.h', + ]); + }); + + it('resolves c #include via project-level include/ fallback', () => { + projectRoot = setupTree({ + 'src/app.c': + `#include "config.h"\n#include "shared.h"\n\nint main() { return 0; }\n`, + 'include/config.h': `#pragma once\n`, + 'src/shared.h': `#pragma once\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/app.c', language: 'c', fileCategory: 'code' }, + { path: 'include/config.h', language: 'c', fileCategory: 'code' }, + { path: 'src/shared.h', language: 'c', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/app.c']).toEqual([ + 'include/config.h', + 'src/shared.h', + ]); + }); +}); + +describe('extract-import-map.mjs — per-file failure resilience', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('continues processing when a file is missing from disk', () => { + // Build a project with one real file and one declared-but-missing file. + // The missing file is still in the input list (the project-scanner + // discovered it before something deleted it), so the resolver must + // emit a Warning: line and set importMap[] = [] without + // aborting the whole script. + projectRoot = setupTree({ + 'src/real.ts': `import { thing } from './other';\nexport const x = 1;\n`, + 'src/other.ts': `export const thing = 1;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/real.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/other.ts', language: 'typescript', fileCategory: 'code' }, + // Declared but does not exist on disk + { path: 'src/missing.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Script completed cleanly + expect(result.output.scriptCompleted).toBe(true); + // Real files resolved + expect(result.output.importMap['src/real.ts']).toEqual(['src/other.ts']); + expect(result.output.importMap['src/other.ts']).toEqual([]); + // Missing file is in importMap with [] + expect(result.output.importMap['src/missing.ts']).toEqual([]); + // A warning was emitted on stderr for the missing file + expect(result.stderr).toMatch(/Warning: extract-import-map: import resolution failed for src\/missing\.ts/); + expect(result.stderr).toMatch(/importMap\[src\/missing\.ts\]=\[\]/); + }); + + it('emits a stats summary on stderr', () => { + projectRoot = setupTree({ + 'a.ts': `import { b } from './b';\n`, + 'b.ts': `export const b = 1;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'a.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'b.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.stderr).toMatch( + /extract-import-map: filesScanned=2 filesWithImports=1 totalEdges=1/, + ); + }); +}); + +describe('extract-import-map.mjs — output schema invariants', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('every input file appears in importMap (even with zero imports)', () => { + projectRoot = setupTree({ + 'a.ts': `// no imports\nexport const a = 1;\n`, + 'README.md': '# x\n', + 'Dockerfile': 'FROM node:22\n', + 'package.json': '{}\n', + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'a.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'README.md', language: 'markdown', fileCategory: 'docs' }, + { path: 'Dockerfile', language: 'dockerfile', fileCategory: 'infra' }, + { path: 'package.json', language: 'json', fileCategory: 'config' }, + ], + }); + + expect(result.status).toBe(0); + expect(Object.keys(result.output.importMap).sort()).toEqual([ + 'Dockerfile', 'README.md', 'a.ts', 'package.json', + ]); + for (const arr of Object.values(result.output.importMap)) { + expect(Array.isArray(arr)).toBe(true); + } + }); + + it('produces deterministic output across runs', () => { + projectRoot = setupTree({ + 'src/a.ts': `import { b } from './b';\nimport { c } from './c';\n`, + 'src/b.ts': `export const b = 1;\n`, + 'src/c.ts': `export const c = 2;\n`, + }); + + const input = { + projectRoot, + files: [ + { path: 'src/a.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/b.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/c.ts', language: 'typescript', fileCategory: 'code' }, + ], + }; + + const r1 = runScript(projectRoot, input); + const r2 = runScript(projectRoot, input); + expect(r1.status).toBe(0); + expect(r2.status).toBe(0); + expect(JSON.stringify(r1.output)).toBe(JSON.stringify(r2.output)); + }); +}); + +// =========================================================================== +// Hardening regression tests +// +// These tests cover the failure modes called out in code review: +// - graceful tree-sitter init failure (IMPORTANT 1) +// - tsconfig parse resilience (IMPORTANT 2) +// - comment-aware import regexes for JS/Ruby/Rust (MINOR 4) +// - tighter Kotlin import grammar (MINOR 5) +// - multi-match Gradle/Maven dotted-FQN behavior (MINOR 6) +// - composer.json malformed warning (MINOR 7) +// - Rust 'use crate::' with no crate root — one-time warning (MINOR 9) +// =========================================================================== + +describe('extract-import-map.mjs — regex comment-strip resilience', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('JS require() inside a // line comment is NOT picked up', () => { + projectRoot = setupTree({ + 'src/index.js': + `// require('./fake'); <- commented out, must be ignored\n` + + `/* require('./alsofake'); also commented */\n` + + `const real = require('./real');\n`, + 'src/real.js': `module.exports = { x: 1 };\n`, + 'src/fake.js': `module.exports = { fake: true };\n`, + 'src/alsofake.js': `module.exports = { fake: true };\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/index.js', language: 'javascript', fileCategory: 'code' }, + { path: 'src/real.js', language: 'javascript', fileCategory: 'code' }, + { path: 'src/fake.js', language: 'javascript', fileCategory: 'code' }, + { path: 'src/alsofake.js', language: 'javascript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Only the real require survives; both commented-out requires are dropped. + expect(result.output.importMap['src/index.js']).toEqual(['src/real.js']); + expect(result.output.importMap['src/index.js']).not.toContain('src/fake.js'); + expect(result.output.importMap['src/index.js']).not.toContain('src/alsofake.js'); + }); + + it('Ruby require inside a # line comment is NOT picked up', () => { + projectRoot = setupTree({ + 'app.rb': + `# require 'fake' -- commented out, must be ignored\n` + + `require 'real'\n`, + 'lib/real.rb': `module Real; end\n`, + 'lib/fake.rb': `module Fake; end\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'app.rb', language: 'ruby', fileCategory: 'code' }, + { path: 'lib/real.rb', language: 'ruby', fileCategory: 'code' }, + { path: 'lib/fake.rb', language: 'ruby', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['app.rb']).toEqual(['lib/real.rb']); + expect(result.output.importMap['app.rb']).not.toContain('lib/fake.rb'); + }); + + it('Rust mod declarations inside // and /* */ comments are NOT picked up', () => { + projectRoot = setupTree({ + 'Cargo.toml': `[package]\nname = "demo"\nversion = "0.1.0"\n`, + 'src/lib.rs': + `// mod fake_line; <- commented out\n` + + `/* mod fake_block; */\n` + + `pub mod real;\n`, + 'src/real.rs': `pub fn r() { }\n`, + 'src/fake_line.rs': `pub fn f() { }\n`, + 'src/fake_block.rs': `pub fn f() { }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'Cargo.toml', language: 'toml', fileCategory: 'config' }, + { path: 'src/lib.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/real.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/fake_line.rs', language: 'rust', fileCategory: 'code' }, + { path: 'src/fake_block.rs', language: 'rust', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.output.importMap['src/lib.rs']).toEqual(['src/real.rs']); + expect(result.output.importMap['src/lib.rs']).not.toContain('src/fake_line.rs'); + expect(result.output.importMap['src/lib.rs']).not.toContain('src/fake_block.rs'); + }); +}); + +describe('extract-import-map.mjs — Kotlin import grammar', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('does NOT phantom-resolve `import ...` or `import .foo`', () => { + // Pathological inputs the tightened regex should reject. If they slipped + // through, the dotted resolver would turn '...' into '/.../' lookups + // or '.foo' into '/foo.kt' — both bogus. + projectRoot = setupTree({ + 'src/Main.kt': + `package com.example\n\n` + + `import ...\n` + // garbage line + `import .foo\n` + // leading-dot garbage line + `import com.example.real.Bar\n`, + 'src/com/example/real/Bar.kt': + `package com.example.real\nclass Bar\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/Main.kt', language: 'kotlin', fileCategory: 'code' }, + { path: 'src/com/example/real/Bar.kt', language: 'kotlin', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Only the valid import resolves. The garbage lines must not produce + // phantom entries. + expect(result.output.importMap['src/Main.kt']).toEqual([ + 'src/com/example/real/Bar.kt', + ]); + }); +}); + +describe('extract-import-map.mjs — multi-source-root dotted FQN (Gradle/Maven)', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('returns BOTH matches when a Java FQN suffix exists in two source roots', () => { + // Multi-module Gradle layout: two `Bar.java` files both at .../com/foo/Bar.java + // but rooted in different source trees. The resolver intentionally returns + // both so the structural graph reflects every plausible target. + projectRoot = setupTree({ + 'src/main/java/com/example/App.java': + `package com.example;\nimport com.foo.Bar;\npublic class App { }\n`, + 'src/main/java/com/foo/Bar.java': + `package com.foo;\npublic class Bar { }\n`, + 'lib/src/main/java/com/foo/Bar.java': + `package com.foo;\npublic class Bar { }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'src/main/java/com/example/App.java', language: 'java', fileCategory: 'code' }, + { path: 'src/main/java/com/foo/Bar.java', language: 'java', fileCategory: 'code' }, + { path: 'lib/src/main/java/com/foo/Bar.java', language: 'java', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Both source-root candidates appear, sorted via localeCompare. + expect(result.output.importMap['src/main/java/com/example/App.java']).toEqual([ + 'lib/src/main/java/com/foo/Bar.java', + 'src/main/java/com/foo/Bar.java', + ]); + }); +}); + +describe('extract-import-map.mjs — composer.json malformed', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('emits a Warning: and PHP imports fall back to empty when composer.json is broken', () => { + projectRoot = setupTree({ + 'composer.json': '{ "autoload": { "psr-4": { "App\\\\": "src/" }, ', // unterminated + 'src/Http/Controller.php': + ` { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('emits a Warning: when tsconfig.json is malformed and falls back to no aliases', () => { + projectRoot = setupTree({ + 'tsconfig.json': '{ "compilerOptions": { "baseUrl": ".", ', // unterminated + 'src/index.ts': + `import { foo } from '@/utils';\nimport { bar } from './sibling';\n`, + 'src/sibling.ts': `export const bar = 1;\n`, + 'src/utils.ts': `export const foo = 1;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'tsconfig.json', language: 'json', fileCategory: 'config' }, + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/sibling.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/utils.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + expect(result.stderr).toMatch( + /Warning: extract-import-map: tsconfig\.json at .* failed to parse/, + ); + // Phrased "from this config" in the plural-tsconfigs implementation + // because per-file walk-up now identifies the specific bad tsconfig. + expect(result.stderr).toMatch(/path aliases.*will not be applied/); + // Aliased import unresolved; relative import still resolves. + expect(result.output.importMap['src/index.ts']).toEqual(['src/sibling.ts']); + }); + + it('falls back to raw-text parse when a paths value contains "//" that the stripper would damage', () => { + // tsconfig with NO comments but a string literal containing "//". The + // naive stripper would chew the second `//` away and break the JSON; + // the raw-text fallback should rescue the parse. + const tsconfigRaw = `{ + "compilerOptions": { + "baseUrl": ".", + "paths": { + "@scheme//foo/*": ["src/foo/*"] + } + } +} +`; + projectRoot = setupTree({ + 'tsconfig.json': tsconfigRaw, + 'src/index.ts': `import { x } from '@scheme//foo/bar';\n`, + 'src/foo/bar.ts': `export const x = 1;\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'tsconfig.json', language: 'json', fileCategory: 'config' }, + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/foo/bar.ts', language: 'typescript', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Either path: the stripper damages the string but the raw retry rescues, + // OR the stripper happens not to damage it. Either way, no warning fires + // and the alias must resolve. + expect(result.stderr).not.toMatch(/tsconfig\.json .* failed to parse/); + expect(result.output.importMap['src/index.ts']).toEqual(['src/foo/bar.ts']); + }); +}); + +describe('extract-import-map.mjs — Rust crate root missing', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('emits a one-time Warning: per file when use crate:: has no crate root', () => { + // A Rust file that uses `crate::` but has neither src/lib.rs nor + // src/main.rs anywhere up its tree. Two `use crate::` statements should + // produce ONE warning, not two. + projectRoot = setupTree({ + 'Cargo.toml': `[package]\nname = "demo"\nversion = "0.1.0"\n`, + // No src/lib.rs and no src/main.rs — but two `use crate::` calls. + 'app/something.rs': + `use crate::auth::login;\nuse crate::db::query;\nfn boot() { }\n`, + }); + + const result = runScript(projectRoot, { + projectRoot, + files: [ + { path: 'Cargo.toml', language: 'toml', fileCategory: 'config' }, + { path: 'app/something.rs', language: 'rust', fileCategory: 'code' }, + ], + }); + + expect(result.status).toBe(0); + // Importer file gets the warning exactly once even though there are two + // unresolvable `use crate::` statements. + const crateRootWarnings = result.stderr + .split('\n') + .filter(l => l.includes('no crate root')); + expect(crateRootWarnings).toHaveLength(1); + expect(crateRootWarnings[0]).toMatch( + /Warning: extract-import-map: Rust file app\/something\.rs has 'use crate::' but no crate root/, + ); + // And the importMap stays empty for that file. + expect(result.output.importMap['app/something.rs']).toEqual([]); + }); +}); + +describe('extract-import-map.mjs — tree-sitter init graceful failure', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('emits a Warning: and produces empty importMap entries when tree-sitter init throws', () => { + // Force tree-sitter init to fail by intercepting the `web-tree-sitter` + // module load with an ESM loader hook. This simulates the real-world + // failure mode where the WASM grammar binaries are missing or + // inaccessible (cache eviction, restricted sandbox, etc.). + projectRoot = setupTree({ + 'src/index.ts': `import { x } from './lib';\nexport const y = x;\n`, + 'src/lib.ts': `export const x = 1;\n`, + }); + + // Write the loader hook + register module to the temp project root. + const hookPath = join(projectRoot, 'ua-eim-fail-hook.mjs'); + const loaderPath = join(projectRoot, 'ua-eim-fail-loader.mjs'); + writeFileSync( + hookPath, + `export async function resolve(specifier, ctx, nextResolve) {\n` + + ` if (specifier === 'web-tree-sitter') {\n` + + ` throw new Error('synthetic: web-tree-sitter unavailable in test');\n` + + ` }\n` + + ` return nextResolve(specifier, ctx);\n` + + `}\n`, + 'utf-8', + ); + writeFileSync( + loaderPath, + `import { register } from 'node:module';\n` + + `import { pathToFileURL } from 'node:url';\n` + + `register(pathToFileURL(${JSON.stringify(hookPath)}).href);\n`, + 'utf-8', + ); + + const result = runScript( + projectRoot, + { + projectRoot, + files: [ + { path: 'src/index.ts', language: 'typescript', fileCategory: 'code' }, + { path: 'src/lib.ts', language: 'typescript', fileCategory: 'code' }, + ], + }, + ['--import', loaderPath], + ); + + expect(result.status).toBe(0); + // Script completed cleanly with the documented degraded output. + expect(result.output.scriptCompleted).toBe(true); + expect(result.stderr).toMatch( + /Warning: extract-import-map: tree-sitter init failed/, + ); + expect(result.stderr).toMatch(/structural graph will have no import edges/); + // Both code files get empty importMap entries. + expect(result.output.importMap['src/index.ts']).toEqual([]); + expect(result.output.importMap['src/lib.ts']).toEqual([]); + // Stats reflect the degraded run: no edges, no files with imports. + expect(result.output.stats.filesScanned).toBe(2); + expect(result.output.stats.filesWithImports).toBe(0); + expect(result.output.stats.totalEdges).toBe(0); + }); +}); diff --git a/understand-anything-plugin/skills/understand/test_merge_batch_graphs.py b/tests/skill/understand/test_merge_batch_graphs.py similarity index 79% rename from understand-anything-plugin/skills/understand/test_merge_batch_graphs.py rename to tests/skill/understand/test_merge_batch_graphs.py index ada2b76..7239bfe 100644 --- a/understand-anything-plugin/skills/understand/test_merge_batch_graphs.py +++ b/tests/skill/understand/test_merge_batch_graphs.py @@ -2,8 +2,8 @@ """ test_merge_batch_graphs.py — Tests for the deterministic tested_by linker. -Run from this directory: - python -m unittest test_merge_batch_graphs.py -v +Run from the repo root: + python -m unittest tests.skill.understand.test_merge_batch_graphs -v """ from __future__ import annotations @@ -20,7 +20,14 @@ from typing import Any # directly. Load it via importlib so we can call its module-level helpers. _HERE = Path(__file__).resolve().parent -_MODULE_PATH = _HERE / "merge-batch-graphs.py" +_REPO_ROOT = _HERE.parent.parent.parent +_MODULE_PATH = ( + _REPO_ROOT + / "understand-anything-plugin" + / "skills" + / "understand" + / "merge-batch-graphs.py" +) def _load_module() -> Any: @@ -941,5 +948,240 @@ class MergeEdgeDirectionTests(unittest.TestCase): self.assertEqual(edges[0]["weight"], 0.9) +# ── Multi-part batch handling ───────────────────────────────────────────── + + +class TestMultiPart(unittest.TestCase): + """End-to-end tests for batch--part-.json input handling. + + These tests invoke merge-batch-graphs.py as a subprocess in a temp + directory so we exercise the full path: glob → load → merge → write. + """ + + def setUp(self) -> None: + import tempfile + self.tmp = Path(tempfile.mkdtemp(prefix="ua-mbg-")) + self.intermediate = self.tmp / ".understand-anything" / "intermediate" + self.intermediate.mkdir(parents=True, exist_ok=True) + + def tearDown(self) -> None: + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def _write_batch(self, name: str, nodes: list, edges: list) -> None: + import json as _j + (self.intermediate / name).write_text( + _j.dumps({"nodes": nodes, "edges": edges}), + encoding="utf-8", + ) + + def _run_merge(self) -> tuple[int, str, dict]: + import subprocess + import json as _j + result = subprocess.run( + ["python3", str(_MODULE_PATH), str(self.tmp)], + capture_output=True, text=True, + ) + out_path = self.intermediate / "assembled-graph.json" + assembled = _j.loads(out_path.read_text()) if out_path.exists() else {} + return result.returncode, result.stderr, assembled + + def test_two_parts_of_one_logical_batch_merge(self) -> None: + self._write_batch("batch-1-part-1.json", + [_file_node("src/a.ts")], + [{"source": "file:src/a.ts", "target": "file:src/b.ts", + "type": "imports", "direction": "forward", "weight": 0.7}]) + self._write_batch("batch-1-part-2.json", + [_file_node("src/b.ts")], + []) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, {"file:src/a.ts", "file:src/b.ts"}) + # Cross-part edge survived + edge_keys = {(e["source"], e["target"], e["type"]) for e in assembled["edges"]} + self.assertIn( + ("file:src/a.ts", "file:src/b.ts", "imports"), edge_keys) + + def test_three_parts_of_one_logical_batch_merge(self) -> None: + for k, path in enumerate(["src/a.ts", "src/b.ts", "src/c.ts"], start=1): + self._write_batch(f"batch-1-part-{k}.json", + [_file_node(path)], []) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, + {"file:src/a.ts", "file:src/b.ts", "file:src/c.ts"}) + + def test_malformed_part_is_skipped_with_warning(self) -> None: + (self.intermediate / "batch-1-part-1.json").write_text( + "{ this is not valid json", encoding="utf-8") + self._write_batch("batch-1-part-2.json", + [_file_node("src/b.ts")], []) + rc, stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + # The skip warning is from existing load_batch logic + self.assertIn("skipping batch-1-part-1.json", stderr) + # part-2 content still made it in + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, {"file:src/b.ts"}) + + def test_mixed_single_and_multi_part(self) -> None: + self._write_batch("batch-1.json", + [_file_node("src/single.ts")], []) + self._write_batch("batch-2-part-1.json", + [_file_node("src/multi-a.ts")], []) + self._write_batch("batch-2-part-2.json", + [_file_node("src/multi-b.ts")], []) + self._write_batch("batch-3.json", + [_file_node("src/another-single.ts")], []) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertEqual(node_ids, { + "file:src/single.ts", "file:src/multi-a.ts", + "file:src/multi-b.ts", "file:src/another-single.ts", + }) + + def test_missing_part_emits_warning(self) -> None: + # parts {2, 3} present, part-1 missing + self._write_batch("batch-1-part-2.json", + [_file_node("src/b.ts")], []) + self._write_batch("batch-1-part-3.json", + [_file_node("src/c.ts")], []) + rc, stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + self.assertRegex(stderr, + r"Warning: merge: batch 1 has parts \[2, 3\] but " + r"missing part \[1\] — possible truncated write") + + def test_stderr_report_format(self) -> None: + self._write_batch("batch-1.json", [_file_node("src/a.ts")], []) + self._write_batch("batch-2-part-1.json", [_file_node("src/b.ts")], []) + self._write_batch("batch-2-part-2.json", [_file_node("src/c.ts")], []) + rc, stderr, _assembled = self._run_merge() + self.assertEqual(rc, 0) + # 3 files on disk, 2 logical batches, 1 multi-part + self.assertIn( + "Found 3 batch files (2 logical batches, 1 multi-part)", stderr) + + +# ── Unrecognized batch filename handling ─────────────────────────────────── + + +class TestUnrecognizedBatchFilename(unittest.TestCase): + """File-analyzer fuses multiple batches into one output (e.g., + `batch-fused-8-13.json`, `batch-8-13.json`) — the merge script's regex + requires `batch-.json` or `batch--part-.json` and would + otherwise silently drop the contents. The script must warn loudly and + surface the drop in its report so the downstream review step catches it. + """ + + def setUp(self) -> None: + import tempfile + self.tmp = Path(tempfile.mkdtemp(prefix="ua-mbg-unrec-")) + self.intermediate = self.tmp / ".understand-anything" / "intermediate" + self.intermediate.mkdir(parents=True, exist_ok=True) + + def tearDown(self) -> None: + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def _write_batch(self, name: str, nodes: list, edges: list) -> None: + import json as _j + (self.intermediate / name).write_text( + _j.dumps({"nodes": nodes, "edges": edges}), + encoding="utf-8", + ) + + def _run_merge(self) -> tuple[int, str, dict]: + import subprocess + import json as _j + result = subprocess.run( + ["python3", str(_MODULE_PATH), str(self.tmp)], + capture_output=True, text=True, + ) + out_path = self.intermediate / "assembled-graph.json" + assembled = _j.loads(out_path.read_text()) if out_path.exists() else {} + return result.returncode, result.stderr, assembled + + def test_fused_filename_emits_stderr_warning(self) -> None: + # `batch-fused-3-5.json` does not match the merge regex — + # script must warn on stderr (not silently drop). + self._write_batch("batch-1.json", [_file_node("src/a.ts")], []) + self._write_batch("batch-2.json", [_file_node("src/b.ts")], []) + self._write_batch( + "batch-fused-3-5.json", + [_file_node("src/c.ts"), _file_node("src/d.ts"), _file_node("src/e.ts")], + [], + ) + rc, stderr, _assembled = self._run_merge() + self.assertEqual(rc, 0) + self.assertIn("Warning: merge-batch-graphs:", stderr) + self.assertIn("unrecognized filenames", stderr) + self.assertIn("batch-fused-3-5.json", stderr) + # Remediation hint must be present so users know what to fix. + self.assertIn("file-analyzer", stderr) + self.assertIn("batch-.json", stderr) + + def test_fused_filename_surfaces_in_report(self) -> None: + # The merge report (printed after the per-file load lines) must + # also flag the drop so Phase 3 review picks it up. + self._write_batch("batch-1.json", [_file_node("src/a.ts")], []) + self._write_batch( + "batch-fused-2-4.json", [_file_node("src/x.ts")], [], + ) + rc, stderr, _assembled = self._run_merge() + self.assertEqual(rc, 0) + # "dropped N batch file(s) with unrecognized filenames" appears in the + # report section (printed after "Output: ..." line). + self.assertIn("dropped 1 batch file(s) with unrecognized filenames", stderr) + self.assertIn("batch-fused-2-4.json", stderr) + self.assertIn( + "every node/edge in these files was excluded from the final graph", + stderr, + ) + + def test_recognized_batches_still_loaded(self) -> None: + # With both recognized and unrecognized files present, recognized + # ones must still produce a valid assembled graph. + self._write_batch("batch-1.json", [_file_node("src/a.ts")], []) + self._write_batch("batch-2.json", [_file_node("src/b.ts")], []) + self._write_batch( + "batch-fused-3-5.json", + [_file_node("src/dropped-c.ts")], + [], + ) + rc, _stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + node_ids = {n["id"] for n in assembled["nodes"]} + # batch-1 + batch-2 survive + self.assertIn("file:src/a.ts", node_ids) + self.assertIn("file:src/b.ts", node_ids) + # batch-fused-3-5.json content is excluded + self.assertNotIn("file:src/dropped-c.ts", node_ids) + self.assertEqual(node_ids, {"file:src/a.ts", "file:src/b.ts"}) + + def test_range_filename_also_unrecognized(self) -> None: + # A bare range like `batch-8-13.json` is just as broken as + # `batch-fused-8-13.json` — both must be flagged. The regex + # `batch-(\d+)(?:-part-(\d+))?\.json` requires the literal + # `-part-` separator before a second number. + self._write_batch("batch-1.json", [_file_node("src/a.ts")], []) + self._write_batch( + "batch-8-13.json", + [_file_node("src/x.ts"), _file_node("src/y.ts")], + [], + ) + rc, stderr, assembled = self._run_merge() + self.assertEqual(rc, 0) + self.assertIn("Warning: merge-batch-graphs:", stderr) + self.assertIn("batch-8-13.json", stderr) + # Content is dropped + node_ids = {n["id"] for n in assembled["nodes"]} + self.assertNotIn("file:src/x.ts", node_ids) + self.assertNotIn("file:src/y.ts", node_ids) + + if __name__ == "__main__": unittest.main() diff --git a/tests/skill/understand/test_scan_project.test.mjs b/tests/skill/understand/test_scan_project.test.mjs new file mode 100644 index 0000000..65d96c8 --- /dev/null +++ b/tests/skill/understand/test_scan_project.test.mjs @@ -0,0 +1,738 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { + mkdtempSync, + mkdirSync, + writeFileSync, + readFileSync, + rmSync, + chmodSync, + existsSync, +} from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, dirname, resolve } from 'node:path'; +import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const SCRIPT = resolve( + __dirname, + '../../../understand-anything-plugin/skills/understand/scan-project.mjs', +); + +/** + * Build a project tree from a `{ relPath: contents }` object. Creates parent + * directories as needed. Initializes a real git repo so the script's preferred + * `git ls-files` enumeration path is exercised — tests that need the walker + * fallback can set `gitInit=false`. + */ +function setupTree(files, { gitInit = true } = {}) { + const root = mkdtempSync(join(tmpdir(), 'ua-scan-test-')); + for (const [relPath, contents] of Object.entries(files)) { + const abs = join(root, relPath); + mkdirSync(dirname(abs), { recursive: true }); + writeFileSync(abs, contents, 'utf-8'); + } + if (gitInit) { + // `git ls-files -co --exclude-standard` returns BOTH cached and others + // (modulo gitignore), so an `add` is unnecessary for our tests — the + // bare repo init is enough for ls-files to enumerate. + const init = spawnSync('git', ['init', '-q'], { cwd: root, encoding: 'utf-8' }); + if (init.status !== 0) { + // CI without git: continue without it; the walker fallback will fire. + } + } + return root; +} + +/** + * Tracks every temp output dir created by runScript() so the global + * cleanup can sweep them between tests. The output file must live + * OUTSIDE projectRoot because the project's default ignore patterns + * do NOT exclude `.understand-anything/` (the dir is reserved for + * persistent state, not transient scratch). If we wrote inside + * projectRoot, the second call in the determinism test would + * enumerate the first call's output file and produce drift. + */ +const _runScriptOutputDirs = []; + +/** + * Run scan-project.mjs against `projectRoot`. Returns + * { status, stdout, stderr, output } where `output` is the parsed JSON + * written by the script (or null on failure). + */ +function runScript(projectRoot) { + const outputDir = mkdtempSync(join(tmpdir(), 'ua-scan-out-')); + _runScriptOutputDirs.push(outputDir); + const outputPath = join(outputDir, 'scan-output.json'); + const result = spawnSync('node', [SCRIPT, projectRoot, outputPath], { + encoding: 'utf-8', + }); + let output = null; + try { + output = JSON.parse(readFileSync(outputPath, 'utf-8')); + } catch { + /* output missing on hard failure */ + } + return { status: result.status, stdout: result.stdout, stderr: result.stderr, output }; +} + +/** + * Look up the `files[]` entry for a given path. Returns undefined if not + * present — callers should `expect(byPath('x')).toBeDefined()` first. + */ +function byPath(output, path) { + return output.files.find(f => f.path === path); +} + +// Sweep every output dir created during a test back to disk-empty between +// tests. The top-level afterEach fires after each `it()` regardless of which +// describe block it lives in, so a single hook covers the whole file. +afterEach(() => { + while (_runScriptOutputDirs.length) { + const d = _runScriptOutputDirs.pop(); + rmSync(d, { recursive: true, force: true }); + } +}); + +describe('scan-project.mjs — language detection', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('maps TypeScript/JavaScript extensions to typescript/javascript', () => { + projectRoot = setupTree({ + 'a.ts': 'export const a = 1;\n', + 'b.tsx': 'export const B = () => null;\n', + 'c.js': 'module.exports = {};\n', + 'd.jsx': 'export default () => null;\n', + 'e.mjs': 'export const e = 1;\n', + 'f.cjs': 'module.exports = 1;\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'a.ts').language).toBe('typescript'); + expect(byPath(r.output, 'b.tsx').language).toBe('typescript'); + expect(byPath(r.output, 'c.js').language).toBe('javascript'); + expect(byPath(r.output, 'd.jsx').language).toBe('javascript'); + expect(byPath(r.output, 'e.mjs').language).toBe('javascript'); + expect(byPath(r.output, 'f.cjs').language).toBe('javascript'); + }); + + it('maps Python, Go, Rust, Java, Kotlin, C# to their language ids', () => { + projectRoot = setupTree({ + 'a.py': 'x = 1\n', + 'b.go': 'package main\n', + 'c.rs': 'fn main() {}\n', + 'd.java': 'class D {}\n', + 'e.kt': 'fun main() {}\n', + 'f.cs': 'class F {}\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'a.py').language).toBe('python'); + expect(byPath(r.output, 'b.go').language).toBe('go'); + expect(byPath(r.output, 'c.rs').language).toBe('rust'); + expect(byPath(r.output, 'd.java').language).toBe('java'); + expect(byPath(r.output, 'e.kt').language).toBe('kotlin'); + expect(byPath(r.output, 'f.cs').language).toBe('csharp'); + }); + + it('maps Ruby, PHP, C, C++ to their language ids', () => { + projectRoot = setupTree({ + 'a.rb': 'puts 1\n', + 'b.php': ' { + projectRoot = setupTree({ + 'a.html': '\n', + 'b.htm': '\n', + 'c.css': '.a { }\n', + 'd.scss': '$x: 1;\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'a.html').language).toBe('html'); + expect(byPath(r.output, 'b.htm').language).toBe('html'); + expect(byPath(r.output, 'c.css').language).toBe('css'); + expect(byPath(r.output, 'd.scss').language).toBe('css'); + }); + + it('maps configuration formats (YAML, JSON, JSONC, TOML, XML, Markdown) to their language ids', () => { + projectRoot = setupTree({ + 'a.yaml': 'x: 1\n', + 'b.yml': 'x: 1\n', + 'c.json': '{}\n', + 'd.jsonc': '{ /* c */ }\n', + 'e.toml': 'x = 1\n', + 'f.xml': '\n', + 'g.md': '# h\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'a.yaml').language).toBe('yaml'); + expect(byPath(r.output, 'b.yml').language).toBe('yaml'); + expect(byPath(r.output, 'c.json').language).toBe('json'); + expect(byPath(r.output, 'd.jsonc').language).toBe('jsonc'); + expect(byPath(r.output, 'e.toml').language).toBe('toml'); + expect(byPath(r.output, 'f.xml').language).toBe('xml'); + expect(byPath(r.output, 'g.md').language).toBe('markdown'); + }); + + it('maps shell + batch + Dockerfile (no extension) to their language ids', () => { + projectRoot = setupTree({ + 'a.sh': 'echo 1\n', + 'b.bat': '@echo off\n', + Dockerfile: 'FROM node:22\n', + 'Dockerfile.dev': 'FROM node:22\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'a.sh').language).toBe('shell'); + expect(byPath(r.output, 'b.bat').language).toBe('batch'); + expect(byPath(r.output, 'Dockerfile').language).toBe('dockerfile'); + expect(byPath(r.output, 'Dockerfile.dev').language).toBe('dockerfile'); + }); + + it('falls back to "unknown" for files with no extension and no filename match', () => { + projectRoot = setupTree({ + WEIRD_FILE: 'mystery contents\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'WEIRD_FILE').language).toBe('unknown'); + }); + + it('falls back to bare extension (without dot) for unknown extensions', () => { + projectRoot = setupTree({ + 'data.weirdext': 'some data\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'data.weirdext').language).toBe('weirdext'); + }); +}); + +describe('scan-project.mjs — category assignment (project-scanner.md Step 4)', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('assigns code to TypeScript, JavaScript, Python, Go, Rust source files', () => { + projectRoot = setupTree({ + 'src/a.ts': 'export const a = 1;\n', + 'src/b.py': 'def b(): pass\n', + 'src/c.go': 'package main\n', + 'src/d.rs': 'fn main() {}\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'src/a.ts').fileCategory).toBe('code'); + expect(byPath(r.output, 'src/b.py').fileCategory).toBe('code'); + expect(byPath(r.output, 'src/c.go').fileCategory).toBe('code'); + expect(byPath(r.output, 'src/d.rs').fileCategory).toBe('code'); + }); + + it('assigns config to JSON/YAML/TOML/INI/XML', () => { + projectRoot = setupTree({ + 'package.json': '{}\n', + 'tsconfig.json': '{}\n', + 'pyproject.toml': '[project]\nname = "p"\n', + 'config.yaml': 'x: 1\n', + 'app.ini': '[s]\nk=v\n', + 'data.xml': '\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'package.json').fileCategory).toBe('config'); + expect(byPath(r.output, 'tsconfig.json').fileCategory).toBe('config'); + expect(byPath(r.output, 'pyproject.toml').fileCategory).toBe('config'); + expect(byPath(r.output, 'config.yaml').fileCategory).toBe('config'); + expect(byPath(r.output, 'app.ini').fileCategory).toBe('config'); + expect(byPath(r.output, 'data.xml').fileCategory).toBe('config'); + }); + + it('assigns docs to .md / .rst / .txt (but NOT to LICENSE)', () => { + projectRoot = setupTree({ + 'README.md': '# x\n', + 'docs/guide.rst': 'Guide\n=====\n', + 'NOTES.txt': 'notes\n', + LICENSE: 'Apache-2.0\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'README.md').fileCategory).toBe('docs'); + expect(byPath(r.output, 'docs/guide.rst').fileCategory).toBe('docs'); + expect(byPath(r.output, 'NOTES.txt').fileCategory).toBe('docs'); + // LICENSE exception: must NOT be docs. The default ignore filter + // normally drops LICENSE entirely, so we re-include it via + // `!LICENSE` so the category test can fire. + writeFileSync(join(projectRoot, '.understandignore'), '!LICENSE\n'); + const r2 = runScript(projectRoot); + const license = byPath(r2.output, 'LICENSE'); + expect(license).toBeDefined(); + expect(license.fileCategory).not.toBe('docs'); + }); + + it('assigns infra to Dockerfile, docker-compose, .gitlab-ci.yml, .tf, .github/workflows/, Makefile, Jenkinsfile, k8s paths', () => { + projectRoot = setupTree({ + Dockerfile: 'FROM node:22\n', + 'docker-compose.yml': 'services: {}\n', + '.gitlab-ci.yml': 'stages: []\n', + 'infra/main.tf': 'resource "x" "y" {}\n', + '.github/workflows/ci.yml': 'name: ci\n', + Makefile: 'all:\n\t@echo hi\n', + Jenkinsfile: 'pipeline { }\n', + 'k8s/deploy.yaml': 'kind: Deployment\n', + 'kubernetes/svc.yaml': 'kind: Service\n', + 'foo.k8s.yaml': 'kind: ConfigMap\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'Dockerfile').fileCategory).toBe('infra'); + expect(byPath(r.output, 'docker-compose.yml').fileCategory).toBe('infra'); + expect(byPath(r.output, '.gitlab-ci.yml').fileCategory).toBe('infra'); + expect(byPath(r.output, 'infra/main.tf').fileCategory).toBe('infra'); + expect(byPath(r.output, '.github/workflows/ci.yml').fileCategory).toBe('infra'); + expect(byPath(r.output, 'Makefile').fileCategory).toBe('infra'); + expect(byPath(r.output, 'Jenkinsfile').fileCategory).toBe('infra'); + expect(byPath(r.output, 'k8s/deploy.yaml').fileCategory).toBe('infra'); + expect(byPath(r.output, 'kubernetes/svc.yaml').fileCategory).toBe('infra'); + expect(byPath(r.output, 'foo.k8s.yaml').fileCategory).toBe('infra'); + }); + + it('assigns data to SQL, GraphQL, Proto, Prisma, CSV', () => { + projectRoot = setupTree({ + 'db/schema.sql': 'CREATE TABLE x (id INT);\n', + 'api/schema.graphql': 'type X { id: ID! }\n', + 'api/types.proto': 'syntax = "proto3";\n', + 'prisma/schema.prisma': 'model X { id Int @id }\n', + 'data/seed.csv': 'a,b\n1,2\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'db/schema.sql').fileCategory).toBe('data'); + expect(byPath(r.output, 'api/schema.graphql').fileCategory).toBe('data'); + expect(byPath(r.output, 'api/types.proto').fileCategory).toBe('data'); + expect(byPath(r.output, 'prisma/schema.prisma').fileCategory).toBe('data'); + expect(byPath(r.output, 'data/seed.csv').fileCategory).toBe('data'); + }); + + it('assigns script to shell + batch files (.sh, .bash, .ps1, .bat)', () => { + projectRoot = setupTree({ + 'scripts/build.sh': '#!/bin/bash\necho 1\n', + 'scripts/run.bash': '#!/bin/bash\necho run\n', + 'scripts/build.ps1': 'Write-Output 1\n', + 'scripts/setup.bat': '@echo off\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'scripts/build.sh').fileCategory).toBe('script'); + expect(byPath(r.output, 'scripts/run.bash').fileCategory).toBe('script'); + expect(byPath(r.output, 'scripts/build.ps1').fileCategory).toBe('script'); + expect(byPath(r.output, 'scripts/setup.bat').fileCategory).toBe('script'); + }); + + it('assigns markup to HTML + CSS variants', () => { + projectRoot = setupTree({ + 'public/index.html': '\n', + 'public/page.htm': '\n', + 'styles/app.css': 'body { }\n', + 'styles/app.scss': '$x: 1;\n', + 'styles/app.less': '@x: 1;\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'public/index.html').fileCategory).toBe('markup'); + expect(byPath(r.output, 'public/page.htm').fileCategory).toBe('markup'); + expect(byPath(r.output, 'styles/app.css').fileCategory).toBe('markup'); + expect(byPath(r.output, 'styles/app.scss').fileCategory).toBe('markup'); + expect(byPath(r.output, 'styles/app.less').fileCategory).toBe('markup'); + }); + + it('priority: docker-compose.yml maps to infra, not config', () => { + // The .yml extension would normally route to `config`, but the + // docker-compose.* filename rule fires first per Step 4 priority. + projectRoot = setupTree({ + 'docker-compose.yml': 'services: {}\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'docker-compose.yml').fileCategory).toBe('infra'); + expect(byPath(r.output, 'docker-compose.yml').language).toBe('yaml'); + }); + + // Regression: path.extname returns '' for `.env` and the second segment + // for `.env.local` — neither hits CATEGORY_BY_EXT['.env']. Dotfile-style + // configs were falling through to `code` / `unknown`. Caught by Codex + // review on PR #204. + it('dotfile configs (.env, .env.local, .env.production) map to config + env language', () => { + projectRoot = setupTree({ + '.env': 'API_KEY=abc\n', + '.env.local': 'LOCAL=1\n', + '.env.production': 'PROD=1\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + for (const p of ['.env', '.env.local', '.env.production']) { + expect(byPath(r.output, p).fileCategory).toBe('config'); + // LANGUAGE_BY_EXT['.env'] -> 'config' (the language id itself; not + // a typo — the language for env files is the 'config' bucket). + expect(byPath(r.output, p).language).toBe('config'); + } + }); +}); + +describe('scan-project.mjs — .understandignore handling', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('respects .understandignore patterns and increments filteredByIgnore', () => { + // `**/*.log` is NOT in the hardcoded defaults at the recursive level + // — wait, `*.log` is. Use a custom pattern to exercise user-driven drops. + projectRoot = setupTree({ + '.understandignore': 'fixtures/\n', + 'src/index.ts': 'export const x = 1;\n', + 'fixtures/snap1.json': '{ "a": 1 }\n', + 'fixtures/snap2.json': '{ "b": 2 }\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + // fixtures/ files dropped + expect(byPath(r.output, 'fixtures/snap1.json')).toBeUndefined(); + expect(byPath(r.output, 'fixtures/snap2.json')).toBeUndefined(); + // Counted as user-driven + expect(r.output.filteredByIgnore).toBe(2); + }); + + it('supports `!pattern` negation to re-include defaults-excluded files', () => { + // `*.log` is in the hardcoded defaults; the user re-includes a + // specific file with `!keep.log`. After the override, keep.log MUST + // appear in the output. It is NOT counted in filteredByIgnore (it + // was re-included, not additionally filtered). + projectRoot = setupTree({ + '.understandignore': '!keep.log\n', + 'src/index.ts': 'export const x = 1;\n', + 'keep.log': 'important diagnostic\n', + 'drop.log': 'noise\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(byPath(r.output, 'keep.log')).toBeDefined(); + // drop.log still excluded by defaults (no negation for it) + expect(byPath(r.output, 'drop.log')).toBeUndefined(); + // The defaults dropped drop.log — that's a baseline default drop, + // NOT a user-driven drop. filteredByIgnore should be 0. + expect(r.output.filteredByIgnore).toBe(0); + }); +}); + +describe('scan-project.mjs — special-file recognition', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('Dockerfile (no extension) is language=dockerfile, category=infra', () => { + projectRoot = setupTree({ + Dockerfile: 'FROM alpine:3\nCMD ["sh"]\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + const entry = byPath(r.output, 'Dockerfile'); + expect(entry).toBeDefined(); + expect(entry.language).toBe('dockerfile'); + expect(entry.fileCategory).toBe('infra'); + }); +}); + +describe('scan-project.mjs — determinism', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('produces byte-identical output across runs for the same input tree', () => { + projectRoot = setupTree({ + 'README.md': '# project\n', + 'src/a.ts': 'export const a = 1;\n', + 'src/b.ts': 'export const b = 2;\n', + 'src/lib/c.ts': 'export const c = 3;\n', + 'package.json': '{}\n', + 'tsconfig.json': '{}\n', + }); + const r1 = runScript(projectRoot); + const r2 = runScript(projectRoot); + expect(r1.status).toBe(0); + expect(r2.status).toBe(0); + expect(JSON.stringify(r1.output)).toBe(JSON.stringify(r2.output)); + }); +}); + +describe('scan-project.mjs — empty repo', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('handles a project with zero files without crashing', () => { + projectRoot = setupTree({}, { gitInit: true }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.scriptCompleted).toBe(true); + expect(r.output.totalFiles).toBe(0); + expect(r.output.files).toEqual([]); + expect(r.output.filteredByIgnore).toBe(0); + expect(r.output.estimatedComplexity).toBe('small'); + }); +}); + +describe('scan-project.mjs — per-file failure resilience', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + // Restore permissions on any chmod'd file before delete, so cleanup + // succeeds even when a test left a 000-permission file behind. + try { + const f = join(projectRoot, 'src/unreadable.ts'); + if (existsSync(f)) chmodSync(f, 0o644); + } catch { /* best-effort */ } + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('emits a Warning: and skips a file with unreadable permissions; other files survive', () => { + if (process.platform === 'win32') { + // chmod permission bits don't apply on Windows the same way; skip. + return; + } + if (process.getuid && process.getuid() === 0) { + // Running as root bypasses permission checks; the test cannot exercise + // its failure mode. Skip rather than emit a false pass. + return; + } + projectRoot = setupTree({ + 'src/good.ts': 'export const good = 1;\n', + 'src/unreadable.ts': 'export const bad = 2;\n', + }); + // Strip read permission on the synthetic file. + chmodSync(join(projectRoot, 'src/unreadable.ts'), 0o000); + + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.scriptCompleted).toBe(true); + // The good file is in the output. + expect(byPath(r.output, 'src/good.ts')).toBeDefined(); + // The unreadable file is dropped. + expect(byPath(r.output, 'src/unreadable.ts')).toBeUndefined(); + // A visible warning was emitted with the documented prefix. + expect(r.stderr).toMatch( + /Warning: scan-project: src\/unreadable\.ts — line count failed/, + ); + expect(r.stderr).toMatch(/file skipped from output/); + // Final summary line still fires. + expect(r.stderr).toMatch( + /scan-project: filesScanned=1 filteredByIgnore=0 complexity=small/, + ); + }); +}); + +describe('scan-project.mjs — estimatedComplexity thresholds', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + /** + * Build a tree with exactly N .ts files at the top level. Used to + * lock in the complexity-tier boundary points from project-scanner.md + * Step 7: small (≤30), moderate (31-150), large (151-500), very-large + * (>500). + */ + function setupNFiles(n) { + const tree = {}; + for (let i = 0; i < n; i++) { + // Pad indices so localeCompare gives the natural order for any N. + tree[`f${String(i).padStart(4, '0')}.ts`] = 'export const x = 1;\n'; + } + return setupTree(tree); + } + + it('30 files -> small (upper boundary of small)', () => { + projectRoot = setupNFiles(30); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.totalFiles).toBe(30); + expect(r.output.estimatedComplexity).toBe('small'); + }); + + it('31 files -> moderate (lower boundary of moderate)', () => { + projectRoot = setupNFiles(31); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.totalFiles).toBe(31); + expect(r.output.estimatedComplexity).toBe('moderate'); + }); + + it('150 files -> moderate (upper boundary of moderate)', () => { + projectRoot = setupNFiles(150); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.totalFiles).toBe(150); + expect(r.output.estimatedComplexity).toBe('moderate'); + }); + + it('151 files -> large (lower boundary of large)', () => { + projectRoot = setupNFiles(151); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.totalFiles).toBe(151); + expect(r.output.estimatedComplexity).toBe('large'); + }); + + it('501 files -> very-large (lower boundary of very-large)', () => { + projectRoot = setupNFiles(501); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output.totalFiles).toBe(501); + expect(r.output.estimatedComplexity).toBe('very-large'); + }); +}); + +describe('scan-project.mjs — CLI entry guard + invocation', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('invokes successfully via subprocess and produces a parseable output file', () => { + projectRoot = setupTree({ + 'README.md': '# proj\n', + 'src/index.ts': 'export const x = 1;\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + expect(r.output).not.toBeNull(); + expect(r.output.scriptCompleted).toBe(true); + // Stats summary line fires on stderr. + expect(r.stderr).toMatch( + /scan-project: filesScanned=2 filteredByIgnore=0 complexity=small/, + ); + // Two files captured. + expect(r.output.totalFiles).toBe(2); + }); + + it('fails fast with usage message when projectRoot is missing', () => { + const result = spawnSync('node', [SCRIPT], { encoding: 'utf-8' }); + expect(result.status).toBe(1); + expect(result.stderr).toMatch(/Usage: node scan-project\.mjs/); + }); +}); + +describe('scan-project.mjs — output schema invariants', () => { + let projectRoot; + + afterEach(() => { + if (projectRoot) { + rmSync(projectRoot, { recursive: true, force: true }); + projectRoot = null; + } + }); + + it('emits the documented top-level fields with correct shapes', () => { + projectRoot = setupTree({ + 'src/a.ts': 'export const a = 1;\n', + 'README.md': '# x\n', + 'package.json': '{}\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + const out = r.output; + expect(out.scriptCompleted).toBe(true); + expect(Array.isArray(out.files)).toBe(true); + expect(typeof out.totalFiles).toBe('number'); + expect(out.totalFiles).toBe(out.files.length); + expect(typeof out.filteredByIgnore).toBe('number'); + expect(['small', 'moderate', 'large', 'very-large']).toContain( + out.estimatedComplexity, + ); + expect(out.stats).toBeDefined(); + expect(out.stats.filesScanned).toBe(out.files.length); + expect(typeof out.stats.byCategory).toBe('object'); + expect(typeof out.stats.byLanguage).toBe('object'); + // Per-file shape + for (const f of out.files) { + expect(typeof f.path).toBe('string'); + expect(typeof f.language).toBe('string'); + expect(typeof f.sizeLines).toBe('number'); + expect([ + 'code', 'config', 'docs', 'infra', 'data', 'script', 'markup', + ]).toContain(f.fileCategory); + } + }); + + it('files[] is sorted by path.localeCompare', () => { + projectRoot = setupTree({ + 'zzz.ts': '\n', + 'aaa.ts': '\n', + 'mmm.ts': '\n', + 'subdir/file.ts': '\n', + }); + const r = runScript(projectRoot); + expect(r.status).toBe(0); + const paths = r.output.files.map(f => f.path); + const sortedPaths = [...paths].sort((a, b) => a.localeCompare(b)); + expect(paths).toEqual(sortedPaths); + }); +}); diff --git a/understand-anything-plugin/.claude-plugin/plugin.json b/understand-anything-plugin/.claude-plugin/plugin.json index 3f1b6b2..5c75bd7 100644 --- a/understand-anything-plugin/.claude-plugin/plugin.json +++ b/understand-anything-plugin/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "understand-anything", "description": "AI-powered codebase understanding — analyze, visualize, and explain any project", - "version": "2.7.4", + "version": "2.7.5", "author": { "name": "Lum1104" }, diff --git a/understand-anything-plugin/agents/file-analyzer.md b/understand-anything-plugin/agents/file-analyzer.md index 6b2a183..a7cdf12 100644 --- a/understand-anything-plugin/agents/file-analyzer.md +++ b/understand-anything-plugin/agents/file-analyzer.md @@ -52,6 +52,18 @@ cat > $PROJECT_ROOT/.understand-anything/tmp/ua-file-analyzer-input- ENDJSON ``` +### Cross-batch context (neighborMap) + +Your dispatch prompt includes a `neighborMap` — for each file in your batch, it lists project-internal neighbors in OTHER batches (files that import yours or that you import), with their exported symbols. + +Use neighborMap as a confidence boost for cross-batch edges (`calls`, `related`, `inherits`, `implements` to nodes outside your batch): + +- If your source clearly references a symbol that appears in some `neighbor.symbols`, emit the edge to `function::` or `class::` with confidence. +- If your source references a cross-batch symbol that is NOT in neighborMap (the project-scanner may not have extracted it), you may still emit the edge if you saw it explicitly in the imported file's surface — but prefer matching neighborMap symbols when available. +- Imports continue to use `batchImportData` (fully resolved), not neighborMap. + +The merge script's dangling-edge dropper is the safety net for genuinely unresolvable targets. + ### Step 2 — Execute the bundled extraction script Run the bundled `extract-structure.mjs` script. The `` path is provided in your dispatch prompt. @@ -464,12 +476,46 @@ Use these hints for common edge patterns: - NEVER create self-referencing edges (where source equals target). - Trust the script's structural extraction. Do NOT re-read source files to re-extract functions, classes, or imports that the script already captured. Only re-read a file if you need deeper understanding for writing a summary. -## Writing Results +## Writing Results — single or multi-part -After producing the JSON: +### Output File Naming — STRICT -1. Write the JSON to: `/.understand-anything/intermediate/batch-.json` -2. The project root and batch index will be provided in your prompt. -3. Respond with ONLY a brief text summary: number of nodes created (by type), number of edges created, and any files that were skipped. +**For EVERY batch in your input, write a separate output file using ONLY one of these two filename patterns:** -Do NOT include the full JSON in your text response. +- `batch-.json` — single-part output for batch `` +- `batch--part-.json` — multi-part output when `nodes > 60` or `edges > 120` (per Step B below) + +`` is the **ORIGINAL integer batch index** from the input `batches.json`. Even if your dispatch prompt fused multiple batches into one call (e.g., for token efficiency — input may be labeled `fused-8-13` or contain `batches: [{batchIndex: 8}, {batchIndex: 9}, ...]`), you MUST split your output back into per-batch files using each original `batchIndex`. + +**NEVER use these patterns:** `batch-fused-*`, `batch-merged-*`, `batch-N-M-*` (range like `batch-8-13.json`), `batches-*`, or any other variant. The downstream merge script (`merge-batch-graphs.py`) requires the regex `batch-(\d+)(?:-part-(\d+))?\.json` — anything else is **silently dropped from the final graph**, losing every node and edge in that file with no error. + +**Example.** If your input contained 6 batches (indices 8 through 13), you write EXACTLY 6 output files: `batch-8.json`, `batch-9.json`, `batch-10.json`, `batch-11.json`, `batch-12.json`, `batch-13.json`. Not one combined `batch-fused-8-13.json`. Not one `batch-8-13.json`. Six files, one per original `batchIndex`. Run Steps A–F below independently for each batch's nodes/edges. + +**Step A — Compute totals.** +``` +nodeCount = nodes.length +edgeCount = edges.length +``` + +**Step B — Decide split.** +- If `nodeCount ≤ 60` AND `edgeCount ≤ 120`: write ONE file to `.understand-anything/intermediate/batch-.json`. Done. Skip to Step F. +- Otherwise: `parts = ceil(max(nodeCount / 60, edgeCount / 120))`. + +**Step C — Partition.** +Sort files in your batch alphabetically by path. Chunk them sequentially into `parts` groups of size `ceil(N / parts)`. For each part: +- All nodes whose `filePath` is in this part's files (for non-file nodes like `module`/`concept`, use the file they belong to). +- All edges whose `source` is in this part's nodes (target may be anywhere — same part, different part of same batch, different batch). + +**Step D — Write each part.** +Write part `k` (1-indexed) to `.understand-anything/intermediate/batch--part-.json`. Each part is a valid GraphFragment: `{ "nodes": [...], "edges": [...] }`. + +**Step E — Self-validate.** +For each file written, verify: +- Valid JSON. +- `nodes` array exists and is well-formed. +- For every edge: `source` and `target` both appear as either (a) a node `id` in this part's nodes, OR (b) a `file:` reference where `` is in `neighborMap` or `batchImportData`, OR (c) a `function::` / `class::` reference where `` is in some `neighbor.symbols`. + +If validation fails on a part, do NOT silently rebuild. Respond with an explicit error stating which part failed, which edge(s) failed validation, and why. The dispatching session can then retry. + +**Step F — Respond.** +Respond with ONLY a brief text summary: parts written (1 or more), total nodes/edges across all parts, any files skipped. Do NOT include JSON content in the response. diff --git a/understand-anything-plugin/agents/project-scanner.md b/understand-anything-plugin/agents/project-scanner.md index 2cedacc..17c0121 100644 --- a/understand-anything-plugin/agents/project-scanner.md +++ b/understand-anything-plugin/agents/project-scanner.md @@ -12,246 +12,59 @@ You are a meticulous project inventory specialist. Your job is to scan a codebas ## Task -Scan the project directory provided in the prompt and produce a JSON inventory. You will accomplish this in two phases: first, write and execute a discovery script that performs all deterministic file scanning; second, review the script's results and add a human-readable project description. +Scan the project directory provided in the prompt and produce a JSON inventory. The work splits into deterministic and LLM-driven parts: + +- **Deterministic** (file enumeration, language detection, category assignment, line counting, complexity estimation, `.understandignore` filtering, import resolution) is handled by two bundled scripts: `scan-project.mjs` and `extract-import-map.mjs`. Do NOT re-implement any of this logic. +- **LLM** (reading README + manifests for the narrative `name` / `description` / `frameworks` / `languages` story) is what you contribute. **Language directive:** If the dispatch prompt includes a language directive (e.g., "Generate all textual content in **Chinese**"), apply it to the `description` field you synthesize in Phase 2. Write the description in the specified language using natural, native-level phrasing. Keep technical terms in English when no standard translation exists (e.g., "middleware", "hook", "barrel"). --- -## Phase 1 -- Discovery Script +## Phase 1 -- Discovery (bundled scan + LLM narrative) -Write a script that discovers all project files (including non-code files like configs, docs, and infrastructure), detects languages and frameworks, counts lines, and produces structured JSON. Prefer Node.js for the script; fall back to Python if Node.js is unavailable. Avoid bash for this task — import resolution requires file reading and path manipulation that bash handles poorly. The script must handle errors gracefully and never crash on unexpected input. +Phase 1 has three orchestrated steps. Steps **B** and **C** run bundled scripts; step **A** is the only LLM work in this phase. -### Script Requirements +### Step A (LLM) -- Read manifests and README for narrative fields -1. **Accept** the project root directory as `$1` (bash) or `process.argv[2]` (Node.js) or `sys.argv[1]` (Python). -2. **Write** results JSON to the path given as `$2` / `process.argv[3]` / `sys.argv[2]`. -3. **Exit 0** on success. -4. **Exit 1** on fatal error (cannot access directory, etc.). Print the error to stderr. +Read the top-level project files to gather narrative metadata. Do NOT walk the file tree or count files yourself — that is Step B's job. -### What the Script Must Do +Read whichever of these exist at the project root: +- `README.md` (or `README.rst`, `README`) — capture the first ~10 lines for narrative grounding +- `package.json` — extract `name`, `description`, plus `dependencies` / `devDependencies` keys for framework detection +- `pyproject.toml`, `setup.py`, `setup.cfg`, `Pipfile`, `requirements.txt` — Python framework signals +- `Cargo.toml` — Rust project name + `[dependencies]` +- `go.mod` — Go module name + `require` block +- `Gemfile` — Ruby framework signals +- `pom.xml`, `build.gradle`, `build.gradle.kts` — JVM project signals +- `composer.json` — PHP project signals -**Step 1 -- File Discovery** +From these, synthesize: -Discover all tracked files. In order of preference: -- Run `git ls-files` in the project root (most reliable for git repos) -- Fall back to a recursive file listing with exclusions if not a git repo +- **`name`** -- in priority order: `package.json` `name`, `Cargo.toml` `[package].name`, `go.mod` module path's last segment, `pyproject.toml` `[project].name` or `[tool.poetry].name`, else the directory name of the project root. +- **`rawDescription`** -- the `description` field from `package.json` (or its equivalent in the matching manifest), or `""` if none. +- **`readmeHead`** -- the first ~10 lines of `README.md` (or equivalent), or `""` if no README exists. +- **`frameworks`** -- match dependency names against known frameworks: `react`, `vue`, `svelte`, `@angular/core`, `express`, `fastify`, `koa`, `next`, `nuxt`, `vite`, `vitest`, `jest`, `mocha`, `tailwindcss`, `prisma`, `typeorm`, `sequelize`, `mongoose`, `redux`, `zustand`, `mobx`; Python: `django`, `djangorestframework`, `fastapi`, `flask`, `sqlalchemy`, `alembic`, `celery`, `pydantic`, `uvicorn`, `gunicorn`, `aiohttp`, `tornado`, `starlette`, `pytest`, `hypothesis`, `channels`; Ruby: `rails`, `railties`, `sinatra`, `grape`, `rspec`, `sidekiq`, `activerecord`, `actionpack`, `devise`, `pundit`; Go: `github.com/gin-gonic/gin`, `github.com/labstack/echo`, `github.com/gofiber/fiber`, `github.com/go-chi/chi`, `gorm.io/gorm`; Rust: `actix-web`, `axum`, `rocket`, `diesel`, `tokio`, `serde`, `warp`; JVM: `spring-boot`, `spring-web`, `spring-data`, `quarkus`, `micronaut`, `hibernate`, `jakarta`, `junit`, `ktor`. Also infer infrastructure tools from manifest presence: add `Docker` if `Dockerfile` exists in the file list, `Docker Compose` if `docker-compose.yml`/`docker-compose.yaml` exists, `Terraform` if any `*.tf`, `GitHub Actions` if `.github/workflows/*.yml`, `GitLab CI` if `.gitlab-ci.yml`, `Jenkins` if `Jenkinsfile`. +- **`languages`** -- the deduplicated, alphabetically-sorted top-level language set you observe across the manifests + the bundled script's per-file language tally (you will read this from Step B's output). -**Step 2 -- Exclusion Filtering** +If the manifest is missing or malformed, leave the corresponding field empty rather than guessing. -Remove ALL files matching these patterns: -- **Dependency directories:** paths containing `node_modules/`, `.git/`, `vendor/`, `venv/`, `.venv/`, `__pycache__/` -- **Build output:** paths with a directory segment matching `dist/`, `build/`, `out/`, `coverage/`, `.next/`, `.cache/`, `.turbo/`, `target/` (Rust), `obj/` (.NET) — match full directory segments only, not substrings (e.g., `buildSrc/` should NOT be excluded). Note: `bin/` is NOT excluded by default because Node.js and Ruby projects use `bin/` for CLI launchers; .NET users can add `bin/` to `.understandignore`. -- **Lock files:** `*.lock`, `package-lock.json`, `yarn.lock`, `pnpm-lock.yaml` -- **Binary/asset files:** `.png`, `.jpg`, `.jpeg`, `.gif`, `.svg`, `.ico`, `.woff`, `.woff2`, `.ttf`, `.eot`, `.mp3`, `.mp4`, `.pdf`, `.zip`, `.tar`, `.gz` -- **Generated files:** `*.min.js`, `*.min.css`, `*.map`, `*.generated.*` (note: do NOT exclude `*.d.ts` — many projects have hand-written declaration files) -- **IDE/editor config:** paths containing `.idea/`, `.vscode/` -- **Misc non-source:** `LICENSE`, `.gitignore`, `.editorconfig`, `.prettierrc`, `.eslintrc*`, `*.log` +### Step B (bundled `scan-project.mjs`) -- File enumeration + language + category + lines -**IMPORTANT:** Do NOT exclude non-code project files. The following MUST be kept: -- Documentation: `*.md`, `*.rst`, `*.txt` (except `LICENSE`) -- Configuration: `*.yaml`, `*.yml`, `*.json`, `*.toml`, `*.xml`, `*.cfg`, `*.ini`, `*.env`, `*.env.example` (include `.env` in the file list but downstream agents should NEVER include `.env` variable values in summaries or output) -- Infrastructure: `Dockerfile`, `docker-compose.*`, `*.tf`, `Makefile`, `Jenkinsfile`, `Procfile`, `Vagrantfile` -- CI/CD: `.github/workflows/*`, `.gitlab-ci.yml`, `.circleci/*`, `Jenkinsfile` -- Data/Schema: `*.sql`, `*.graphql`, `*.gql`, `*.proto`, `*.prisma`, `*.schema.json` -- Web markup: `*.html`, `*.css`, `*.scss`, `*.sass`, `*.less` -- Shell scripts: `*.sh`, `*.bash`, `*.ps1`, `*.bat` -- Kubernetes: `*.k8s.yaml`, `*.k8s.yml`, paths containing `k8s/`, paths containing `kubernetes/` +Invoke the bundled scan script. It walks the project (preferring `git ls-files`, falling back to a recursive walk for non-git directories), applies `.understandignore` filtering (defaults + user patterns), assigns `language` and `fileCategory` per the canonical tables, counts lines, and writes deterministic JSON. You do not see or maintain those tables — they live in the script. -**Note on package manifests:** Config files read for framework detection (`package.json`, `tsconfig.json`, `Cargo.toml`, `go.mod`, `pyproject.toml`, etc.) should also appear in the file list with `fileCategory: "config"`. - -**Step 2.5 -- User-Configured Filtering (.understandignore)** - -When `.understandignore` files exist, **replace** Step 2's hardcoded filtering with a unified filter that combines defaults and user patterns in a single pass. This ensures `!` negation patterns can override defaults. - -1. Check if `$PROJECT_ROOT/.understand-anything/.understandignore` exists. If so, read it. -2. Check if `$PROJECT_ROOT/.understandignore` exists. If so, read it. -3. If neither file exists, skip this step entirely — Step 2's hardcoded filtering is sufficient. -4. If at least one file exists, re-filter the **original file list from Step 1** (not the Step 2 output) using the `createIgnoreFilter` function from `@understand-anything/core`, which merges hardcoded defaults and user patterns into a single `.gitignore`-compatible matcher. This ensures `!` negation in user files can override hardcoded defaults (e.g., `!dist/` force-includes dist/ files). -5. Track the count of additional files removed beyond Step 2's baseline as `filteredByIgnore`. - -This filtering must be deterministic (not LLM-based). Use a Node.js script with the `ignore` npm package from `@understand-anything/core`. - -**Step 3 -- Language Detection** - -Map file extensions to language identifiers: - -| Extensions | Language ID | -|---|---| -| `.ts`, `.tsx` | `typescript` | -| `.js`, `.jsx` | `javascript` | -| `.py` | `python` | -| `.go` | `go` | -| `.rs` | `rust` | -| `.java` | `java` | -| `.rb` | `ruby` | -| `.cpp`, `.cc`, `.cxx`, `.h`, `.hpp` | `cpp` | -| `.c` | `c` | -| `.cs` | `csharp` | -| `.swift` | `swift` | -| `.kt` | `kotlin` | -| `.php` | `php` | -| `.vue` | `vue` | -| `.svelte` | `svelte` | -| `.sh`, `.bash` | `shell` | -| `.ps1` | `powershell` | -| `.bat`, `.cmd` | `batch` | -| `.md`, `.rst` | `markdown` | -| `.yaml`, `.yml` | `yaml` | -| `.json` | `json` | -| `.jsonc` | `jsonc` | -| `.toml` | `toml` | -| `.sql` | `sql` | -| `.graphql`, `.gql` | `graphql` | -| `.proto` | `protobuf` | -| `.tf`, `.tfvars` | `terraform` | -| `.html`, `.htm` | `html` | -| `.css`, `.scss`, `.sass`, `.less` | `css` | -| `.xml` | `xml` | -| `.cfg`, `.ini`, `.env` | `config` | -| `Dockerfile` (no extension) | `dockerfile` | -| `Makefile` (no extension) | `makefile` | -| `Jenkinsfile` (no extension) | `jenkinsfile` | - -**Fallback:** If a file's extension is not in the table above, set `language` to the lowercased extension (without the leading dot), or `"unknown"` if there is no extension. Never emit `null` — downstream consumers rely on this field being a string. - -Collect unique languages, sorted alphabetically. - -**Step 4 -- File Category Detection** - -Assign a `fileCategory` to each discovered file based on its extension and path: - -| Pattern | Category | -|---|---| -| `.md`, `.rst`, `.txt` (except `LICENSE`) | `docs` | -| `.yaml`, `.yml`, `.json`, `.jsonc`, `.toml`, `.xml`, `.cfg`, `.ini`, `.env`, `tsconfig.json`, `package.json`, `pyproject.toml`, `Cargo.toml`, `go.mod` | `config` | -| `Dockerfile`, `docker-compose.*`, `.tf`, `.tfvars`, `Makefile`, `Jenkinsfile`, `Procfile`, `Vagrantfile`, `.github/workflows/*`, `.gitlab-ci.yml`, `.circleci/*`, `*.k8s.yaml`, `*.k8s.yml`, paths in `k8s/` or `kubernetes/` | `infra` | -| `.sql`, `.graphql`, `.gql`, `.proto`, `.prisma`, `*.schema.json`, `.csv` | `data` | -| `.sh`, `.bash`, `.ps1`, `.bat` | `script` | -| `.html`, `.htm`, `.css`, `.scss`, `.sass`, `.less` | `markup` | -| All other extensions (`.ts`, `.tsx`, `.js`, `.py`, `.go`, `.rs`, etc.) | `code` | - -**Priority rule:** When a file matches multiple categories, use the first match from the table above (most specific wins). For example, `docker-compose.yml` is `infra`, not `config`. - -**Step 5 -- Line Counting** - -For each file, count lines using `wc -l`. For efficiency: -- If fewer than 500 files, count all of them -- If 500+ files, count all of them but batch the `wc -l` calls (pass multiple files per invocation to avoid spawning thousands of processes) - -**Step 6 -- Framework Detection** - -Read config files (if they exist) and extract framework information: -- `package.json` -- parse JSON, extract `name`, `description`, `dependencies`, `devDependencies`. Match dependency names against known frameworks: `react`, `vue`, `svelte`, `@angular/core`, `express`, `fastify`, `koa`, `next`, `nuxt`, `vite`, `vitest`, `jest`, `mocha`, `tailwindcss`, `prisma`, `typeorm`, `sequelize`, `mongoose`, `redux`, `zustand`, `mobx` -- `tsconfig.json` -- if present, confirms TypeScript usage -- `Cargo.toml` -- if present, confirms Rust project; extract `[package].name` -- `go.mod` -- if present, confirms Go project; extract module name -- `requirements.txt` -- if present, confirms Python project; read line by line and match package names (strip version specifiers) against known Python frameworks: `django`, `djangorestframework`, `fastapi`, `flask`, `sqlalchemy`, `alembic`, `celery`, `pydantic`, `uvicorn`, `gunicorn`, `aiohttp`, `tornado`, `starlette`, `pytest`, `hypothesis`, `channels` -- `pyproject.toml` -- if present, confirms Python project; parse the `[project].dependencies` or `[tool.poetry.dependencies]` section and apply the same Python framework keyword matching as above. Also check for `[tool.pytest.ini_options]` (confirms pytest) and `[tool.django]` (confirms Django). -- `setup.py` / `setup.cfg` / `Pipfile` -- if present, confirms Python project; read and apply Python framework keyword matching -- `Gemfile` -- if present, confirms Ruby project; read and match gem names against known Ruby frameworks: `rails`, `railties`, `sinatra`, `grape`, `rspec`, `sidekiq`, `activerecord`, `actionpack`, `devise`, `pundit` -- `go.mod` dependencies -- if present, read the `require` block and match module paths against known Go frameworks: `github.com/gin-gonic/gin`, `github.com/labstack/echo`, `github.com/gofiber/fiber`, `github.com/go-chi/chi`, `gorm.io/gorm` -- `Cargo.toml` dependencies -- if present, read `[dependencies]` and match crate names against known Rust frameworks: `actix-web`, `axum`, `rocket`, `diesel`, `tokio`, `serde`, `warp` -- `pom.xml` / `build.gradle` / `build.gradle.kts` -- if present, confirms Java/Kotlin project; match dependency names against known JVM frameworks: `spring-boot`, `spring-web`, `spring-data`, `quarkus`, `micronaut`, `hibernate`, `jakarta`, `junit`, `ktor` - -Also detect infrastructure tooling from discovered files: -- Presence of `Dockerfile` -> add `Docker` to frameworks -- Presence of `docker-compose.yml` or `docker-compose.yaml` -> add `Docker Compose` to frameworks -- Presence of `*.tf` files -> add `Terraform` to frameworks -- Presence of `.github/workflows/*.yml` -> add `GitHub Actions` to frameworks -- Presence of `.gitlab-ci.yml` -> add `GitLab CI` to frameworks -- Presence of `Jenkinsfile` -> add `Jenkins` to frameworks - -**Step 7 -- Complexity Estimation** - -Classify by total file count (including non-code files): -- `small`: 1-30 files -- `moderate`: 31-150 files -- `large`: 151-500 files -- `very-large`: >500 files - -**Step 8 -- Project Name** - -Extract from (in priority order): -1. `package.json` `name` field -2. `Cargo.toml` `[package].name` -3. `go.mod` module path (last segment) -4. `pyproject.toml` -- check `[project].name` first, then `[tool.poetry].name` -5. Directory name of project root - -**Step 9 -- Import Resolution** - -For each **code-category** file in the discovered list (`fileCategory === "code"`), extract and resolve relative import statements. The goal is to produce a map from each file's path to the list of project-internal files it imports. External package imports are ignored. - -**Non-code files** (config, docs, infra, data, script, markup) should have an empty array `[]` in the import map — they do not participate in code-level import resolution. - -For each code file, read its content and extract import paths using language-appropriate patterns: - -| Language | Import patterns to match | -|---|---| -| TypeScript/JavaScript | Relative: `import ... from './...'` or `'../'`, `require('./...')` or `require('../...')`. **Plus path aliases** from `tsconfig.json` `compilerOptions.paths` and `baseUrl` (e.g. `@/foo` → `/foo`, `~/foo` → `/foo`). Read tsconfig.json (if present) and resolve every alias prefix against the discovered file list with the standard extension probes. | -| Python | Both relative AND absolute. Relative: `from .x import y`, `from ..x import y`, `from . import x`. Absolute: `import a.b.c`, `from a.b.c import x[, y, ...]` — try every dotted path against the discovered file list (see resolution algorithm below) and keep matches; non-matches are external packages and are dropped. | -| Go | Paths in `import (...)` blocks that start with the module path from `go.mod` | -| Rust | `use crate::`, `use super::`, `mod x` (within the same crate) | -| Java | `import com.example.foo.Bar;` — try `**/com/example/foo/Bar.java` against the discovered file list; keep matches | -| Kotlin | `import com.example.foo.Bar` — try `**/com/example/foo/Bar.kt` against the discovered file list; keep matches | -| Ruby | Relative: `require_relative '...'` paths. **Plus** `require 'foo/bar'` (load-path) — try `lib/foo/bar.rb`, `app/foo/bar.rb`, `foo/bar.rb` against the discovered file list. | -| PHP | `use Vendor\Pkg\Class;` — read `composer.json` `autoload.psr-4` map (e.g. `"App\\": "src/"`), translate the namespace prefix to its directory, then try `/Pkg/Class.php` against the discovered file list. Skip imports whose namespace prefix isn't in the autoload map. | -| C / C++ | `#include "foo.h"` (relative to the includer's directory) and `#include ` — for both, also probe `include/foo.h`, `src/foo.h`, and the bare path against the discovered file list. Match `.h`, `.hpp`, `.hxx`, `.cuh`. | - -For each extracted import path: -1. Compute the resolved file path relative to project root: - - For relative imports (`./x`, `../x`): resolve from the importing file's directory - - Try these extension variants in order if the import has no extension: `.ts`, `.tsx`, `.js`, `.jsx`, `/index.ts`, `/index.js`, `/index.tsx`, `/index.jsx`, `.py`, `.go`, `.rs`, `.rb` -2. Check if the resolved path exists in the discovered file list -3. If yes: add to this file's resolved imports list -4. If no: skip (external, unresolvable, or dynamic import) - -**Python absolute imports — resolution algorithm.** This is the dominant import style in real Python projects, so it MUST be handled: - -For `import a.b.c`, try (in order, take first match in the discovered file list): -- `a/b/c.py` -- `a/b/c/__init__.py` - -For `from a.b.c import x, y, z`, try (in order, take first match for the module path): -- `a/b/c.py` -- `a/b/c/__init__.py` - -If the module path matched as a package (`__init__.py`), additionally probe each imported name `x`/`y`/`z` against: -- `a/b/c/x.py` -- `a/b/c/x/__init__.py` - -so that `from package import submodule` resolves to the submodule file. Skip names that don't match (they're class/function imports from inside the package, already covered by the `__init__.py` match). - -If NO probe matches, the import is external — drop it. - -**Worked example.** Discovered files include `src/utils/formatter.py`, `src/utils/__init__.py`. The line `from src.utils import formatter` resolves to `src/utils/__init__.py` (module match) AND `src/utils/formatter.py` (submodule probe). Both are added to the importer's resolved list. - -Output format in the script result: -```json -"importMap": { - "src/index.ts": ["src/utils.ts", "src/config.ts"], - "src/utils.ts": [], - "README.md": [], - "Dockerfile": [], - "src/components/App.tsx": ["src/hooks/useAuth.ts", "src/store/index.ts"] -} +```bash +mkdir -p $PROJECT_ROOT/.understand-anything/tmp +node $PLUGIN_ROOT/skills/understand/scan-project.mjs \ + "$PROJECT_ROOT" \ + "$PROJECT_ROOT/.understand-anything/tmp/ua-scan-files.json" ``` -Keys are project-relative paths. Values are arrays of resolved project-relative paths. Every key in the file list must appear in `importMap` (use an empty array `[]` if no imports were resolved). External packages and unresolvable imports are omitted entirely. - -### Script Output Format - -The script must write this exact JSON structure to the output file: +Output JSON shape (you will read this verbatim and merge into the final scan-result): ```json { "scriptCompleted": true, - "name": "project-name", - "rawDescription": "Description from package.json or empty string", - "readmeHead": "First 10 lines of README.md or empty string", - "languages": ["javascript", "markdown", "typescript", "yaml"], - "frameworks": ["React", "Vite", "Vitest", "Docker"], "files": [ {"path": "src/index.ts", "language": "typescript", "sizeLines": 150, "fileCategory": "code"}, {"path": "README.md", "language": "markdown", "sizeLines": 45, "fileCategory": "docs"}, @@ -261,50 +74,106 @@ The script must write this exact JSON structure to the output file: "totalFiles": 42, "filteredByIgnore": 0, "estimatedComplexity": "moderate", - "importMap": { - "src/index.ts": ["src/utils.ts", "src/config.ts"], - "src/utils.ts": [], - "README.md": [], - "Dockerfile": [], - "package.json": [] + "stats": { + "filesScanned": 42, + "byCategory": {"code": 28, "config": 6, "docs": 4, "infra": 2, "script": 2}, + "byLanguage": {"typescript": 22, "javascript": 6, "json": 5, "markdown": 4, "yaml": 3, "shell": 2} } } ``` -- `scriptCompleted` (boolean) -- always `true` when the script finishes normally -- `name` (string) -- project name extracted from config or directory name -- `rawDescription` (string) -- raw description from `package.json` or empty string -- `readmeHead` (string) -- first 10 lines of `README.md` or empty string if no README exists -- `languages` (string[]) -- deduplicated, sorted alphabetically -- `frameworks` (string[]) -- only confirmed frameworks; empty array if none detected -- `files` (object[]) -- every discovered file, sorted by `path` alphabetically -- `files[].fileCategory` (string) -- one of: `code`, `config`, `docs`, `infra`, `data`, `script`, `markup` -- `totalFiles` (integer) -- must equal `files.length` -- `filteredByIgnore` (integer) -- count of files removed by `.understandignore` patterns in Step 2.5; 0 if no `.understandignore` file exists -- `estimatedComplexity` (string) -- one of `small`, `moderate`, `large`, `very-large` -- `importMap` (object) -- map from every file path to its list of resolved project-internal import paths; empty array for non-code files and files with no resolved imports; external packages excluded +The script: +- sorts `files` by `path.localeCompare` (deterministic) +- emits `fileCategory ∈ {code, config, docs, infra, data, script, markup}` per file (priority-ordered per the rules below) +- emits `language` as a non-null string for every file (canonical id for known extensions, lowercased extension for unknowns, `"unknown"` for no-extension files that don't match `Dockerfile` / `Makefile` / `Jenkinsfile`) +- counts `filteredByIgnore` as the delta beyond hardcoded defaults — `!`-negation in `.understandignore` correctly re-includes files +- emits `Warning: scan-project: — file skipped from output` on stderr for per-file failures (permission denied, malformed unicode, vanished file). Capture these and append to phase warnings. +- emits `scan-project: filesScanned=… filteredByIgnore=… complexity=…` as the final stderr summary line; informational only. -### Executing the Script +**Canonical category table** (for the record — the script is authoritative; do NOT re-derive these rules in your prompt): -After writing the script, execute it. `$PROJECT_ROOT` is the project root directory provided in your dispatch prompt: +| Pattern | Category | +|---|---| +| `LICENSE` | `code` (exception — not docs) | +| `Dockerfile`, `Dockerfile.*`, `docker-compose.*`, `compose.yml`/`compose.yaml`, `Makefile`, `Jenkinsfile`, `Procfile`, `Vagrantfile`, `.gitlab-ci.yml`, `.dockerignore`, `.github/workflows/*`, `.circleci/*`, paths in `k8s/` or `kubernetes/`, `*.k8s.yml`/`*.k8s.yaml` | `infra` | +| `.md`, `.mdx`, `.rst`, `.txt`, `.text` (except `LICENSE`) | `docs` | +| `.yaml`, `.yml`, `.json`, `.jsonc`, `.toml`, `.xml`, `.xsl`, `.xsd`, `.plist`, `.cfg`, `.ini`, `.env`, `.properties`, `.csproj`, `.sln`, `.mod`, `.sum`, `.gradle` | `config` | +| `.tf`, `.tfvars` | `infra` | +| `.sql`, `.graphql`, `.gql`, `.proto`, `.prisma`, `.csv`, `.tsv` | `data` | +| `.sh`, `.bash`, `.zsh`, `.ps1`, `.psm1`, `.psd1`, `.bat`, `.cmd` | `script` | +| `.html`, `.htm`, `.css`, `.scss`, `.sass`, `.less` | `markup` | +| Everything else | `code` | + +**Priority rule:** most-specific wins. Filename / path rules fire before extension rules — e.g., `docker-compose.yml` is `infra` (not `config`); `.github/workflows/ci.yml` is `infra` (not `config`); `LICENSE` is `code` (not `docs`). + +**`.understandignore` behavior:** the bundled script reads `.understandignore` and `.understand-anything/.understandignore` if present and merges them with the hardcoded defaults via `createIgnoreFilter`. `!`-negation overrides defaults (`!dist/` would re-include `dist/` files). The `filteredByIgnore` counter measures only user-driven drops, not baseline default drops. + +If the script exits with a non-zero status, read stderr to diagnose. You have up to 2 retry attempts (re-invocations) before failing the phase. Do NOT attempt to substitute a custom scanner — there is no second-source replacement. + +### Step C -- Import Resolution (bundled `extract-import-map.mjs`) + +After Step B has produced the file list, invoke the bundled `extract-import-map.mjs` script for deterministic import extraction across all supported code languages. It uses tree-sitter for parsing and applies language-specific resolution rules in code (see `/extract-import-map.mjs`). + +**Do not** attempt to re-implement import patterns. Step B emits `path`/`language`/`fileCategory` for every file; this script consumes that list and produces the `importMap`. + +Write the input JSON for the bundled script (the `files[]` array is exactly Step B's `files[]` — pass it through verbatim): ```bash -node $PROJECT_ROOT/.understand-anything/tmp/ua-project-scan.js "$PROJECT_ROOT" "$PROJECT_ROOT/.understand-anything/tmp/ua-scan-results.json" +mkdir -p $PROJECT_ROOT/.understand-anything/tmp +cat > $PROJECT_ROOT/.understand-anything/tmp/ua-import-map-input.json << 'ENDJSON' +{ + "projectRoot": "", + "files": [ + {"path": "src/index.ts", "language": "typescript", "fileCategory": "code"}, + {"path": "README.md", "language": "markdown", "fileCategory": "docs"} + ] +} +ENDJSON ``` -(Or the equivalent for Python, depending on which language you chose.) +Then run: -If the script exits with a non-zero code, read stderr, diagnose the issue, fix the script, and re-run. You have up to 2 retry attempts. +```bash +node $PLUGIN_ROOT/skills/understand/extract-import-map.mjs \ + $PROJECT_ROOT/.understand-anything/tmp/ua-import-map-input.json \ + $PROJECT_ROOT/.understand-anything/tmp/ua-import-map-output.json +``` + +The output JSON has shape: + +```json +{ + "scriptCompleted": true, + "stats": { "filesScanned": 314, "filesWithImports": 142, "totalEdges": 487 }, + "importMap": { + "src/index.ts": ["src/utils.ts", "src/config.ts"], + "src/utils.ts": [], + "README.md": [], + "Dockerfile": [] + } +} +``` + +Read the output JSON and merge the `importMap` field directly into your final scan-result.json (under the same key — `importMap`). The format matches the project-scanner contract: every input file has an entry; non-code files have empty arrays; resolved internal paths only (external packages are dropped). + +**Capture stderr** when you run the bundled script. Any line starting with `Warning:` should be appended to phase warnings — the SKILL.md orchestrator captures these for the final report. The script also writes a one-line summary `extract-import-map: filesScanned=… filesWithImports=… totalEdges=…` on completion; you can ignore that line or surface it as informational. + +**Languages supported.** The bundled script natively handles import resolution for: TypeScript, JavaScript (including CJS `require()`), Python (relative + absolute + `__init__.py`), Go (go.mod prefix stripping), Rust (`use crate::`, `use super::`, `use self::`, and `mod x;` declarations), Java, Kotlin, C#, Ruby (`require` + `require_relative`), PHP (composer.json PSR-4 autoload), C, and C++ (`#include` with relative + include/ + src/ probes). Languages outside this set get empty arrays — there is no LLM-based fallback. --- ## Phase 2 -- Description and Final Assembly -After the script completes, read `$PROJECT_ROOT/.understand-anything/tmp/ua-scan-results.json`. Do NOT re-run file discovery commands or re-count lines -- trust the script's results entirely. +After Steps A + B + C have all completed, read: +1. `$PROJECT_ROOT/.understand-anything/tmp/ua-scan-files.json` — output of `scan-project.mjs` (file list with language, sizeLines, fileCategory; plus `totalFiles`, `filteredByIgnore`, `estimatedComplexity`). +2. `$PROJECT_ROOT/.understand-anything/tmp/ua-import-map-output.json` — output of `extract-import-map.mjs` (the `importMap` field). +3. Your Step A in-memory notes (`name`, `rawDescription`, `readmeHead`, `frameworks`, `languages` narrative). -**IMPORTANT:** The final output must NOT contain the `scriptCompleted`, `rawDescription`, or `readmeHead` fields. These are intermediate script fields only. Strip them when assembling the final JSON. All other fields — including `importMap` — MUST be preserved exactly as output by the script. +Do NOT re-walk the file tree, re-count lines, or re-derive categories — trust `scan-project.mjs` entirely. Do NOT re-implement import resolution — trust `extract-import-map.mjs` entirely. -Your only task in this phase is to produce the final `description` field: +**IMPORTANT:** The final output must NOT contain the `scriptCompleted` or `stats` fields from either bundled script, nor your transient `rawDescription` / `readmeHead` work-strings. Strip them when assembling the final JSON. The final `importMap` MUST equal the `importMap` field from `extract-import-map.mjs` verbatim (do not edit, re-sort, or filter it). The final `files` array MUST equal Step B's `files` array verbatim (do not re-order, drop, or augment it). + +Your only synthesis task in this phase is the final `description` field: 1. If `rawDescription` is non-empty, use it as the basis. Clean it up if needed (remove marketing fluff, ensure it is 1-2 sentences). 2. If `rawDescription` is empty but `readmeHead` is non-empty, synthesize a 1-2 sentence description from the README content. @@ -334,25 +203,25 @@ Then assemble the final output JSON: ``` **Field requirements:** -- `name` (string): directly from script output +- `name` (string): from your Step A narrative work - `description` (string): your synthesized 1-2 sentence description -- `languages` (string[]): directly from script output -- `frameworks` (string[]): directly from script output -- `files` (object[]): directly from script output, including `fileCategory` per file -- `totalFiles` (integer): directly from script output -- `filteredByIgnore` (integer): directly from script output -- `estimatedComplexity` (string): directly from script output -- `importMap` (object): directly from script output +- `languages` (string[]): from your Step A narrative work (deduplicated, sorted alphabetically; cross-checked against Step B's `stats.byLanguage` keys) +- `frameworks` (string[]): from your Step A narrative work; only confirmed frameworks (empty array if none detected) +- `files` (object[]): directly from Step B's `files[]` (verbatim, including `fileCategory`) +- `totalFiles` (integer): directly from Step B +- `filteredByIgnore` (integer): directly from Step B +- `estimatedComplexity` (string): directly from Step B +- `importMap` (object): directly from Step C's `importMap` field ## Critical Constraints -- NEVER invent or guess file paths. Every `path` in the `files` array must come from the script's file discovery, which in turn comes from `git ls-files` or a real directory listing. +- NEVER invent or guess file paths. Every `path` in the `files` array must come from `scan-project.mjs`'s output (which itself comes from `git ls-files` or a real directory listing). - NEVER include files that do not exist on disk. - ALWAYS validate that `totalFiles` matches the actual length of the `files` array. -- ALWAYS sort `files` by `path` for deterministic output. -- Include ALL discovered project files in `files` -- code, configs, docs, infrastructure, and data files. Only exclude binaries, lock files, generated files, and dependency directories. -- Every file MUST have a `fileCategory` field with one of: `code`, `config`, `docs`, `infra`, `data`, `script`, `markup`. -- Trust the script's output for all structural data. Your only contribution is the `description` field. +- Trust Step B for file enumeration + language detection + category assignment + line counts + complexity. Trust Step C for `importMap`. Your only synthesis is the `description` field (plus the Step A narrative fields: `name`, `frameworks`, `languages`). +- Do NOT re-implement file enumeration, language detection, or category assignment in your discovery script. Use the bundled `scan-project.mjs`. If the table doesn't cover your project type, file an issue rather than ad-hoc handling. +- Do NOT attempt to re-implement import resolution. The bundled `extract-import-map.mjs` handles all 12 supported code languages (TS, JS, Python, Go, Rust, Java, Kotlin, C#, Ruby, PHP, C, C++) deterministically via tree-sitter + per-language resolvers. +- Every file MUST have a `fileCategory` field with one of: `code`, `config`, `docs`, `infra`, `data`, `script`, `markup` — `scan-project.mjs` guarantees this; just don't strip it. ## Writing Results diff --git a/understand-anything-plugin/package.json b/understand-anything-plugin/package.json index b789128..303ff98 100644 --- a/understand-anything-plugin/package.json +++ b/understand-anything-plugin/package.json @@ -1,15 +1,17 @@ { "name": "@understand-anything/skill", - "version": "2.7.4", + "version": "2.7.5", "type": "module", "main": "dist/index.js", "types": "dist/index.d.ts", "scripts": { "build": "tsc", - "test": "vitest run" + "test": "node -e \"console.log('skill tests live at /tests/skill — run via root \\`pnpm test\\`')\"" }, "dependencies": { - "@understand-anything/core": "workspace:*" + "@understand-anything/core": "workspace:*", + "graphology": "~0.26.0", + "graphology-communities-louvain": "^2.0.2" }, "devDependencies": { "@types/node": "^22.0.0", diff --git a/understand-anything-plugin/packages/core/vitest.config.ts b/understand-anything-plugin/packages/core/vitest.config.ts new file mode 100644 index 0000000..ca9d6e7 --- /dev/null +++ b/understand-anything-plugin/packages/core/vitest.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + include: ['src/**/*.test.{ts,tsx,mjs}'], + }, +}); diff --git a/understand-anything-plugin/skills/understand/SKILL.md b/understand-anything-plugin/skills/understand/SKILL.md index 28d131b..b66dcc2 100644 --- a/understand-anything-plugin/skills/understand/SKILL.md +++ b/understand-anything-plugin/skills/understand/SKILL.md @@ -275,26 +275,32 @@ If the scan result includes `filteredByIgnore > 0`, report: --- +## Phase 1.5 — BATCH + +Report: `[Phase 1.5/7] Computing semantic batches...` + +Run the bundled batching script: +```bash +node /compute-batches.mjs $PROJECT_ROOT +``` + +Reads `.understand-anything/intermediate/scan-result.json`, writes `.understand-anything/intermediate/batches.json`. + +Capture stderr. Append any line starting with `Warning:` to `$PHASE_WARNINGS` for the final report. + +If the script exits non-zero, the failure is hard — relay the full stderr to the user as a Phase 1.5 failure. Do not attempt to recover; the script's internal fallback (count-based) already handles recoverable issues. A non-zero exit means a fundamental problem (missing input file, malformed JSON, etc.). + +--- + ## Phase 2 — ANALYZE ### Full analysis path -Batch the file list from Phase 1 into groups of **20-30 files each** (aim for ~25 files per batch for balanced sizes). +Load `.understand-anything/intermediate/batches.json` (produced by Phase 1.5). Iterate the `batches[]` array. -**Batching strategy for non-code files:** -- Group related non-code files together in the same batch when possible: - - Dockerfile + docker-compose.yml + .dockerignore → same batch - - SQL migration files → same batch (ordered by filename) - - CI/CD config files (.github/workflows/*) → same batch - - Documentation files (docs/*.md) → same batch -- This allows the file-analyzer to create cross-file edges (e.g., docker-compose `depends_on` Dockerfile) -- Non-code files can be mixed with code files in the same batch if batch sizes are small -- Each file's `fileCategory` from Phase 1 must be included in the batch file list +Report: `[Phase 2/7] Analyzing files — files in batches (up to 5 concurrent)...` -After batching, report the plan to the user: -> `[Phase 2/7] Analyzing files — files in batches (up to 5 concurrent)...` - -For each batch, dispatch a subagent using the `file-analyzer` agent definition (at `agents/file-analyzer.md`). Run up to **5 subagents concurrently** using parallel dispatch. Append the following additional context: +For each batch, dispatch a subagent using the `file-analyzer` agent definition (at `agents/file-analyzer.md`). Run up to **5 subagents concurrently**. Append the following additional context: > **Additional context from main session:** > @@ -303,14 +309,7 @@ For each batch, dispatch a subagent using the `file-analyzer` agent definition ( > > $LANGUAGE_DIRECTIVE -Before dispatching each batch, construct `batchImportData` from `$IMPORT_MAP`: -```json -batchImportData = {} -for each file in this batch: - batchImportData[file.path] = $IMPORT_MAP[file.path] ?? [] -``` - -Fill in batch-specific parameters below and dispatch: +Dispatch prompt template (fill in batch-specific values from `batches.json[i]`): > Analyze these files and produce GraphNode and GraphEdge objects. > Project root: `$PROJECT_ROOT` @@ -318,11 +317,16 @@ Fill in batch-specific parameters below and dispatch: > Languages: `` > Batch: `/` > Skill directory (for bundled scripts): `` -> Write output to: `$PROJECT_ROOT/.understand-anything/intermediate/batch-.json` +> Output: write to `$PROJECT_ROOT/.understand-anything/intermediate/batch-.json` (single-file mode) OR `batch--part-.json` (split mode, per Step B of your output protocol). > -> Pre-resolved import data for this batch (use this for all import edge creation — do NOT re-resolve imports from source): +> Pre-resolved import data for this batch (use directly — do NOT re-resolve imports from source): > ```json -> +> +> ``` +> +> Cross-batch neighbors with their exported symbols (confidence boost for cross-batch edges): +> ```json +> > ``` > > Files to analyze in this batch (every entry MUST be passed through to `batchFiles` with all four fields — `path`, `language`, `sizeLines`, `fileCategory`): @@ -330,6 +334,8 @@ Fill in batch-specific parameters below and dispatch: > 2. `` ( lines, language: ``, fileCategory: ``) > ... +**Output naming is per-batchIndex — no fusion.** If you fuse multiple small batches into a single file-analyzer dispatch for token efficiency, the dispatched agent must STILL write one output file per original `batchIndex` using `batch-.json` or `batch--part-.json`. The merge script's regex (`batch-(\d+)(?:-part-(\d+))?\.json`) silently drops any other naming (e.g., `batch-fused-8-13.json`, `batch-8-13.json`), losing every node and edge in that file. After each dispatch returns, verify each `batchIndex` in the dispatched input has a corresponding `batch-.json` (or `batch--part-*.json`) on disk before proceeding to the next dispatch. + After ALL batches complete, report to the user: `Phase 2 complete. All batches analyzed.` Run the merge-and-normalize script bundled with this skill (located next to this SKILL.md file — use the skill directory path, not the project root): @@ -337,7 +343,7 @@ Run the merge-and-normalize script bundled with this skill (located next to this python /merge-batch-graphs.py $PROJECT_ROOT ``` -This script reads all `batch-*.json` files from `$PROJECT_ROOT/.understand-anything/intermediate/`, then in one pass: +This script reads all `batch-*.json` files (including `batch--part-.json` produced by file-analyzers that split their output) from `$PROJECT_ROOT/.understand-anything/intermediate/`, then in one pass: - Combines all nodes and edges across batches - Normalizes node IDs (strips double prefixes, project-name prefixes, adds missing prefixes) - Normalizes complexity values (`low`→`simple`, `medium`→`moderate`, `high`→`complex`, etc.) @@ -346,7 +352,7 @@ This script reads all `batch-*.json` files from `$PROJECT_ROOT/.understand-anyth - Drops dangling edges referencing missing nodes - Logs all corrections and dropped items to stderr -The merge script also runs a `tested_by` linker that canonicalizes test-coverage edges in two passes. **Pass 1** walks LLM-emitted `tested_by` edges and flips inverted ones in place (the LLM systematically emits `test → production` because it sees the import only when analyzing the test file); semantically broken edges (test↔test, prod↔prod, orphan endpoints) are dropped. **Pass 2** supplements with path-convention pairings (`X.ts` ↔ `X.test.ts`, JS/TS `__tests__/` and `/test/` walk-out, Python in-package `tests/`, Go `_test.go` sibling, Maven/Gradle `src/test/...` ↔ `src/main/...`, .NET `/tests/` ↔ `/src/...` and `.Tests/` ↔ `/`). Production nodes that end up sourcing any `tested_by` edge get a `"tested"` tag. All resulting edges run `production → test`. +The merge script also runs a `tested_by` linker that canonicalizes test-coverage edges in two passes. **Pass 1** walks LLM-emitted `tested_by` edges and flips inverted ones in place; semantically broken edges (test↔test, prod↔prod, orphan endpoints) are dropped. **Pass 2** supplements with path-convention pairings. Production nodes that end up sourcing any `tested_by` edge get a `"tested"` tag. All resulting edges run `production → test`. Output: `$PROJECT_ROOT/.understand-anything/intermediate/assembled-graph.json` @@ -354,7 +360,20 @@ Include the script's warnings in `$PHASE_WARNINGS` for the reviewer. ### Incremental update path -Use the changed files list from Phase 0. Batch and dispatch file-analyzer subagents using the same process as above (20-30 files per batch, up to 5 concurrent, with batchImportData constructed from $IMPORT_MAP), but only for changed files. +Write the changed-files list (one path per line) to a temp file: +```bash +git diff ..HEAD --name-only > $PROJECT_ROOT/.understand-anything/tmp/changed-files.txt +``` + +Run compute-batches with `--changed-files`: +```bash +node /compute-batches.mjs $PROJECT_ROOT \ + --changed-files=$PROJECT_ROOT/.understand-anything/tmp/changed-files.txt +``` + +This produces a `batches.json` that contains only batches with changed files, but neighborMap entries still reference unchanged files (with their full-graph batchIndex) so cross-batch edges remain emittable. + +Then dispatch file-analyzer subagents per the same template as the full path. After batches complete: 1. Remove old nodes whose `filePath` matches any changed file from the existing graph diff --git a/understand-anything-plugin/skills/understand/compute-batches.mjs b/understand-anything-plugin/skills/understand/compute-batches.mjs new file mode 100644 index 0000000..b7cce34 --- /dev/null +++ b/understand-anything-plugin/skills/understand/compute-batches.mjs @@ -0,0 +1,555 @@ +#!/usr/bin/env node +/** + * compute-batches.mjs — Phase 1.5 of /understand + * + * Reads scan-result.json, runs Louvain community detection on the import + * graph, and writes batches.json containing batches + neighborMap. + * + * Usage: + * node compute-batches.mjs [--changed-files=] + * + * Input: /.understand-anything/intermediate/scan-result.json + * Output: /.understand-anything/intermediate/batches.json + */ + +import { readFileSync, writeFileSync, existsSync, realpathSync } from 'node:fs'; +import { dirname, join, resolve } from 'node:path'; +import { fileURLToPath, pathToFileURL } from 'node:url'; +import { createRequire } from 'node:module'; + +const __filename = fileURLToPath(import.meta.url); +const PLUGIN_ROOT = resolve(dirname(__filename), '../..'); +const require = createRequire(resolve(PLUGIN_ROOT, 'package.json')); + +let core; +try { + core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href); +} catch { + core = await import(pathToFileURL(resolve(PLUGIN_ROOT, 'packages/core/dist/index.js')).href); +} +const { TreeSitterPlugin, PluginRegistry, builtinLanguageConfigs, registerAllParsers } = core; + +import Graph from 'graphology'; +import louvain from 'graphology-communities-louvain'; + +/** + * For each code file, returns its top-level exported symbol names (functions, + * classes, exported consts). Per-file errors are swallowed into [] with a + * visible warning so a single bad file does not abort batching. + * + * Returns Map. + */ +async function extractExports(projectRoot, codeFiles) { + let registry; + try { + const tsConfigs = builtinLanguageConfigs.filter(c => c.treeSitter); + const tsPlugin = new TreeSitterPlugin(tsConfigs); + await tsPlugin.init(); + registry = new PluginRegistry(); + registry.register(tsPlugin); + registerAllParsers(registry); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: tree-sitter init failed (${err.message}) ` + + `— all symbols=[] in neighborMap — cross-batch edges limited to file-level\n`, + ); + return new Map(codeFiles.map(f => [f.path, []])); + } + + const exportsByPath = new Map(); + for (const file of codeFiles) { + const abs = join(projectRoot, file.path); + let content; + try { + content = readFileSync(abs, 'utf-8'); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: exports extraction failed for ${file.path} ` + + `(read error: ${err.message}) — symbols=[] in neighborMap — ` + + `cross-batch edges to this file limited to file-level\n`, + ); + exportsByPath.set(file.path, []); + continue; + } + try { + const analysis = registry.analyzeFile(file.path, content); + const names = (analysis?.exports || []).map(e => e.name).filter(Boolean); + exportsByPath.set(file.path, names); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: exports extraction failed for ${file.path} ` + + `(analyze error: ${err.message}) — symbols=[] in neighborMap — ` + + `cross-batch edges to this file limited to file-level\n`, + ); + exportsByPath.set(file.path, []); + } + } + return exportsByPath; +} + +/** + * Build batches for non-code files per Groups A-E in the design spec. + * Returns Array<{ files: FileMeta[], mergeable: boolean }> — caller assigns + * batchIndex. `mergeable=false` for semantic Groups A-D (Dockerfile clusters, + * .github/workflows, .gitlab-ci/.circleci, SQL migrations) preserves their + * boundary intent across the merge-small pass; Group E (catch-all parent-dir + * grouping) is `mergeable=true` so its tiny singletons can be pooled. + */ +function buildNonCodeBatches(nonCodeFiles) { + const byPath = new Map(nonCodeFiles.map(f => [f.path, f])); + const consumed = new Set(); + const groups = []; + + const dirOf = p => p.includes('/') ? p.slice(0, p.lastIndexOf('/')) : ''; + const baseOf = p => p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p; + + // Group A: per-directory Dockerfile clusters. + const dirsWithDockerfile = new Set( + [...byPath.keys()] + .filter(p => baseOf(p) === 'Dockerfile') + .map(dirOf), + ); + for (const dir of [...dirsWithDockerfile].sort()) { + const inDir = [...byPath.keys()].filter(p => dirOf(p) === dir); + const cluster = inDir.filter(p => { + const b = baseOf(p); + return b === 'Dockerfile' + || b === '.dockerignore' + || b.startsWith('docker-compose.'); + }); + if (cluster.length) { + groups.push({ files: cluster.map(p => byPath.get(p)), mergeable: false }); + cluster.forEach(p => consumed.add(p)); + } + } + + // Group B: .github/workflows/* + const ghWorkflows = [...byPath.keys()].filter( + p => p.startsWith('.github/workflows/') && (p.endsWith('.yml') || p.endsWith('.yaml')), + ).filter(p => !consumed.has(p)); + if (ghWorkflows.length) { + groups.push({ files: ghWorkflows.map(p => byPath.get(p)), mergeable: false }); + ghWorkflows.forEach(p => consumed.add(p)); + } + + // Group C: .gitlab-ci.yml + .circleci/* + const ciFiles = [...byPath.keys()].filter( + p => (p === '.gitlab-ci.yml' || p.startsWith('.circleci/')) + && !consumed.has(p), + ); + if (ciFiles.length) { + groups.push({ files: ciFiles.map(p => byPath.get(p)), mergeable: false }); + ciFiles.forEach(p => consumed.add(p)); + } + + // Group D: SQL migrations per migrations/ or migration/ directory. + // Defensive consumed.has check: no upstream group consumes SQL today, but + // future Group additions could; keep the check for forward-compat. + const migrationDirs = new Set( + [...byPath.keys()] + .filter(p => p.endsWith('.sql')) + .map(dirOf) + .filter(d => /(^|\/)migrations?$/.test(d)), + ); + for (const dir of migrationDirs) { + const sqls = [...byPath.keys()] + .filter(p => dirOf(p) === dir && p.endsWith('.sql') && !consumed.has(p)) + .sort(); + if (sqls.length) { + groups.push({ files: sqls.map(p => byPath.get(p)), mergeable: false }); + sqls.forEach(p => consumed.add(p)); + } + } + + // Group E: all remaining grouped by immediate parent dir, max 20 per batch + const remainingByDir = new Map(); + for (const p of [...byPath.keys()].sort()) { + if (consumed.has(p)) continue; + const dir = dirOf(p); + if (!remainingByDir.has(dir)) remainingByDir.set(dir, []); + remainingByDir.get(dir).push(p); + } + // Per design spec: max files per parent-dir batch for Group E. + const MAX_E = 20; + for (const [, paths] of remainingByDir) { + for (let i = 0; i < paths.length; i += MAX_E) { + const slice = paths.slice(i, i + MAX_E); + groups.push({ files: slice.map(p => byPath.get(p)), mergeable: true }); + } + } + + return groups; +} + +/** + * Build a lookup map from file path → batchIndex across all batches (code + + * non-code). Used to resolve cross-batch neighbor references in neighborMap. + */ +function buildBatchOfMap(allBatches) { + const m = new Map(); + for (const b of allBatches) { + for (const f of b.files) m.set(f.path, b.batchIndex); + } + return m; +} + +/** + * Returns Map via Louvain. May throw — caller must catch + * and fall back if it does. Honors UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW=1 + * to allow tests to exercise the fallback path. + */ +function runLouvain(codeFiles, importMap) { + if (process.env.UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW === '1') { + throw new Error('forced throw via UA_COMPUTE_BATCHES_FORCE_LOUVAIN_THROW'); + } + const g = new Graph({ type: 'undirected', allowSelfLoops: false }); + for (const f of codeFiles) g.addNode(f.path); + for (const [src, targets] of Object.entries(importMap)) { + if (!g.hasNode(src)) continue; + for (const tgt of targets) { + if (!g.hasNode(tgt) || src === tgt || g.hasEdge(src, tgt)) continue; + g.addEdge(src, tgt); + } + } + const cs = louvain(g); // { nodeId: communityId } + return new Map(Object.entries(cs)); +} + +/** + * Returns Map via alphabetical chunking of `batchSize` + * files per batch. Deterministic, used as fallback when Louvain fails. + */ +function countBasedAssignment(codeFiles, batchSize = 12) { + const out = new Map(); + const sorted = [...codeFiles].map(f => f.path).sort(); + for (let i = 0; i < sorted.length; i++) { + out.set(sorted[i], `count_${Math.floor(i / batchSize)}`); + } + return out; +} + +/** + * Pool small mergeable batches into "misc" batches to reduce dispatch overhead. + * Preserves semantic groupings (non-code Groups A-D, marked `mergeable=false`) + * regardless of size; only merges code Louvain singletons / orphans and + * Group E parent-dir batches that fall below MIN_BATCH_SIZE. + * + * On a 314-file microservices-demo run, vanilla Louvain produced 87 singleton + * communities → 87 dispatch tasks of size 1. This pass collapses them into + * ceil(N / MAX_MERGE_TARGET) misc batches, drastically cutting orchestration + * overhead while leaving the high-modularity communities untouched. + * + * Returns the rewritten batch list with reassigned batchIndex (1-based, + * keepers first preserving their relative order, misc batches appended). + */ +function mergeSmallBatches(bareBatches) { + // MIN_BATCH_SIZE=3: below this, file-analyzer dispatch overhead (subagent + // spin-up, prompt setup) dwarfs the per-file analysis cost — not worth a + // standalone batch. + const MIN_BATCH_SIZE = 3; + // MAX_MERGE_TARGET=25: stays below MAX_COMMUNITY_SIZE=35 so the misc-batch + // agent retains headroom for neighborMap context without overflowing. + const MAX_MERGE_TARGET = 25; + + const keepers = []; + const smallMergeable = []; + for (const b of bareBatches) { + if (b.mergeable && b.files.length < MIN_BATCH_SIZE) { + smallMergeable.push(b); + } else { + keepers.push(b); + } + } + + if (smallMergeable.length === 0) { + // Nothing to merge — strip mergeable flag and renumber for cleanliness. + return keepers.map((b, i) => ({ + batchIndex: i + 1, + files: b.files, + })); + } + + // Pool and sort deterministically by path so repeated runs match byte-for-byte. + const pooledFiles = smallMergeable + .flatMap(b => b.files) + .sort((a, b) => a.path.localeCompare(b.path)); + + const miscBatches = []; + for (let i = 0; i < pooledFiles.length; i += MAX_MERGE_TARGET) { + miscBatches.push({ files: pooledFiles.slice(i, i + MAX_MERGE_TARGET) }); + } + + // Use `Info:` rather than `Warning:` — singleton consolidation is a + // routine optimization, not a fallback/degrade path. Per + // [[feedback_visible_warnings]] only fallbacks should bubble as Warning: + // to the Phase 7 final report. Real warnings would get drowned out if + // every normal Louvain run with singletons (i.e. almost every run) added + // a Warning: line. + process.stderr.write( + `Info: compute-batches: merged ${smallMergeable.length} small batches ` + + `(${pooledFiles.length} files) into ${miscBatches.length} misc batches ` + + `— singletons and orphans consolidated\n`, + ); + + const final = [...keepers, ...miscBatches]; + return final.map((b, i) => ({ + batchIndex: i + 1, + files: b.files, + })); +} + +// ── Main: load → Louvain (or count-fallback) → enrich → write batches.json ─ +async function main() { + const projectRoot = process.argv[2]; + if (!projectRoot) { + process.stderr.write('Usage: node compute-batches.mjs [--changed-files=]\n'); + process.exit(1); + } + + let changedFiles = null; + for (const arg of process.argv.slice(3)) { + const m = arg.match(/^--changed-files=(.+)$/); + if (m) { + const p = m[1]; + let content; + try { + content = readFileSync(p, 'utf-8'); + } catch (err) { + process.stderr.write( + `Error: compute-batches: --changed-files path not readable: ${p} (${err.message})\n`, + ); + process.exit(1); + } + const lines = content + .split('\n') + .map(s => s.trim()) + .filter(Boolean); + changedFiles = new Set(lines); + } + } + + const scanPath = join(projectRoot, '.understand-anything', 'intermediate', 'scan-result.json'); + if (!existsSync(scanPath)) { + process.stderr.write(`Error: scan-result.json not found at ${scanPath}\n`); + process.exit(1); + } + + const scan = JSON.parse(readFileSync(scanPath, 'utf-8')); + const files = scan.files || []; + const codeFiles = files.filter(f => f.fileCategory === 'code'); + const nonCodeFiles = files.filter(f => f.fileCategory !== 'code'); + const importMap = scan.importMap || {}; + + process.stderr.write(`Loaded ${files.length} files (${codeFiles.length} code).\n`); + + const exportsByPath = await extractExports(projectRoot, codeFiles); + + let algorithm = 'louvain'; + let perFileCommunity; + try { + perFileCommunity = runLouvain(codeFiles, importMap); + } catch (err) { + process.stderr.write( + `Warning: compute-batches: Louvain failed (${err.message}) ` + + `— falling back to count-based grouping (12 files/batch) ` + + `— module semantic boundaries lost\n`, + ); + perFileCommunity = countBasedAssignment(codeFiles, 12); + algorithm = 'count-fallback'; + } + + // Group files by community id + const filesByCommunity = new Map(); + for (const [path, cid] of perFileCommunity) { + if (!filesByCommunity.has(cid)) filesByCommunity.set(cid, []); + filesByCommunity.get(cid).push(path); + } + + // Size enforcement only on louvain output. count-fallback already chunked. + const MAX_COMMUNITY_SIZE = 35; + const splitCommunities = new Map(); + let nextSyntheticId = 0; + if (algorithm === 'louvain') { + for (const [cid, paths] of filesByCommunity) { + if (paths.length <= MAX_COMMUNITY_SIZE) { + splitCommunities.set(cid, paths); + continue; + } + process.stderr.write( + `Warning: compute-batches: community size ${paths.length} > max ${MAX_COMMUNITY_SIZE} ` + + `— splitting via alphabetical chunking — modularity may decrease\n`, + ); + const sorted = [...paths].sort(); + const parts = Math.ceil(paths.length / MAX_COMMUNITY_SIZE); + const perPart = Math.ceil(paths.length / parts); + for (let i = 0; i < parts; i++) { + const slice = sorted.slice(i * perPart, (i + 1) * perPart); + const synthId = `__split_${cid}_${nextSyntheticId++}`; + splitCommunities.set(synthId, slice); + } + } + } else { + for (const [cid, paths] of filesByCommunity) splitCommunities.set(cid, paths); + } + + // Sort communities by size desc, then by min-path asc for determinism + const sortedCommunities = [...splitCommunities.entries()] + .sort((a, b) => { + if (b[1].length !== a[1].length) return b[1].length - a[1].length; + const minA = [...a[1]].sort()[0]; + const minB = [...b[1]].sort()[0]; + return minA.localeCompare(minB); + }); + + // Build per-batch file list with full file metadata from scan + const fileMetaByPath = new Map(files.map(f => [f.path, f])); + // Safe: every path in a community is a graph node, and graph nodes are a + // subset of files (see addNode loop above). fileMetaByPath.get() can + // never return undefined here. + + // First-pass: assemble bare batches (no batchImportData/neighborMap yet). + // All Louvain communities are mergeable=true so the merge-small pass can + // collapse singletons / 2-file orphans. Non-code groups carry per-group + // mergeable flags from buildNonCodeBatches (false for semantic Groups A-D, + // true for Group E catch-all). + const codeBatchObjsBare = sortedCommunities.map(([, paths], idx) => ({ + batchIndex: idx + 1, + files: paths.sort().map(p => fileMetaByPath.get(p)), + mergeable: true, + })); + const nonCodeGroups = buildNonCodeBatches(nonCodeFiles); + const nonCodeBatchObjsBare = nonCodeGroups.map((g, i) => ({ + batchIndex: codeBatchObjsBare.length + i + 1, + files: g.files, + mergeable: g.mergeable, + })); + const bareBatches = [...codeBatchObjsBare, ...nonCodeBatchObjsBare]; + const mergedBareBatches = mergeSmallBatches(bareBatches); + const batchOf = buildBatchOfMap(mergedBareBatches); + + // Build reverse import map: target → [sources that import target] + const reverseImportMap = new Map(); + for (const [src, targets] of Object.entries(importMap)) { + for (const tgt of targets) { + if (!reverseImportMap.has(tgt)) reverseImportMap.set(tgt, []); + reverseImportMap.get(tgt).push(src); + } + } + + // Compute neighbor degree (number of import relations) per path, used for + // truncation when neighborMap[file] has > MAX_NEIGHBORS entries. + const NEIGHBOR_DEGREE = new Map(); + for (const f of codeFiles) { + const outDeg = (importMap[f.path] || []).length; + const inDeg = (reverseImportMap.get(f.path) || []).length; + NEIGHBOR_DEGREE.set(f.path, outDeg + inDeg); + } + + const MAX_NEIGHBORS = 50; + + // Second-pass: enrich each batch with batchImportData + neighborMap + const batches = mergedBareBatches.map(b => { + const batchPaths = new Set(b.files.map(f => f.path)); + const batchImportData = {}; + const neighborMap = {}; + for (const f of b.files) { + batchImportData[f.path] = (importMap[f.path] || []).slice(); + + // 1-hop neighbors: imports out + imported-by in, excluding same batch. + // Note on truncation: we measure "popularity" by total raw 1-hop neighbor + // count (rawCount), not kept.length. A widely-imported hub like a logger + // module may have N>50 inbound imports but, after Louvain + size + // enforcement, only some land in other batches — kept.length can be < 50 + // while the file is still a high-degree hub whose missing relationships + // matter for downstream cross-batch edge confidence. Warning on rawCount + // surfaces this; truncation on kept ensures the JSON stays bounded. + const outNeighbors = importMap[f.path] || []; + const inNeighbors = reverseImportMap.get(f.path) || []; + const all = new Set([...outNeighbors, ...inNeighbors]); + const rawCount = all.size; + const filtered = [...all].filter(p => batchOf.has(p) && !batchPaths.has(p)); + + let kept = filtered.map(p => ({ + path: p, + batchIndex: batchOf.get(p), + symbols: exportsByPath.get(p) || [], + })); + + if (rawCount > MAX_NEIGHBORS) { + kept.sort((a, b2) => (NEIGHBOR_DEGREE.get(b2.path) || 0) + - (NEIGHBOR_DEGREE.get(a.path) || 0) + || a.path.localeCompare(b2.path)); // deterministic tiebreak + const beforeSlice = kept.length; + kept = kept.slice(0, MAX_NEIGHBORS); + process.stderr.write( + `Warning: compute-batches: neighborMap for ${f.path} has high 1-hop degree ${rawCount} ` + + `— exceeds soft cap of ${MAX_NEIGHBORS} — keeping top ${kept.length} cross-batch entries ` + + `(${beforeSlice - kept.length} dropped by degree sort)\n`, + ); + } + + if (kept.length) neighborMap[f.path] = kept; + } + return { batchIndex: b.batchIndex, files: b.files, batchImportData, neighborMap }; + }); + + let finalBatches = batches; + if (changedFiles) { + finalBatches = batches.filter(b => b.files.some(f => changedFiles.has(f.path))); + // batchIndex on filtered batches retains the full-graph assignment + // (the design says neighborMap should still reference unchanged files' + // full-graph batchIndex). No renumbering. + } + + // Note: under --changed-files mode, totalFiles is the FULL project file + // count (unchanged from the input scan) while totalBatches reflects only + // the filtered set written to disk. batchIndex values on the kept batches + // preserve the full-graph assignment so neighborMap references resolve. + const output = { + schemaVersion: 1, + algorithm, + totalFiles: scan.files.length, + totalBatches: finalBatches.length, + exportsByPath: Object.fromEntries(exportsByPath), + batches: finalBatches, + }; + + const outPath = join(projectRoot, '.understand-anything', 'intermediate', 'batches.json'); + writeFileSync(outPath, JSON.stringify(output, null, 2), 'utf-8'); + const batchSizes = finalBatches.map(b => b.files.length); + const maxSize = batchSizes.length ? Math.max(...batchSizes) : 0; + const minSize = batchSizes.length ? Math.min(...batchSizes) : 0; + process.stderr.write( + `Wrote ${finalBatches.length} batches (sizes: max=${maxSize}, min=${minSize}) to ${outPath}\n`, + ); +} + +// --------------------------------------------------------------------------- +// Run only when executed directly as a CLI; importing the module (e.g. from +// tests) must not trigger main(). +// +// Canonicalize both sides through realpathSync. Node ESM resolves +// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves +// them, so a raw equality check silently no-ops when the script is invoked via +// a symlinked plugin install path (the default in Claude Code / Copilot CLI +// caches). See GitHub issue #162. +// --------------------------------------------------------------------------- +function isCliEntry() { + if (!process.argv[1]) return false; + try { + const modulePath = realpathSync(fileURLToPath(import.meta.url)); + const argvPath = realpathSync(process.argv[1]); + return modulePath === argvPath; + } catch { + return false; + } +} + +if (isCliEntry()) { + try { + await main(); + } catch (err) { + process.stderr.write(`compute-batches.mjs failed: ${err.message}\n${err.stack}\n`); + process.exit(1); + } +} diff --git a/understand-anything-plugin/skills/understand/extract-import-map.mjs b/understand-anything-plugin/skills/understand/extract-import-map.mjs new file mode 100644 index 0000000..6c547d3 --- /dev/null +++ b/understand-anything-plugin/skills/understand/extract-import-map.mjs @@ -0,0 +1,1558 @@ +#!/usr/bin/env node +/** + * extract-import-map.mjs + * + * Deterministic import resolution script for the project-scanner agent. + * Uses PluginRegistry (TreeSitterPlugin + non-code parsers) from + * @understand-anything/core to extract raw import paths via tree-sitter, + * then applies language-specific resolution rules to map them to + * project-internal file paths. + * + * Replaces the LLM-written prose import resolver in agents/project-scanner.md + * (the prose previously described patterns by language; runtime LLMs produced + * inconsistent, regex-only scripts with sparse coverage). + * + * Usage: + * node extract-import-map.mjs + * + * Input JSON: + * { + * projectRoot: , + * files: [{ path, language, fileCategory }, ...] + * } + * + * Output JSON: + * { + * scriptCompleted: true, + * stats: { filesScanned, filesWithImports, totalEdges }, + * importMap: { : [, ...], ... } + * } + * + * Logging: stderr only (stdout reserved for piped tools). + * Per-file resilience: failures emit `Warning: extract-import-map: ...` and + * set importMap[path] = [], they do not abort the script. + */ + +import { createRequire } from 'node:module'; +import { dirname, resolve, join, posix } from 'node:path'; +import { fileURLToPath, pathToFileURL } from 'node:url'; +import { existsSync, readFileSync, realpathSync, writeFileSync } from 'node:fs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +// skills/understand/ -> plugin root is two dirs up +const pluginRoot = resolve(__dirname, '../..'); +const require = createRequire(resolve(pluginRoot, 'package.json')); + +// --------------------------------------------------------------------------- +// Resolve @understand-anything/core +// +// Node ESM dynamic import() requires a file:// URL on Windows; passing a raw +// absolute path like "C:\..." throws ERR_UNSUPPORTED_ESM_URL_SCHEME because the +// loader parses "C:" as a URL scheme. Wrap both resolutions in pathToFileURL(). +// --------------------------------------------------------------------------- +let core; +try { + core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href); +} catch { + // Fallback: direct path for installed plugin cache layouts + core = await import(pathToFileURL(resolve(pluginRoot, 'packages/core/dist/index.js')).href); +} + +const { TreeSitterPlugin, PluginRegistry, builtinLanguageConfigs, registerAllParsers } = core; + +// --------------------------------------------------------------------------- +// Path helpers +// --------------------------------------------------------------------------- + +/** + * Normalize a project-relative path to forward slashes (POSIX). Project-scanner + * always emits forward slashes; we re-normalize to keep this script + * cross-platform. + */ +function toPosix(p) { + return p.split(/[\\/]/).filter(Boolean).join('/'); +} + +/** + * Join a directory with a relative segment, normalizing `.`/`..` segments and + * returning a forward-slash POSIX path. Anchored at project root (no leading + * slash). Returns '' if the path walks above the project root. + */ +function resolveRelative(dir, rel) { + const parts = (dir ? dir.split('/').filter(Boolean) : []).concat( + rel.split('/').filter(Boolean), + ); + const stack = []; + for (const part of parts) { + if (part === '' || part === '.') continue; + if (part === '..') { + if (stack.length === 0) return ''; + stack.pop(); + } else { + stack.push(part); + } + } + return stack.join('/'); +} + +/** + * Return the directory portion of a project-relative path (no trailing slash, + * '' for top-level files). + */ +function dirOf(p) { + const i = p.lastIndexOf('/'); + return i === -1 ? '' : p.slice(0, i); +} + +// --------------------------------------------------------------------------- +// Config loading +// +// Cached once at startup. Per-file resolvers consume these values; they MUST +// NOT re-read these files (a 1000-file project would otherwise re-parse the +// same config 1000 times). +// --------------------------------------------------------------------------- + +/** + * Parse a single tsconfig.json file content and return + * `{ baseUrl: string, paths: Map }` or `null` if both the + * comment-stripped and raw parses fail. Centralizes the "JSONC-then-raw" + * fallback so callers can iterate many tsconfigs without duplicating the + * try/catch ladder. + * + * Returning `null` (rather than throwing) lets the caller emit a Warning: + * with the exact tsconfig path that failed; bubbling the error would + * conceal which file was at fault when many tsconfigs are loaded. + */ +function parseTsConfigText(raw) { + // tsconfig.json often contains JSONC-style comments; strip line and block + // comments before parsing. The strip is naive (it doesn't honor string + // contents), so we fall back to the raw text on failure. + const stripped = raw + .replace(/\/\*[\s\S]*?\*\//g, '') + .replace(/(^|[^:])\/\/.*$/gm, '$1'); + let parsed; + try { + parsed = JSON.parse(stripped); + } catch { + try { + parsed = JSON.parse(raw); + } catch { + return null; + } + } + const compilerOptions = parsed?.compilerOptions ?? {}; + const baseUrl = compilerOptions.baseUrl ?? '.'; + const paths = new Map(); + if (compilerOptions.paths && typeof compilerOptions.paths === 'object') { + for (const [alias, targets] of Object.entries(compilerOptions.paths)) { + if (Array.isArray(targets)) { + paths.set(alias, targets); + } + } + } + return { baseUrl, paths }; +} + +/** + * Load every `tsconfig.json` discovered in the input file list and parse + * each. Returns `Map` keyed by the + * project-relative POSIX directory containing the tsconfig (empty string + * for a root-level tsconfig.json). + * + * `paths` keys keep their trailing `*` wildcards intact (e.g. `"@/*"`); the + * resolver matches them by prefix. Values are arrays because tsconfig + * allows multiple targets per alias. + * + * WHY plural: pnpm/yarn workspace monorepos commonly carry per-package + * tsconfig.json files with package-scoped `paths` aliases. Loading only + * the root tsconfig would (1) miss aliases defined in sub-packages and + * (2) erroneously apply root aliases to files in sub-packages that + * redefine them. Per-importer walk-up is the only correct behavior. + * + * Returns an empty map if no tsconfigs are found — many JS-only projects + * have none, and relative imports still resolve without one. On parse + * failure for a specific tsconfig, emits a Warning: pointing at the bad + * file and skips it (the rest of the project keeps working). + * + * Parse strategy (per-file, in parseTsConfigText): + * 1. Try the comment-stripped text (handles JSONC-style tsconfigs). + * 2. If that fails, retry the ORIGINAL raw text — recovers the case + * where the stripper damaged a string literal containing `//`. + * 3. If both fail, warn and skip — that tsconfig contributes no aliases. + */ +function loadTsConfigs(projectRoot, files) { + const out = new Map(); + for (const f of files) { + const p = toPosix(f.path); + const base = p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p; + if (base !== 'tsconfig.json') continue; + const absPath = join(projectRoot, p); + if (!existsSync(absPath)) continue; + let raw; + try { + raw = readFileSync(absPath, 'utf-8'); + } catch (err) { + process.stderr.write( + `Warning: extract-import-map: tsconfig.json at ${absPath} failed ` + + `to read (${err.message}) — path aliases from this config will ` + + `not be applied — relative imports unaffected\n`, + ); + continue; + } + const parsed = parseTsConfigText(raw); + if (!parsed) { + process.stderr.write( + `Warning: extract-import-map: tsconfig.json at ${absPath} failed ` + + `to parse — path aliases from this config will not be applied ` + + `— relative imports unaffected\n`, + ); + continue; + } + out.set(dirOf(p), parsed); + } + return out; +} + +/** + * Load every `go.mod` discovered in the input file list and extract its + * `module ` line. Returns `Map` where `dirPath` + * is the project-relative POSIX directory containing the go.mod (empty + * string for a root-level go.mod). + * + * WHY plural: multi-service / multi-module repositories (e.g. Google's + * microservices-demo) have one go.mod per service. The resolver dispatches + * per importer by walking up to the nearest go.mod, so a single root-only + * lookup misses every file that lives inside a sub-module. + * + * Files outside the discovered `files[]` are ignored — the project-scanner + * is the single source of truth for what the user considers part of the + * project. On read failure for a discovered go.mod we silently skip that + * entry; the per-file resolver will surface the "no ancestor go.mod" warning + * if it matters for any importer. + * + * Example go.mod: + * module github.com/foo/bar + * go 1.21 + * + * The resolver uses each module's prefix to translate + * `import "github.com/foo/bar/x"` into the project-internal `x/.go`. + */ +function loadGoModules(projectRoot, files) { + const out = new Map(); + for (const f of files) { + const p = toPosix(f.path); + const base = p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p; + if (base !== 'go.mod') continue; + const absPath = join(projectRoot, p); + if (!existsSync(absPath)) continue; + let raw; + try { + raw = readFileSync(absPath, 'utf-8'); + } catch { + continue; + } + let moduleName = ''; + for (const line of raw.split(/\r?\n/)) { + const trimmed = line.replace(/\/\/.*$/, '').trim(); + if (!trimmed.startsWith('module ')) continue; + moduleName = trimmed.slice('module '.length).trim(); + break; + } + if (!moduleName) continue; + out.set(dirOf(p), moduleName); + } + return out; +} + +/** + * Walk up from `startDir` (project-relative POSIX, '' for project root) + * and return the DEEPEST ancestor directory that exists as a key in + * `configMap`, or undefined if no ancestor matches. + * + * Determinism: ancestors are inspected from deepest to shallowest, so the + * deepest match is always picked. This matches the way TS/JS / PHP / Go + * tools resolve nearest config in the wild ("nearest enclosing"). + * + * Defensive note: if multiple distinct keys somehow share a depth (cannot + * happen with proper directory paths, but a malformed input could), the + * caller is expected to have normalized the keys. We do not re-sort here + * because the iteration order is determined by depth alone. + */ +function findNearestConfigDir(startDir, configMap) { + if (configMap.size === 0) return undefined; + // Walk ancestors from the importer's directory up to the project root. + // Slicing the parts array gives every prefix; we test each from longest + // to shortest so the deepest match wins. + const parts = startDir ? startDir.split('/').filter(Boolean) : []; + for (let i = parts.length; i >= 0; i--) { + const ancestor = parts.slice(0, i).join('/'); + if (configMap.has(ancestor)) return ancestor; + } + return undefined; +} + +/** + * Resolution context shared across all per-file resolver calls. Holds: + * - fileSet: Set of every input file's posix path + * - tsConfigs: Map from every tsconfig.json in + * `files[]`. Per-import resolution walks up from the importer to the + * nearest enclosing tsconfig. + * - goModules: Map from every go.mod in `files[]`. + * - phpAutoloads: Map from every composer.json in + * `files[]`. Resolved paths are anchored at the composer's directory. + * - goFilesByDir: Map of .go files per directory (built + * once so Go's package-level import dispatch doesn't re-scan the file + * set per import). + * + * Build once; pass everywhere. + */ +function buildResolutionContext(projectRoot, files) { + const fileSet = new Set(files.map(f => toPosix(f.path))); + const tsConfigs = loadTsConfigs(projectRoot, files); + const goModules = loadGoModules(projectRoot, files); + + // Index .go files by their parent directory so the Go resolver can + // expand a package-level import to all member .go files in O(1). + const goFilesByDir = new Map(); + for (const f of files) { + if (!f.path.endsWith('.go')) continue; + const p = toPosix(f.path); + const d = dirOf(p); + if (!goFilesByDir.has(d)) goFilesByDir.set(d, []); + goFilesByDir.get(d).push(p); + } + for (const arr of goFilesByDir.values()) { + arr.sort((a, b) => a.localeCompare(b)); + } + + // Build per-extension suffix indices for dotted-FQN resolvers (Java, + // Kotlin, C#). Indexed once; reused for every import dispatch. + const javaIndex = buildSuffixIndex(files, p => p.endsWith('.java')); + const kotlinIndex = buildSuffixIndex(files, p => p.endsWith('.kt')); + const csIndex = buildSuffixIndex(files, p => p.endsWith('.cs')); + + const phpAutoloads = loadPhpAutoloads(projectRoot, files); + + return { + projectRoot, + fileSet, + tsConfigs, + goModules, + goFilesByDir, + javaIndex, + kotlinIndex, + csIndex, + phpAutoloads, + // Dedupe Sets for one-time-per-file warnings. Keyed by importer file + // path. Mutated by resolvers. + _warnedNoRustCrateRoot: new Set(), + _warnedNoGoModule: new Set(), + }; +} + +// --------------------------------------------------------------------------- +// TypeScript / JavaScript resolver +// +// Handles: +// - Relative imports: `import x from './foo'` -> `/foo` + ext probes +// - tsconfig path aliases: `import x from '@/foo'` -> `//foo` +// +// `imp.source` from tree-sitter is the literal string content of the import +// path (no quotes). We don't need to redo the regex work — we just classify +// the source string and dispatch. +// --------------------------------------------------------------------------- + +// Extensions probed when the import has no extension. The order mirrors the +// historical project-scanner prose so behavior matches existing fixtures. +const TS_EXT_PROBES = [ + '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs', + '/index.ts', '/index.tsx', '/index.js', '/index.jsx', +]; + +/** + * Try ext probes against the file set for the given base path. Returns the + * first matching project-relative path, or null. If the base path already has + * a code extension AND exists in the file set, returns it directly. + */ +function probeWithExtensions(basePath, fileSet) { + if (!basePath) return null; + // Exact match (import already had an extension) + if (fileSet.has(basePath)) return basePath; + for (const ext of TS_EXT_PROBES) { + const candidate = basePath + ext; + if (fileSet.has(candidate)) return candidate; + } + return null; +} + +/** + * Resolve a TypeScript / JavaScript import. Returns project-relative resolved + * path or null. External packages return null. + * + * Path-alias resolution walks up from the importer's directory to find the + * nearest enclosing tsconfig.json (monorepo-friendly). `baseUrl`-relative + * targets are anchored at THAT tsconfig's directory, matching the way the + * TypeScript compiler resolves nested project configs. + */ +export function resolveTsJsImport(rawImport, file, ctx) { + if (!rawImport || typeof rawImport !== 'string') return null; + const src = rawImport.trim(); + if (!src) return null; + + const importerDir = dirOf(toPosix(file.path)); + + // Relative imports: ./foo, ../foo — tsconfig has no bearing here. + if (src.startsWith('./') || src.startsWith('../')) { + const base = resolveRelative(importerDir, src); + return probeWithExtensions(base, ctx.fileSet); + } + + // tsconfig path aliases. Walk up from the importer to find the nearest + // tsconfig.json; resolve targets relative to THAT tsconfig's directory. + // Without the walk-up, a root tsconfig would either swallow aliases that + // belong to a sub-package or fail to apply sub-package-defined aliases. + const tsConfigDir = findNearestConfigDir(importerDir, ctx.tsConfigs); + if (tsConfigDir !== undefined) { + const tsConfig = ctx.tsConfigs.get(tsConfigDir); + const { baseUrl, paths } = tsConfig; + if (paths && paths.size > 0) { + for (const [alias, targets] of paths) { + const aliasMatch = matchTsAlias(alias, src); + if (aliasMatch === null) continue; + for (const target of targets) { + const mapped = applyTsAlias(target, aliasMatch); + // baseUrl is tsconfig-dir-relative; '.', './', '' all mean the + // tsconfig's own directory. We anchor at tsConfigDir so a nested + // tsconfig's `baseUrl: '.'` maps to its package, not project root. + const normalizedBase = baseUrl === '.' || baseUrl === '' + ? '' + : toPosix(baseUrl); + const relativeToConfig = normalizedBase + ? posix.join(normalizedBase, mapped) + : mapped; + const candidate = tsConfigDir + ? posix.join(tsConfigDir, relativeToConfig) + : relativeToConfig; + const probed = probeWithExtensions(candidate, ctx.fileSet); + if (probed) return probed; + } + } + } + } + + // Bare specifier with no leading `./`, no alias match -> external package. + return null; +} + +/** + * Match an import against a tsconfig paths alias. Aliases use `*` as a single + * wildcard, e.g. `"@/*"` matches `"@/foo/bar"` with the wildcard = "foo/bar". + * Aliases without `*` must match exactly. Returns the wildcard content + * (possibly '') on match, null on no match. + */ +function matchTsAlias(alias, src) { + const starIdx = alias.indexOf('*'); + if (starIdx === -1) { + return src === alias ? '' : null; + } + const prefix = alias.slice(0, starIdx); + const suffix = alias.slice(starIdx + 1); + if (!src.startsWith(prefix)) return null; + if (!src.endsWith(suffix)) return null; + // Avoid double-counting when prefix+suffix length exceeds src length + if (src.length < prefix.length + suffix.length) return null; + return src.slice(prefix.length, src.length - suffix.length); +} + +/** + * Substitute the wildcard content into a tsconfig target. Mirror of + * matchTsAlias — if the target has no `*`, return it as-is (rare, but valid). + */ +function applyTsAlias(target, wildcard) { + const starIdx = target.indexOf('*'); + if (starIdx === -1) return target; + return target.slice(0, starIdx) + wildcard + target.slice(starIdx + 1); +} + +/** + * Tree-sitter's TS/JS extractor only records ES module `import` declarations. + * CommonJS `require('./foo')` is treated as a generic call expression and + * never enters `analysis.imports`, which would silently drop edges in + * Node-style codebases. Patch coverage with a focused regex pass on the file + * content — we only want literal string arguments, so the regex is narrow. + * + * Limitations (intentional): + * - Computed requires (`require(name)`) are external/dynamic — skipped. + * - Template-literal requires are unresolved. + * - String concatenation in the argument is unresolved. + */ +const REQUIRE_LITERAL_RE = /\brequire\(\s*(['"])([^'"`\n]+?)\1\s*\)/g; + +/** + * Strip JS/TS line and block comments before running text-pattern matchers. + * Replaces with spaces (preserving offsets isn't critical here, but keeping + * roughly the same length avoids surprising the matcher with collapsed + * whitespace). Does not attempt to honor string contents — that's fine for + * the narrow patterns we run (`require('...')`, etc.) because the same + * comment-or-not heuristic applies uniformly to all matched literals. + */ +function stripJsLikeComments(content) { + return content + .replace(/\/\*[\s\S]*?\*\//g, '') + .replace(/\/\/[^\n]*/g, ''); +} + +function extractRequireSources(content) { + const sources = []; + let m; + const stripped = stripJsLikeComments(content); + REQUIRE_LITERAL_RE.lastIndex = 0; + while ((m = REQUIRE_LITERAL_RE.exec(stripped)) !== null) { + sources.push(m[2]); + } + return sources; +} + +/** + * Kotlin has no tree-sitter extractor in this project, so we collect its + * import sources via a focused regex pass. Kotlin imports are syntactically + * simple: one per line, `import x.y.Z` or `import x.y.Z as Alias` (or + * `import x.y.*` for star imports). We capture the dotted FQN and let the + * dotted resolver classify wildcards. + * + * The capture is a strict qualifiedName grammar — a leading identifier + * followed by zero or more `.identifier` segments and an optional trailing + * `.*` for star-imports. The looser `[\w.*]+` form previously here would + * match pathological inputs like `import ...` or `import .foo`. + */ +const KOTLIN_IMPORT_RE = + /^\s*import\s+(\w+(?:\.\w+)*(?:\.\*)?)(?:\s+as\s+\w+)?\s*$/gm; + +function extractKotlinSources(content) { + const sources = []; + let m; + KOTLIN_IMPORT_RE.lastIndex = 0; + while ((m = KOTLIN_IMPORT_RE.exec(content)) !== null) { + sources.push(m[1]); + } + return sources; +} + +// --------------------------------------------------------------------------- +// Python resolver +// +// Tree-sitter's Python extractor emits one entry per import statement: +// - `import a.b.c` -> { source: 'a.b.c', specifiers: ['a.b.c'] } +// - `from a.b.c import x,y` -> { source: 'a.b.c', specifiers: ['x','y'] } +// - `from . import x` -> { source: '', specifiers: ['x'] } +// - `from .x import y` -> { source: '.x', specifiers: ['y'] } +// - `from ..pkg import y` -> { source: '..pkg', specifiers: ['y'] } +// +// We can't tell relative from absolute by the source string alone — the dots +// could be a leading-dot relative source OR a literal `.` package separator. +// Python's lexical convention disambiguates: leading dots ALWAYS mean +// relative. Tree-sitter preserves leading dots verbatim in the source field, +// so we can dispatch on the prefix. +// +// Resolution rules: +// 1. Relative (starts with `.`): walk up parent dirs by leading-dot count, +// then descend by the remaining dotted segments. +// 2. Absolute (no leading dot): walk up from the importer's directory, +// trying EACH ancestor as a candidate Python root. The first ancestor +// under which probing succeeds wins. This matches how multi-service +// Python repos work in practice — each service directory acts as its +// own root for unqualified `import sibling` style imports +// (e.g. microservices-demo's per-service grpc stubs). +// +// We don't gate this on setup.py / pyproject.toml detection. The +// probe itself IS the test of whether the ancestor is a candidate +// root: an absent module just continues the walk. The closest +// ancestor where the import resolves wins, which gives importer +// scope precedence (sibling files override remote candidates). +// --------------------------------------------------------------------------- + +/** + * Resolve a Python import. Unlike most resolvers this can produce multiple + * matches (one for the package `__init__.py` plus one per submodule + * specifier), so the signature differs: returns string[]. + * + * Returns empty array for external/unresolved packages. + */ +export function resolvePythonImport(rawImport, specifiers, file, ctx) { + if (typeof rawImport !== 'string') return []; + const src = rawImport; + const importerDir = dirOf(toPosix(file.path)); + + // Count leading dots; the rest is a dotted module path + let dots = 0; + while (dots < src.length && src.charCodeAt(dots) === 0x2e /* '.' */) dots++; + const tail = src.slice(dots); + const tailSegments = tail ? tail.split('.').filter(Boolean) : []; + + if (dots > 0) { + // Relative import. `from . import x` (dots=1, tail='') walks up zero + // directories (sibling level); `from .. import x` walks up one. + // Relative imports are anchored at the importer's package, so we do + // NOT do the per-root walk-up here — leading dots already encode the + // exact anchor. + const importerParts = importerDir ? importerDir.split('/').filter(Boolean) : []; + const dropLevels = dots - 1; + if (dropLevels > importerParts.length) { + // Walked above the project root — unresolvable + return []; + } + const baseParts = importerParts.slice(0, importerParts.length - dropLevels); + + // `from .[..] import x, y` with no dotted tail — specifiers are siblings + // at `baseParts`. Probe directly without requiring `/__init__.py` + // to exist: PEP 328 implicit namespace packages are common in modern + // Python (no `__init__.py`), and `resolvePythonProbe` would otherwise + // gate specifier resolution on the package marker and drop these imports. + if (tailSegments.length === 0) { + if (!Array.isArray(specifiers) || specifiers.length === 0) return []; + const base = baseParts.join('/'); + const matches = []; + for (const spec of specifiers) { + // Wildcard `*` and qualified specifiers (`Foo.bar`) skip; the + // surface name is what tree-sitter records for `from . import x`. + if (!spec || spec === '*' || spec.includes('.')) continue; + const subFile = base ? `${base}/${spec}.py` : `${spec}.py`; + const subInit = base ? `${base}/${spec}/__init__.py` : `${spec}/__init__.py`; + if (ctx.fileSet.has(subFile)) matches.push(subFile); + else if (ctx.fileSet.has(subInit)) matches.push(subInit); + } + return matches; + } + + const moduleParts = baseParts.concat(tailSegments); + return resolvePythonProbe(moduleParts, specifiers, ctx); + } + + // Absolute import. Walk up from the importer's directory and try every + // ancestor as a candidate Python root — the first one where probing + // resolves anything wins. This handles the multi-service / multi-package + // case where each service's directory acts as its own implicit + // sys.path entry (e.g. `import demo_pb2_grpc` from + // `src/emailservice/email_server.py` should resolve to + // `src/emailservice/demo_pb2_grpc.py`, NOT fail because the file isn't + // at `/demo_pb2_grpc.py`). + // + // Importer-scope precedence (deepest ancestor first) means that when + // the same module name exists in multiple services, each service's + // file shadows the others — no cross-service edges. + if (tailSegments.length === 0) { + // `from . import x` is dots>0 only; reaching here means the source + // was the empty string. Nothing to probe. + return []; + } + + const importerParts = importerDir ? importerDir.split('/').filter(Boolean) : []; + for (let i = importerParts.length; i >= 0; i--) { + const rootParts = importerParts.slice(0, i); + const candidateModule = rootParts.concat(tailSegments); + const matches = resolvePythonProbe(candidateModule, specifiers, ctx); + if (matches.length > 0) return matches; + } + return []; +} + +/** + * Given a fully-qualified module-path segment list (e.g. ['src','utils']), + * probe the file set for `a/b/c.py` then `a/b/c/__init__.py`. On package + * match, also probe each specifier as a submodule. Returns an array of + * resolved project-relative paths (deduped by Set in caller). + */ +function resolvePythonProbe(moduleParts, specifiers, ctx) { + if (moduleParts.length === 0) { + // `from . import x` case: importer's package is the implicit module; + // each x is a sibling module to probe directly. + return []; + } + const base = moduleParts.join('/'); + const matches = []; + + const moduleFile = `${base}.py`; + const packageInit = `${base}/__init__.py`; + + if (ctx.fileSet.has(moduleFile)) { + matches.push(moduleFile); + return matches; // No further probing on a leaf module file. + } + if (ctx.fileSet.has(packageInit)) { + matches.push(packageInit); + // Package match: probe each specifier as a submodule + if (Array.isArray(specifiers)) { + for (const spec of specifiers) { + // Wildcard `*` and qualified specifiers (`Foo.bar`) skip; the + // surface name is what tree-sitter records for `from pkg import x`. + if (!spec || spec === '*' || spec.includes('.')) continue; + const subFile = `${base}/${spec}.py`; + const subInit = `${base}/${spec}/__init__.py`; + if (ctx.fileSet.has(subFile)) matches.push(subFile); + else if (ctx.fileSet.has(subInit)) matches.push(subInit); + } + } + return matches; + } + + // No match — external package. + return []; +} + +// --------------------------------------------------------------------------- +// Go resolver +// +// Tree-sitter's Go extractor emits the literal import path (without quotes). +// Resolution: walk up from the importer's directory to find the nearest +// enclosing `go.mod` (multi-module monorepos are the norm). Strip that +// module's prefix; the remainder maps to a directory RELATIVE TO THAT +// MODULE'S DIRECTORY in the project. Go imports are package-level (not +// file-level), so a single `import "github.com/foo/bar/util"` produces edges +// to every .go file inside that module's `util/`. +// +// Cross-module imports (`github.com/foo/bar/X` from a file under a module +// that declares `github.com/foo/baz`) are correctly classified as external — +// they refer to a different Go module, which from this module's perspective +// is a third-party dependency. +// +// Inputs: +// - rawImport: 'github.com/foo/bar/util' (no quotes) +// - file.path: importer's project-relative path +// - ctx.goModules: Map of every go.mod discovered. +// +// Result: array of every `/util/*.go` path in the project +// (deduped by caller). +// --------------------------------------------------------------------------- + +export function resolveGoImport(rawImport, file, ctx) { + if (!rawImport || typeof rawImport !== 'string') return []; + const src = rawImport.trim(); + if (!src) return []; + + const importerPath = toPosix(file.path); + const importerDir = dirOf(importerPath); + + const nearestModuleDir = findNearestConfigDir(importerDir, ctx.goModules); + if (nearestModuleDir === undefined) { + // Warn once per importer file — a single .go file can import several + // module-prefixed paths, so suppress duplicates. + if (!ctx._warnedNoGoModule.has(importerPath)) { + ctx._warnedNoGoModule.add(importerPath); + process.stderr.write( + `Warning: extract-import-map: Go file ${importerPath} has no ` + + `ancestor go.mod — import ${src} unresolvable — module-prefix ` + + `imports skipped\n`, + ); + } + return []; + } + + const moduleName = ctx.goModules.get(nearestModuleDir); + + // Strip module prefix; require a `/` boundary so 'githubXcom...' does not + // accidentally match 'github.com...'. + let remainder; + if (src === moduleName) { + remainder = ''; + } else if (src.startsWith(moduleName + '/')) { + remainder = src.slice(moduleName.length + 1); + } else { + // External package (stdlib, 3rd-party module, OR a different in-tree + // module — the latter is intentional: from this module's perspective, + // a sibling module is an external dependency). + return []; + } + + // Map to a directory in the project (POSIX style). Anchor at the module's + // own directory, so a sub-module's `/sub` resolves under that + // module's tree rather than under project root. + const subDir = toPosix(remainder); + const targetDir = nearestModuleDir + ? (subDir ? `${nearestModuleDir}/${subDir}` : nearestModuleDir) + : subDir; + const files = ctx.goFilesByDir.get(targetDir); + return files ? [...files] : []; +} + +// --------------------------------------------------------------------------- +// Dotted-package resolver (Java / Kotlin / C#) +// +// Shared logic: an import like `com.example.foo.Bar` maps to a file +// `**/com/example/foo/Bar.` in the project. Many JVM/CLR projects nest +// sources under `src/main/java/`, `src/main/kotlin/`, etc., so the resolver +// must search for any file whose suffix matches the dotted-path-as-file form. +// +// We pre-build an index: trailing-slash-suffix -> matching project paths. +// Indexing once is O(files * average_segments); per-import lookup is then +// effectively O(1) hash lookup + scan of the bucket. +// --------------------------------------------------------------------------- + +/** + * Build an index of all files for a given extension, keyed by their + * "package-path suffix" form. For each file `src/main/java/com/x/Y.java`, + * the index gets entries for every suffix that ends at a `/`: + * - 'com/x/Y.java' + * - 'x/Y.java' + * - 'Y.java' + * keyed off each successively-shorter suffix. + * + * Using a Map avoids per-import full table scans; a 50K-file + * monorepo with deep package nesting still resolves O(1) per import. + */ +function buildSuffixIndex(files, extPredicate) { + const idx = new Map(); + for (const f of files) { + const p = toPosix(f.path); + if (!extPredicate(p)) continue; + // Generate every "directory-bounded suffix" of the path + const parts = p.split('/'); + for (let i = 0; i < parts.length; i++) { + const suffix = parts.slice(i).join('/'); + if (!idx.has(suffix)) idx.set(suffix, []); + idx.get(suffix).push(p); + } + } + // Deterministic order within each bucket + for (const arr of idx.values()) { + arr.sort((a, b) => a.localeCompare(b)); + } + return idx; +} + +/** + * Resolve a dotted-import to a file. `fqn` is the qualified name + * (`com.example.Foo`); `ext` is the file extension to probe (`.java`, + * `.kt`, `.cs`). Wildcards (e.g. `com.example.*`) and the trailing `*` in + * Java's `com.example.*` are stripped before resolution — there is no good + * single-file resolution for wildcards, so we drop them. (Tree-sitter + * already exposes `*` as a specifier; the source field strips it.) + * + * Returns array (most cases: 0 or 1 match; multiple if the same suffix + * appears in multiple source roots). + */ +function resolveDottedFqn(fqn, ext, suffixIndex) { + if (!fqn || typeof fqn !== 'string') return []; + // Strip trailing wildcard segments like `com.example.*` + const trimmed = fqn.replace(/\.\*$/, ''); + if (!trimmed) return []; + const filePart = trimmed.replace(/\./g, '/') + ext; + const matches = suffixIndex.get(filePart); + return matches ? [...matches] : []; +} + +// --------------------------------------------------------------------------- +// Java resolver +// --------------------------------------------------------------------------- + +export function resolveJavaImport(rawImport, _file, ctx) { + return resolveDottedFqn(rawImport, '.java', ctx.javaIndex); +} + +// --------------------------------------------------------------------------- +// Kotlin resolver +// +// Kotlin has no tree-sitter extractor in this project, so its import sources +// are collected via a focused regex pass in extractExtraImportSources(); the +// resolver itself is identical-shape to Java. +// --------------------------------------------------------------------------- + +export function resolveKotlinImport(rawImport, _file, ctx) { + return resolveDottedFqn(rawImport, '.kt', ctx.kotlinIndex); +} + +// --------------------------------------------------------------------------- +// C# resolver +// +// C# `using Foo.Bar;` declarations are typically NAMESPACES, not files, and +// the C# convention is namespace = directory (loose). Tree-sitter's C# +// extractor captures these as imports with the dotted source. We probe the +// dotted path against the .cs index the same way Java/Kotlin do. +// --------------------------------------------------------------------------- + +export function resolveCSharpImport(rawImport, _file, ctx) { + return resolveDottedFqn(rawImport, '.cs', ctx.csIndex); +} + +// --------------------------------------------------------------------------- +// Ruby resolver +// +// Two distinct Ruby import forms, with different resolution semantics: +// - `require_relative 'foo'` -> resolve against the importer's directory, +// append .rb +// - `require 'foo/bar'` -> load-path probe: lib/foo/bar.rb, +// app/foo/bar.rb, or foo/bar.rb (whichever +// exists) +// +// Tree-sitter's Ruby extractor uses a single `imports` field for both forms +// and drops the method name, so we cannot tell them apart from the +// extractor output alone. Instead we use a regex pass on the file content, +// which preserves the method name as the discriminator. +// +// The two forms are unambiguous in source — both start with the method name +// followed by a quoted argument — so a focused regex is reliable. +// --------------------------------------------------------------------------- + +const RUBY_REQUIRE_RE = + /\b(require_relative|require)\s*\(?\s*(['"])([^'"`\n]+?)\2/g; + +/** + * Strip Ruby line comments (`# ...` to end of line) before running the + * require regex. Ruby has no block comments at this scope (=begin/=end + * exists but is rare; tree-sitter would normally handle that). Like the JS + * stripper, this doesn't try to honor string contents — it's a heuristic. + */ +function stripRubyComments(content) { + return content.replace(/#[^\n]*/g, ''); +} + +/** + * Return [{ kind: 'relative'|'absolute', source }] for every require / + * require_relative call in a Ruby file. + */ +function parseRubyImports(content) { + const out = []; + let m; + const stripped = stripRubyComments(content); + RUBY_REQUIRE_RE.lastIndex = 0; + while ((m = RUBY_REQUIRE_RE.exec(stripped)) !== null) { + out.push({ + kind: m[1] === 'require_relative' ? 'relative' : 'absolute', + source: m[3], + }); + } + return out; +} + +/** + * Resolve a single Ruby require. Returns array (0 or 1 match). + * + * For require_relative: append `.rb` if missing, resolve against importer dir. + * For require: probe lib/.rb, app/.rb, .rb. + */ +export function resolveRubyImport({ kind, source }, file, ctx) { + if (!source) return []; + const importerDir = dirOf(toPosix(file.path)); + const withExt = source.endsWith('.rb') ? source : source + '.rb'; + + if (kind === 'relative') { + const base = resolveRelative(importerDir, withExt); + return ctx.fileSet.has(base) ? [base] : []; + } + + // Load-path probe order + const probes = [`lib/${withExt}`, `app/${withExt}`, withExt]; + for (const p of probes) { + if (ctx.fileSet.has(p)) return [p]; + } + return []; +} + +// --------------------------------------------------------------------------- +// PHP resolver +// +// PHP's `use Vendor\Pkg\Class;` is namespace-based. Composer's PSR-4 +// autoload map (`composer.json` -> autoload.psr-4) declares which directory +// holds the files for each namespace prefix, e.g.: +// { "App\\": "src/" } means App\Foo\Bar lives at src/Foo/Bar.php +// +// Resolution: +// 1. Find the longest matching autoload prefix. +// 2. Strip that prefix from the FQN. +// 3. Translate backslashes to forward slashes. +// 4. Append `.php` and probe the file set. +// +// Imports whose namespace is not declared in any autoload entry are +// external — dropped. +// --------------------------------------------------------------------------- + +/** + * Parse a single composer.json content and return Map or null if the JSON failed to parse. The returned dirs are + * relative to the composer.json's own directory — NOT projectRoot — + * matching how PSR-4 itself is specified. + * + * Returning `null` (rather than throwing) lets the caller emit a Warning: + * with the exact composer.json path that failed; bubbling the error would + * conceal which file was at fault when many composer.json files are loaded. + */ +function parseComposerAutoloadText(raw) { + let parsed; + try { + parsed = JSON.parse(raw); + } catch { + return null; + } + const out = new Map(); + const psr4 = parsed?.autoload?.['psr-4']; + if (!psr4 || typeof psr4 !== 'object') return out; + for (const [prefix, target] of Object.entries(psr4)) { + const targets = Array.isArray(target) ? target : [target]; + // Normalize each dir to posix, strip leading `./`, strip trailing `/` + const normalized = targets + .filter(t => typeof t === 'string') + .map(t => toPosix(t).replace(/\/$/, '')); + // Ensure non-empty prefixes end with a backslash so the + // longest-prefix-match does not accidentally split mid-segment + // ("App" vs "Application"). Preserve the empty prefix as-is — it's + // Composer's fallback mapping (`"psr-4": {"": "src/"}`) and means + // "any namespace resolves under this dir". Appending `\` would + // convert it into a prefix that matches nothing. + const normalizedPrefix = prefix === '' || prefix.endsWith('\\') ? prefix : prefix + '\\'; + out.set(normalizedPrefix, normalized); + } + return out; +} + +/** + * Load every `composer.json` discovered in the input file list and parse + * each's `autoload.psr-4` section. Returns Map + * keyed by the project-relative POSIX directory containing the + * composer.json (empty string for a root-level composer.json). + * + * WHY plural: Composer monorepos commonly stack a root composer.json over + * per-package composer.json files (one of the two formal "monorepo" + * patterns Composer documents — `wikimedia/composer-merge-plugin` and + * `symplify/monorepo-builder` both ship this layout). Loading only the + * root would miss package-scoped PSR-4 entries entirely. + * + * On parse failure for a specific composer.json, emits a Warning: pointing + * at the bad file and skips it. The rest of the project's PHP imports keep + * resolving via whichever composer.json files parsed cleanly. + */ +function loadPhpAutoloads(projectRoot, files) { + const out = new Map(); + for (const f of files) { + const p = toPosix(f.path); + const base = p.includes('/') ? p.slice(p.lastIndexOf('/') + 1) : p; + if (base !== 'composer.json') continue; + const absPath = join(projectRoot, p); + if (!existsSync(absPath)) continue; + let raw; + try { + raw = readFileSync(absPath, 'utf-8'); + } catch (err) { + process.stderr.write( + `Warning: extract-import-map: composer.json at ${absPath} failed ` + + `to read (${err.message}) — PSR-4 namespace mapping from this ` + + `composer.json unavailable — PHP imports under this package ` + + `will not resolve\n`, + ); + continue; + } + const parsed = parseComposerAutoloadText(raw); + if (parsed === null) { + process.stderr.write( + `Warning: extract-import-map: composer.json at ${absPath} failed ` + + `to parse — PSR-4 namespace mapping unavailable — PHP imports ` + + `under this package will not resolve\n`, + ); + continue; + } + out.set(dirOf(p), parsed); + } + return out; +} + +/** + * Resolve a PHP `use` FQN against the autoload map of the importer's + * nearest enclosing composer.json. Returns array (0 or 1 match — the first + * dir in the PSR-4 target list that contains the file). + * + * Resolved paths are anchored at the composer.json's directory, NOT at + * projectRoot, so a sub-package's `App\Foo\Bar` resolves to + * `/src/Foo/Bar.php` rather than `/src/...`. + * This is what Composer's autoloader actually does on disk. + */ +export function resolvePhpImport(rawImport, file, ctx) { + if (!rawImport || typeof rawImport !== 'string') return []; + // Strip leading backslash if present (PHP allows `use \Foo\Bar;`) + const fqn = rawImport.startsWith('\\') ? rawImport.slice(1) : rawImport; + if (!fqn) return []; + + const importerDir = dirOf(toPosix(file.path)); + const composerDir = findNearestConfigDir(importerDir, ctx.phpAutoloads); + if (composerDir === undefined) return []; + const autoload = ctx.phpAutoloads.get(composerDir); + if (!autoload || autoload.size === 0) return []; + + // Longest-prefix match across this composer.json's autoload entries. + // Walk the map and pick the entry with the longest matching prefix, so + // `Foo\Bar` does not match a prefix `F\` if `Foo\` is also present. + // Use `null` as the sentinel rather than 0-length so the empty PSR-4 + // fallback prefix (`""` → `src/`) can win when nothing more specific + // matches; otherwise `prefix.length > bestPrefix.length` would always + // be `0 > 0 = false` for the empty prefix. + let bestPrefix = null; + let bestDirs = null; + for (const [prefix, dirs] of autoload) { + if (fqn.startsWith(prefix) && (bestPrefix === null || prefix.length > bestPrefix.length)) { + bestPrefix = prefix; + bestDirs = dirs; + } + } + if (bestDirs === null) return []; + + // Drop the prefix (it covers the directory), translate `\` to `/`. + const relative = fqn.slice(bestPrefix.length).replace(/\\/g, '/'); + if (!relative) return []; + for (const dir of bestDirs) { + // Anchor at the composer.json's own directory — PSR-4 paths are + // composer-relative, not project-relative. + const dirUnderComposer = dir + ? (composerDir ? `${composerDir}/${dir}` : dir) + : composerDir; + const candidate = dirUnderComposer + ? `${dirUnderComposer}/${relative}.php` + : `${relative}.php`; + if (ctx.fileSet.has(candidate)) return [candidate]; + } + return []; +} + +// --------------------------------------------------------------------------- +// Rust resolver +// +// Rust's module system is path-based but the import syntax is `use` rather +// than path strings. Tree-sitter emits sources like `crate::a::b::Item`, +// `super::a::Item`, `self::a`, or bare `std::collections::HashMap`. We map +// only those rooted at `crate::` or `super::` — bare paths are external +// crates. +// +// Resolution heuristics: +// - `crate::a::b::*` -> probe `/a/b.rs`, then +// `/a/b/mod.rs`. The crate root is `/src/` +// (Cargo convention). +// - `super::a::b::*` -> walk up one directory from the importer, then +// descend; same .rs / mod.rs probes. +// - `self::a::*` -> like `super::a::*` but without the walk-up. +// +// Rust uses won't always land on a file (an import like `crate::Foo` could +// refer to a struct re-exported through `mod.rs`); we accept that limitation. +// +// We also extract `mod x;` declarations via regex — these declare submodules +// to load and translate directly to `/x.rs` or +// `/x/mod.rs`. +// --------------------------------------------------------------------------- + +/** + * Try `.rs` then `/mod.rs` against the file set. Returns the + * first match or null. + */ +function probeRustModule(base, fileSet) { + if (!base) return null; + if (fileSet.has(`${base}.rs`)) return `${base}.rs`; + if (fileSet.has(`${base}/mod.rs`)) return `${base}/mod.rs`; + return null; +} + +/** + * Find the "crate root" directory for a Rust importer. By Cargo convention, + * this is the directory containing `src/lib.rs` or `src/main.rs`. For nested + * workspaces, walk up from the importer until a `src/` ancestor is found. + * Returns the path relative to project root, or null if not found. + * + * The loop walks every ancestor directory (including the root) and probes + * `/src/lib.rs` and `/src/main.rs`. We don't need a + * separate "candidate ends with src" branch — when the importer is itself + * inside `src/`, the next iteration up reaches the package dir and the + * `/src/lib.rs` probe catches it. + */ +function findRustCrateSrc(importerDir, fileSet) { + const parts = importerDir.split('/').filter(Boolean); + for (let i = parts.length; i >= 0; i--) { + const ancestor = parts.slice(0, i).join('/'); + const childSrc = ancestor ? `${ancestor}/src` : 'src'; + if (fileSet.has(`${childSrc}/lib.rs`) || fileSet.has(`${childSrc}/main.rs`)) { + return childSrc; + } + } + return null; +} + +export function resolveRustImport(rawImport, file, ctx) { + if (!rawImport || typeof rawImport !== 'string') return []; + const src = rawImport.trim(); + if (!src) return []; + + const importerDir = dirOf(toPosix(file.path)); + const segments = src.split('::').filter(Boolean); + if (segments.length === 0) return []; + const head = segments[0]; + + // External crates: anything not rooted at crate/super/self. + if (head !== 'crate' && head !== 'super' && head !== 'self') return []; + + // Walk segments after the head to a base file path. We probe each + // successive prefix from longest to shortest so that `crate::a::b::Item` + // matches `a/b.rs` (with `Item` being a re-export inside) rather than + // failing because `a/b/Item.rs` doesn't exist. + let baseDir; + if (head === 'crate') { + const crateSrc = findRustCrateSrc(importerDir, ctx.fileSet); + if (!crateSrc) { + // Warn once per importer file (a single .rs file can have many + // `use crate::...` statements; suppress duplicate warnings). + const importerPath = toPosix(file.path); + if (!ctx._warnedNoRustCrateRoot.has(importerPath)) { + ctx._warnedNoRustCrateRoot.add(importerPath); + process.stderr.write( + `Warning: extract-import-map: Rust file ${importerPath} has ` + + `'use crate::' but no crate root (src/lib.rs or src/main.rs) ` + + `found — crate-relative imports unresolved\n`, + ); + } + return []; + } + baseDir = crateSrc; + } else if (head === 'super') { + // Walk up one directory from the importer + const parts = importerDir.split('/').filter(Boolean); + if (parts.length === 0) return []; + baseDir = parts.slice(0, -1).join('/'); + } else { + // self:: + baseDir = importerDir; + } + + const rest = segments.slice(1); + // Try each prefix length from longest -> shortest. The empty rest case + // (e.g. bare `use crate;`) is unresolvable. + for (let i = rest.length; i > 0; i--) { + const prefix = rest.slice(0, i); + const base = baseDir + ? `${baseDir}/${prefix.join('/')}` + : prefix.join('/'); + const match = probeRustModule(base, ctx.fileSet); + if (match) return [match]; + } + return []; +} + +/** + * Regex pass for Rust `mod x;` declarations. These are NOT captured by + * tree-sitter's import field, but they declare a child module on disk that + * follows the same `/x.rs` or `/x/mod.rs` convention. + */ +const RUST_MOD_RE = /^\s*(?:pub(?:\s*\([^)]*\))?\s+)?mod\s+(\w+)\s*;\s*$/gm; + +function extractRustModSources(content) { + const sources = []; + let m; + // Rust uses the same line + block comment syntax as JS/TS, so we can reuse + // the same stripper. Without this, `// mod fake;` would phantom-register + // a submodule that doesn't exist on disk. + const stripped = stripJsLikeComments(content); + RUST_MOD_RE.lastIndex = 0; + while ((m = RUST_MOD_RE.exec(stripped)) !== null) { + // Synthesize as a `self::` source so the regular Rust resolver + // handles it (probes the importer's directory). + sources.push(`self::${m[1]}`); + } + return sources; +} + +// --------------------------------------------------------------------------- +// C / C++ resolver +// +// Tree-sitter's cpp extractor exposes both quoted and angle-bracket includes +// as imports with `source` set to the bare filename (e.g. `foo.h`). +// Quoted includes resolve relative to the importer's directory; angle +// includes look in a system path. We can't tell quoted from angle from +// tree-sitter alone, but the resolution rules overlap enough that probing +// both yields the right answer most of the time: +// 1. / +// 2. include/ +// 3. src/ +// 4. (project-root-relative) +// +// We probe in that order and take the first match. Multiple file extensions +// (.h, .hpp, .hxx, .cuh) are NOT auto-appended — #include carries the +// extension explicitly. +// --------------------------------------------------------------------------- + +export function resolveCppImport(rawImport, file, ctx) { + if (!rawImport || typeof rawImport !== 'string') return []; + const src = toPosix(rawImport.trim()); + if (!src) return []; + const importerDir = dirOf(toPosix(file.path)); + + const candidates = [ + resolveRelative(importerDir, src), + `include/${src}`, + `src/${src}`, + src, + ]; + for (const c of candidates) { + if (c && ctx.fileSet.has(c)) return [c]; + } + return []; +} + +// --------------------------------------------------------------------------- +// Dispatcher +// --------------------------------------------------------------------------- + +/** + * Languages recognized as "code" for resolver dispatch. Tree-sitter parses + * these via the corresponding extractor; the dispatcher routes the import + * source through the matching resolver. + */ +const TS_JS_LANGS = new Set([ + 'typescript', 'javascript', 'tsx', 'jsx', 'vue', +]); + +/** + * Dispatch a raw import to the language-specific resolver. Returns an array + * of resolved project-relative paths (most resolvers produce 0 or 1; Python + * can produce multiple when a `from pkg import a, b, c` resolves both the + * package's `__init__.py` and each submodule). + * + * Per-resolver contract: never throw, never read disk (read once in main()). + * Empty array means external/unresolved. + */ +function resolveImport(imp, file, ctx) { + const lang = file.language; + const src = imp.source; + if (TS_JS_LANGS.has(lang)) { + const out = resolveTsJsImport(src, file, ctx); + return out ? [out] : []; + } + if (lang === 'python') { + return resolvePythonImport(src, imp.specifiers, file, ctx); + } + if (lang === 'go') { + return resolveGoImport(src, file, ctx); + } + if (lang === 'java') { + return resolveJavaImport(src, file, ctx); + } + if (lang === 'kotlin') { + return resolveKotlinImport(src, file, ctx); + } + if (lang === 'csharp') { + return resolveCSharpImport(src, file, ctx); + } + if (lang === 'php') { + return resolvePhpImport(src, file, ctx); + } + if (lang === 'rust') { + return resolveRustImport(src, file, ctx); + } + if (lang === 'c' || lang === 'cpp') { + return resolveCppImport(src, file, ctx); + } + // Ruby is handled via a dedicated pathway because its tree-sitter + // extractor flattens require vs require_relative into a single field, + // losing the discriminator the resolver needs. + return []; +} + +/** + * Collect extra raw import sources that tree-sitter doesn't capture. Today + * this is CommonJS require() literals for JS/TS files. Returns an array of + * import-source strings to be passed through resolveImport(). + */ +function extractExtraImportSources(file, content) { + if (TS_JS_LANGS.has(file.language)) { + return extractRequireSources(content); + } + if (file.language === 'kotlin') { + return extractKotlinSources(content); + } + if (file.language === 'rust') { + // `mod x;` declarations aren't in tree-sitter's `imports` field, but they + // declare submodules on disk that the rust resolver knows how to find. + return extractRustModSources(content); + } + return []; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- +async function main() { + const [,, inputPath, outputPath] = process.argv; + if (!inputPath || !outputPath) { + process.stderr.write('Usage: node extract-import-map.mjs \n'); + process.exit(1); + } + + const inputRaw = readFileSync(inputPath, 'utf-8'); + const input = JSON.parse(inputRaw); + const { projectRoot, files } = input; + + if (!projectRoot || !Array.isArray(files)) { + throw new Error('Invalid input: must contain projectRoot and files array'); + } + + // Create tree-sitter plugin with all configs that have WASM grammars. + // + // WHY graceful init: the most likely real-world failure mode is the WASM + // loader failing to locate or fetch the grammar binaries (cache eviction, + // restricted sandboxes, transient FS issues). When that happens, we still + // want the script to complete — producing an empty importMap for every + // code file — rather than crashing the whole project-scanner pipeline. + // The structural graph will lose import edges, but all OTHER analysis + // (file inventory, exports inferred from filenames, etc.) keeps working. + let registry = null; + let treeSitterReady = false; + try { + const tsConfigs = builtinLanguageConfigs.filter(c => c.treeSitter); + const tsPlugin = new TreeSitterPlugin(tsConfigs); + await tsPlugin.init(); + registry = new PluginRegistry(); + registry.register(tsPlugin); + registerAllParsers(registry); + treeSitterReady = true; + } catch (err) { + process.stderr.write( + `Warning: extract-import-map: tree-sitter init failed ` + + `(${err.message}) — all importMap entries will be empty — ` + + `structural graph will have no import edges\n`, + ); + } + + // Build resolution context (cached configs) + const ctx = buildResolutionContext(projectRoot, files); + + const importMap = {}; + let filesWithImports = 0; + let totalEdges = 0; + + for (const file of files) { + const path = toPosix(file.path); + + // Non-code files always get an empty array + if (file.fileCategory !== 'code') { + importMap[path] = []; + continue; + } + + // Tree-sitter init failed earlier — produce empty importMap entries for + // every code file and skip the analysis path. The one-time warning was + // already emitted at startup. + if (!treeSitterReady) { + importMap[path] = []; + continue; + } + + const absolutePath = join(projectRoot, file.path); + + // Read file content (per-file resilience) + let content; + try { + content = readFileSync(absolutePath, 'utf-8'); + } catch (err) { + process.stderr.write( + `Warning: extract-import-map: import resolution failed for ${path} ` + + `(read error: ${err.message}) — importMap[${path}]=[]\n`, + ); + importMap[path] = []; + continue; + } + + // Analyze + resolve + let resolved; + try { + const resolvedSet = new Set(); + + // Ruby is the only language whose tree-sitter import field doesn't + // preserve the require vs require_relative discriminator, so the + // resolver needs the regex-parsed shape directly. All other tree-sitter + // languages get analyzed once and dispatched normally. + if (file.language === 'ruby') { + for (const imp of parseRubyImports(content)) { + for (const out of resolveRubyImport(imp, file, ctx)) { + if (out && ctx.fileSet.has(out)) resolvedSet.add(out); + } + } + } else { + const analysis = registry.analyzeFile(file.path, content); + const imports = analysis?.imports ?? []; + for (const imp of imports) { + const outs = resolveImport(imp, file, ctx); + for (const out of outs) { + if (out && ctx.fileSet.has(out)) { + resolvedSet.add(out); + } + } + } + // Supplemental pass for sources tree-sitter doesn't capture (e.g. + // CJS require() calls, Kotlin imports). Dedup via the same set. + for (const extra of extractExtraImportSources(file, content)) { + const outs = resolveImport({ source: extra, specifiers: [] }, file, ctx); + for (const out of outs) { + if (out && ctx.fileSet.has(out)) { + resolvedSet.add(out); + } + } + } + } + resolved = [...resolvedSet].sort((a, b) => a.localeCompare(b)); + } catch (err) { + process.stderr.write( + `Warning: extract-import-map: import resolution failed for ${path} ` + + `(analyze error: ${err.message}) — importMap[${path}]=[]\n`, + ); + importMap[path] = []; + continue; + } + + importMap[path] = resolved; + if (resolved.length > 0) { + filesWithImports += 1; + totalEdges += resolved.length; + } + } + + const output = { + scriptCompleted: true, + stats: { + filesScanned: files.length, + filesWithImports, + totalEdges, + }, + importMap, + }; + + writeFileSync(outputPath, JSON.stringify(output, null, 2), 'utf-8'); + + if (!existsSync(outputPath)) { + throw new Error(`output file missing after write: ${outputPath}`); + } + + process.stderr.write( + `extract-import-map: filesScanned=${files.length} ` + + `filesWithImports=${filesWithImports} totalEdges=${totalEdges}\n`, + ); +} + +// --------------------------------------------------------------------------- +// Run only when executed directly as a CLI; importing the module (e.g. from +// tests) must not trigger main(). +// +// Canonicalize both sides through realpathSync. Node ESM resolves +// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves +// them, so a raw equality check silently no-ops when the script is invoked via +// a symlinked plugin install path (the default in Claude Code / Copilot CLI +// caches). See GitHub issue #162. +// --------------------------------------------------------------------------- +function isCliEntry() { + if (!process.argv[1]) return false; + try { + const modulePath = realpathSync(fileURLToPath(import.meta.url)); + const argvPath = realpathSync(process.argv[1]); + return modulePath === argvPath; + } catch { + return false; + } +} + +if (isCliEntry()) { + try { + await main(); + } catch (err) { + process.stderr.write(`extract-import-map.mjs failed: ${err.message}\n${err.stack}\n`); + process.exit(1); + } +} diff --git a/understand-anything-plugin/skills/understand/merge-batch-graphs.py b/understand-anything-plugin/skills/understand/merge-batch-graphs.py index dba9504..2021f9a 100644 --- a/understand-anything-plugin/skills/understand/merge-batch-graphs.py +++ b/understand-anything-plugin/skills/understand/merge-batch-graphs.py @@ -1023,11 +1023,74 @@ def main() -> None: print("Error: no batch-*.json files found in intermediate/", file=sys.stderr) sys.exit(1) - print(f"Found {len(batch_files)} batch files:", file=sys.stderr) + # Group by logical batch index so the report distinguishes single-batch + # files from multi-part file-analyzer outputs. Files that don't match the + # `batch-.json` / `batch--part-.json` pattern (e.g. fused + # `batch-fused-8-13.json`, range `batch-8-13.json`) would otherwise be + # silently dropped during load — flag them loudly instead so the user + # can fix the file-analyzer agent. + from collections import defaultdict as _dd + by_batch = _dd(list) + unrecognized_batch_files: list[str] = [] + for f in batch_files: + m = re.match(r"batch-(\d+)(?:-part-(\d+))?\.json", f.name) + if m: + by_batch[int(m.group(1))].append((f.name, int(m.group(2)) if m.group(2) else None)) + else: + unrecognized_batch_files.append(f.name) - # Load batches + if unrecognized_batch_files: + preview = ", ".join(unrecognized_batch_files[:5]) + suffix = ( + f" (+{len(unrecognized_batch_files) - 5} more)" + if len(unrecognized_batch_files) > 5 + else "" + ) + print( + f"Warning: merge-batch-graphs: {len(unrecognized_batch_files)} " + f"batch file(s) with unrecognized filenames will be DROPPED — " + f"files: {preview}{suffix} — fix the file-analyzer agent to use " + f"only batch-.json or batch--part-.json patterns", + file=sys.stderr, + ) + + logical_count = len(by_batch) + multi_part = sum(1 for entries in by_batch.values() if len(entries) > 1) + print( + f"Found {len(batch_files)} batch files " + f"({logical_count} logical batches, {multi_part} multi-part):", + file=sys.stderr, + ) + + # Missing-part detection: for any logical batch with parts (len > 1), the + # set of part numbers MUST be contiguous starting at 1. Gaps suggest a + # truncated write — emit a visible warning so the user can investigate. + # Collect into `missing_part_warnings` so they also surface in the final + # phase report; stderr alone gets buried under the per-batch load lines. + missing_part_warnings: list[str] = [] + for idx, entries in by_batch.items(): + part_nums = [p for (_n, p) in entries if p is not None] + if not part_nums: + continue + present = set(part_nums) + expected = set(range(1, max(part_nums) + 1)) + missing = sorted(expected - present) + if missing: + msg = ( + f"batch {idx} has parts {sorted(present)} but " + f"missing part {missing} — possible truncated write — " + f"affected nodes/edges may be lost" + ) + print(f"Warning: merge: {msg}", file=sys.stderr) + missing_part_warnings.append(msg) + + # Load batches — skip unrecognized filenames so they don't pollute the + # merged graph with content the agent labeled incorrectly. + unrecognized_set = set(unrecognized_batch_files) batches: list[dict[str, Any]] = [] for f in batch_files: + if f.name in unrecognized_set: + continue batch = load_batch(f) if batch is not None: batches.append(batch) @@ -1042,6 +1105,38 @@ def main() -> None: # Merge and normalize assembled, report = merge_and_normalize(batches) + # Surface missing multi-part files to the phase report (parallel to + # unrecognized-filename handling below). Stderr lines emitted during + # batch discovery get buried under per-batch load output — re-emitting + # via the report list ensures the Phase 4 review and final summary see + # the data-loss signal. + if missing_part_warnings: + report.append("") + report.append( + f"Warning: {len(missing_part_warnings)} batch(es) with missing parts " + f"— some nodes/edges silently dropped:" + ) + for w in missing_part_warnings: + report.append(f" - {w}") + + # Surface unrecognized-filename drops to the phase report so the + # downstream review step sees them, not just stderr. + if unrecognized_batch_files: + preview = ", ".join(unrecognized_batch_files[:5]) + suffix = ( + f" (+{len(unrecognized_batch_files) - 5} more)" + if len(unrecognized_batch_files) > 5 + else "" + ) + report.append("") + report.append( + f"Warning: dropped {len(unrecognized_batch_files)} batch file(s) " + f"with unrecognized filenames — files: {preview}{suffix} — " + f"fix the file-analyzer agent to use only batch-.json or " + f"batch--part-.json patterns (every node/edge in these " + f"files was excluded from the final graph)" + ) + # Recover any imports edges file-analyzer batches dropped despite # `batchImportData` containing them. The project-scanner's importMap # is the deterministic source of truth. diff --git a/understand-anything-plugin/skills/understand/scan-project.mjs b/understand-anything-plugin/skills/understand/scan-project.mjs new file mode 100644 index 0000000..553a82e --- /dev/null +++ b/understand-anything-plugin/skills/understand/scan-project.mjs @@ -0,0 +1,794 @@ +#!/usr/bin/env node +/** + * scan-project.mjs + * + * Deterministic file enumeration + language/category detection for the + * project-scanner agent. Replaces the LLM-written prose scanner that used to + * (a) author a per-run Node.js script (`tmp/ua-project-scan.js`), (b) walk the + * file tree, and (c) classify each file via lookup tables in LLM context — a + * pure rule-lookup pass that was being billed at LLM rates and adding many + * minutes of per-run latency on mid-sized monorepos. + * + * What the LLM still owns (Step A of project-scanner.md Phase 1): + * - Reading README + top-level manifests to synthesize `name`, + * `rawDescription`, `readmeHead`, `frameworks`, and the high-level + * `languages` narrative. + * + * What this script owns: + * - File enumeration (git ls-files preferred, recursive walk fallback) + * - `.understandignore` filtering (delegated to core's createIgnoreFilter) + * - Per-file language detection (extension + filename table) + * - Per-file category assignment (priority-ordered rules from + * project-scanner.md Step 4) + * - Line counting + * - Complexity estimation (project-scanner.md Step 7 thresholds) + * + * Usage: + * node scan-project.mjs + * + * Output JSON (subset of what project-scanner.md Phase 1 expects — the LLM + * agent merges this with Step A's narrative fields and Step C's importMap to + * produce the final scan-result.json): + * { + * "scriptCompleted": true, + * "files": [{ "path": "...", "language": "...", "sizeLines": N, "fileCategory": "..." }, ...], + * "totalFiles": N, + * "filteredByIgnore": M, + * "estimatedComplexity": "small" | "moderate" | "large" | "very-large", + * "stats": { "filesScanned": N, "byCategory": {...}, "byLanguage": {...} } + * } + * + * Logging: stderr only (stdout reserved for piped tooling). + * Per-file resilience: read/stat failures emit + * `Warning: scan-project: — file skipped from output` + * to stderr and the file is dropped; the rest of the scan completes. + * + * Determinism: files are sorted by `path.localeCompare` before emission, and + * the underlying enumeration is deterministic (git ls-files returns a stable + * order; the fallback walker sorts each directory's entries). + */ + +import { createRequire } from 'node:module'; +import { dirname, resolve, join, basename, extname, relative, sep } from 'node:path'; +import { fileURLToPath, pathToFileURL } from 'node:url'; +import { + existsSync, + readFileSync, + readdirSync, + realpathSync, + statSync, + writeFileSync, +} from 'node:fs'; +import { spawnSync } from 'node:child_process'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +// skills/understand/ -> plugin root is two dirs up +const pluginRoot = resolve(__dirname, '../..'); +const require = createRequire(resolve(pluginRoot, 'package.json')); + +// --------------------------------------------------------------------------- +// Resolve @understand-anything/core +// +// Two-step resolution: try the workspace-linked package first, fall back to +// the installed plugin cache layout. pathToFileURL() is required on Windows +// because dynamic import() of raw "C:\..." paths throws +// ERR_UNSUPPORTED_ESM_URL_SCHEME (Node parses "C:" as a URL scheme). +// --------------------------------------------------------------------------- +let core; +try { + core = await import(pathToFileURL(require.resolve('@understand-anything/core')).href); +} catch { + core = await import(pathToFileURL(resolve(pluginRoot, 'packages/core/dist/index.js')).href); +} + +const { createIgnoreFilter } = core; + +// --------------------------------------------------------------------------- +// Language detection +// +// Mirrors the canonical extension list from +// understand-anything-plugin/packages/core/src/languages/configs/* and the +// project-scanner.md Step 3 table. Extensions are matched lowercase; +// filenames (Dockerfile, Makefile, etc.) are matched case-sensitively because +// the projects-in-the-wild use canonical capitalizations. +// +// Where the core configs and project-scanner.md diverge (rare), project- +// scanner.md wins because it is the user-facing contract. +// --------------------------------------------------------------------------- + +/** + * Extension -> language id. Lowercase keys; lookup is `.ext.toLowerCase()`. + * Includes the legacy Step-3 mapping (.cfg/.ini/.env -> `config`) — note + * that `config` is a language id here, not a category. Category routing + * for these extensions is handled separately in CATEGORY_BY_EXT. + */ +const LANGUAGE_BY_EXT = Object.freeze({ + // TypeScript / JavaScript + '.ts': 'typescript', + '.tsx': 'typescript', + '.js': 'javascript', + '.jsx': 'javascript', + '.mjs': 'javascript', + '.cjs': 'javascript', + // Python + '.py': 'python', + '.pyi': 'python', + // Go / Rust / Java / Kotlin / C# / Swift / Lua + '.go': 'go', + '.rs': 'rust', + '.java': 'java', + '.kt': 'kotlin', + '.kts': 'kotlin', + '.cs': 'csharp', + '.swift': 'swift', + '.lua': 'lua', + // Ruby / PHP + '.rb': 'ruby', + '.rake': 'ruby', + '.php': 'php', + // C / C++ + '.c': 'c', + '.h': 'c', + '.cpp': 'cpp', + '.cc': 'cpp', + '.cxx': 'cpp', + '.hpp': 'cpp', + '.hxx': 'cpp', + // Vue / Svelte (no tree-sitter extractor, but project-scanner contract + // lists them as code languages — downstream import map will return []) + '.vue': 'vue', + '.svelte': 'svelte', + // Shell / Batch / PowerShell + '.sh': 'shell', + '.bash': 'shell', + '.zsh': 'shell', + '.ps1': 'powershell', + '.psm1': 'powershell', + '.psd1': 'powershell', + '.bat': 'batch', + '.cmd': 'batch', + // Markup / docs + '.html': 'html', + '.htm': 'html', + '.css': 'css', + '.scss': 'css', + '.sass': 'css', + '.less': 'css', + '.md': 'markdown', + '.mdx': 'markdown', + '.rst': 'markdown', + // Config / data + '.yaml': 'yaml', + '.yml': 'yaml', + '.json': 'json', + '.jsonc': 'jsonc', + '.toml': 'toml', + '.xml': 'xml', + '.xsl': 'xml', + '.xsd': 'xml', + '.plist': 'xml', + '.cfg': 'config', + '.ini': 'config', + '.env': 'config', + // Data / schema + '.sql': 'sql', + '.graphql': 'graphql', + '.gql': 'graphql', + '.proto': 'protobuf', + '.prisma': 'prisma', + '.csv': 'csv', + '.tsv': 'csv', + // Infra + '.tf': 'terraform', + '.tfvars': 'terraform', + // JVM build files (categorized via filename-or-extension) + '.gradle': 'gradle', + // .NET project files (mapped to extension-derived ids; downstream + // treats them as config — see CATEGORY_BY_EXT) + '.csproj': 'csproj', + '.sln': 'sln', + '.properties': 'properties', + '.mod': 'mod', + '.sum': 'sum', +}); + +/** + * Filename (no extension) -> language id. Compared case-sensitively against + * basename(path). Includes the most common no-extension conventions; anything + * NOT in this table with no extension falls back to `unknown`. + * + * Dockerfile.* variants (Dockerfile.dev, Dockerfile.prod) are handled by a + * startsWith check in `detectLanguage()` so we don't have to enumerate every + * possible suffix. + */ +const LANGUAGE_BY_FILENAME = Object.freeze({ + Dockerfile: 'dockerfile', + Makefile: 'makefile', + GNUmakefile: 'makefile', + makefile: 'makefile', + Jenkinsfile: 'jenkinsfile', + Procfile: 'procfile', + Vagrantfile: 'vagrantfile', +}); + +/** + * Detect the language of a file by its path. Lowercase extension lookup, + * then no-extension filename lookup. Never returns null — falls back to + * the lowercased extension (without dot) or 'unknown' if there is no + * extension. Downstream consumers rely on this field always being a string + * (see project-scanner.md Step 3 "Fallback" note). + */ +export function detectLanguage(filePath) { + const base = basename(filePath); + const ext = extname(filePath).toLowerCase(); + + // Dockerfile.dev, Dockerfile.prod, etc. — common variant form. + if (base === 'Dockerfile' || base.startsWith('Dockerfile.')) return 'dockerfile'; + + // Dotfile names like .env, .env.local — path.extname returns '' for + // single-segment dotfiles (e.g. '.env') and the SECOND segment for + // compound dotfiles (e.g. '.local' for '.env.local'). Neither hits the + // intended LANGUAGE_BY_EXT['.env'] mapping. Try the leading dotfile + // portion first so `.env`, `.env.local`, `.env.production` all map. + const dotKey = dotfileKey(base); + if (dotKey && LANGUAGE_BY_EXT[dotKey]) return LANGUAGE_BY_EXT[dotKey]; + + if (ext) { + const byExt = LANGUAGE_BY_EXT[ext]; + if (byExt) return byExt; + // Unknown extension → drop the leading dot, lowercase. Never null. + return ext.slice(1); + } + + // No-extension file — try filename table. + const byFilename = LANGUAGE_BY_FILENAME[base]; + if (byFilename) return byFilename; + + return 'unknown'; +} + +/** + * Extract the canonical dotfile "extension" from a basename, or null. + * + * `.env` -> `.env` + * `.env.local` -> `.env` + * `.bashrc` -> `.bashrc` + * `package.json` -> null (not a dotfile) + * + * Used by both detectLanguage and detectCategory so dotfile-style configs + * (e.g., `.env`, `.env.local`, `.env.production`) get their leading + * segment treated as the implicit extension instead of falling through + * to `unknown` / `code`. + */ +function dotfileKey(base) { + if (!base.startsWith('.')) return null; + const m = base.match(/^(\.[a-z0-9]+)/i); + return m ? m[1].toLowerCase() : null; +} + +// --------------------------------------------------------------------------- +// Category detection +// +// Implements the priority-ordered rules from project-scanner.md Step 4. +// Order matters: more specific rules must run before more general ones +// (e.g. `docker-compose.yml` is infra, not config). +// +// Categories: code | config | docs | infra | data | script | markup +// --------------------------------------------------------------------------- + +/** + * Extension -> category. Used only after the higher-priority path-based + * checks (infra/docs exclusions) in `detectCategory()`. Plain extension + * lookup is intentionally last-resort — many configs need their full path + * inspected first. + */ +const CATEGORY_BY_EXT = Object.freeze({ + // docs + '.md': 'docs', + '.mdx': 'docs', + '.rst': 'docs', + '.txt': 'docs', + '.text': 'docs', + // config + '.yaml': 'config', + '.yml': 'config', + '.json': 'config', + '.jsonc': 'config', + '.toml': 'config', + '.xml': 'config', + '.xsl': 'config', + '.xsd': 'config', + '.plist': 'config', + '.cfg': 'config', + '.ini': 'config', + '.env': 'config', + '.properties': 'config', + '.csproj': 'config', + '.sln': 'config', + '.mod': 'config', + '.sum': 'config', + '.gradle': 'config', + // infra + '.tf': 'infra', + '.tfvars': 'infra', + // data + '.sql': 'data', + '.graphql': 'data', + '.gql': 'data', + '.proto': 'data', + '.prisma': 'data', + '.csv': 'data', + '.tsv': 'data', + // script + '.sh': 'script', + '.bash': 'script', + '.zsh': 'script', + '.ps1': 'script', + '.psm1': 'script', + '.psd1': 'script', + '.bat': 'script', + '.cmd': 'script', + // markup + '.html': 'markup', + '.htm': 'markup', + '.css': 'markup', + '.scss': 'markup', + '.sass': 'markup', + '.less': 'markup', +}); + +/** + * Filenames (no extension or full filename with extension) that always + * map to `infra` regardless of their extension. Compared case-sensitively + * against basename(path). + */ +const INFRA_FILENAMES = new Set([ + 'Dockerfile', + '.dockerignore', + 'Makefile', + 'GNUmakefile', + 'makefile', + 'Jenkinsfile', + 'Procfile', + 'Vagrantfile', + '.gitlab-ci.yml', +]); + +/** + * Detect the project-scanner category for a file. Priority order matches + * project-scanner.md Step 4 "Priority rule" — most specific wins. + * + * 1. LICENSE -> code (per the spec note "except LICENSE"). The Step-2 + * exclusion table normally removes LICENSE, but if a project chooses to + * re-include it via `.understandignore` negation, it should NOT land in + * docs. We classify as `code` rather than inventing a new bucket. + * 2. Filename-based infra (Dockerfile, Makefile, Jenkinsfile, + * docker-compose.*, Vagrantfile, Procfile, .gitlab-ci.yml, + * .dockerignore). + * 3. Path-based infra (.github/workflows/, .circleci/, k8s/, kubernetes/, + * *.k8s.yml, *.k8s.yaml). + * 4. Extension-based mapping (CATEGORY_BY_EXT). + * 5. Fallback: `code` (matches the spec — "All other extensions"). + */ +export function detectCategory(filePath) { + const base = basename(filePath); + const ext = extname(filePath).toLowerCase(); + const posix = filePath.split(sep).join('/'); + + // Rule 1: LICENSE exception (project-scanner.md Step 4 table comment). + if (base === 'LICENSE') return 'code'; + + // Rule 2: infra by filename — Dockerfile + variants, Makefile, + // Jenkinsfile, docker-compose.*, Procfile, Vagrantfile, .gitlab-ci.yml, + // .dockerignore. + if (INFRA_FILENAMES.has(base)) return 'infra'; + if (base === 'Dockerfile' || base.startsWith('Dockerfile.')) return 'infra'; + if (base.startsWith('docker-compose.')) return 'infra'; + if (base === 'compose.yml' || base === 'compose.yaml') return 'infra'; + + // Rule 3: infra by path. + if (posix.startsWith('.github/workflows/')) return 'infra'; + if (posix.startsWith('.circleci/')) return 'infra'; + // Match a `k8s/` or `kubernetes/` segment anywhere in the path. + if (/(^|\/)(k8s|kubernetes)\//.test(posix)) return 'infra'; + // `*.k8s.yml` and `*.k8s.yaml` — Kubernetes-flavored YAML. + if (/\.k8s\.(ya?ml)$/i.test(base)) return 'infra'; + + // Rule 4: extension-based lookup. + if (ext) { + const byExt = CATEGORY_BY_EXT[ext]; + if (byExt) return byExt; + } + + // Rule 4.5: dotfile-style configs (.env, .env.local, .env.production). + // path.extname misses these — see dotfileKey docstring. + const dotKey = dotfileKey(base); + if (dotKey) { + const byDot = CATEGORY_BY_EXT[dotKey]; + if (byDot) return byDot; + } + + // Rule 5: filename-based config catch-all for no-extension config files + // commonly seen in JVM/Go/.NET projects (covered above for infra but not + // config). We don't enumerate every possible config filename here — that + // gets handled by the language map's no-extension entries upstream. + // Anything not matched falls through to `code`. + return 'code'; +} + +// --------------------------------------------------------------------------- +// Complexity estimation (project-scanner.md Step 7) +// --------------------------------------------------------------------------- + +/** + * Map a total file count to a complexity tier. Thresholds are inclusive on + * the lower bound: + * - small: 1-30 + * - moderate: 31-150 + * - large: 151-500 + * - very-large: >500 + * + * Edge case: 0 files maps to `small` (the lowest tier) so the field is + * always set even on empty repos. Downstream consumers treat 0 files as + * a sentinel for "nothing to analyze" via `totalFiles`, not complexity. + */ +export function estimateComplexity(totalFiles) { + if (totalFiles <= 30) return 'small'; + if (totalFiles <= 150) return 'moderate'; + if (totalFiles <= 500) return 'large'; + return 'very-large'; +} + +// --------------------------------------------------------------------------- +// File enumeration +// --------------------------------------------------------------------------- + +/** + * Normalize a path to forward-slash POSIX. The project-scanner contract + * emits POSIX paths; we re-normalize so the output is stable across + * Windows/macOS/Linux. + */ +function toPosix(p) { + return p.split(sep).join('/'); +} + +/** + * Enumerate all files in `projectRoot` via `git ls-files`. Returns an + * array of project-relative POSIX paths, or null if the directory is not + * a git repository (or git is not installed). Caller falls back to the + * recursive walker. + * + * Why git ls-files first: it respects the repo's `.gitignore`, handles + * submodules sensibly, and gives a fast, deterministic listing. The walker + * is a strict superset of what git would emit (no .gitignore awareness), + * so the ignore filter has to do more work in the fallback path. + */ +function enumerateViaGit(projectRoot) { + const result = spawnSync('git', ['ls-files', '-co', '--exclude-standard'], { + cwd: projectRoot, + encoding: 'utf-8', + maxBuffer: 256 * 1024 * 1024, // 256MB — huge monorepos can produce >10MB of paths + }); + if (result.status !== 0 || !result.stdout) return null; + // Each line is one path, project-relative, already POSIX on all platforms + // because git emits forward slashes regardless of OS. + return result.stdout + .split('\n') + .map(s => s.trim()) + .filter(Boolean) + .map(toPosix); +} + +/** + * Recursive directory walker — fallback when `git ls-files` is unavailable + * (no git, not a repo, or git refused). Skips hard-coded "obviously bad" + * directory names BEFORE invoking the ignore filter so we don't waste cycles + * descending into `node_modules/` etc. on huge trees. + * + * Yields project-relative POSIX paths in directory-sorted order so the + * output is deterministic without an extra sort pass. + */ +function enumerateViaWalk(projectRoot) { + // Hard skip — these directories are universally non-source and skipping + // at the walker level avoids materializing thousands of node_modules + // paths before the ignore filter drops them. The ignore filter still + // runs on everything else. + const HARD_SKIP_DIRS = new Set([ + 'node_modules', + '.git', + '.svn', + '.hg', + '__pycache__', + ]); + + const out = []; + + function walk(absDir) { + let entries; + try { + entries = readdirSync(absDir, { withFileTypes: true }); + } catch (err) { + process.stderr.write( + `Warning: scan-project: ${toPosix(relative(projectRoot, absDir)) || '.'} ` + + `— directory read failed (${err.message}) — subtree skipped\n`, + ); + return; + } + // Sort deterministically by name; mix files and dirs together so the + // final output (after the path sort) is identical regardless of + // OS-specific readdir order. + entries.sort((a, b) => a.name.localeCompare(b.name)); + for (const ent of entries) { + if (ent.isDirectory()) { + if (HARD_SKIP_DIRS.has(ent.name)) continue; + walk(join(absDir, ent.name)); + } else if (ent.isFile()) { + const rel = toPosix(relative(projectRoot, join(absDir, ent.name))); + if (rel) out.push(rel); + } + // Symlinks intentionally ignored — git ls-files doesn't follow them + // either, and following them is a classic recursion-bomb footgun. + } + } + + walk(projectRoot); + return out; +} + +/** + * Enumerate all candidate files in `projectRoot`. Tries git ls-files first; + * falls back to a recursive walk if git is unavailable or this is not a + * repo. Returns an array of project-relative POSIX paths in unspecified + * order — caller is responsible for sorting + filtering. + */ +function enumerateFiles(projectRoot) { + const fromGit = enumerateViaGit(projectRoot); + if (fromGit !== null) return fromGit; + process.stderr.write( + `scan-project: git ls-files unavailable — falling back to recursive walk\n`, + ); + return enumerateViaWalk(projectRoot); +} + +// --------------------------------------------------------------------------- +// Filter accounting +// +// The project-scanner.md contract requires `filteredByIgnore` to count files +// dropped *specifically* by user `.understandignore` patterns (the delta +// beyond what the hardcoded defaults would have removed). We accomplish this +// by building TWO filters: +// - `defaultOnly`: defaults only, no user patterns +// - `combined`: defaults + user patterns (createIgnoreFilter) +// and counting paths that the combined filter excludes but the defaults-only +// filter would have kept. +// +// Negation (`!pattern`) is correctly handled by the combined filter — a file +// re-included via `!` won't be in the combined-excluded set, so it WON'T be +// counted in filteredByIgnore (it's "kept", not "additionally filtered"). +// --------------------------------------------------------------------------- + +/** + * Build a defaults-only IgnoreFilter — same patterns as createIgnoreFilter + * would apply, minus any user .understandignore content. We synthesize this + * via a temp directory with no .understandignore files so the core function + * still drives the matcher. (Re-implementing the ignore-package wiring here + * would risk subtle behavior drift from core's matcher.) + */ +function buildDefaultsOnlyFilter() { + // Use the createIgnoreFilter with a path that we KNOW has no .understandignore. + // `os.tmpdir()`-based fresh dir guarantees no user patterns leak in. + // The directory doesn't need to exist on disk because createIgnoreFilter + // only checks existsSync() before reading. + const fakeProjectRoot = join( + require('node:os').tmpdir(), + `ua-scan-defaults-${process.pid}-${Date.now()}`, + ); + return createIgnoreFilter(fakeProjectRoot); +} + +/** + * Determine whether `projectRoot` has any user .understandignore files. + * When neither file exists, the combined and defaults-only filters are + * identical, so we can skip the dual-filter accounting entirely. + */ +function hasUserIgnoreFile(projectRoot) { + return ( + existsSync(join(projectRoot, '.understandignore')) + || existsSync(join(projectRoot, '.understand-anything', '.understandignore')) + ); +} + +// --------------------------------------------------------------------------- +// Line counting +// --------------------------------------------------------------------------- + +/** + * Count newline-delimited lines in a file. Returns the number of `\n` + * characters; this matches `wc -l` semantics (which counts newlines, not + * "lines of content"). Files without a trailing newline therefore report + * one fewer than the visible line count — same behavior as wc. + * + * Per-file failure: emits a Warning: and returns null. Caller decides + * whether to drop the file or keep it with sizeLines=0. + */ +function countLines(absPath, posixPath) { + try { + const buf = readFileSync(absPath); + // Manual newline count beats split('\n').length on large files — no + // intermediate array allocation. We count the `\n` byte (0x0a) directly. + let count = 0; + for (let i = 0; i < buf.length; i++) { + if (buf[i] === 0x0a) count++; + } + return count; + } catch (err) { + process.stderr.write( + `Warning: scan-project: ${posixPath} — line count failed ` + + `(${err.message}) — file skipped from output\n`, + ); + return null; + } +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +async function main() { + const [, , projectRoot, outputPath] = process.argv; + if (!projectRoot || !outputPath) { + process.stderr.write( + 'Usage: node scan-project.mjs \n', + ); + process.exit(1); + } + + if (!existsSync(projectRoot)) { + process.stderr.write( + `scan-project.mjs failed: projectRoot does not exist: ${projectRoot}\n`, + ); + process.exit(1); + } + const projectRootStat = statSync(projectRoot); + if (!projectRootStat.isDirectory()) { + process.stderr.write( + `scan-project.mjs failed: projectRoot is not a directory: ${projectRoot}\n`, + ); + process.exit(1); + } + + // 1. Enumerate. Either git ls-files or recursive walk. + const candidates = enumerateFiles(projectRoot); + + // 2. Filter via createIgnoreFilter (defaults + user .understandignore). + // Build a defaults-only filter in parallel to count user-driven drops. + const combined = createIgnoreFilter(projectRoot); + const userIgnoresPresent = hasUserIgnoreFile(projectRoot); + const defaultsOnly = userIgnoresPresent ? buildDefaultsOnlyFilter() : combined; + + let filteredByIgnore = 0; + const kept = []; + for (const rel of candidates) { + const isIgnoredCombined = combined.isIgnored(rel); + if (!isIgnoredCombined) { + kept.push(rel); + continue; + } + // Dropped by combined filter. If defaults-only would have ALSO dropped + // it, this is a baseline default drop — not counted. If defaults-only + // would have KEPT it, this drop is attributable to the user's + // .understandignore content. + if (userIgnoresPresent && !defaultsOnly.isIgnored(rel)) { + filteredByIgnore++; + } + } + + // 3. Per-file: language + category + line count. + // Drop files that fail line counting (per-file resilience). + const fileEntries = []; + for (const rel of kept) { + const absPath = join(projectRoot, rel); + // Stat first — git ls-files could include paths that vanished between + // listing and processing; the walker shouldn't but defensive anyway. + try { + const st = statSync(absPath); + if (!st.isFile()) { + // Symlinks-to-dir, special files, etc. — skip silently. Not a + // warning condition because git wouldn't have tracked it as a file. + continue; + } + } catch (err) { + process.stderr.write( + `Warning: scan-project: ${rel} — stat failed (${err.message}) ` + + `— file skipped from output\n`, + ); + continue; + } + const sizeLines = countLines(absPath, rel); + if (sizeLines === null) { + // countLines already emitted the Warning: line. + continue; + } + fileEntries.push({ + path: rel, + language: detectLanguage(rel), + sizeLines, + fileCategory: detectCategory(rel), + }); + } + + // 4. Determinism: sort by path.localeCompare. + fileEntries.sort((a, b) => a.path.localeCompare(b.path)); + + // 5. Stats. + const byCategory = {}; + const byLanguage = {}; + for (const f of fileEntries) { + byCategory[f.fileCategory] = (byCategory[f.fileCategory] || 0) + 1; + byLanguage[f.language] = (byLanguage[f.language] || 0) + 1; + } + + const estimatedComplexity = estimateComplexity(fileEntries.length); + + const output = { + scriptCompleted: true, + files: fileEntries, + totalFiles: fileEntries.length, + filteredByIgnore, + estimatedComplexity, + stats: { + filesScanned: fileEntries.length, + byCategory, + byLanguage, + }, + }; + + writeFileSync(outputPath, JSON.stringify(output, null, 2), 'utf-8'); + + if (!existsSync(outputPath)) { + throw new Error(`output file missing after write: ${outputPath}`); + } + + process.stderr.write( + `scan-project: filesScanned=${fileEntries.length} ` + + `filteredByIgnore=${filteredByIgnore} ` + + `complexity=${estimatedComplexity}\n`, + ); +} + +// --------------------------------------------------------------------------- +// Run only when executed directly as a CLI; importing the module (e.g. from +// tests) must not trigger main(). +// +// Canonicalize both sides through realpathSync. Node ESM resolves +// import.meta.url through symlinks but pathToFileURL(process.argv[1]) preserves +// them, so a raw equality check silently no-ops when the script is invoked via +// a symlinked plugin install path (the default in Claude Code / Copilot CLI +// caches). See GitHub issue #162. +// --------------------------------------------------------------------------- +function isCliEntry() { + if (!process.argv[1]) return false; + try { + const modulePath = realpathSync(fileURLToPath(import.meta.url)); + const argvPath = realpathSync(process.argv[1]); + return modulePath === argvPath; + } catch { + return false; + } +} + +if (isCliEntry()) { + try { + await main(); + } catch (err) { + process.stderr.write(`scan-project.mjs failed: ${err.message}\n${err.stack}\n`); + process.exit(1); + } +} + +// Default export of helpers for testability. +export default { + detectLanguage, + detectCategory, + estimateComplexity, +}; diff --git a/understand-anything-plugin/vitest.config.ts b/understand-anything-plugin/vitest.config.ts new file mode 100644 index 0000000..9eedf1e --- /dev/null +++ b/understand-anything-plugin/vitest.config.ts @@ -0,0 +1,14 @@ +import { defineConfig } from 'vitest/config'; + +// The plugin package no longer ships any test files — they were relocated +// to the repo-root `tests/` tree so they no longer ride along with the +// plugin marketplace bundle. This config exists solely to shadow the +// repo-root vitest.config.ts (which would otherwise be inherited via +// upward config discovery from this cwd) and explicitly resolve no tests. +// +// Run skill tests from the repo root with `pnpm test` instead. +export default defineConfig({ + test: { + include: [], + }, +}); diff --git a/vitest.config.ts b/vitest.config.ts new file mode 100644 index 0000000..e009ea3 --- /dev/null +++ b/vitest.config.ts @@ -0,0 +1,25 @@ +import { defineConfig } from 'vitest/config'; + +// Single-config aggregation for the whole monorepo. Picks up: +// - tests/** — relocated skill tests (out-of-plugin so they +// do not ship via the marketplace bundle) +// - understand-anything-plugin/src/** — skill TS source tests +// - understand-anything-plugin/packages/dashboard/** — dashboard utils tests +// +// The `@understand-anything/core` package owns its own vitest.config.ts and is +// invoked separately via `pnpm --filter @understand-anything/core test`; its +// files are excluded here to avoid double-counting. +export default defineConfig({ + test: { + include: [ + 'tests/**/*.test.{js,mjs,ts}', + 'understand-anything-plugin/src/**/*.test.{js,mjs,ts}', + 'understand-anything-plugin/packages/dashboard/**/*.test.{js,mjs,ts,tsx}', + ], + exclude: [ + '**/node_modules/**', + '**/dist/**', + 'understand-anything-plugin/packages/core/**', + ], + }, +});