// doc-map parsing and doc injection for path-scoped design document context in AI code reviews. package review import ( "context" "fmt" "log/slog" "os" "path/filepath" "strings" "unicode/utf8" "github.com/goccy/go-yaml" ) const ( // DefaultDocMapMaxBytes is the default cap on total injected doc content. DefaultDocMapMaxBytes = 100 * 1024 // 100 KB ) // DocMapping maps a set of path glob patterns to governing doc files/directories. type DocMapping struct { Paths []string `yaml:"paths"` // glob patterns matched against changed PR files Docs []string `yaml:"docs"` // doc file paths or directories in the reviewed repo } // DocMapConfig is the top-level structure of a doc-map YAML file. type DocMapConfig struct { Mappings []DocMapping `yaml:"mappings"` } // DocMapOptions configures behavior for doc loading. type DocMapOptions struct { // MaxBytes caps the total size of injected doc content. Default: DefaultDocMapMaxBytes. MaxBytes int } // DocFetcher reads file and directory content from a VCS repository. // It is a subset of vcsClient, defined here to keep the review package free // of cmd-level dependencies. type DocFetcher interface { // GetFileContent returns the content of a single file at default branch. GetFileContent(ctx context.Context, owner, repo, path string) (string, error) // GetAllFilesInPath returns all files (path → content) under a directory. GetAllFilesInPath(ctx context.Context, owner, repo, path string) (map[string]string, error) } // ParseDocMapConfig reads and parses a doc-map YAML file from a local path. // Unknown top-level keys produce a warning but are not fatal. func ParseDocMapConfig(localPath string) (*DocMapConfig, error) { data, err := readFileBytes(localPath) if err != nil { return nil, fmt.Errorf("read doc-map file %q: %w", localPath, err) } return parseDocMapBytes(data, localPath) } // ParseDocMapConfigContent parses a doc-map YAML config from an in-memory // string. The source parameter is used only for error messages and log entries // (e.g. "owner/repo@main:.review-bot/doc-map.yml"). // // Use this when the config content has been fetched from a trusted VCS ref // rather than read from the local workspace. func ParseDocMapConfigContent(content, source string) (*DocMapConfig, error) { data := []byte(content) return parseDocMapBytes(data, source) } // parseDocMapBytes is the shared YAML parse implementation used by // ParseDocMapConfig and ParseDocMapConfigContent. func parseDocMapBytes(data []byte, source string) (*DocMapConfig, error) { var cfg DocMapConfig if err := yaml.UnmarshalWithOptions(data, &cfg, yaml.Strict()); err != nil { // Re-parse without strict mode to log which keys are unknown. var relaxed DocMapConfig if err2 := yaml.Unmarshal(data, &relaxed); err2 != nil { return nil, fmt.Errorf("parse doc-map YAML %q: %w", source, err) } slog.Warn("doc-map YAML contains unknown keys (ignored)", "file", source, "error", err) cfg = relaxed } return &cfg, nil } // FileCoveredByDocMap reports whether at least one paths: glob in any mapping // of cfg matches the given file path. It is used by static validation tooling // (e.g. the validate-docmap subcommand) to check per-file docmap coverage. func FileCoveredByDocMap(cfg *DocMapConfig, file string) bool { for _, mapping := range cfg.Mappings { if mappingMatches(mapping.Paths, []string{file}) { return true } } return false } // MatchDocs returns deduplicated doc paths for the given changed file paths. // A mapping matches if any of its path globs matches any of the changed files. func MatchDocs(cfg *DocMapConfig, changedFiles []string) []string { seen := make(map[string]struct{}) var result []string for _, mapping := range cfg.Mappings { if len(mapping.Paths) == 0 || len(mapping.Docs) == 0 { continue } if mappingMatches(mapping.Paths, changedFiles) { for _, doc := range mapping.Docs { if doc == "" { continue } if _, ok := seen[doc]; !ok { seen[doc] = struct{}{} result = append(result, doc) } } } } return result } // mappingMatches returns true if any glob in patterns matches any file in files. func mappingMatches(patterns, files []string) bool { for _, pat := range patterns { for _, f := range files { if globMatch(pat, f) { return true } } } return false } // globMatch matches a path against a glob pattern that may contain **. // It supports: // - filepath.Match patterns (*, ?, [range]) // - ** as a path segment that matches zero or more segments // - Trailing /** to match a directory and all its contents // // The pattern and path use forward slash as separator. func globMatch(pattern, path string) bool { return globMatchParts(splitPath(pattern), splitPath(path)) } // splitPath splits a slash-separated path into non-empty parts. func splitPath(p string) []string { // Clean and split on "/" parts := strings.Split(p, "/") result := make([]string, 0, len(parts)) for _, part := range parts { if part != "" { result = append(result, part) } } return result } // globMatchParts recursively matches pattern parts against path parts. func globMatchParts(patParts, pathParts []string) bool { for len(patParts) > 0 { pat := patParts[0] if pat == "**" { patParts = patParts[1:] if len(patParts) == 0 { // Trailing **: matches any remaining path (including empty). return true } // ** in the middle: try matching the rest at every position. for i := 0; i <= len(pathParts); i++ { if globMatchParts(patParts, pathParts[i:]) { return true } } return false } // Non-** segment: path must have a segment here. if len(pathParts) == 0 { return false } matched, err := filepath.Match(pat, pathParts[0]) if err != nil || !matched { return false } patParts = patParts[1:] pathParts = pathParts[1:] } // All pattern parts consumed; path must also be consumed. return len(pathParts) == 0 } // LoadMatchingDocs fetches content for the given doc paths via VCS and returns // a formatted string suitable for injection into the system prompt. // // Behavior: // - Paths that look like directories (end with /, or GetAllFilesInPath returns files) // are expanded to all .md files under them. // - Missing files are logged as warnings and skipped. // - Total content is capped at opts.MaxBytes; truncation is noted inline. func LoadMatchingDocs(ctx context.Context, fetcher DocFetcher, owner, repo string, docPaths []string, opts DocMapOptions) (string, error) { if opts.MaxBytes <= 0 { opts.MaxBytes = DefaultDocMapMaxBytes } var sb strings.Builder totalBytes := 0 limitReached := false for _, docPath := range docPaths { if ctx.Err() != nil { break } if limitReached { slog.Warn("doc-map: context size limit reached, skipping remaining docs", "remaining_path", docPath, "limit_bytes", opts.MaxBytes) break } entries, err := loadDocEntries(ctx, fetcher, owner, repo, docPath) if err != nil { slog.Warn("doc-map: could not load doc, skipping", "path", docPath, "error", err) continue } if len(entries) == 0 { slog.Debug("doc-map: no .md files found under path", "path", docPath) continue } for _, entry := range entries { if limitReached { break } available := opts.MaxBytes - totalBytes if available <= 0 { limitReached = true sb.WriteString("\n\n> ⚠️ Design document context truncated — size limit reached.\n") break } content := entry.content truncated := false if len(content) > available { content = truncateUTF8(content, available) truncated = true limitReached = true } sb.WriteString("### ") sb.WriteString(entry.path) sb.WriteString("\n\n") sb.WriteString(content) sb.WriteString("\n") if truncated { sb.WriteString("\n> ⚠️ (truncated — size limit reached)\n") } totalBytes += len(content) slog.Debug("doc-map: injected doc", "path", entry.path, "bytes", len(content)) } } if sb.Len() == 0 { return "", nil } return sb.String(), nil } // docEntry holds a single doc file path and content. type docEntry struct { path string content string } // loadDocEntries returns the doc content for a given path. // If the path is a directory, all .md files under it are returned. // If it's a file, a single entry is returned. func loadDocEntries(ctx context.Context, fetcher DocFetcher, owner, repo, docPath string) ([]docEntry, error) { if err := ValidateDocPath(docPath); err != nil { return nil, fmt.Errorf("doc path %q rejected: %w", docPath, err) } // Try directory expansion first. files, dirErr := fetcher.GetAllFilesInPath(ctx, owner, repo, docPath) if dirErr == nil && len(files) > 0 { // Filter for .md files only. var entries []docEntry for path, content := range files { if isMDFile(path) { entries = append(entries, docEntry{path: path, content: content}) } } // Sort for deterministic output. sortDocEntries(entries) return entries, nil } // Directory expansion returned nothing; log and fall through to single-file fetch. if dirErr != nil { slog.Debug("doc-map: directory expansion failed, trying as single file", "path", docPath, "error", dirErr) } // Try as a single file. content, fileErr := fetcher.GetFileContent(ctx, owner, repo, docPath) if fileErr != nil { // Return the file error (more specific than directory error). return nil, fmt.Errorf("fetch doc %q: %w", docPath, fileErr) } return []docEntry{{path: docPath, content: content}}, nil } // isMDFile returns true if the file has a .md extension. func isMDFile(path string) bool { return strings.HasSuffix(strings.ToLower(path), ".md") } // sortDocEntries sorts entries by path for deterministic output. func sortDocEntries(entries []docEntry) { // Simple insertion sort (doc lists are small). for i := 1; i < len(entries); i++ { for j := i; j > 0 && entries[j].path < entries[j-1].path; j-- { entries[j], entries[j-1] = entries[j-1], entries[j] } } } // readFileBytes reads the contents of a local file. func readFileBytes(path string) ([]byte, error) { return os.ReadFile(path) } // ValidateDocPath rejects doc paths that could cause path traversal // (absolute paths, any ".." segment, backslashes). Defense-in-depth: callers // must also confine the joined path to the repo root via filepath.Rel before // any filesystem access. Backslashes are rejected explicitly to prevent // Windows platform edge cases. func ValidateDocPath(p string) error { if strings.Contains(p, "\\") { return fmt.Errorf("backslashes not allowed in doc paths") } if filepath.IsAbs(p) { return fmt.Errorf("absolute paths not allowed") } for _, segment := range strings.Split(p, "/") { if segment == ".." { return fmt.Errorf("path traversal ('..' segment) not allowed") } } return nil } // truncateUTF8 truncates s to at most maxBytes without splitting multi-byte // UTF-8 characters. Returns a valid UTF-8 string of at most maxBytes bytes. // // Note: an identical implementation exists in budget/budget.go. The two // packages are intentionally separate (review does not import budget), so // the duplication is accepted rather than introducing a shared internal // package for a single small function. func truncateUTF8(s string, maxBytes int) string { if len(s) <= maxBytes { return s } for maxBytes > 0 && !utf8.RuneStart(s[maxBytes]) { maxBytes-- } return s[:maxBytes] }