02dfc12141
PR Ready Gate / clear-labels (pull_request) Successful in 2s
CI / test (pull_request) Successful in 17s
CI / review (anthropic--claude-4.6-sonnet, sonnet, SONNET_REVIEW_TOKEN) (pull_request) Successful in 30s
CI / review (gpt-5, gpt, GPT_REVIEW_TOKEN) (pull_request) Failing after 49s
CI / review (gpt-5, security, ., rodin/security-patterns, SECURITY_REVIEW.md, SECURITY_REVIEW_TOKEN) (pull_request) Failing after 1m11s
When --doc-map-trusted-ref is provided, the --doc-map value is used as a VCS API path parameter, not a local filesystem path. The early call to validateWorkspacePath (which requires the file to exist locally) blocked the trusted-ref code path when the doc-map did not exist in the local checkout — defeating the feature's purpose in sparse checkouts or when the file is only on the default branch. Fix: guard the early validation with `&& *docMapTrustedRef == ""`. Also fixes: - review/docmap.go: correct ParseDocMapConfigContent godoc example to match actual source format "owner/repo@ref:path" - cmd/review-bot/main_test.go: add TestMainSubprocess_DocMapTrustedRefSkipsLocalValidation to prevent regression
365 lines
11 KiB
Go
365 lines
11 KiB
Go
// doc-map parsing and doc injection for path-scoped design document context in AI code reviews.
|
|
package review
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"github.com/goccy/go-yaml"
|
|
)
|
|
|
|
const (
|
|
// DefaultDocMapMaxBytes is the default cap on total injected doc content.
|
|
DefaultDocMapMaxBytes = 100 * 1024 // 100 KB
|
|
)
|
|
|
|
// DocMapping maps a set of path glob patterns to governing doc files/directories.
|
|
type DocMapping struct {
|
|
Paths []string `yaml:"paths"` // glob patterns matched against changed PR files
|
|
Docs []string `yaml:"docs"` // doc file paths or directories in the reviewed repo
|
|
}
|
|
|
|
// DocMapConfig is the top-level structure of a doc-map YAML file.
|
|
type DocMapConfig struct {
|
|
Mappings []DocMapping `yaml:"mappings"`
|
|
}
|
|
|
|
// DocMapOptions configures behavior for doc loading.
|
|
type DocMapOptions struct {
|
|
// MaxBytes caps the total size of injected doc content. Default: DefaultDocMapMaxBytes.
|
|
MaxBytes int
|
|
}
|
|
|
|
// DocFetcher reads file and directory content from a VCS repository.
|
|
// It is a subset of vcsClient, defined here to keep the review package free
|
|
// of cmd-level dependencies.
|
|
type DocFetcher interface {
|
|
// GetFileContent returns the content of a single file at default branch.
|
|
GetFileContent(ctx context.Context, owner, repo, path string) (string, error)
|
|
// GetAllFilesInPath returns all files (path → content) under a directory.
|
|
GetAllFilesInPath(ctx context.Context, owner, repo, path string) (map[string]string, error)
|
|
}
|
|
|
|
// ParseDocMapConfig reads and parses a doc-map YAML file from a local path.
|
|
// Unknown top-level keys produce a warning but are not fatal.
|
|
func ParseDocMapConfig(localPath string) (*DocMapConfig, error) {
|
|
data, err := readFileBytes(localPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read doc-map file %q: %w", localPath, err)
|
|
}
|
|
return parseDocMapBytes(data, localPath)
|
|
}
|
|
|
|
// ParseDocMapConfigContent parses a doc-map YAML config from an in-memory
|
|
// string. The source parameter is used only for error messages and log entries
|
|
// (e.g. "owner/repo@main:.review-bot/doc-map.yml").
|
|
//
|
|
// Use this when the config content has been fetched from a trusted VCS ref
|
|
// rather than read from the local workspace.
|
|
func ParseDocMapConfigContent(content, source string) (*DocMapConfig, error) {
|
|
data := []byte(content)
|
|
return parseDocMapBytes(data, source)
|
|
}
|
|
|
|
// parseDocMapBytes is the shared YAML parse implementation used by
|
|
// ParseDocMapConfig and ParseDocMapConfigContent.
|
|
func parseDocMapBytes(data []byte, source string) (*DocMapConfig, error) {
|
|
var cfg DocMapConfig
|
|
if err := yaml.UnmarshalWithOptions(data, &cfg, yaml.Strict()); err != nil {
|
|
// Re-parse without strict mode to log which keys are unknown.
|
|
var relaxed DocMapConfig
|
|
if err2 := yaml.Unmarshal(data, &relaxed); err2 != nil {
|
|
return nil, fmt.Errorf("parse doc-map YAML %q: %w", source, err)
|
|
}
|
|
slog.Warn("doc-map YAML contains unknown keys (ignored)", "file", source, "error", err)
|
|
cfg = relaxed
|
|
}
|
|
return &cfg, nil
|
|
}
|
|
|
|
// FileCoveredByDocMap reports whether at least one paths: glob in any mapping
|
|
// of cfg matches the given file path. It is used by static validation tooling
|
|
// (e.g. the validate-docmap subcommand) to check per-file docmap coverage.
|
|
func FileCoveredByDocMap(cfg *DocMapConfig, file string) bool {
|
|
for _, mapping := range cfg.Mappings {
|
|
if mappingMatches(mapping.Paths, []string{file}) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// MatchDocs returns deduplicated doc paths for the given changed file paths.
|
|
// A mapping matches if any of its path globs matches any of the changed files.
|
|
func MatchDocs(cfg *DocMapConfig, changedFiles []string) []string {
|
|
seen := make(map[string]struct{})
|
|
var result []string
|
|
|
|
for _, mapping := range cfg.Mappings {
|
|
if len(mapping.Paths) == 0 || len(mapping.Docs) == 0 {
|
|
continue
|
|
}
|
|
if mappingMatches(mapping.Paths, changedFiles) {
|
|
for _, doc := range mapping.Docs {
|
|
if doc == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[doc]; !ok {
|
|
seen[doc] = struct{}{}
|
|
result = append(result, doc)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// mappingMatches returns true if any glob in patterns matches any file in files.
|
|
func mappingMatches(patterns, files []string) bool {
|
|
for _, pat := range patterns {
|
|
for _, f := range files {
|
|
if globMatch(pat, f) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// globMatch matches a path against a glob pattern that may contain **.
|
|
// It supports:
|
|
// - filepath.Match patterns (*, ?, [range])
|
|
// - ** as a path segment that matches zero or more segments
|
|
// - Trailing /** to match a directory and all its contents
|
|
//
|
|
// The pattern and path use forward slash as separator.
|
|
func globMatch(pattern, path string) bool {
|
|
return globMatchParts(splitPath(pattern), splitPath(path))
|
|
}
|
|
|
|
// splitPath splits a slash-separated path into non-empty parts.
|
|
func splitPath(p string) []string {
|
|
// Clean and split on "/"
|
|
parts := strings.Split(p, "/")
|
|
result := make([]string, 0, len(parts))
|
|
for _, part := range parts {
|
|
if part != "" {
|
|
result = append(result, part)
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// globMatchParts recursively matches pattern parts against path parts.
|
|
func globMatchParts(patParts, pathParts []string) bool {
|
|
for len(patParts) > 0 {
|
|
pat := patParts[0]
|
|
if pat == "**" {
|
|
patParts = patParts[1:]
|
|
if len(patParts) == 0 {
|
|
// Trailing **: matches any remaining path (including empty).
|
|
return true
|
|
}
|
|
// ** in the middle: try matching the rest at every position.
|
|
for i := 0; i <= len(pathParts); i++ {
|
|
if globMatchParts(patParts, pathParts[i:]) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
// Non-** segment: path must have a segment here.
|
|
if len(pathParts) == 0 {
|
|
return false
|
|
}
|
|
matched, err := filepath.Match(pat, pathParts[0])
|
|
if err != nil || !matched {
|
|
return false
|
|
}
|
|
patParts = patParts[1:]
|
|
pathParts = pathParts[1:]
|
|
}
|
|
// All pattern parts consumed; path must also be consumed.
|
|
return len(pathParts) == 0
|
|
}
|
|
|
|
// LoadMatchingDocs fetches content for the given doc paths via VCS and returns
|
|
// a formatted string suitable for injection into the system prompt.
|
|
//
|
|
// Behavior:
|
|
// - Paths that look like directories (end with /, or GetAllFilesInPath returns files)
|
|
// are expanded to all .md files under them.
|
|
// - Missing files are logged as warnings and skipped.
|
|
// - Total content is capped at opts.MaxBytes; truncation is noted inline.
|
|
func LoadMatchingDocs(ctx context.Context, fetcher DocFetcher, owner, repo string, docPaths []string, opts DocMapOptions) (string, error) {
|
|
if opts.MaxBytes <= 0 {
|
|
opts.MaxBytes = DefaultDocMapMaxBytes
|
|
}
|
|
|
|
var sb strings.Builder
|
|
totalBytes := 0
|
|
limitReached := false
|
|
|
|
for _, docPath := range docPaths {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
if limitReached {
|
|
slog.Warn("doc-map: context size limit reached, skipping remaining docs",
|
|
"remaining_path", docPath, "limit_bytes", opts.MaxBytes)
|
|
break
|
|
}
|
|
|
|
entries, err := loadDocEntries(ctx, fetcher, owner, repo, docPath)
|
|
if err != nil {
|
|
slog.Warn("doc-map: could not load doc, skipping", "path", docPath, "error", err)
|
|
continue
|
|
}
|
|
if len(entries) == 0 {
|
|
slog.Debug("doc-map: no .md files found under path", "path", docPath)
|
|
continue
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if limitReached {
|
|
break
|
|
}
|
|
available := opts.MaxBytes - totalBytes
|
|
if available <= 0 {
|
|
limitReached = true
|
|
sb.WriteString("\n\n> ⚠️ Design document context truncated — size limit reached.\n")
|
|
break
|
|
}
|
|
|
|
content := entry.content
|
|
truncated := false
|
|
if len(content) > available {
|
|
content = truncateUTF8(content, available)
|
|
truncated = true
|
|
limitReached = true
|
|
}
|
|
|
|
sb.WriteString("### ")
|
|
sb.WriteString(entry.path)
|
|
sb.WriteString("\n\n")
|
|
sb.WriteString(content)
|
|
sb.WriteString("\n")
|
|
if truncated {
|
|
sb.WriteString("\n> ⚠️ (truncated — size limit reached)\n")
|
|
}
|
|
totalBytes += len(content)
|
|
slog.Debug("doc-map: injected doc", "path", entry.path, "bytes", len(content))
|
|
}
|
|
}
|
|
|
|
if sb.Len() == 0 {
|
|
return "", nil
|
|
}
|
|
return sb.String(), nil
|
|
}
|
|
|
|
// docEntry holds a single doc file path and content.
|
|
type docEntry struct {
|
|
path string
|
|
content string
|
|
}
|
|
|
|
// loadDocEntries returns the doc content for a given path.
|
|
// If the path is a directory, all .md files under it are returned.
|
|
// If it's a file, a single entry is returned.
|
|
func loadDocEntries(ctx context.Context, fetcher DocFetcher, owner, repo, docPath string) ([]docEntry, error) {
|
|
if err := ValidateDocPath(docPath); err != nil {
|
|
return nil, fmt.Errorf("doc path %q rejected: %w", docPath, err)
|
|
}
|
|
|
|
// Try directory expansion first.
|
|
files, dirErr := fetcher.GetAllFilesInPath(ctx, owner, repo, docPath)
|
|
if dirErr == nil && len(files) > 0 {
|
|
// Filter for .md files only.
|
|
var entries []docEntry
|
|
for path, content := range files {
|
|
if isMDFile(path) {
|
|
entries = append(entries, docEntry{path: path, content: content})
|
|
}
|
|
}
|
|
// Sort for deterministic output.
|
|
sortDocEntries(entries)
|
|
return entries, nil
|
|
}
|
|
|
|
// Directory expansion returned nothing; log and fall through to single-file fetch.
|
|
if dirErr != nil {
|
|
slog.Debug("doc-map: directory expansion failed, trying as single file", "path", docPath, "error", dirErr)
|
|
}
|
|
|
|
// Try as a single file.
|
|
content, fileErr := fetcher.GetFileContent(ctx, owner, repo, docPath)
|
|
if fileErr != nil {
|
|
// Return the file error (more specific than directory error).
|
|
return nil, fmt.Errorf("fetch doc %q: %w", docPath, fileErr)
|
|
}
|
|
return []docEntry{{path: docPath, content: content}}, nil
|
|
}
|
|
|
|
// isMDFile returns true if the file has a .md extension.
|
|
func isMDFile(path string) bool {
|
|
return strings.HasSuffix(strings.ToLower(path), ".md")
|
|
}
|
|
|
|
// sortDocEntries sorts entries by path for deterministic output.
|
|
func sortDocEntries(entries []docEntry) {
|
|
// Simple insertion sort (doc lists are small).
|
|
for i := 1; i < len(entries); i++ {
|
|
for j := i; j > 0 && entries[j].path < entries[j-1].path; j-- {
|
|
entries[j], entries[j-1] = entries[j-1], entries[j]
|
|
}
|
|
}
|
|
}
|
|
|
|
// readFileBytes reads the contents of a local file.
|
|
func readFileBytes(path string) ([]byte, error) {
|
|
return os.ReadFile(path)
|
|
}
|
|
|
|
// ValidateDocPath rejects doc paths that could cause path traversal
|
|
// (absolute paths, any ".." segment, backslashes). Defense-in-depth: callers
|
|
// must also confine the joined path to the repo root via filepath.Rel before
|
|
// any filesystem access. Backslashes are rejected explicitly to prevent
|
|
// Windows platform edge cases.
|
|
func ValidateDocPath(p string) error {
|
|
if strings.Contains(p, "\\") {
|
|
return fmt.Errorf("backslashes not allowed in doc paths")
|
|
}
|
|
if filepath.IsAbs(p) {
|
|
return fmt.Errorf("absolute paths not allowed")
|
|
}
|
|
for _, segment := range strings.Split(p, "/") {
|
|
if segment == ".." {
|
|
return fmt.Errorf("path traversal ('..' segment) not allowed")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// truncateUTF8 truncates s to at most maxBytes without splitting multi-byte
|
|
// UTF-8 characters. Returns a valid UTF-8 string of at most maxBytes bytes.
|
|
//
|
|
// Note: an identical implementation exists in budget/budget.go. The two
|
|
// packages are intentionally separate (review does not import budget), so
|
|
// the duplication is accepted rather than introducing a shared internal
|
|
// package for a single small function.
|
|
func truncateUTF8(s string, maxBytes int) string {
|
|
if len(s) <= maxBytes {
|
|
return s
|
|
}
|
|
for maxBytes > 0 && !utf8.RuneStart(s[maxBytes]) {
|
|
maxBytes--
|
|
}
|
|
return s[:maxBytes]
|
|
}
|