diff --git a/.gitea/actions/review/action.yml b/.gitea/actions/review/action.yml index e1ff093..17649e3 100644 --- a/.gitea/actions/review/action.yml +++ b/.gitea/actions/review/action.yml @@ -141,6 +141,16 @@ inputs: description: 'Maximum bytes of injected doc content from doc-map (default 102400 = 100KB)' required: false default: '102400' + doc-map-trusted-ref: + description: >- + Git ref (branch, tag, or SHA) from which to fetch the doc-map config file + via VCS API instead of reading it from the local workspace. Recommended + when using doc-map: set this to the default branch (e.g. 'main') so a + malicious PR cannot modify the doc-map config to inject arbitrary design + docs into the LLM prompt. When unset, the config is read from the local + workspace (the PR branch) with a security warning in the logs. + required: false + default: '' runs: using: 'composite' @@ -507,6 +517,7 @@ runs: PERSONA_FILE: ${{ inputs.persona-file }} DOC_MAP_FILE: ${{ inputs.doc-map }} DOC_MAP_MAX_BYTES: ${{ inputs.doc-map-max-bytes }} + DOC_MAP_TRUSTED_REF: ${{ inputs.doc-map-trusted-ref }} AICORE_CLIENT_ID: ${{ inputs.aicore-client-id }} AICORE_CLIENT_SECRET: ${{ inputs.aicore-client-secret }} AICORE_AUTH_URL: ${{ inputs.aicore-auth-url }} diff --git a/CHANGELOG.md b/CHANGELOG.md index e277c63..03e2d18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Security - **`validateDocmapPath`: add `EvalSymlinks` to close directory-symlink bypass** ([#150](https://gitea.weiker.me/rodin/review-bot/issues/150)): The previous implementation used `os.Lstat` which only avoids following the *final* path component. An intermediate directory symlink (e.g. `.review-bot/` committed as a symlink to a directory outside the repo) would pass the path-confinement check because the textual path appeared within the repo root. `filepath.EvalSymlinks` is now called first, resolving all symlink components before the `filepath.Rel` confinement check. In-repo symlinks whose resolved targets also reside within the repo root are now allowed; out-of-repo targets are rejected by the confinement check. +- **`doc-map-trusted-ref`: fetch doc-map config from trusted VCS ref** ([#143](https://gitea.weiker.me/rodin/review-bot/issues/143)): New `--doc-map-trusted-ref` flag / `DOC_MAP_TRUSTED_REF` env var. When set, the doc-map YAML config is fetched from the specified VCS ref (e.g. `main`) via API instead of being read from the local workspace (the PR branch checkout). This prevents a malicious PR from modifying `.review-bot/doc-map.yml` to inject arbitrary design docs into the LLM prompt. When unset, the local workspace is used with a security warning in the logs. ### Tests @@ -12,6 +13,8 @@ ### Added +- **`doc-map-trusted-ref` input** (`--doc-map-trusted-ref` flag / `DOC_MAP_TRUSTED_REF` env var): Git ref (branch, tag, or SHA) from which to fetch the doc-map config via VCS API. Recommended for all `doc-map` users. Example: `doc-map-trusted-ref: main`. ([#143](https://gitea.weiker.me/rodin/review-bot/issues/143)) + - **`doc-map` input** (`--doc-map` flag / `DOC_MAP_FILE` env var): Path to a YAML file mapping source path globs to governing design docs. review-bot intersects the map with changed PR paths and injects matching docs into the system prompt under a `## Design Documents` heading. ([#137](https://gitea.weiker.me/rodin/review-bot/issues/137)) - **`doc-map-max-bytes` input** (`--doc-map-max-bytes` flag / `DOC_MAP_MAX_BYTES` env var): Cap on total injected design doc content in bytes. Default: 102400 (100 KB). Prevents accidental context overflow when a PR touches many modules. - **`DesignDocs` budget section**: Design docs are included in the context budget and trimmed after conventions, before file context, if the total exceeds the model's context limit. diff --git a/README.md b/README.md index fd2bdd9..2c9d14e 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,7 @@ AI Core handles OAuth token management and deployment discovery automatically. M | `system-prompt-file` | No | `""` | Local file with additional system prompt instructions | | `doc-map` | No | `""` | Path to a YAML file mapping source path globs to governing design docs | | `doc-map-max-bytes` | No | `102400` | Maximum bytes of injected doc content from doc-map (default 100KB) | +| `doc-map-trusted-ref` | No | `""` | Git ref (e.g. `main`) to fetch the doc-map config from via VCS API instead of local workspace. **Recommended for security** — prevents a PR from modifying the doc-map config to inject arbitrary docs. | | `persona` | No | `""` | Built-in persona name (security, architect, docs) | | `persona-file` | No | `""` | Path to persona file (YAML or JSON) with custom review focus | | `temperature` | No | `0` | LLM temperature (0 = server default) | diff --git a/cmd/review-bot/main.go b/cmd/review-bot/main.go index f489e53..9f9fbb5 100644 --- a/cmd/review-bot/main.go +++ b/cmd/review-bot/main.go @@ -101,6 +101,7 @@ func main() { aicoreResourceGroup := flag.String("aicore-resource-group", envOrDefault("AICORE_RESOURCE_GROUP", "default"), "SAP AI Core resource group (for provider=aicore)") docMapFile := flag.String("doc-map", envOrDefault("DOC_MAP_FILE", ""), "Path to YAML file mapping source path globs to governing design docs") docMapMaxBytes := flag.Int("doc-map-max-bytes", envOrDefaultInt("DOC_MAP_MAX_BYTES", review.DefaultDocMapMaxBytes), "Maximum bytes of injected doc content (default 102400)") + docMapTrustedRef := flag.String("doc-map-trusted-ref", envOrDefault("DOC_MAP_TRUSTED_REF", ""), "Git ref (e.g. main) to fetch the doc-map config from via VCS API instead of local workspace. Recommended to prevent PR branch from controlling which docs are injected.") flag.Parse() @@ -173,9 +174,12 @@ func main() { os.Exit(1) } - // Early validation of filesystem-path flags (fail fast before network I/O) + // Early validation of filesystem-path flags (fail fast before network I/O). + // Skip local-path validation when --doc-map-trusted-ref is set: the flag + // value is used as a VCS API path, not a local filesystem path, and the + // file may not exist in the local checkout (sparse, PR-deleted, etc.). var resolvedDocMapFile string - if *docMapFile != "" { + if *docMapFile != "" && *docMapTrustedRef == "" { resolved, err := validateWorkspacePath(*docMapFile, "doc-map") if err != nil { slog.Error("invalid doc-map path", "error", err) @@ -368,10 +372,45 @@ func main() { // Step 6c: Load path-scoped design docs if doc-map specified designDocs := "" if *docMapFile != "" { - docMapCfg, err := review.ParseDocMapConfig(resolvedDocMapFile) - if err != nil { - slog.Error("failed to parse doc-map file", "file", *docMapFile, "error", err) - os.Exit(1) + var docMapCfg *review.DocMapConfig + + if *docMapTrustedRef != "" { + // Fetch doc-map config from a trusted VCS ref (e.g. the default branch). + // This prevents a malicious PR from modifying the doc-map config to + // inject arbitrary docs into the LLM prompt. + slog.Info("doc-map: fetching config from trusted ref", + "path", *docMapFile, + "ref", *docMapTrustedRef) + content, fetchErr := vcs.GetFileContentRef(ctx, owner, repoName, *docMapFile, *docMapTrustedRef) + if fetchErr != nil { + slog.Error("doc-map: failed to fetch config from trusted ref", + "path", *docMapFile, + "ref", *docMapTrustedRef, + "error", fetchErr) + os.Exit(1) + } + source := fmt.Sprintf("%s/%s@%s:%s", owner, repoName, *docMapTrustedRef, *docMapFile) + var parseErr error + docMapCfg, parseErr = review.ParseDocMapConfigContent(content, source) + if parseErr != nil { + slog.Error("doc-map: failed to parse fetched config", + "source", source, + "error", parseErr) + os.Exit(1) + } + } else { + // Local workspace fallback — the doc-map is read from the PR branch checkout. + // SECURITY WARNING: a malicious PR can modify this file to inject arbitrary + // docs. Set --doc-map-trusted-ref (or DOC_MAP_TRUSTED_REF) to a trusted ref + // (e.g. "main") to fetch the config from the default branch instead. + slog.Warn("doc-map: loading config from local workspace (PR branch) — " + + "set --doc-map-trusted-ref to fetch from a trusted ref for security") + var parseErr error + docMapCfg, parseErr = review.ParseDocMapConfig(resolvedDocMapFile) + if parseErr != nil { + slog.Error("failed to parse doc-map file", "file", *docMapFile, "error", parseErr) + os.Exit(1) + } } // Collect changed file paths from the PR for intersection. @@ -385,10 +424,11 @@ func main() { if len(matchedDocs) > 0 { docMapOpts := review.DocMapOptions{MaxBytes: *docMapMaxBytes} - designDocs, err = review.LoadMatchingDocs(ctx, vcs, owner, repoName, matchedDocs, docMapOpts) - if err != nil { + var loadErr error + designDocs, loadErr = review.LoadMatchingDocs(ctx, vcs, owner, repoName, matchedDocs, docMapOpts) + if loadErr != nil { // Non-fatal: individual missing files are already warned; log and continue. - slog.Warn("doc-map: partial failure loading docs", "error", err) + slog.Warn("doc-map: partial failure loading docs", "error", loadErr) } if designDocs != "" { slog.Info("doc-map: injected design docs", "matched", len(matchedDocs), "bytes", len(designDocs)) diff --git a/cmd/review-bot/main_test.go b/cmd/review-bot/main_test.go index e9d32c8..db6d536 100644 --- a/cmd/review-bot/main_test.go +++ b/cmd/review-bot/main_test.go @@ -1578,3 +1578,47 @@ func TestMainSubprocess_InvalidDocMapFile(t *testing.T) { t.Errorf("expected error about failed resolution, got: %s", output) } } + +// TestMainSubprocess_DocMapTrustedRefSkipsLocalValidation confirms that +// --doc-map-trusted-ref bypasses local filesystem validation for --doc-map. +// When the trusted-ref flag is set, the doc-map value is used as a VCS API +// path; a nonexistent local file must not cause an early exit before network I/O. +func TestMainSubprocess_DocMapTrustedRefSkipsLocalValidation(t *testing.T) { + if os.Getenv("TEST_SUBPROCESS_MAIN") == "1" { + flag.CommandLine = flag.NewFlagSet(os.Args[0], flag.ExitOnError) + os.Args = []string{"review-bot", + "--vcs-url", "https://gitea.example.com", + "--repo", "owner/repo", + "--pr", "1", + "--reviewer-token", "tok", + "--llm-base-url", "https://api.example.com", + "--llm-api-key", "key", + "--llm-model", "gpt-4", + "--doc-map", "nonexistent-local.yml", + "--doc-map-trusted-ref", "main", + } + main() + return + } + + cmd := exec.Command(os.Args[0], "-test.run=TestMainSubprocess_DocMapTrustedRefSkipsLocalValidation") + cmd.Env = append(cleanEnv(), + "TEST_SUBPROCESS_MAIN=1", + "GITHUB_WORKSPACE="+t.TempDir(), + ) + out, err := cmd.CombinedOutput() + output := string(out) + + // The test must fail (network I/O or VCS API failure) but must NOT + // fail with the local filesystem validation error. + // "failed to resolve" would indicate the early validateWorkspacePath ran — + // that would be the bug this test is catching. + if strings.Contains(output, "failed to resolve") { + t.Errorf("--doc-map-trusted-ref should skip local path validation, but got filesystem error: %s", output) + } + + // It must still exit non-zero (real VCS call to example.com will fail). + if err == nil { + t.Fatal("expected non-zero exit when VCS API is unreachable, got success") + } +} diff --git a/review/docmap.go b/review/docmap.go index 4381329..1789e63 100644 --- a/review/docmap.go +++ b/review/docmap.go @@ -52,15 +52,31 @@ func ParseDocMapConfig(localPath string) (*DocMapConfig, error) { if err != nil { return nil, fmt.Errorf("read doc-map file %q: %w", localPath, err) } + return parseDocMapBytes(data, localPath) +} +// ParseDocMapConfigContent parses a doc-map YAML config from an in-memory +// string. The source parameter is used only for error messages and log entries +// (e.g. "owner/repo@main:.review-bot/doc-map.yml"). +// +// Use this when the config content has been fetched from a trusted VCS ref +// rather than read from the local workspace. +func ParseDocMapConfigContent(content, source string) (*DocMapConfig, error) { + data := []byte(content) + return parseDocMapBytes(data, source) +} + +// parseDocMapBytes is the shared YAML parse implementation used by +// ParseDocMapConfig and ParseDocMapConfigContent. +func parseDocMapBytes(data []byte, source string) (*DocMapConfig, error) { var cfg DocMapConfig if err := yaml.UnmarshalWithOptions(data, &cfg, yaml.Strict()); err != nil { // Re-parse without strict mode to log which keys are unknown. var relaxed DocMapConfig if err2 := yaml.Unmarshal(data, &relaxed); err2 != nil { - return nil, fmt.Errorf("parse doc-map YAML %q: %w", localPath, err) + return nil, fmt.Errorf("parse doc-map YAML %q: %w", source, err) } - slog.Warn("doc-map YAML contains unknown keys (ignored)", "file", localPath, "error", err) + slog.Warn("doc-map YAML contains unknown keys (ignored)", "file", source, "error", err) cfg = relaxed } return &cfg, nil diff --git a/review/docmap_test.go b/review/docmap_test.go index 2674d15..da2e177 100644 --- a/review/docmap_test.go +++ b/review/docmap_test.go @@ -510,3 +510,63 @@ func TestFileCoveredByDocMap_EmptyConfig(t *testing.T) { t.Error("expected false for empty config, got true") } } + +// ============================================================ +// ParseDocMapConfigContent +// ============================================================ + +func TestParseDocMapConfigContent_Valid(t *testing.T) { + content := ` +mappings: + - paths: + - "lib/foo/**" + docs: + - docs/foo.md +` + cfg, err := ParseDocMapConfigContent(content, "owner/repo@main:.review-bot/doc-map.yml") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(cfg.Mappings) != 1 { + t.Fatalf("expected 1 mapping, got %d", len(cfg.Mappings)) + } + if len(cfg.Mappings[0].Docs) != 1 || cfg.Mappings[0].Docs[0] != "docs/foo.md" { + t.Errorf("unexpected mapping: %+v", cfg.Mappings[0]) + } +} + +func TestParseDocMapConfigContent_EmptyContent(t *testing.T) { + cfg, err := ParseDocMapConfigContent("", "test-source") + if err != nil { + t.Fatalf("unexpected error for empty content: %v", err) + } + if len(cfg.Mappings) != 0 { + t.Errorf("expected 0 mappings for empty content, got %d", len(cfg.Mappings)) + } +} + +func TestParseDocMapConfigContent_InvalidYAML(t *testing.T) { + _, err := ParseDocMapConfigContent("mappings: [{{invalid", "test-source") + if err == nil { + t.Fatal("expected error for invalid YAML, got nil") + } +} + +func TestParseDocMapConfigContent_UnknownKeys(t *testing.T) { + content := ` +mappings: + - paths: + - "lib/**" + docs: + - docs/foo.md +unknown_top_level_key: "should be warned but not fatal" +` + // Unknown top-level keys produce a warning but not an error. + cfg, err := ParseDocMapConfigContent(content, "test-source") + if err != nil { + t.Fatalf("unexpected error for unknown keys: %v", err) + } + if len(cfg.Mappings) == 0 { + t.Error("expected mappings to be parsed despite unknown key") + } +}