feat(#141): validate-docmap subcommand — CI hard-fail on missing docmap coverage #142
@@ -83,6 +83,8 @@ func runValidateDocmap(args []string) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// --- Check 2: Stale docs ---
|
// --- Check 2: Stale docs ---
|
||||||
|
// checkStaleDocs validates each path before touching the filesystem; see
|
||||||
|
security-review-bot marked this conversation as resolved
Outdated
|
|||||||
|
// its documentation for the path-traversal hardening applied.
|
||||||
repoRoot := filepath.Clean(*repoRootFlag)
|
repoRoot := filepath.Clean(*repoRootFlag)
|
||||||
staleDocs := checkStaleDocs(cfg, repoRoot)
|
staleDocs := checkStaleDocs(cfg, repoRoot)
|
||||||
|
sonnet-review-bot
commented
[NIT] The error message 'ERROR: stale docmap docs: entries (paths do not exist):' has an awkward colon after 'docs:' — this reads as if 'docs:' is YAML syntax leaking into the message. Consider 'ERROR: stale docmap entries (docs: paths that do not exist on disk):' or simply 'ERROR: docs: entries not found on disk:'. **[NIT]** The error message 'ERROR: stale docmap docs: entries (paths do not exist):' has an awkward colon after 'docs:' — this reads as if 'docs:' is YAML syntax leaking into the message. Consider 'ERROR: stale docmap entries (docs: paths that do not exist on disk):' or simply 'ERROR: docs: entries not found on disk:'.
|
|||||||
if len(staleDocs) > 0 {
|
if len(staleDocs) > 0 {
|
||||||
@@ -101,7 +103,14 @@ func runValidateDocmap(args []string) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// checkStaleDocs returns deduplicated docs: entries that do not exist under repoRoot.
|
// checkStaleDocs returns deduplicated docs: entries that do not exist under
|
||||||
|
gpt-review-bot
commented
[NIT] Deduplication of docs: entries uses the raw docPath string; consider normalizing (e.g., filepath.Clean and using forward slashes) before inserting into the seen set to avoid duplicate reports for equivalent paths like "docs/./a.md" vs "docs/a.md". **[NIT]** Deduplication of docs: entries uses the raw docPath string; consider normalizing (e.g., filepath.Clean and using forward slashes) before inserting into the seen set to avoid duplicate reports for equivalent paths like "docs/./a.md" vs "docs/a.md".
|
|||||||
|
// repoRoot.
|
||||||
|
//
|
||||||
|
// Path-traversal hardening: each docPath is validated with
|
||||||
|
// review.ValidateDocPath (rejects absolute paths and ".." segments) and then
|
||||||
|
// confined to repoRoot via filepath.Clean + filepath.Rel before os.Stat is
|
||||||
|
// called. Paths that fail either check are treated as invalid (reported as
|
||||||
|
// stale) without touching the host filesystem.
|
||||||
func checkStaleDocs(cfg *review.DocMapConfig, repoRoot string) []string {
|
func checkStaleDocs(cfg *review.DocMapConfig, repoRoot string) []string {
|
||||||
seen := make(map[string]struct{})
|
seen := make(map[string]struct{})
|
||||||
var stale []string
|
var stale []string
|
||||||
@@ -116,7 +125,25 @@ func checkStaleDocs(cfg *review.DocMapConfig, repoRoot string) []string {
|
|||||||
}
|
}
|
||||||
seen[docPath] = struct{}{}
|
seen[docPath] = struct{}{}
|
||||||
|
|
||||||
fullPath := filepath.Join(repoRoot, filepath.FromSlash(docPath))
|
// Guard 1: reject absolute paths and ".." segments sourced from
|
||||||
|
sonnet-review-bot
commented
[NIT] readLines uses bufio.Scanner with the default 64KB token buffer. Very long file paths (unlikely but possible with generated paths) would silently be truncated by scanner.Scan() returning false without error when a line exceeds the buffer. For a CI tool reading git diff output this is practically fine, but a comment noting the 64KB line limit or using scanner.Buffer to increase it would be defensive. **[NIT]** readLines uses bufio.Scanner with the default 64KB token buffer. Very long file paths (unlikely but possible with generated paths) would silently be truncated by scanner.Scan() returning false without error when a line exceeds the buffer. For a CI tool reading git diff output this is practically fine, but a comment noting the 64KB line limit or using scanner.Buffer to increase it would be defensive.
|
|||||||
|
// PR-controlled YAML before joining with repoRoot.
|
||||||
|
if err := review.ValidateDocPath(docPath); err != nil {
|
||||||
|
stale = append(stale, docPath)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guard 2: verify the cleaned joined path does not escape repoRoot.
|
||||||
|
// filepath.Clean resolves any remaining ".." after the join; the
|
||||||
|
// filepath.Rel check confirms the path is still under repoRoot.
|
||||||
|
security-review-bot marked this conversation as resolved
Outdated
[MAJOR] Symlink traversal allows probing existence of arbitrary host files. checkStaleDocs uses os.Stat on repo-controlled paths after Clean/Rel checks, but does not prevent symlinks under repoRoot from pointing outside the workspace. A malicious PR can add a symlink in the repo and reference it in doc-map to infer presence/absence of sensitive files on the CI runner via pass/fail or logs. Mitigate by using os.Lstat to reject symlinks, or resolve with filepath.EvalSymlinks and re-validate the resolved path remains within repoRoot before stat. **[MAJOR]** Symlink traversal allows probing existence of arbitrary host files. checkStaleDocs uses os.Stat on repo-controlled paths after Clean/Rel checks, but does not prevent symlinks under repoRoot from pointing outside the workspace. A malicious PR can add a symlink in the repo and reference it in doc-map to infer presence/absence of sensitive files on the CI runner via pass/fail or logs. Mitigate by using os.Lstat to reject symlinks, or resolve with filepath.EvalSymlinks and re-validate the resolved path remains within repoRoot before stat.
|
|||||||
|
fullPath := filepath.Clean(filepath.Join(repoRoot, filepath.FromSlash(docPath)))
|
||||||
|
rel, err := filepath.Rel(repoRoot, fullPath)
|
||||||
|
if err != nil || strings.HasPrefix(rel, "..") {
|
||||||
|
stale = append(stale, docPath)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Safe to stat: path is relative, contains no "..", and is
|
||||||
|
// confined within repoRoot.
|
||||||
if _, err := os.Stat(fullPath); err != nil {
|
if _, err := os.Stat(fullPath); err != nil {
|
||||||
stale = append(stale, docPath)
|
stale = append(stale, docPath)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -288,3 +288,50 @@ mappings:
|
|||||||
t.Errorf("expected docs/shared.md to appear exactly once in stderr (deduplicated), got %d occurrences: %q", count, stderr)
|
t.Errorf("expected docs/shared.md to appear exactly once in stderr (deduplicated), got %d occurrences: %q", count, stderr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestCheckStaleDocs_PathTraversal verifies that checkStaleDocs rejects
|
||||||
|
// traversal and absolute paths without touching the host filesystem.
|
||||||
|
func TestCheckStaleDocs_PathTraversal(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
|
||||||
|
// Baseline: a valid doc that exists.
|
||||||
|
makeDocFile(t, dir, "docs/valid.md")
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
docPath string
|
||||||
|
wantStale bool
|
||||||
|
}{
|
||||||
|
{"dot-dot traversal", "../../etc/passwd", true},
|
||||||
|
{"dot-dot single", "../outside", true},
|
||||||
|
{"absolute path", "/etc/passwd", true},
|
||||||
|
{"valid present path", "docs/valid.md", false},
|
||||||
|
{"valid missing path", "docs/missing.md", true},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
docmap := makeDocmapYAML(t, `
|
||||||
|
mappings:
|
||||||
|
- paths:
|
||||||
|
- "lib/**"
|
||||||
|
docs:
|
||||||
|
- `+tc.docPath+`
|
||||||
|
`)
|
||||||
|
code, _, stderr := stdinValidateDocmap(t,
|
||||||
|
"",
|
||||||
|
[]string{"--docmap", docmap, "--repo-root", dir},
|
||||||
|
sonnet-review-bot
commented
[NIT] In **[NIT]** In `TestCheckStaleDocs_PathTraversal`, the inner table test creates a docmap using string concatenation with the docPath directly embedded in YAML (`- `+tc.docPath+``). For paths like `/etc/passwd` and `../../etc/passwd`, this creates valid YAML. However, for paths containing special YAML characters (`:`, `#`, `{`), this could produce invalid YAML that would fail at parse time rather than at the stale-docs check, making the test misleading. The current test cases are safe, but using `fmt.Sprintf` with YAML quoting or writing the file programmatically would be more robust.
|
|||||||
|
)
|
||||||
|
|
||||||
|
if tc.wantStale {
|
||||||
|
if code != 1 {
|
||||||
|
t.Errorf("path %q: expected exit 1 (stale/invalid), got %d; stderr: %q", tc.docPath, code, stderr)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if code != 0 {
|
||||||
|
t.Errorf("path %q: expected exit 0 (valid), got %d; stderr: %q", tc.docPath, code, stderr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -257,7 +257,7 @@ type docEntry struct {
|
|||||||
// If the path is a directory, all .md files under it are returned.
|
// If the path is a directory, all .md files under it are returned.
|
||||||
// If it's a file, a single entry is returned.
|
// If it's a file, a single entry is returned.
|
||||||
func loadDocEntries(ctx context.Context, fetcher DocFetcher, owner, repo, docPath string) ([]docEntry, error) {
|
func loadDocEntries(ctx context.Context, fetcher DocFetcher, owner, repo, docPath string) ([]docEntry, error) {
|
||||||
if err := validateDocPath(docPath); err != nil {
|
if err := ValidateDocPath(docPath); err != nil {
|
||||||
return nil, fmt.Errorf("doc path %q rejected: %w", docPath, err)
|
return nil, fmt.Errorf("doc path %q rejected: %w", docPath, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -310,11 +310,11 @@ func readFileBytes(path string) ([]byte, error) {
|
|||||||
return os.ReadFile(path)
|
return os.ReadFile(path)
|
||||||
|
sonnet-review-bot
commented
[NIT] The comment on ValidateDocPath says 'Backslashes are rejected explicitly to prevent Windows platform edge cases.' — the sentence could be clearer that the tool itself may run on Windows and that backslashes in YAML doc paths could be misinterpreted by filepath.Join on that platform. Minor documentation clarity. **[NIT]** The comment on ValidateDocPath says 'Backslashes are rejected explicitly to prevent Windows platform edge cases.' — the sentence could be clearer that the tool itself may run on Windows and that backslashes in YAML doc paths could be misinterpreted by filepath.Join on that platform. Minor documentation clarity.
|
|||||||
}
|
}
|
||||||
|
|
||||||
// validateDocPath rejects doc paths that could cause path traversal via the
|
// ValidateDocPath rejects doc paths that could cause path traversal via the
|
||||||
// VCS API (absolute paths, any ".." segment). Defense-in-depth: the VCS API
|
// VCS API (absolute paths, any ".." segment). Defense-in-depth: the VCS API
|
||||||
// should already scope paths to the repo, but we validate locally to avoid
|
// should already scope paths to the repo, but we validate locally to avoid
|
||||||
|
sonnet-review-bot
commented
[NIT] The comment on **[NIT]** The comment on `ValidateDocPath` mentions "Finding #3" which appears to be a reference to an internal issue/finding number that won't be meaningful to future readers of this file. This should be reworded to be self-contained (e.g., remove the parenthetical or rephrase as 'to prevent OS-specific path separator normalization issues').
|
|||||||
// any quirk in backend path handling.
|
// any quirk in backend path handling.
|
||||||
func validateDocPath(p string) error {
|
func ValidateDocPath(p string) error {
|
||||||
if filepath.IsAbs(p) {
|
if filepath.IsAbs(p) {
|
||||||
return fmt.Errorf("absolute paths not allowed")
|
return fmt.Errorf("absolute paths not allowed")
|
||||||
|
sonnet-review-bot
commented
[NIT] The comment on **[NIT]** The comment on `ValidateDocPath` says "Defense-in-depth: the VCS API should already scope paths to the repo" — this is accurate for the VCS-fetch path, but `ValidateDocPath` is now also used by the local-filesystem stale-docs check where the VCS API is not involved. The comment is still broadly correct but could be generalized: "Defense-in-depth: callers should also confine the joined path to the repo root via filepath.Rel before filesystem access."
|
|||||||
}
|
}
|
||||||
|
security-review-bot marked this conversation as resolved
Outdated
[NIT] ValidateDocPath splits on '/' only. While later checks use filepath.FromSlash and Rel, consider normalizing or explicitly rejecting backslashes in paths to avoid platform-specific edge cases on Windows. **[NIT]** ValidateDocPath splits on '/' only. While later checks use filepath.FromSlash and Rel, consider normalizing or explicitly rejecting backslashes in paths to avoid platform-specific edge cases on Windows.
|
|||||||
|
|||||||
@@ -11,7 +11,7 @@ import (
|
|||||||
|
|
||||||
// fakeDocFetcher is a mock DocFetcher for tests.
|
// fakeDocFetcher is a mock DocFetcher for tests.
|
||||||
type fakeDocFetcher struct {
|
type fakeDocFetcher struct {
|
||||||
files map[string]string // path -> content
|
files map[string]string // path -> content
|
||||||
dirs map[string]map[string]string // dir path -> (file path -> content)
|
dirs map[string]map[string]string // dir path -> (file path -> content)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -384,7 +384,7 @@ func TestValidateDocPath(t *testing.T) {
|
|||||||
"a/b/c",
|
"a/b/c",
|
||||||
}
|
}
|
||||||
for _, p := range valid {
|
for _, p := range valid {
|
||||||
if err := validateDocPath(p); err != nil {
|
if err := ValidateDocPath(p); err != nil {
|
||||||
t.Errorf("expected valid path %q to pass, got error: %v", p, err)
|
t.Errorf("expected valid path %q to pass, got error: %v", p, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -397,7 +397,7 @@ func TestValidateDocPath(t *testing.T) {
|
|||||||
"a/b/../c",
|
"a/b/../c",
|
||||||
}
|
}
|
||||||
for _, p := range invalid {
|
for _, p := range invalid {
|
||||||
if err := validateDocPath(p); err == nil {
|
if err := ValidateDocPath(p); err == nil {
|
||||||
t.Errorf("expected path %q to be rejected, but it was accepted", p)
|
t.Errorf("expected path %q to be rejected, but it was accepted", p)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
[MINOR] repoRoot is only cleaned (filepath.Clean) but not resolved; resolving repoRoot with filepath.Abs + filepath.EvalSymlinks before Rel checks would harden against cases where the root itself is a symlink and reduce ambiguity across platforms.
[MINOR] The stale-docs check calls
filepath.EvalSymlinks(absRoot)on the--repo-rootpath, which will fail with an error if the directory does not yet exist (e.g. a new repo checkout where the root hasn't been created). In practice--repo-rootdefaults to.(cwd) which always exists, so this is low-risk, but an explicitos.Statcheck beforeEvalSymlinkswould give a clearer error message when the root is genuinely missing vs. a symlink resolution failure.