feat: add YAML support for persona files #58
@@ -9,7 +9,7 @@ AI-powered code review bot for Gitea pull requests. Fetches diff + context, send
|
||||
- **Smart budget**: Automatically trims context to fit model token limits
|
||||
- **Idempotent reviews**: Posts new review, then cleans up stale ones (one review per bot)
|
||||
- **Custom prompts**: Load additional instructions from a file (e.g. security-focused review)
|
||||
- **Zero dependencies**: Go stdlib only
|
||||
- **Minimal dependencies**: Go stdlib + `gopkg.in/yaml.v3` only
|
||||
|
||||
## Quick Start: Composite Action
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ JSON is awkward for persona files that contain multi-line text (identity, severi
|
||||
- Backwards compatibility: existing JSON personas must continue to work
|
||||
- Security: protect against DoS via deeply nested YAML (AIKIDO-2024-10486)
|
||||
- Consistency: use `.yaml` extension (not `.yml`)
|
||||
|
|
||||
- Library: use `github.com/goccy/go-yaml` v1.16.0+ (actively maintained, security fix applied)
|
||||
- Library: use `gopkg.in/yaml.v3` (approved in CONVENTIONS.md) with explicit depth limiting
|
||||
|
gpt-review-bot
commented
[NIT] The design doc checklist references adding a "go-yaml" dependency at v1.16.0+, which doesn't align with the actual **[NIT]** The design doc checklist references adding a "go-yaml" dependency at v1.16.0+, which doesn't align with the actual `gopkg.in/yaml.v3 v3.0.1` dependency used. Consider updating the doc to avoid confusion.
|
||||
|
||||
## Proposed Approach
|
||||
|
sonnet-review-bot
commented
[MINOR] The design document says to use **[MINOR]** The design document says to use `github.com/goccy/go-yaml` but the implementation uses `gopkg.in/yaml.v3`. The design document is now inaccurate/misleading. If the decision was made to use `gopkg.in/yaml.v3` instead, the design document should be updated to reflect this, and the security claim about depth protection should be addressed explicitly.
|
||||
|
||||
@@ -36,8 +36,8 @@ func parsePersona(data []byte, source string) (*Persona, error) {
|
||||
```go
|
||||
func parseYAML(data []byte, source string) (*Persona, error) {
|
||||
var p Persona
|
||||
// go-yaml has built-in protection against deeply nested structures
|
||||
// but we add explicit decoder options for defense in depth
|
||||
// gopkg.in/yaml.v3 does NOT have built-in depth limiting.
|
||||
// Use explicit depth check via yaml.Node API.
|
||||
if err := yaml.Unmarshal(data, &p); err != nil {
|
||||
return nil, fmt.Errorf("parse persona %s: %w", source, err)
|
||||
}
|
||||
@@ -48,7 +48,7 @@ func parseYAML(data []byte, source string) (*Persona, error) {
|
||||
}
|
||||
```
|
||||
|
||||
The `goccy/go-yaml` library since v1.16.0 limits nesting depth by default.
|
||||
The `gopkg.in/yaml.v3` library does not have built-in depth protection, so we implement explicit depth checking by first decoding into a `yaml.Node`, walking the tree to verify depth, then decoding into the target struct.
|
||||
|
||||
## State/Data Model
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
package review
|
||||
|
gpt-review-bot
commented
[NIT] LoadPersona wraps os.Stat errors with message 'read persona file ...', which can be slightly misleading; consider using a 'stat persona file' prefix for the stat error path to improve specificity. **[NIT]** LoadPersona wraps os.Stat errors with message 'read persona file ...', which can be slightly misleading; consider using a 'stat persona file' prefix for the stat error path to improve specificity.
gpt-review-bot
commented
[NIT] unmarshalYAMLWithDepthLimit accepts multi-document YAML but silently ignores subsequent documents. Consider rejecting multi-document inputs to avoid ambiguity or add a comment in README clarifying this behavior. **[NIT]** unmarshalYAMLWithDepthLimit accepts multi-document YAML but silently ignores subsequent documents. Consider rejecting multi-document inputs to avoid ambiguity or add a comment in README clarifying this behavior.
gpt-review-bot
commented
[NIT] LoadBuiltinPersona falls back to JSON for embedded personas, but only *.yaml files are embedded now; the JSON fallback is effectively dead code. Optional: either embed both formats or remove the fallback to simplify. **[NIT]** LoadBuiltinPersona falls back to JSON for embedded personas, but only *.yaml files are embedded now; the JSON fallback is effectively dead code. Optional: either embed both formats or remove the fallback to simplify.
gpt-review-bot
commented
[NIT] ListBuiltinPersonas filters by case-sensitive suffixes; while embedded filenames are controlled and lowercase now, using a case-insensitive check would be slightly more robust if naming conventions change. **[NIT]** ListBuiltinPersonas filters by case-sensitive suffixes; while embedded filenames are controlled and lowercase now, using a case-insensitive check would be slightly more robust if naming conventions change.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"embed"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
@@ -14,6 +16,14 @@ import (
|
||||
//go:embed personas/*.yaml
|
||||
|
gpt-review-bot
commented
[NIT] Only YAML files are embedded (//go:embed personas/*.yaml) while the design doc mentions embedding both formats; consider updating the comment/design doc or embedding both if JSON files may ever return. **[NIT]** Only YAML files are embedded (//go:embed personas/*.yaml) while the design doc mentions embedding both formats; consider updating the comment/design doc or embedding both if JSON files may ever return.
|
||||
var embeddedPersonas embed.FS
|
||||
|
||||
// MaxPersonaFileSize is the maximum size for persona files (64 KB).
|
||||
// This prevents denial-of-service via excessively large files.
|
||||
const MaxPersonaFileSize = 64 * 1024
|
||||
|
||||
// MaxYAMLDepth is the maximum nesting depth allowed in YAML persona files.
|
||||
// This prevents stack exhaustion from deeply nested structures.
|
||||
const MaxYAMLDepth = 20
|
||||
|
||||
// Persona defines a specialized review role with focused expertise.
|
||||
type Persona struct {
|
||||
Name string `json:"name" yaml:"name"`
|
||||
@@ -36,7 +46,15 @@ type Severity struct {
|
||||
|
||||
// LoadPersona loads a persona from a JSON or YAML file path.
|
||||
// Format is detected by file extension: .yaml/.yml for YAML, .json or other for JSON.
|
||||
// Files larger than MaxPersonaFileSize are rejected.
|
||||
func LoadPersona(path string) (*Persona, error) {
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read persona file %s: %w", path, err)
|
||||
}
|
||||
if info.Size() > MaxPersonaFileSize {
|
||||
return nil, fmt.Errorf("persona file %s exceeds maximum size (%d bytes)", path, MaxPersonaFileSize)
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read persona file %s: %w", path, err)
|
||||
@@ -65,7 +83,7 @@ func LoadBuiltinPersona(name string) (*Persona, error) {
|
||||
return parsePersona(data, "builtin:"+jsonFile)
|
||||
}
|
||||
|
||||
// ListBuiltinPersonas returns the names of all built-in personas.
|
||||
// ListBuiltinPersonas returns the names of all built-in personas in sorted order.
|
||||
// Returns an empty slice if the embedded directory cannot be read.
|
||||
func ListBuiltinPersonas() []string {
|
||||
entries, err := embeddedPersonas.ReadDir("personas")
|
||||
@@ -94,10 +112,11 @@ func ListBuiltinPersonas() []string {
|
||||
seen[personaName] = true
|
||||
|
gpt-review-bot
commented
[NIT] ListBuiltinPersonas handles .yml and .json extensions even though only *.yaml files are embedded. This is harmless but could be simplified or clarified in comments to reflect the embed pattern. **[NIT]** ListBuiltinPersonas handles .yml and .json extensions even though only *.yaml files are embedded. This is harmless but could be simplified or clarified in comments to reflect the embed pattern.
|
||||
}
|
||||
}
|
||||
var names []string
|
||||
names := make([]string, 0, len(seen))
|
||||
for name := range seen {
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
return names
|
||||
}
|
||||
|
||||
@@ -110,8 +129,7 @@ func parsePersona(data []byte, source string) (*Persona, error) {
|
||||
var p Persona
|
||||
var err error
|
||||
if isYAML {
|
||||
// go-yaml v1.16.0+ has built-in protection against deeply nested structures
|
||||
err = yaml.Unmarshal(data, &p)
|
||||
err = unmarshalYAMLWithDepthLimit(data, &p, MaxYAMLDepth)
|
||||
} else {
|
||||
err = json.Unmarshal(data, &p)
|
||||
}
|
||||
@@ -124,6 +142,35 @@ func parsePersona(data []byte, source string) (*Persona, error) {
|
||||
return &p, nil
|
||||
}
|
||||
|
||||
// unmarshalYAMLWithDepthLimit unmarshals YAML data with explicit depth limiting.
|
||||
// This protects against stack exhaustion from deeply nested structures.
|
||||
func unmarshalYAMLWithDepthLimit(data []byte, out interface{}, maxDepth int) error {
|
||||
var node yaml.Node
|
||||
|
gpt-review-bot
commented
[MINOR] JSON parsing does not enforce single-document input. After dec.Decode(&p), any trailing JSON values would be silently ignored. Consider verifying EOF by attempting a second decode and expecting io.EOF to ensure there's no extra data. **[MINOR]** JSON parsing does not enforce single-document input. After dec.Decode(&p), any trailing JSON values would be silently ignored. Consider verifying EOF by attempting a second decode and expecting io.EOF to ensure there's no extra data.
|
||||
dec := yaml.NewDecoder(bytes.NewReader(data))
|
||||
if err := dec.Decode(&node); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := checkYAMLDepth(&node, 0, maxDepth); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return node.Decode(out)
|
||||
}
|
||||
|
||||
// checkYAMLDepth recursively checks that YAML nodes don't exceed the depth limit.
|
||||
func checkYAMLDepth(node *yaml.Node, depth, maxDepth int) error {
|
||||
if depth > maxDepth {
|
||||
return fmt.Errorf("YAML nesting depth exceeds maximum (%d)", maxDepth)
|
||||
}
|
||||
for _, child := range node.Content {
|
||||
if err := checkYAMLDepth(child, depth+1, maxDepth); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validatePersona(p *Persona, source string) error {
|
||||
if p.Name == "" {
|
||||
return fmt.Errorf("persona %s: name is required", source)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package review
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
@@ -449,3 +450,65 @@ severity:
|
||||
t.Errorf("Focus[0] = %q, want %q", p.Focus[0], "item")
|
||||
}
|
||||
}
|
||||
|
||||
func TestYAMLDeeplyNestedRejection(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "deeply-nested.yaml")
|
||||
|
||||
// Build a deeply nested YAML structure that exceeds MaxYAMLDepth (20).
|
||||
// Each level adds 2 to the depth count (key + value mapping).
|
||||
var sb strings.Builder
|
||||
sb.WriteString("name: test\nidentity: test\nnested:\n")
|
||||
indent := " "
|
||||
for i := 0; i < 25; i++ {
|
||||
sb.WriteString(strings.Repeat(indent, i+1))
|
||||
sb.WriteString(fmt.Sprintf("level%d:\n", i))
|
||||
}
|
||||
sb.WriteString(strings.Repeat(indent, 26))
|
||||
sb.WriteString("value: too-deep\n")
|
||||
|
||||
if err := os.WriteFile(path, []byte(sb.String()), 0644); err != nil {
|
||||
t.Fatalf("failed to write test file: %v", err)
|
||||
}
|
||||
|
||||
_, err := LoadPersona(path)
|
||||
if err == nil {
|
||||
t.Error("expected error for deeply nested YAML, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "nesting depth exceeds") {
|
||||
t.Errorf("error = %q, want containing 'nesting depth exceeds'", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestYAMLFileSizeLimit(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "huge.yaml")
|
||||
|
||||
// Create a file larger than MaxPersonaFileSize (64 KB)
|
||||
content := "name: test\nidentity: " + strings.Repeat("x", MaxPersonaFileSize+1) + "\n"
|
||||
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||
t.Fatalf("failed to write test file: %v", err)
|
||||
}
|
||||
|
||||
_, err := LoadPersona(path)
|
||||
if err == nil {
|
||||
t.Error("expected error for oversized file, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "exceeds maximum size") {
|
||||
t.Errorf("error = %q, want containing 'exceeds maximum size'", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestListBuiltinPersonasSortedOrder(t *testing.T) {
|
||||
names := ListBuiltinPersonas()
|
||||
if len(names) < 2 {
|
||||
t.Skip("need at least 2 personas to test ordering")
|
||||
}
|
||||
|
||||
// Verify the list is sorted
|
||||
for i := 1; i < len(names); i++ {
|
||||
if names[i-1] > names[i] {
|
||||
t.Errorf("ListBuiltinPersonas not sorted: %q > %q", names[i-1], names[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[NIT] The design constraint says to use '.yaml' (not '.yml') for consistency, but the implementation and README support both. Clarify the recommendation vs. support to avoid confusion.