feat: add JSONC support for comments and trailing commas (#8862)

2025-12-12 15:50:15 -08:00 · 2025-05-13 14:24:11 +04:00
parent e97af9806a
commit 0b0e4061ef
2 changed files with 438 additions and 0 deletions
--- a/pkg/x/json/jsonc.go
+++ b/pkg/x/json/jsonc.go
@@ -0,0 +1,248 @@
+package json
+
+import (
+	"bytes"
+	"errors"
+	"io"
+)
+
+// TokenType represents the type of token being processed
+type TokenType int
+
+const (
+	TokenNormal TokenType = iota
+	TokenString
+	TokenSingleLineComment
+	TokenMultiLineComment
+)
+
+// jsoncParser manages the state and processing of JSONC content
+type jsoncParser struct {
+	reader    *bytes.Reader // Source reader
+	dst       []byte        // Destination buffer
+	pos       int           // Current position in destination
+	tokenType TokenType     // Current token type being processed
+	escaped   bool          // Whether the previous character was an escape character
+	lastChar  byte          // Last processed character
+}
+
+// ToRFC8259 converts JSONC (JSON with Comments) to valid JSON following RFC8259.
+// It strips out comments and trailing commas while maintaining the exact character
+// offsets as the input. This ensures that any JSON parser locations will map
+// directly back to the original source file positions.
+//
+// Both line numbers and character positions are preserved in the output.
+// Comments and trailing commas are replaced with spaces without changing line counts.
+//
+// Comments can be either:
+// - Single-line: starting with // and continuing to the end of the line
+// - Multi-line: starting with /* and ending with */
+//
+// Trailing commas are allowed in JSONC but not in standard JSON, so they are replaced
+// with spaces to maintain character offsets.
+func ToRFC8259(src []byte) []byte {
+	dst := make([]byte, len(src))
+	copy(dst, src) // Copy input to maintain same length and offsets
+
+	parser := newJSONCParser(src, dst)
+	parser.process()
+
+	return dst
+}
+
+// UnmarshalJSONC parses JSONC (JSON with Comments) data into the specified value.
+// It first converts JSONC to standard JSON following RFC8259 and then unmarshals it.
+// This is a convenience function that combines ToRFC8259 and Unmarshal.
+//
+// The parser preserves line number information, which is essential for reporting
+// errors at their correct locations in the original file.
+//
+// Usage example:
+//
+//	type Config struct {
+//	    Name    string            `json:"name"`
+//	    Version string            `json:"version"`
+//	    xjson.Location            // Embed Location to get line number info
+//	}
+//
+//	var config Config
+//	if err := xjson.UnmarshalJSONC(data, &config); err != nil {
+//	    return err
+//	}
+func UnmarshalJSONC(data []byte, v any) error {
+	jsonData := ToRFC8259(data)
+	return Unmarshal(jsonData, v)
+}
+
+// newJSONCParser creates a new JSONC parser
+func newJSONCParser(src, dst []byte) *jsoncParser {
+	return &jsoncParser{
+		reader:    bytes.NewReader(src),
+		dst:       dst,
+		pos:       0,
+		tokenType: TokenNormal,
+	}
+}
+
+// process processes the input JSONC content
+func (p *jsoncParser) process() {
+	for {
+		b, err := p.reader.ReadByte()
+		if errors.Is(err, io.EOF) {
+			break
+		} else if err != nil {
+			// Ignore other errors (not expected to occur)
+			break
+		}
+		p.processChar(b)
+	}
+}
+
+// processChar processes a single character based on current state
+func (p *jsoncParser) processChar(b byte) {
+	switch p.tokenType {
+	case TokenString:
+		p.processStringToken(b)
+	case TokenSingleLineComment:
+		p.processSingleLineComment(b)
+	case TokenMultiLineComment:
+		p.processMultiLineComment(b)
+	default:
+		p.processNormalToken(b)
+	}
+}
+
+// processStringToken processes a character within a string literal
+func (p *jsoncParser) processStringToken(b byte) {
+	switch {
+	case p.escaped:
+		p.escaped = false
+	case b == '\\':
+		p.escaped = true
+	case b == '"':
+		p.tokenType = TokenNormal
+	}
+
+	p.lastChar = b
+	p.pos++
+}
+
+// processSingleLineComment processes a character within a single-line comment
+func (p *jsoncParser) processSingleLineComment(b byte) {
+	if b == '\n' {
+		// End of single-line comment at newline
+		p.tokenType = TokenNormal
+	} else if !isPreservedWhitespace(b) {
+		// Replace non-whitespace characters with spaces
+		if p.pos < len(p.dst) {
+			p.dst[p.pos] = ' '
+		}
+	}
+
+	p.lastChar = b
+	p.pos++
+}
+
+// processMultiLineComment processes a character within a multi-line comment
+func (p *jsoncParser) processMultiLineComment(b byte) {
+	if p.lastChar == '*' && b == '/' {
+		// End of multi-line comment
+		p.tokenType = TokenNormal
+		if p.pos < len(p.dst) {
+			p.dst[p.pos] = ' ' // Replace '/' with space
+		}
+	} else if !isPreservedWhitespace(b) {
+		// Replace non-whitespace with space
+		if p.pos < len(p.dst) {
+			p.dst[p.pos] = ' '
+		}
+	}
+
+	p.lastChar = b
+	p.pos++
+}
+
+// processNormalToken processes a character outside of string literals and comments
+func (p *jsoncParser) processNormalToken(b byte) {
+	switch b {
+	case '"':
+		// Start of string literal
+		p.tokenType = TokenString
+	case '/':
+		// Potential start of comment - look ahead
+		nextByte, err := p.reader.ReadByte()
+		if err != nil {
+			// End of file after '/' character
+			return
+		}
+
+		switch nextByte {
+		case '/':
+			// Start of single-line comment
+			p.tokenType = TokenSingleLineComment
+			if p.pos < len(p.dst) {
+				p.dst[p.pos] = ' ' // Replace '/' with space
+			}
+			if p.pos+1 < len(p.dst) {
+				p.dst[p.pos+1] = ' ' // Replace second '/' with space
+			}
+			p.lastChar = nextByte
+			p.pos += 2
+			return
+		case '*':
+			// Start of multi-line comment
+			p.tokenType = TokenMultiLineComment
+			if p.pos < len(p.dst) {
+				p.dst[p.pos] = ' ' // Replace '/' with space
+			}
+			if p.pos+1 < len(p.dst) {
+				p.dst[p.pos+1] = ' ' // Replace '*' with space
+			}
+			p.lastChar = nextByte
+			p.pos += 2
+			return
+		}
+
+		// Not a comment, put the byte back
+		p.reader.UnreadByte()
+	case ']', '}':
+		// Handle trailing comma - look backward
+		p.handleTrailingComma()
+	}
+	p.lastChar = b
+	p.pos++
+}
+
+// handleTrailingComma handles the trailing comma by looking backward from the current position
+func (p *jsoncParser) handleTrailingComma() {
+	// Start from one position before the current bracket
+	startPos := p.pos - 1
+	if startPos < 0 {
+		return
+	}
+
+	// Find the previous significant (non-whitespace) character
+	for i := startPos; i >= 0; i-- {
+		if i >= len(p.dst) {
+			continue
+		}
+
+		c := p.dst[i]
+		switch c {
+		case ' ', '\t', '\n', '\r':
+			// Skip whitespace
+			continue
+		case ',':
+			// If it's a comma, replace it with a space
+			p.dst[i] = ' '
+		default:
+			// Stop after finding the first non-whitespace character
+			return
+		}
+	}
+}
+
+// isPreservedWhitespace returns true for whitespace that should be preserved
+func isPreservedWhitespace(c byte) bool {
+	return c == '\n' || c == '\t' || c == '\r'
+}
--- a/pkg/x/json/jsonc_test.go
+++ b/pkg/x/json/jsonc_test.go
@@ -0,0 +1,190 @@
+package json_test
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	xjson "github.com/aquasecurity/trivy/pkg/x/json"
+)
+
+func TestToRFC8259(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+		want  string
+	}{
+		{
+			name:  "no comments",
+			input: `{"a": 1, "b": 2}`,
+			want:  `{"a": 1, "b": 2}`,
+		},
+		{
+			name:  "single-line comment",
+			input: "{\n  \"a\": 1, // This is a comment\n  \"b\": 2\n}",
+			want:  "{\n  \"a\": 1,                     \n  \"b\": 2\n}",
+		},
+		{
+			name:  "multi-line comment",
+			input: "{\n  \"a\": 1, /* This is\n     a multi-line\n     comment */ \"b\": 2\n}",
+			want:  "{\n  \"a\": 1,           \n                 \n                \"b\": 2\n}",
+		},
+		{
+			name:  "comment with forward slash in string",
+			input: "{\n  \"url\": \"http://example.com\", // Comment\n  \"value\": 123\n}",
+			want:  "{\n  \"url\": \"http://example.com\",           \n  \"value\": 123\n}",
+		},
+		{
+			name:  "trailing comma in object",
+			input: `{"a": 1, "b": 2,}`,
+			want:  `{"a": 1, "b": 2 }`,
+		},
+		{
+			name:  "trailing comma in array",
+			input: `[1, 2, 3,]`,
+			want:  `[1, 2, 3 ]`,
+		},
+		{
+			name:  "nested trailing commas",
+			input: `{"a": [1, 2,], "b": {"x": 1, "y": 2,},}`,
+			want:  `{"a": [1, 2 ], "b": {"x": 1, "y": 2 } }`,
+		},
+		{
+			name:  "single-line comment at end of file without newline",
+			input: `{"a": 1} // Comment`,
+			want:  `{"a": 1}           `,
+		},
+		{
+			name:  "multi-line comment at end of file",
+			input: `{"a": 1} /* Comment */`,
+			want:  `{"a": 1}              `,
+		},
+		{
+			name:  "comment within string",
+			input: `{"text": "This string has // comment syntax"}`,
+			want:  `{"text": "This string has // comment syntax"}`,
+		},
+		{
+			name:  "quoted comment markers",
+			input: `{"a": "//", "b": "/*", "c": "*/"}`,
+			want:  `{"a": "//", "b": "/*", "c": "*/"}`,
+		},
+		{
+			name:  "escaped quotes in string",
+			input: `{"text": "String with \"escaped quotes\" // not a comment"}`,
+			want:  `{"text": "String with \"escaped quotes\" // not a comment"}`,
+		},
+		{
+			name:  "complex escaped quotes",
+			input: `{"text": "String with \\\"double escaped\\\" quotes"}`,
+			want:  `{"text": "String with \\\"double escaped\\\" quotes"}`,
+		},
+		{
+			name: "real world example",
+			input: `{
+  "name": "my-package", // Package name
+  "version": "1.0.0",   /* Version number */
+  "dependencies": {
+    "lodash": "^4.17.21",
+    "express": "^4.17.1", // Latest express
+  },
+  "scripts": {
+    "start": "node index.js",
+    "test": "jest",
+  }
+}`,
+			want: `{
+  "name": "my-package",                
+  "version": "1.0.0",                       
+  "dependencies": {
+    "lodash": "^4.17.21",
+    "express": "^4.17.1"                   
+  },
+  "scripts": {
+    "start": "node index.js",
+    "test": "jest" 
+  }
+}`,
+		},
+		{
+			name: "preserves newlines in multiline comments",
+			input: `{
+  "name": "test", // Comment
+  /* 
+   * Multi-line
+   * comment
+   */
+  "value": 42
+}`,
+			want: `{
+  "name": "test",           
+     
+               
+            
+     
+  "value": 42
+}`,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Test ToRFC8259 (allocates new buffer)
+			got := xjson.ToRFC8259([]byte(tt.input))
+
+			// Check length preservation
+			require.Len(t, got, len(tt.input), "output length should match input length")
+
+			// Check content
+			assert.Equal(t, tt.want, string(got))
+
+			// Verify newline count is preserved
+			inputNewlines := bytes.Count([]byte(tt.input), []byte{'\n'})
+			outputNewlines := bytes.Count(got, []byte{'\n'})
+			assert.Equal(t, inputNewlines, outputNewlines, "number of newlines should be preserved")
+
+			// Make sure the output is valid JSON
+			var jsonMap any
+			err := xjson.Unmarshal(got, &jsonMap)
+			require.NoError(t, err, "result should be valid JSON")
+		})
+	}
+}
+
+func TestUnmarshalJSONC(t *testing.T) {
+	jsonc := `{
+  "name": "test", // This is a comment
+  "dependencies": {
+    "lodash": "^4.17.21", /* Another comment */
+    "express": "^4.17.1", // Comment
+  }, // Trailing comment
+  /* Multi-line
+     comment */
+  "version": "1.0.0"
+}`
+
+	type Config struct {
+		Name         string            `json:"name"`
+		Dependencies map[string]string `json:"dependencies"`
+		Version      string            `json:"version"`
+		xjson.Location
+	}
+
+	var config Config
+	err := xjson.UnmarshalJSONC([]byte(jsonc), &config)
+	require.NoError(t, err)
+
+	// Verify the parsed content
+	assert.Equal(t, "test", config.Name)
+	assert.Equal(t, "1.0.0", config.Version)
+	assert.Equal(t, map[string]string{
+		"lodash":  "^4.17.21",
+		"express": "^4.17.1",
+	}, config.Dependencies)
+
+	// Verify location information
+	assert.Equal(t, 1, config.StartLine)
+	assert.Equal(t, 10, config.EndLine)
+}