fix(secret): add UTF-8 validation in secret scanner to prevent protobuf marshalling errors (#9253)

Co-authored-by: knqyf263 <knqyf263@users.noreply.github.com>
This commit is contained in:
Teppei Fukuda
2025-07-28 18:25:47 +04:00
committed by GitHub
parent 8f5b56005a
commit 54832a77b5
3 changed files with 62 additions and 6 deletions

View File

@@ -10,6 +10,7 @@ import (
"sort"
"strings"
"sync"
"unicode/utf8"
"github.com/samber/lo"
"golang.org/x/xerrors"
@@ -19,7 +20,12 @@ import (
"github.com/aquasecurity/trivy/pkg/log"
)
var lineSep = []byte{'\n'}
var (
lineSep = []byte{'\n'}
warnUTF8Once = sync.OnceFunc(func() {
log.WithPrefix(log.PrefixSecret).Warn("Invalid UTF-8 sequences detected in file content, replacing with empty string")
})
)
type Scanner struct {
logger *log.Logger
@@ -280,7 +286,7 @@ func ParseConfig(configPath string) (*Config, error) {
return nil, nil
}
logger := log.WithPrefix("secret").With("config_path", configPath)
logger := log.WithPrefix(log.PrefixSecret).With("config_path", configPath)
f, err := os.Open(configPath)
if errors.Is(err, os.ErrNotExist) {
// If the specified file doesn't exist, it just uses built-in rules and allow rules.
@@ -318,7 +324,7 @@ func convertSeverity(logger *log.Logger, severity string) string {
}
func NewScanner(config *Config) Scanner {
logger := log.WithPrefix("secret")
logger := log.WithPrefix(log.PrefixSecret)
// Use the default rules
if config == nil {
@@ -512,7 +518,7 @@ func findLocation(start, end int, content []byte) (int, int, types.Code, string)
lineStart = lo.Ternary(start-lineStart-30 < 0, lineStart, start-30)
lineEnd = lo.Ternary(end+20 > lineEnd, lineEnd, end+20)
}
matchLine := string(content[lineStart:lineEnd])
matchLine := sanitizeUTF8String(content[lineStart:lineEnd])
endLineNum := startLineNum + bytes.Count(content[start:end], lineSep)
var code types.Code
@@ -529,9 +535,9 @@ func findLocation(start, end int, content []byte) (int, int, types.Code, string)
var strRawLine string
if len(rawLine) > maxLineLength {
strRawLine = lo.Ternary(inCause, matchLine, string(rawLine[:maxLineLength]))
strRawLine = lo.Ternary(inCause, matchLine, sanitizeUTF8String(rawLine[:maxLineLength]))
} else {
strRawLine = string(rawLine)
strRawLine = sanitizeUTF8String(rawLine)
}
code.Lines = append(code.Lines, types.Line{
@@ -555,3 +561,14 @@ func findLocation(start, end int, content []byte) (int, int, types.Code, string)
return startLineNum + 1, endLineNum + 1, code, matchLine
}
// sanitizeUTF8String converts bytes to a valid UTF-8 string, logging a warning once if invalid sequences are found
func sanitizeUTF8String(data []byte) string {
if utf8.Valid(data) {
return string(data)
}
warnUTF8Once()
return strings.ToValidUTF8(string(data), string(utf8.RuneError))
}

View File

@@ -1406,6 +1406,42 @@ func TestSecretScanner(t *testing.T) {
Findings: []types.SecretFinding{wantFindingTokenInsideJs},
},
},
{
name: "invalid UTF-8 sequences in secrets",
configPath: filepath.Join("testdata", "skip-test.yaml"),
inputFilePath: filepath.Join("testdata", "invalid-utf8.txt"),
want: types.Secret{
FilePath: filepath.Join("testdata", "invalid-utf8.txt"),
Findings: []types.SecretFinding{
{
RuleID: "github-pat",
Category: secret.CategoryGitHub,
Title: "GitHub Personal Access Token",
Severity: "CRITICAL",
StartLine: 1,
EndLine: 1,
Match: "token=****************************************",
Code: types.Code{
Lines: []types.Line{
{
Number: 1,
Content: "token=****************************************",
Highlighted: "token=****************************************",
IsCause: true,
FirstCause: true,
LastCause: true,
},
{
Number: 2,
Content: "# Comment with invalid UTF-8: <20>",
Highlighted: "# Comment with invalid UTF-8: <20>",
},
},
},
},
},
},
},
}
for _, tt := range tests {

View File

@@ -0,0 +1,3 @@
token=ghp_abcdef1234567890ABCDEF1234567890abcd
# Comment with invalid UTF-8: <20><><EFBFBD><EFBFBD>
token2=ghp_1234567890abcdef1234567890ABCDEF<45><46>abcd