Files
complexity/censor.go

243 lines
6.0 KiB
Go

package complexity
import (
"io"
"regexp"
"regexp/syntax"
"slices"
"strings"
"unicode/utf8"
"golang.org/x/text/transform"
)
var secretKeyPrefixes = []string{
`sk-`, `pk-`, `rk-`,
`ghp_`, `gho_`, `ghu_`, `ghs_`, `github_pat_`,
`glpat-`, `glptt-`, `gldt-`,
`xoxb-`, `xoxp-`, `xoxa-`, `xoxr-`,
`ya29.`, `1//`,
`SG.`, `key-`, `token-`, `api_`, `pat_`,
}
func makeSecretKeyPrefixPattern() string {
var b strings.Builder
b.WriteString(`(?:`)
for i, prefix := range secretKeyPrefixes {
if i != 0 {
b.WriteRune('|')
}
b.WriteString(regexp.QuoteMeta(prefix))
}
b.WriteRune(')')
return b.String()
}
func maxPrefixLength() (sz int) {
for _, prefix := range secretKeyPrefixes {
sz = max(sz, len(prefix))
}
return
}
const (
minKeyLength = 8
sensitiveDataThreshold = textCorpusMeanEntropy + (leakedPasswordMeanEntropy-textCorpusMeanEntropy)*0.8 + (randomBase64MeanEntropy-textCorpusMeanEntropy)*0.2
keyPrefixThreshold = textCorpusMeanEntropy + (sensitiveDataThreshold-textCorpusMeanEntropy)*0.5
censoredStr = "********"
)
var (
maxSafeBlindCopySize = maxPrefixLength() + minKeyLength
rePossibleKey = regexp.MustCompile(`\b` + makeSecretKeyPrefixPattern() + `[\w=+,\-]{8,}|\b[^\s]{8,}\b`)
reSecretKeyPrefix = regexp.MustCompile(`^` + makeSecretKeyPrefixPattern())
censoredBytes = []byte(censoredStr)
)
func replaceLikelyKeyBytes(fragment []byte) []byte {
if match := reSecretKeyPrefix.Find(fragment); len(match) > 0 {
entropy, cnt := Bytes(fragment[len(match):])
if entropy/float32(cnt) >= keyPrefixThreshold {
return append(slices.Clip(match), censoredStr...)
}
} else {
entropy, cnt := Bytes(fragment)
if entropy/float32(cnt) >= sensitiveDataThreshold {
return censoredBytes
}
}
return fragment
}
func replaceLikelyKey(fragment string) string {
if match := reSecretKeyPrefix.FindString(fragment); match != "" {
entropy, cnt := String(fragment[len(match):])
if entropy/float32(cnt) >= keyPrefixThreshold {
return match + censoredStr
}
} else {
entropy, cnt := String(fragment)
if entropy/float32(cnt) >= sensitiveDataThreshold {
return censoredStr
}
}
return fragment
}
func CensorLikelyKeysBytes(text []byte) []byte {
return rePossibleKey.ReplaceAllFunc(text, replaceLikelyKeyBytes)
}
func CensorLikelyKeys(text string) string {
return rePossibleKey.ReplaceAllStringFunc(text, replaceLikelyKey)
}
type tf struct{}
func trimIncompleteLastRune(text []byte) []byte {
i := max(0, len(text)-1)
for ; i > 0 && !utf8.RuneStart(text[i]); i-- {
}
r, sz := utf8.DecodeRune(text[i:])
if !(sz == 1 && r == utf8.RuneError) {
i += sz
}
return text[:i]
}
func trimLastWordBoundaryNextRuneKnown(text []byte, last rune) []byte {
i := len(text)
for i > 0 {
prev, sz := utf8.DecodeLastRune(text[:i])
if syntax.IsWordChar(prev) != syntax.IsWordChar(last) {
return text[:i]
}
i -= sz
last = prev
}
return text[:i]
}
func trimLastWordBoundary(text []byte) []byte {
last, sz := utf8.DecodeLastRune(text)
return trimLastWordBoundaryNextRuneKnown(text[:len(text)-sz], last)
}
// assumes src already ends on a valid cut point
// we can't determine that ourselves without knowing the bytes that follow.
// if dst is shorter than src, then src is reduced to the same length, and
// the possibly incomplete rune and final word boundary is removed prior to copying
func copyCompleteWords(dst, src []byte) int {
if len(dst) < len(src) {
_src := trimIncompleteLastRune(src[:len(dst)])
lastRune, _ := utf8.DecodeRune(src[len(_src):])
src = trimLastWordBoundaryNextRuneKnown(_src, lastRune)
}
return copy(dst, src)
}
func (*tf) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if !atEOF {
// make sure the last rune hasn't been cut off.
src = trimIncompleteLastRune(src)
}
matches := rePossibleKey.FindAllIndex(src, -1)
if !atEOF {
if len(matches) > 0 {
// ignore the last match; what matches might change if we had more bytes available.
// we'll also adjust the length of src so that we don't try to copy past where the last match begins
last := len(matches) - 1
src = src[:matches[last][0]]
matches = matches[:last]
} else {
// ignore the last maxSafeBlindCopySize bytes,
// as they could potentially become a match.
_src := trimIncompleteLastRune(src[:max(0, len(src)-maxSafeBlindCopySize)])
last, _ := utf8.DecodeRune(src[len(_src):])
src = trimLastWordBoundaryNextRuneKnown(_src, last)
}
}
for _, m := range matches {
sz := copyCompleteWords(dst[nDst:], src[nSrc:m[0]])
nDst += sz
nSrc += sz
if nSrc != m[0] {
err = transform.ErrShortDst
return
}
match := reSecretKeyPrefix.Find(src[m[0]:m[1]])
censor := true
if len(match) > 0 {
entropy, cnt := Bytes(src[m[0]+len(match) : m[1]])
censor = entropy/float32(cnt) >= keyPrefixThreshold
} else {
entropy, cnt := Bytes(src[m[0]:m[1]])
censor = entropy/float32(cnt) >= sensitiveDataThreshold
}
if censor {
if len(dst[nDst:]) < len(match)+len(censoredBytes) {
err = transform.ErrShortDst
return
}
nDst += copy(dst[nDst:], match)
nDst += copy(dst[nDst:], censoredBytes)
nSrc = m[1]
} else {
if len(dst[nDst:]) < m[1]-m[0] {
err = transform.ErrShortDst
return
}
nDst += copy(dst[nDst:], src[m[0]:m[1]])
nSrc = m[1]
}
}
sz := copyCompleteWords(dst[nDst:], src[nSrc:])
nDst += sz
nSrc += sz
if nSrc != len(src) {
err = transform.ErrShortDst
} else if !atEOF {
err = transform.ErrShortSrc
}
return
}
func (*tf) Reset() {
// NOP
}
// Transformer implements [transform.Transformer], censoring any likely keys it encounters.
//
// WARNING: The current implementation doesn't match the output of [CensorLikelyKeys], so I definitely
// have a bug somewhere.
var Transformer transform.Transformer = (*tf)(nil)
// Copy copies r to w, censoring any likely keys it encounters.
//
// WARNING: The current implementation doesn't match the output of [CensorLikelyKeys], so I definitely
// have a bug somewhere.
func Copy(w io.Writer, r io.Reader) error {
_, err := io.Copy(w, transform.NewReader(r, Transformer))
return err
}