Initial commit.
This commit is contained in:
240
censor.go
Normal file
240
censor.go
Normal file
@ -0,0 +1,240 @@
|
||||
package complexity
|
||||
|
||||
import (
|
||||
"io"
|
||||
"regexp"
|
||||
"regexp/syntax"
|
||||
"slices"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
var secretKeyPrefixes = []string{
|
||||
`sk-`, `pk-`, `rk-`,
|
||||
`ghp_`, `gho_`, `ghu_`, `ghs_`, `github_pat_`,
|
||||
`glpat-`, `glptt-`, `gldt-`,
|
||||
`xoxb-`, `xoxp-`, `xoxa-`, `xoxr-`,
|
||||
`ya29.`, `1//`,
|
||||
`SG.`, `key-`, `token-`, `api_`, `pat_`,
|
||||
}
|
||||
|
||||
func makeSecretKeyPrefixPattern() string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`(?:`)
|
||||
for i, prefix := range secretKeyPrefixes {
|
||||
if i != 0 {
|
||||
b.WriteRune('|')
|
||||
}
|
||||
b.WriteString(regexp.QuoteMeta(prefix))
|
||||
}
|
||||
b.WriteRune(')')
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func maxPrefixLength() (sz int) {
|
||||
for _, prefix := range secretKeyPrefixes {
|
||||
sz = max(sz, len(prefix))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
minKeyLength = 8
|
||||
sensitiveDataThreshold = textCorpusMeanEntropy + (leakedPasswordMeanEntropy-textCorpusMeanEntropy)*0.8 + (randomBase64MeanEntropy-textCorpusMeanEntropy)*0.2
|
||||
keyPrefixThreshold = textCorpusMeanEntropy + (sensitiveDataThreshold-textCorpusMeanEntropy)*0.5
|
||||
censoredStr = "********"
|
||||
)
|
||||
|
||||
var (
|
||||
maxSafeBlindCopySize = maxPrefixLength() + minKeyLength
|
||||
rePossibleKey = regexp.MustCompile(`\b` + makeSecretKeyPrefixPattern() + `[\w=+,\-]{8,}|\b[^\s]{8,}\b`)
|
||||
reSecretKeyPrefix = regexp.MustCompile(`^` + makeSecretKeyPrefixPattern())
|
||||
censoredBytes = []byte(censoredStr)
|
||||
)
|
||||
|
||||
func replaceLikelyKeyBytes(fragment []byte) []byte {
|
||||
if match := reSecretKeyPrefix.Find(fragment); len(match) > 0 {
|
||||
entropy, cnt := Bytes(fragment[len(match):])
|
||||
|
||||
if entropy/float32(cnt) >= keyPrefixThreshold {
|
||||
return append(slices.Clip(match), censoredStr...)
|
||||
}
|
||||
} else {
|
||||
entropy, cnt := Bytes(fragment)
|
||||
if entropy/float32(cnt) >= sensitiveDataThreshold {
|
||||
return censoredBytes
|
||||
}
|
||||
}
|
||||
|
||||
return fragment
|
||||
}
|
||||
|
||||
func replaceLikelyKey(fragment string) string {
|
||||
if match := reSecretKeyPrefix.FindString(fragment); match != "" {
|
||||
entropy, cnt := String(fragment[len(match):])
|
||||
|
||||
if entropy/float32(cnt) >= keyPrefixThreshold {
|
||||
return match + censoredStr
|
||||
}
|
||||
} else {
|
||||
entropy, cnt := String(fragment)
|
||||
if entropy/float32(cnt) >= sensitiveDataThreshold {
|
||||
return censoredStr
|
||||
}
|
||||
}
|
||||
|
||||
return fragment
|
||||
}
|
||||
|
||||
func CensorLikelyKeysBytes(text []byte) []byte {
|
||||
return rePossibleKey.ReplaceAllFunc(text, replaceLikelyKeyBytes)
|
||||
}
|
||||
|
||||
func CensorLikelyKeys(text string) string {
|
||||
return rePossibleKey.ReplaceAllStringFunc(text, replaceLikelyKey)
|
||||
}
|
||||
|
||||
type tf struct{}
|
||||
|
||||
func trimIncompleteLastRune(text []byte) []byte {
|
||||
i := max(0, len(text)-1)
|
||||
for ; i > 0 && !utf8.RuneStart(text[i]); i-- {
|
||||
}
|
||||
|
||||
r, sz := utf8.DecodeRune(text[i:])
|
||||
if !(sz == 1 && r == utf8.RuneError) {
|
||||
i += sz
|
||||
}
|
||||
|
||||
return text[:i]
|
||||
}
|
||||
|
||||
func trimLastWordBoundaryNextRuneKnown(text []byte, last rune) []byte {
|
||||
i := len(text)
|
||||
|
||||
for i > 0 {
|
||||
prev, sz := utf8.DecodeLastRune(text[:i])
|
||||
if syntax.IsWordChar(prev) != syntax.IsWordChar(last) {
|
||||
return text[:i]
|
||||
}
|
||||
i -= sz
|
||||
last = prev
|
||||
}
|
||||
|
||||
return text[:i]
|
||||
}
|
||||
|
||||
func trimLastWordBoundary(text []byte) []byte {
|
||||
last, sz := utf8.DecodeLastRune(text)
|
||||
return trimLastWordBoundaryNextRuneKnown(text[:len(text)-sz], last)
|
||||
}
|
||||
|
||||
// assumes src already ends on a valid cut point
|
||||
// we can't determine that ourselves without knowing the bytes that follow.
|
||||
// if dst is shorter than src, then src is reduced to the same length, and
|
||||
// the possibly incomplete rune and final word boundary is removed prior to copying
|
||||
func copyCompleteWords(dst, src []byte) int {
|
||||
if len(dst) < len(src) {
|
||||
_src := trimIncompleteLastRune(src[:len(dst)])
|
||||
lastRune, _ := utf8.DecodeRune(src[len(_src):])
|
||||
src = trimLastWordBoundaryNextRuneKnown(_src, lastRune)
|
||||
}
|
||||
|
||||
return copy(dst, src)
|
||||
}
|
||||
|
||||
func (*tf) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if !atEOF {
|
||||
// make sure the last rune hasn't been cut off.
|
||||
src = trimIncompleteLastRune(src)
|
||||
}
|
||||
|
||||
matches := rePossibleKey.FindAllIndex(src, -1)
|
||||
|
||||
if !atEOF {
|
||||
if len(matches) > 0 {
|
||||
// ignore the last match; what matches might change if we had more bytes available.
|
||||
// we'll also adjust the length of src so that we don't try to copy past where the last match begins
|
||||
last := len(matches) - 1
|
||||
src = src[:matches[last][0]]
|
||||
matches = matches[:last]
|
||||
} else {
|
||||
// ignore the last maxSafeBlindCopySize bytes,
|
||||
// as they could potentially become a match.
|
||||
_src := trimIncompleteLastRune(src[:max(0, len(src)-maxSafeBlindCopySize)])
|
||||
last, _ := utf8.DecodeRune(src[len(_src):])
|
||||
src = trimLastWordBoundaryNextRuneKnown(_src, last)
|
||||
}
|
||||
}
|
||||
|
||||
for _, m := range matches {
|
||||
sz := copyCompleteWords(dst[nDst:], src[nSrc:m[0]])
|
||||
nDst += sz
|
||||
nSrc += sz
|
||||
if nSrc != m[0] {
|
||||
err = transform.ErrShortDst
|
||||
return
|
||||
}
|
||||
|
||||
match := reSecretKeyPrefix.Find(src[m[0]:m[1]])
|
||||
censor := true
|
||||
|
||||
if len(match) > 0 {
|
||||
entropy, cnt := Bytes(src[m[0]+len(match) : m[1]])
|
||||
censor = entropy/float32(cnt) >= keyPrefixThreshold
|
||||
} else {
|
||||
entropy, cnt := Bytes(src[m[0]:m[1]])
|
||||
censor = entropy/float32(cnt) >= sensitiveDataThreshold
|
||||
}
|
||||
|
||||
if censor {
|
||||
if len(dst[nDst:]) < len(match)+len(censoredBytes) {
|
||||
err = transform.ErrShortDst
|
||||
return
|
||||
}
|
||||
|
||||
nDst += copy(dst[nDst:], match)
|
||||
nDst += copy(dst[nDst:], censoredBytes)
|
||||
nSrc = m[1]
|
||||
} else {
|
||||
if len(dst[nDst:]) < m[1]-m[0] {
|
||||
err = transform.ErrShortDst
|
||||
return
|
||||
}
|
||||
|
||||
nDst += copy(dst[nDst:], src[m[0]:m[1]])
|
||||
nSrc = m[1]
|
||||
}
|
||||
}
|
||||
|
||||
sz := copyCompleteWords(dst[nDst:], src[nSrc:])
|
||||
nDst += sz
|
||||
nSrc += sz
|
||||
|
||||
if nSrc != len(src) {
|
||||
err = transform.ErrShortDst
|
||||
} else if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (*tf) Reset() {
|
||||
// NOP
|
||||
}
|
||||
|
||||
// Transformer implements [transform.Transformer], censoring any likely keys it encounters.
|
||||
// Not exporting at this time, as it's bugged and a correct version might require maintaining state.
|
||||
// var Transformer transform.Transformer = (*tf)(nil)
|
||||
|
||||
// Copy copies r to w, censoring any likely keys it encounters.
|
||||
//
|
||||
// WARNING: The current implementation doesn't match the output of [CensorLikelyKeys], so I definitely
|
||||
// have a bug somewhere.
|
||||
func Copy(w io.Writer, r io.Reader) error {
|
||||
_, err := io.Copy(w, transform.NewReader(r, (*tf)(nil)))
|
||||
return err
|
||||
}
|
Reference in New Issue
Block a user