Initial commit.

2025-08-13 12:17:25 -04:00
commit 581771d93a
11 changed files with 878 additions and 0 deletions
--- a/censor.go
+++ b/censor.go
@ -0,0 +1,240 @@
+package complexity
+
+import (
+	"io"
+	"regexp"
+	"regexp/syntax"
+	"slices"
+	"strings"
+	"unicode/utf8"
+
+	"golang.org/x/text/transform"
+)
+
+var secretKeyPrefixes = []string{
+	`sk-`, `pk-`, `rk-`,
+	`ghp_`, `gho_`, `ghu_`, `ghs_`, `github_pat_`,
+	`glpat-`, `glptt-`, `gldt-`,
+	`xoxb-`, `xoxp-`, `xoxa-`, `xoxr-`,
+	`ya29.`, `1//`,
+	`SG.`, `key-`, `token-`, `api_`, `pat_`,
+}
+
+func makeSecretKeyPrefixPattern() string {
+	var b strings.Builder
+	b.WriteString(`(?:`)
+	for i, prefix := range secretKeyPrefixes {
+		if i != 0 {
+			b.WriteRune('|')
+		}
+		b.WriteString(regexp.QuoteMeta(prefix))
+	}
+	b.WriteRune(')')
+	return b.String()
+}
+
+func maxPrefixLength() (sz int) {
+	for _, prefix := range secretKeyPrefixes {
+		sz = max(sz, len(prefix))
+	}
+	return
+}
+
+const (
+	minKeyLength           = 8
+	sensitiveDataThreshold = textCorpusMeanEntropy + (leakedPasswordMeanEntropy-textCorpusMeanEntropy)*0.8 + (randomBase64MeanEntropy-textCorpusMeanEntropy)*0.2
+	keyPrefixThreshold     = textCorpusMeanEntropy + (sensitiveDataThreshold-textCorpusMeanEntropy)*0.5
+	censoredStr            = "********"
+)
+
+var (
+	maxSafeBlindCopySize = maxPrefixLength() + minKeyLength
+	rePossibleKey        = regexp.MustCompile(`\b` + makeSecretKeyPrefixPattern() + `[\w=+,\-]{8,}|\b[^\s]{8,}\b`)
+	reSecretKeyPrefix    = regexp.MustCompile(`^` + makeSecretKeyPrefixPattern())
+	censoredBytes        = []byte(censoredStr)
+)
+
+func replaceLikelyKeyBytes(fragment []byte) []byte {
+	if match := reSecretKeyPrefix.Find(fragment); len(match) > 0 {
+		entropy, cnt := Bytes(fragment[len(match):])
+
+		if entropy/float32(cnt) >= keyPrefixThreshold {
+			return append(slices.Clip(match), censoredStr...)
+		}
+	} else {
+		entropy, cnt := Bytes(fragment)
+		if entropy/float32(cnt) >= sensitiveDataThreshold {
+			return censoredBytes
+		}
+	}
+
+	return fragment
+}
+
+func replaceLikelyKey(fragment string) string {
+	if match := reSecretKeyPrefix.FindString(fragment); match != "" {
+		entropy, cnt := String(fragment[len(match):])
+
+		if entropy/float32(cnt) >= keyPrefixThreshold {
+			return match + censoredStr
+		}
+	} else {
+		entropy, cnt := String(fragment)
+		if entropy/float32(cnt) >= sensitiveDataThreshold {
+			return censoredStr
+		}
+	}
+
+	return fragment
+}
+
+func CensorLikelyKeysBytes(text []byte) []byte {
+	return rePossibleKey.ReplaceAllFunc(text, replaceLikelyKeyBytes)
+}
+
+func CensorLikelyKeys(text string) string {
+	return rePossibleKey.ReplaceAllStringFunc(text, replaceLikelyKey)
+}
+
+type tf struct{}
+
+func trimIncompleteLastRune(text []byte) []byte {
+	i := max(0, len(text)-1)
+	for ; i > 0 && !utf8.RuneStart(text[i]); i-- {
+	}
+
+	r, sz := utf8.DecodeRune(text[i:])
+	if !(sz == 1 && r == utf8.RuneError) {
+		i += sz
+	}
+
+	return text[:i]
+}
+
+func trimLastWordBoundaryNextRuneKnown(text []byte, last rune) []byte {
+	i := len(text)
+
+	for i > 0 {
+		prev, sz := utf8.DecodeLastRune(text[:i])
+		if syntax.IsWordChar(prev) != syntax.IsWordChar(last) {
+			return text[:i]
+		}
+		i -= sz
+		last = prev
+	}
+
+	return text[:i]
+}
+
+func trimLastWordBoundary(text []byte) []byte {
+	last, sz := utf8.DecodeLastRune(text)
+	return trimLastWordBoundaryNextRuneKnown(text[:len(text)-sz], last)
+}
+
+// assumes src already ends on a valid cut point
+// we can't determine that ourselves without knowing the bytes that follow.
+// if dst is shorter than src, then src is reduced to the same length, and
+// the possibly incomplete rune and final word boundary is removed prior to copying
+func copyCompleteWords(dst, src []byte) int {
+	if len(dst) < len(src) {
+		_src := trimIncompleteLastRune(src[:len(dst)])
+		lastRune, _ := utf8.DecodeRune(src[len(_src):])
+		src = trimLastWordBoundaryNextRuneKnown(_src, lastRune)
+	}
+
+	return copy(dst, src)
+}
+
+func (*tf) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if !atEOF {
+		// make sure the last rune hasn't been cut off.
+		src = trimIncompleteLastRune(src)
+	}
+
+	matches := rePossibleKey.FindAllIndex(src, -1)
+
+	if !atEOF {
+		if len(matches) > 0 {
+			// ignore the last match; what matches might change if we had more bytes available.
+			// we'll also adjust the length of src so that we don't try to copy past where the last match begins
+			last := len(matches) - 1
+			src = src[:matches[last][0]]
+			matches = matches[:last]
+		} else {
+			// ignore the last maxSafeBlindCopySize bytes,
+			// as they could potentially become a match.
+			_src := trimIncompleteLastRune(src[:max(0, len(src)-maxSafeBlindCopySize)])
+			last, _ := utf8.DecodeRune(src[len(_src):])
+			src = trimLastWordBoundaryNextRuneKnown(_src, last)
+		}
+	}
+
+	for _, m := range matches {
+		sz := copyCompleteWords(dst[nDst:], src[nSrc:m[0]])
+		nDst += sz
+		nSrc += sz
+		if nSrc != m[0] {
+			err = transform.ErrShortDst
+			return
+		}
+
+		match := reSecretKeyPrefix.Find(src[m[0]:m[1]])
+		censor := true
+
+		if len(match) > 0 {
+			entropy, cnt := Bytes(src[m[0]+len(match) : m[1]])
+			censor = entropy/float32(cnt) >= keyPrefixThreshold
+		} else {
+			entropy, cnt := Bytes(src[m[0]:m[1]])
+			censor = entropy/float32(cnt) >= sensitiveDataThreshold
+		}
+
+		if censor {
+			if len(dst[nDst:]) < len(match)+len(censoredBytes) {
+				err = transform.ErrShortDst
+				return
+			}
+
+			nDst += copy(dst[nDst:], match)
+			nDst += copy(dst[nDst:], censoredBytes)
+			nSrc = m[1]
+		} else {
+			if len(dst[nDst:]) < m[1]-m[0] {
+				err = transform.ErrShortDst
+				return
+			}
+
+			nDst += copy(dst[nDst:], src[m[0]:m[1]])
+			nSrc = m[1]
+		}
+	}
+
+	sz := copyCompleteWords(dst[nDst:], src[nSrc:])
+	nDst += sz
+	nSrc += sz
+
+	if nSrc != len(src) {
+		err = transform.ErrShortDst
+	} else if !atEOF {
+		err = transform.ErrShortSrc
+	}
+
+	return
+}
+
+func (*tf) Reset() {
+	// NOP
+}
+
+// Transformer implements [transform.Transformer], censoring any likely keys it encounters.
+// Not exporting at this time, as it's bugged and a correct version might require maintaining state.
+// var Transformer transform.Transformer = (*tf)(nil)
+
+// Copy copies r to w, censoring any likely keys it encounters.
+//
+// WARNING: The current implementation doesn't match the output of [CensorLikelyKeys], so I definitely
+// have a bug somewhere.
+func Copy(w io.Writer, r io.Reader) error {
+	_, err := io.Copy(w, transform.NewReader(r, (*tf)(nil)))
+	return err
+}