113 lines
2.3 KiB
Go
113 lines
2.3 KiB
Go
|
//go:build ignore
|
||
|
|
||
|
package main
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"log"
|
||
|
"math"
|
||
|
"os"
|
||
|
"regexp/syntax"
|
||
|
)
|
||
|
|
||
|
const (
|
||
|
runeStart rune = 32
|
||
|
runeLast rune = 127
|
||
|
sz = int(runeLast + 1 - runeStart)
|
||
|
)
|
||
|
|
||
|
func getCounts(r io.Reader) (matrix [sz * sz]uint64, err error) {
|
||
|
b := bufio.NewReader(r)
|
||
|
prev := runeLast
|
||
|
for {
|
||
|
r, _, err := b.ReadRune()
|
||
|
|
||
|
if err == io.EOF {
|
||
|
return matrix, nil
|
||
|
}
|
||
|
|
||
|
if err != nil {
|
||
|
return matrix, err
|
||
|
}
|
||
|
|
||
|
if r < runeStart || r > runeLast {
|
||
|
r = runeLast
|
||
|
}
|
||
|
|
||
|
matrix[int(prev-runeStart)*sz+int(r-runeStart)]++
|
||
|
|
||
|
if prev != runeLast && !syntax.IsWordChar(prev) && syntax.IsWordChar(r) {
|
||
|
// when transitioning to a non-word to a word, train this as if the previous
|
||
|
// rune were lastRune, as this seems like a sensible way to start a piece of random text.
|
||
|
matrix[(sz-1)*sz+int(r-runeStart)]++
|
||
|
}
|
||
|
|
||
|
prev = r
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func calcEntropy(in [sz * sz]uint64) (out [sz * sz]float64) {
|
||
|
var fallbackCount [sz]uint64
|
||
|
var fallbackSum uint64
|
||
|
|
||
|
for i, cnt := range in {
|
||
|
fallbackCount[i%sz] += cnt
|
||
|
fallbackSum += cnt
|
||
|
}
|
||
|
|
||
|
for row := range sz {
|
||
|
var sum uint64
|
||
|
|
||
|
for _, c := range in[row*sz : row*sz+sz] {
|
||
|
sum += c
|
||
|
}
|
||
|
|
||
|
for column, c := range in[row*sz : row*sz+sz] {
|
||
|
p := (float64(2*(fallbackSum+1))*float64(c) + float64(2*fallbackCount[column]+1)) / float64(float64(2*(fallbackSum+1))*float64(sum+1))
|
||
|
if p <= 0 || p >= 1 {
|
||
|
log.Printf("c=%d fallbackSum=%d fallbackCount[column]=%d sum=%d\n", c, fallbackSum, fallbackCount[column], sum)
|
||
|
log.Fatalf("got p=%f for %c->%c\n", p, runeStart+rune(row), runeStart+rune(column))
|
||
|
}
|
||
|
|
||
|
out[row*sz+column] = -math.Log2(p)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return
|
||
|
}
|
||
|
|
||
|
func calcMeanEntropy(counts [sz * sz]uint64, entropies [sz * sz]float64) float64 {
|
||
|
var sumEntropy float64
|
||
|
var sumCount uint64
|
||
|
for i, c := range counts {
|
||
|
sumCount += c
|
||
|
sumEntropy += entropies[i] * float64(c)
|
||
|
}
|
||
|
return sumEntropy / float64(sumCount)
|
||
|
}
|
||
|
|
||
|
func main() {
|
||
|
cnts, err := getCounts(os.Stdin)
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
entropy := calcEntropy(cnts)
|
||
|
|
||
|
fmt.Printf("var matrix = [%d*%d]float32{", sz, sz)
|
||
|
for row := range sz {
|
||
|
for i, v := range entropy[row*sz : row*sz+sz] {
|
||
|
if i == 0 {
|
||
|
fmt.Printf("\n ")
|
||
|
} else {
|
||
|
fmt.Printf(" ")
|
||
|
}
|
||
|
|
||
|
fmt.Printf("%f,", v)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fmt.Printf("}\n\nconst meanEntropy = %f\n", calcMeanEntropy(cnts, entropy))
|
||
|
}
|