Fumble around with good turing

main
Eric Ihli 3 years ago
parent 6b8352ae30
commit 234b40a2e9

@ -0,0 +1,4 @@
cp $1 freqhist
S <gtanal.S
cat gtanal
rm freqhist gtanal

@ -0,0 +1,26 @@
#-*- mode: Fundamental; -*-
#read in data
xm<-matrix(scan("freqhist",0),ncol=2,byrow=T)
xr<-xm[,1]
xnr<-xm[,2]
xn<-sum(xr*xnr)
# make averaging transform
xnrz<-nrzest(xr, xnr)
# get Linear Good-Turing estimate
xf<-lsfit(log(xr), log(xnrz))
xcoef<-xf$coef
xrst<-rstest(xr,xcoef)
xrstrel<-xrst/xr
# get Turing estimate
xrtry<-xr == c(xr[-1] - 1, 0)
xrstarel<-rep(0, length(xr))
xrstarel[xrtry]<-(xr[xrtry] + 1) / xr[xrtry] * c(xnr[-1], 0) [xrtry] / xnr[xrtry]
# make switch from Turing to LGT estimates
tursd<-rep(1, length(xr))
for (i in 1:length(xr)) if (xrtry[i])
tursd[i]<-(i+1) / xnr[i] * sqrt(xnr[i+1] * (1 + xnr[i+1] / xnr[i]))

@ -0,0 +1,11 @@
nrzest<-function(r, nr)
{
d <- c(1, diff(r))
dr <- c(0.5 * (d[-1] + d[ - length(d) ]), d[length(d)])
return(nr/dr)
}
rstest<-function(r, coef)
{
return(r * (1 + 1r)^(1 + coef[2]))
}

@ -1114,16 +1114,44 @@
;;;; Accuracy ;;;; Accuracy
(defn lookup-with-backoff
[model lookup]
(loop [lookup lookup]
(let [node (trie/lookup model lookup)]
(cond
(empty? lookup) [model (count (trie/children model))]
node [(trie/lookup model (butlast lookup))
(second (get node []))]
:else (recur (butlast lookup))))))
(defn calc-N [node]
(apply + (map #(second (get % [])) (trie/children node))))
(defn trie-frequencies [node]
(->> node
trie/children
(map #(second (get % [])))
frequencies
vec
(sort-by first)
(into (sorted-map))))
(comment
(time (def N (calc-N markov-tight-trie)))
(time (trie-frequencies (trie/lookup markov-tight-trie [107])))
)
(defn mle (defn mle
[model lookup] [model lookup]
(let [node (trie/lookup model lookup) (let [[parent freq] (lookup-with-backoff model lookup)
[_ freq] (get node [] [nil 1]) [_ parent-freq] (get parent [] [nil N])]
parent (trie/lookup model (butlast lookup)) [freq parent-freq]))
[_ parent-freq] (get parent [] [nil 1])]
(/ freq parent-freq)))
(comment (comment
(mle markov-tight-trie [795 68 69]) (mle markov-tight-trie [9095 452 27040])
(count (trie/children markov-tight-trie))
) )
(defn perplexity (defn perplexity
@ -1136,5 +1164,18 @@
(comment (comment
(perplexity markov-tight-trie database 3 "hi there eric how are you") (perplexity markov-tight-trie database 3 "hi there eric how are you")
(database "through") ;; 1924
database
(count database)
(get markov-tight-trie [315 1924])
(->>
(map #(second (get % []))
(trie/children (trie/lookup markov-tight-trie [315])))
frequencies
vec
(sort-by first)
(into (sorted-map)))
) )

@ -144,7 +144,7 @@
err-x-sqr (map #(* % %) err-x) err-x-sqr (map #(* % %) err-x)
m (/ (apply + (map #(apply * %) (map vector err-x err-y))) m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
(apply + err-x-sqr)) (apply + err-x-sqr))
b (/ (- sum-y (* m sum-x)) n)] b (- mean-y (* m mean-x))]
(assert (< m -1) (assert (< m -1)
(format (format
(str "See Good-Turing Without Tears" (str "See Good-Turing Without Tears"
@ -249,7 +249,14 @@
(/ nr1 (Math/pow nr 2)) (/ nr1 (Math/pow nr 2))
(inc (/ nr1 nr))))) (inc (/ nr1 nr)))))
(defn turing-estimate
"Value of r* such that p = r*/N
Alternative to MLE so that p never equals 0."
[lm r]
(* (inc r) (/ (lm (inc r)) (lm r))))
(defn estimator (defn estimator
"Switches between a Turing estimator and a Linear Good Turing estimator."
[lm rs nrs] [lm rs nrs]
(fn (fn
([x lgt?] ([x lgt?]
@ -315,7 +322,12 @@
(float p0) (float p0)
(map #(* (- 1 p0) (/ % N*)) estimations)) (map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)] sum-probs (apply + probs)]
[lgts [zrs
lgts
estimations
probs
(apply + probs)
rs
(map (map
(fn [r] (fn [r]
(* (inc r) (/ (lm (inc r)) (lm r)))) (* (inc r) (/ (lm (inc r)) (lm r))))
@ -341,27 +353,8 @@
log-zrs (map #(Math/log %) zrs) log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs) lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs) lgts (map lm rs)
estimations (loop [coll rs r* (partial turing-estimate lm)]
lgt? false [p0 rs lgts (map r* rs) (map #(/ (r* %) N) rs) N]))
e (estimator lm rs zrs)
estimations []]
(cond
(empty? coll) estimations
:else
(let [[estimation lgt?] (e (first coll) lgt?)]
(recur
(rest coll)
lgt?
e
(conj estimations estimation)))))
N* (apply + (map #(apply * %) (map vector nrs estimations)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[(cons 0 rs)
(map #(/ % sum-probs) probs)
estimations]))
(comment (comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]

@ -0,0 +1,245 @@
#!/usr/bin/env python3
# Copyright 2009-2011 by Max Bane
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
This module provides an implementation of Gale and Sampson's (1995/2001) "Simple
Good Turing" algorithm. The main function is simpleGoodTuringProbs(), which
takes a dictionary of species counts and returns the estimated population
frequencies of the species, as estimated by the Simple Good Turing method. To
use this module, you must have scipy and numpy installed.
Also included is a function that uses pylab and matplotlib to draw a useful
scatterplot for comparing the empirical frequencies against the Simple Good
Turing estimates.
Depends on reasonably recent versions of scipy and numpy.
Version 0.3: June 21, 2011
First github version.
Version 0.2: November 12, 2009.
Added __version__ string.
Added check for 0 counts.
Don't pollute namespace with "import *".
Added loglog keyword argument to plotFreqVsGoodTuring().
Version 0.1: November 11, 2009.
REFERENCES:
William Gale and Geoffrey Sampson. 1995. Good-Turing frequency estimation
without tears. Journal of Quantitative Linguistics, vol. 2, pp. 217--37.
See also the corrected reprint of same on Sampson's web site.
"""
__version__ = "0.3"
from scipy import linalg
from numpy import c_, exp, log, inf, NaN, sqrt
def countOfCountsTable(counts, sparse=True):
"""
Given a dictionary mapping keys (species) to counts, returns a dictionary
encoding the corresponding table of counts of counts, i.e., a dictionary
that maps a count to the number of species that have that count. If
sparse=True (default), counts with zero counts are not included in the
returned dictionary.
"""
if sparse == True:
cs = counts.values()
else:
cs = xrange(1, max(counts.values())+1)
countsOfCounts = {}
for c in cs:
countsOfCounts[c] = 0
for species, speciesCount in counts.items():
if speciesCount == c:
countsOfCounts[c] += 1
return countsOfCounts
def simpleGoodTuringProbs(counts, confidenceLevel=1.96):
"""
Given a dictionary mapping keys (species) to counts, returns a dictionary
mapping those same species to their smoothed probabilities, according to
Gale and Sampson's (1995/2001 reprint) "Simple Good-Turing" method of
smoothing. The optional confidenceLevel argument should be a multiplier of
the standard deviation of the empirical Turing estimate (default 1.96,
corresponding to a 95% confidence interval), a parameter of the algorithm
that controls how many datapoints are smoothed loglinearly (see Gale and
Sampson 1995).
"""
# Gale and Sampson (1995/2001 reprint)
if 0 in counts.values():
raise ValueError('Species must not have 0 count.')
totalCounts = float(sum(counts.values())) # N (G&S)
countsOfCounts = countOfCountsTable(counts) # r -> n (G&S)
sortedCounts = sorted(countsOfCounts.keys())
assert(totalCounts == sum([r*n for r,n in countsOfCounts.items()]))
p0 = countsOfCounts[1] / totalCounts
print('p0 = %f' % p0)
Z = __sgtZ(sortedCounts, countsOfCounts)
print(f"Z {Z}")
# Compute a loglinear regression of Z[r] on r
rs = list(Z.keys())
zs = list(Z.values())
a, b = __loglinregression(rs, zs)
print(f'{a} {b}')
# Gale and Sampson's (1995/2001) "simple" loglinear smoothing method.
rSmoothed = {}
useY = False
for r in sortedCounts:
# y is the loglinear smoothing
y = float(r+1) * exp(a*log(r+1) + b) / exp(a*log(r) + b)
# If we've already started using y as the estimate for r, then
# contine doing so; also start doing so if no species was observed
# with count r+1.
if r+1 not in countsOfCounts:
if not useY:
print('Warning: reached unobserved count before crossing the '
'smoothing threshold.')
useY = True
if useY:
rSmoothed[r] = y
continue
# x is the empirical Turing estimate for r
x = (float(r+1) * countsOfCounts[r+1]) / countsOfCounts[r]
Nr = float(countsOfCounts[r])
Nr1 = float(countsOfCounts[r+1])
# t is the width of the 95% (or whatever) confidence interval of the
# empirical Turing estimate, assuming independence.
t = confidenceLevel * \
sqrt(\
float(r+1)**2 * (Nr1 / Nr**2) \
* (1. + (Nr1 / Nr))\
)
# If the difference between x and y is more than t, then the empirical
# Turing estimate x tends to be more accurate. Otherwise, use the
# loglinear smoothed value y.
if abs(x - y) > t:
rSmoothed[r] = x
useY = True
rSmoothed[r] = y
# normalize and return the resulting smoothed probabilities, less the
# estimated probability mass of unseen species.
sgtProbs = {}
smoothTot = 0.0
for r, rSmooth in rSmoothed.items():
smoothTot += countsOfCounts[r] * rSmooth
print(f"smoothTot {smoothTot} rSmoothed {rSmoothed}")
for species, spCount in counts.items():
sgtProbs[species] = (1.0 - p0) * (rSmoothed[spCount] / smoothTot)
print(f"sgtProbs {sgtProbs}")
return sgtProbs, p0
def __sgtZ(sortedCounts, countsOfCounts):
# For each count j, set Z[j] to the linear interpolation of i,j,k, where i
# is the greatest observed count less than i and k is the smallest observed
# count greater than j.
Z = {}
for (jIdx, j) in enumerate(sortedCounts):
if jIdx == 0:
i = 0
else:
i = sortedCounts[jIdx-1]
if jIdx == len(sortedCounts)-1:
k = 2*j - i
else:
k = sortedCounts[jIdx+1]
Z[j] = 2*countsOfCounts[j] / float(k-i)
return Z
def __loglinregression(rs, zs):
coef = linalg.lstsq(c_[log(rs), (1,)*len(rs)], log(zs))[0]
a, b = coef
print('Regression: log(z) = %f*log(r) + %f' % (a,b))
if a > -1.0:
print('Warning: slope is > -1.0')
return a, b
# Related plotting functions for use in pylab
def setupTexPlots():
"""
Optional convenience function that configures matplotlib for TeX-based
output, if possible. Depends on matplotlib.
"""
from matplotlib import rc
rc('text', usetex=True)
rc('text', dvipnghack=True) # for OSX
rc('font', family='serif')
rc('font', serif=['Computer Modern'])
def plotFreqVsGoodTuring(counts, confidence=1.96, loglog=False):
"""
Draws a scatterplot of the empirical frequencies of the counted species
versus their Simple Good Turing smoothed values, in rank order. Depends on
pylab and matplotlib.
"""
import pylab
from matplotlib import rc
tot = float(sum(counts.values()))
freqs = dict([(species, cnt/tot) for species, cnt in counts.items()])
sgt, p0 = simpleGoodTuringProbs(counts, confidence)
if loglog:
plotFunc = pylab.loglog
else:
plotFunc = pylab.plot
plotFunc(sorted(freqs.values(), reverse=True), 'kD', mfc='white',
label="Observed")
plotFunc(sorted(sgt.values(), reverse=True), 'k+',
label="Simple Good-Turing Estimate")
pylab.xlim(-0.5, len(freqs)+0.5)
pylab.xlabel("Rank")
pylab.ylabel("Frequency")
pylab.legend(numpoints=1)
def test():
i = {
1: 32,
2: 20,
3: 10,
4: 3,
5: 1,
6: 2,
7: 1,
8: 1,
9: 1,
10: 2,
12: 1,
26: 1,
}
return simpleGoodTuringProbs(i)
test()

@ -4,27 +4,7 @@
[clojure.test :as t :refer [deftest is testing use-fixtures]] [clojure.test :as t :refer [deftest is testing use-fixtures]]
[clojure.java.io :as io])) [clojure.java.io :as io]))
(def train-corpus
(with-open [reader (io/reader (io/resource "dark-corpus-train.txt"))]
(->> (line-seq reader) doall)))
(def test-corpus
(with-open [reader (io/reader (io/resource "dark-corpus-test.txt"))]
(->> (line-seq reader) doall)))
(def test-sentence (first test-corpus))
(def test-tokens
(sgt/pad-tokens (sgt/tokenize-line test-sentence) 1))
(def train-trie
(sgt/lines->trie train-corpus 3))
(def sgt-model
(sgt/simple-good-turing train-trie))
(def vocab
(into #{} (remove #{:count} (keys train-trie))))
(def maps-for-sgt (sgt/maps-for-simple-good-turing train-trie)) (def maps-for-sgt (sgt/maps-for-simple-good-turing train-trie))

@ -0,0 +1,103 @@
(ns com.owoga.prhyme.util.math-test
(:require [com.owoga.prhyme.util.math :as math]
[clojure.test :as t]))
(defn approx=
[a b e]
(<= (Math/abs (- a b)) e))
;; Following the work in Church and Gale [1991], we averagewith each
;; non-zeroNrthe zeroNrs that surround it: order the non-zeroNrbyr, and letq,r,
;; andtbesuccessive indices of non-zero values. We replaceNrbyZr=Nr/0. 5 (tq).
;; In other words we estimatethe expectedNrby the density ofNrfor larger. For
;; smallr, there is no difference, because the length ofthe intervals is unity.
;; For larger, the change can make a difference of several orders of magnitude.
(t/deftest averaging-consecutives
(t/testing "averaging consecutives"
(let [r-coll [1 2 3 5 10]
nr-coll [20 10 5 1 2]
zr-coll (math/average-consecutives r-coll nr-coll)]
(t/is (approx= (nth zr-coll 1)
(/ 10 (* 0.5 (- 3 1)))
1e-4))
(t/is (approx= (nth zr-coll 2)
(/ 5 (* 0.5 (- 5 2)))
1e-4))
(t/is (approx= (nth zr-coll 3)
(/ 1 (* 0.5 (- 10 3)))
1e-4))
(t/is (approx= (nth zr-coll 4)
(/ 2 (* 0.5 (- 15 5)))
1e-4)))))
;;;; Values from LibreOffice data -> statistics -> regression
;; 18.1311501368169
;; 7.85117996785167
;; 4.81179001426509
;; 2.59672962119497
;; 1.12444006199334
(t/deftest linear-regression
(t/testing "The results of the linear regression model are accurate"
(let [r-coll [1 2 3 5 10]
zr-coll [20 10 5 1 2] ;; not really smoothed, but smoothing isn't under test
log-r (map #(Math/log %) r-coll)
log-zr (map #(Math/log %) zr-coll)
linear-model (math/least-squares-linear-regression log-r log-zr)
linear-results (map linear-model (map #(Math/log %) r-coll))]
(t/is (every?
(fn [[expected predicted]]
(approx= expected predicted 0.01))
(map
vector
'(18.13
7.85
4.81
2.59
1.12)
(map #(Math/pow Math/E %) linear-results)))))))
;; The below passes a sanity check in that each r* is slightly less than r.
#_(t/deftest turing-estimation
(t/testing "turing estimation - r*"
(let [r-coll [1 2 3 5 10]
nr-coll [20 10 5 1 2]
zr-coll (math/average-consecutives r-coll nr-coll)
log-r (map #(Math/log %) r-coll)
log-zr (map #(Math/log %) zr-coll)
linear-model (math/least-squares-linear-regression log-r log-zr)]
(t/is (= [] (map
(partial math/turing-estimate linear-model)
r-coll))))))
(t/deftest simple-good-turing-estimator
(t/testing "The simple good turing estimator switches between linear and turing"
(let [r-coll [1 2 3 5 10]
zr-coll [20 10 5 1 2] ;; not smoothed, but smoothing isn't under test
log-r (map #(Math/log %) r-coll)
log-zr (map #(Math/log %) zr-coll)
linear-model (math/least-squares-linear-regression log-r log-zr)
sgt-estimator (math/estimator linear-model r-coll zr-coll)
sgt-estimates (:r*
(reduce
(fn [{:keys [lgt? r*] :as acc} x]
(let [[y lgt?] (sgt-estimator x lgt?)]
{:lgt? lgt?
:r* (conj r* y)}))
{:lgt? false
:r* []}
r-coll))]
(println zr-coll)
(println (map linear-model r-coll))
(println sgt-estimates)
(t/is (every?
(fn [[expected predicted]]
(approx= expected predicted 0.01))
(map
vector
'(18.13
7.85
4.81
2.59
1.12)
sgt-estimates))))))
Loading…
Cancel
Save