Fumble around with good turing
parent
6b8352ae30
commit
234b40a2e9
@ -0,0 +1,4 @@
|
||||
cp $1 freqhist
|
||||
S <gtanal.S
|
||||
cat gtanal
|
||||
rm freqhist gtanal
|
@ -0,0 +1,26 @@
|
||||
#-*- mode: Fundamental; -*-
|
||||
|
||||
#read in data
|
||||
xm<-matrix(scan("freqhist",0),ncol=2,byrow=T)
|
||||
xr<-xm[,1]
|
||||
xnr<-xm[,2]
|
||||
xn<-sum(xr*xnr)
|
||||
|
||||
# make averaging transform
|
||||
xnrz<-nrzest(xr, xnr)
|
||||
|
||||
# get Linear Good-Turing estimate
|
||||
xf<-lsfit(log(xr), log(xnrz))
|
||||
xcoef<-xf$coef
|
||||
xrst<-rstest(xr,xcoef)
|
||||
xrstrel<-xrst/xr
|
||||
|
||||
# get Turing estimate
|
||||
xrtry<-xr == c(xr[-1] - 1, 0)
|
||||
xrstarel<-rep(0, length(xr))
|
||||
xrstarel[xrtry]<-(xr[xrtry] + 1) / xr[xrtry] * c(xnr[-1], 0) [xrtry] / xnr[xrtry]
|
||||
|
||||
# make switch from Turing to LGT estimates
|
||||
tursd<-rep(1, length(xr))
|
||||
for (i in 1:length(xr)) if (xrtry[i])
|
||||
tursd[i]<-(i+1) / xnr[i] * sqrt(xnr[i+1] * (1 + xnr[i+1] / xnr[i]))
|
@ -0,0 +1,11 @@
|
||||
nrzest<-function(r, nr)
|
||||
{
|
||||
d <- c(1, diff(r))
|
||||
dr <- c(0.5 * (d[-1] + d[ - length(d) ]), d[length(d)])
|
||||
return(nr/dr)
|
||||
}
|
||||
|
||||
rstest<-function(r, coef)
|
||||
{
|
||||
return(r * (1 + 1r)^(1 + coef[2]))
|
||||
}
|
@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright 2009-2011 by Max Bane
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
This module provides an implementation of Gale and Sampson's (1995/2001) "Simple
|
||||
Good Turing" algorithm. The main function is simpleGoodTuringProbs(), which
|
||||
takes a dictionary of species counts and returns the estimated population
|
||||
frequencies of the species, as estimated by the Simple Good Turing method. To
|
||||
use this module, you must have scipy and numpy installed.
|
||||
|
||||
Also included is a function that uses pylab and matplotlib to draw a useful
|
||||
scatterplot for comparing the empirical frequencies against the Simple Good
|
||||
Turing estimates.
|
||||
|
||||
Depends on reasonably recent versions of scipy and numpy.
|
||||
|
||||
Version 0.3: June 21, 2011
|
||||
First github version.
|
||||
|
||||
Version 0.2: November 12, 2009.
|
||||
Added __version__ string.
|
||||
Added check for 0 counts.
|
||||
Don't pollute namespace with "import *".
|
||||
Added loglog keyword argument to plotFreqVsGoodTuring().
|
||||
Version 0.1: November 11, 2009.
|
||||
|
||||
REFERENCES:
|
||||
William Gale and Geoffrey Sampson. 1995. Good-Turing frequency estimation
|
||||
without tears. Journal of Quantitative Linguistics, vol. 2, pp. 217--37.
|
||||
|
||||
See also the corrected reprint of same on Sampson's web site.
|
||||
"""
|
||||
|
||||
__version__ = "0.3"
|
||||
|
||||
from scipy import linalg
|
||||
from numpy import c_, exp, log, inf, NaN, sqrt
|
||||
|
||||
def countOfCountsTable(counts, sparse=True):
|
||||
"""
|
||||
Given a dictionary mapping keys (species) to counts, returns a dictionary
|
||||
encoding the corresponding table of counts of counts, i.e., a dictionary
|
||||
that maps a count to the number of species that have that count. If
|
||||
sparse=True (default), counts with zero counts are not included in the
|
||||
returned dictionary.
|
||||
"""
|
||||
if sparse == True:
|
||||
cs = counts.values()
|
||||
else:
|
||||
cs = xrange(1, max(counts.values())+1)
|
||||
|
||||
countsOfCounts = {}
|
||||
for c in cs:
|
||||
countsOfCounts[c] = 0
|
||||
for species, speciesCount in counts.items():
|
||||
if speciesCount == c:
|
||||
countsOfCounts[c] += 1
|
||||
|
||||
return countsOfCounts
|
||||
|
||||
def simpleGoodTuringProbs(counts, confidenceLevel=1.96):
|
||||
"""
|
||||
Given a dictionary mapping keys (species) to counts, returns a dictionary
|
||||
mapping those same species to their smoothed probabilities, according to
|
||||
Gale and Sampson's (1995/2001 reprint) "Simple Good-Turing" method of
|
||||
smoothing. The optional confidenceLevel argument should be a multiplier of
|
||||
the standard deviation of the empirical Turing estimate (default 1.96,
|
||||
corresponding to a 95% confidence interval), a parameter of the algorithm
|
||||
that controls how many datapoints are smoothed loglinearly (see Gale and
|
||||
Sampson 1995).
|
||||
"""
|
||||
# Gale and Sampson (1995/2001 reprint)
|
||||
if 0 in counts.values():
|
||||
raise ValueError('Species must not have 0 count.')
|
||||
totalCounts = float(sum(counts.values())) # N (G&S)
|
||||
countsOfCounts = countOfCountsTable(counts) # r -> n (G&S)
|
||||
sortedCounts = sorted(countsOfCounts.keys())
|
||||
assert(totalCounts == sum([r*n for r,n in countsOfCounts.items()]))
|
||||
|
||||
p0 = countsOfCounts[1] / totalCounts
|
||||
print('p0 = %f' % p0)
|
||||
|
||||
Z = __sgtZ(sortedCounts, countsOfCounts)
|
||||
|
||||
print(f"Z {Z}")
|
||||
# Compute a loglinear regression of Z[r] on r
|
||||
rs = list(Z.keys())
|
||||
zs = list(Z.values())
|
||||
a, b = __loglinregression(rs, zs)
|
||||
|
||||
print(f'{a} {b}')
|
||||
# Gale and Sampson's (1995/2001) "simple" loglinear smoothing method.
|
||||
rSmoothed = {}
|
||||
useY = False
|
||||
for r in sortedCounts:
|
||||
# y is the loglinear smoothing
|
||||
y = float(r+1) * exp(a*log(r+1) + b) / exp(a*log(r) + b)
|
||||
# If we've already started using y as the estimate for r, then
|
||||
# contine doing so; also start doing so if no species was observed
|
||||
# with count r+1.
|
||||
if r+1 not in countsOfCounts:
|
||||
if not useY:
|
||||
print('Warning: reached unobserved count before crossing the '
|
||||
'smoothing threshold.')
|
||||
useY = True
|
||||
|
||||
if useY:
|
||||
rSmoothed[r] = y
|
||||
continue
|
||||
|
||||
# x is the empirical Turing estimate for r
|
||||
x = (float(r+1) * countsOfCounts[r+1]) / countsOfCounts[r]
|
||||
|
||||
Nr = float(countsOfCounts[r])
|
||||
Nr1 = float(countsOfCounts[r+1])
|
||||
|
||||
# t is the width of the 95% (or whatever) confidence interval of the
|
||||
# empirical Turing estimate, assuming independence.
|
||||
t = confidenceLevel * \
|
||||
sqrt(\
|
||||
float(r+1)**2 * (Nr1 / Nr**2) \
|
||||
* (1. + (Nr1 / Nr))\
|
||||
)
|
||||
|
||||
# If the difference between x and y is more than t, then the empirical
|
||||
# Turing estimate x tends to be more accurate. Otherwise, use the
|
||||
# loglinear smoothed value y.
|
||||
if abs(x - y) > t:
|
||||
rSmoothed[r] = x
|
||||
useY = True
|
||||
rSmoothed[r] = y
|
||||
|
||||
# normalize and return the resulting smoothed probabilities, less the
|
||||
# estimated probability mass of unseen species.
|
||||
sgtProbs = {}
|
||||
smoothTot = 0.0
|
||||
for r, rSmooth in rSmoothed.items():
|
||||
smoothTot += countsOfCounts[r] * rSmooth
|
||||
|
||||
print(f"smoothTot {smoothTot} rSmoothed {rSmoothed}")
|
||||
for species, spCount in counts.items():
|
||||
sgtProbs[species] = (1.0 - p0) * (rSmoothed[spCount] / smoothTot)
|
||||
|
||||
print(f"sgtProbs {sgtProbs}")
|
||||
return sgtProbs, p0
|
||||
|
||||
def __sgtZ(sortedCounts, countsOfCounts):
|
||||
# For each count j, set Z[j] to the linear interpolation of i,j,k, where i
|
||||
# is the greatest observed count less than i and k is the smallest observed
|
||||
# count greater than j.
|
||||
Z = {}
|
||||
for (jIdx, j) in enumerate(sortedCounts):
|
||||
if jIdx == 0:
|
||||
i = 0
|
||||
else:
|
||||
i = sortedCounts[jIdx-1]
|
||||
if jIdx == len(sortedCounts)-1:
|
||||
k = 2*j - i
|
||||
else:
|
||||
k = sortedCounts[jIdx+1]
|
||||
Z[j] = 2*countsOfCounts[j] / float(k-i)
|
||||
return Z
|
||||
|
||||
def __loglinregression(rs, zs):
|
||||
coef = linalg.lstsq(c_[log(rs), (1,)*len(rs)], log(zs))[0]
|
||||
a, b = coef
|
||||
print('Regression: log(z) = %f*log(r) + %f' % (a,b))
|
||||
if a > -1.0:
|
||||
print('Warning: slope is > -1.0')
|
||||
return a, b
|
||||
|
||||
|
||||
# Related plotting functions for use in pylab
|
||||
|
||||
def setupTexPlots():
|
||||
"""
|
||||
Optional convenience function that configures matplotlib for TeX-based
|
||||
output, if possible. Depends on matplotlib.
|
||||
"""
|
||||
from matplotlib import rc
|
||||
|
||||
rc('text', usetex=True)
|
||||
rc('text', dvipnghack=True) # for OSX
|
||||
rc('font', family='serif')
|
||||
rc('font', serif=['Computer Modern'])
|
||||
|
||||
def plotFreqVsGoodTuring(counts, confidence=1.96, loglog=False):
|
||||
"""
|
||||
Draws a scatterplot of the empirical frequencies of the counted species
|
||||
versus their Simple Good Turing smoothed values, in rank order. Depends on
|
||||
pylab and matplotlib.
|
||||
"""
|
||||
import pylab
|
||||
from matplotlib import rc
|
||||
|
||||
tot = float(sum(counts.values()))
|
||||
freqs = dict([(species, cnt/tot) for species, cnt in counts.items()])
|
||||
sgt, p0 = simpleGoodTuringProbs(counts, confidence)
|
||||
|
||||
if loglog:
|
||||
plotFunc = pylab.loglog
|
||||
else:
|
||||
plotFunc = pylab.plot
|
||||
plotFunc(sorted(freqs.values(), reverse=True), 'kD', mfc='white',
|
||||
label="Observed")
|
||||
plotFunc(sorted(sgt.values(), reverse=True), 'k+',
|
||||
label="Simple Good-Turing Estimate")
|
||||
pylab.xlim(-0.5, len(freqs)+0.5)
|
||||
pylab.xlabel("Rank")
|
||||
pylab.ylabel("Frequency")
|
||||
pylab.legend(numpoints=1)
|
||||
|
||||
|
||||
def test():
|
||||
i = {
|
||||
1: 32,
|
||||
2: 20,
|
||||
3: 10,
|
||||
4: 3,
|
||||
5: 1,
|
||||
6: 2,
|
||||
7: 1,
|
||||
8: 1,
|
||||
9: 1,
|
||||
10: 2,
|
||||
12: 1,
|
||||
26: 1,
|
||||
}
|
||||
return simpleGoodTuringProbs(i)
|
||||
|
||||
test()
|
Loading…
Reference in New Issue