Fumble around with good turing

4 years ago · 234b40a2e9
parent 6b8352ae30
commit 234b40a2e9
8 changed files with 452 additions and 49 deletions
--- a/sgt/gt.s
+++ b/sgt/gt.s
@ -0,0 +1,4 @@
+    cp $1 freqhist
+    S <gtanal.S
+    cat gtanal
+    rm freqhist gtanal
--- a/sgt/gtanal.S
+++ b/sgt/gtanal.S
@ -0,0 +1,26 @@
+#-*- mode: Fundamental; -*-
+
+#read in data
+xm<-matrix(scan("freqhist",0),ncol=2,byrow=T)
+xr<-xm[,1]
+xnr<-xm[,2]
+xn<-sum(xr*xnr)
+
+# make averaging transform
+xnrz<-nrzest(xr, xnr)
+
+# get Linear Good-Turing estimate
+xf<-lsfit(log(xr), log(xnrz))
+xcoef<-xf$coef
+xrst<-rstest(xr,xcoef)
+xrstrel<-xrst/xr
+
+# get Turing estimate
+xrtry<-xr == c(xr[-1] - 1, 0)
+xrstarel<-rep(0, length(xr))
+xrstarel[xrtry]<-(xr[xrtry] + 1) / xr[xrtry] * c(xnr[-1], 0) [xrtry] / xnr[xrtry]
+
+# make switch from Turing to LGT estimates
+tursd<-rep(1, length(xr))
+for (i in 1:length(xr)) if (xrtry[i])
+    tursd[i]<-(i+1) / xnr[i] * sqrt(xnr[i+1] * (1 + xnr[i+1] / xnr[i]))
--- a/sgt/gtfunc.S
+++ b/sgt/gtfunc.S
@ -0,0 +1,11 @@
+    nrzest<-function(r, nr)
+    {
+    d <- c(1, diff(r))
+    dr <- c(0.5 * (d[-1] + d[ - length(d)  ]), d[length(d)])
+    return(nr/dr)
+    }
+
+    rstest<-function(r, coef)
+    {
+        return(r * (1 + 1r)^(1 + coef[2]))
+    }
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -1114,16 +1114,44 @@

 ;;;; Accuracy

+(defn lookup-with-backoff
+  [model lookup]
+  (loop [lookup lookup]
+    (let [node (trie/lookup model lookup)]
+      (cond
+        (empty? lookup) [model (count (trie/children model))]
+        node [(trie/lookup model (butlast lookup))
+              (second (get node []))]
+        :else (recur (butlast lookup))))))
+
+(defn calc-N [node]
+    (apply + (map #(second (get % [])) (trie/children node))))
+
+(defn trie-frequencies [node]
+  (->> node
+       trie/children
+       (map #(second (get % [])))
+       frequencies
+       vec
+       (sort-by first)
+       (into (sorted-map))))
+
+(comment
+  (time (def N (calc-N markov-tight-trie)))
+  (time (trie-frequencies (trie/lookup markov-tight-trie [107])))
+  )
+
 (defn mle
  [model lookup]
-  (let [node (trie/lookup model lookup)
-        [_ freq] (get node [] [nil 1])
-        parent (trie/lookup model (butlast lookup))
-        [_ parent-freq] (get parent [] [nil 1])]
-    (/ freq parent-freq)))
+  (let [[parent freq] (lookup-with-backoff model lookup)
+        [_ parent-freq] (get parent [] [nil N])]
+    [freq parent-freq]))

 (comment
-  (mle markov-tight-trie [795 68 69])
+  (mle markov-tight-trie [9095 452 27040])
+
+  (count (trie/children markov-tight-trie))
+
  )

 (defn perplexity
@ -1136,5 +1164,18 @@

 (comment
  (perplexity markov-tight-trie database 3 "hi there eric how are you")
+  (database "through") ;; 1924
+  database
+
+  (count database)
+
+  (get markov-tight-trie [315 1924])
+  (->>
+   (map #(second (get % []))
+        (trie/children (trie/lookup markov-tight-trie [315])))
+   frequencies
+   vec
+   (sort-by first)
+   (into (sorted-map)))

  )
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@ -144,7 +144,7 @@
        err-x-sqr (map #(* % %) err-x)
        m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
             (apply + err-x-sqr))
-        b (/ (- sum-y (* m sum-x)) n)]
+        b (- mean-y (* m mean-x))]
    (assert (< m -1)
            (format
             (str "See Good-Turing Without Tears"
@ -249,7 +249,14 @@
      (/ nr1 (Math/pow nr 2))
      (inc (/ nr1 nr)))))

+(defn turing-estimate
+  "Value of r* such that pᵣ = r*/N
+  Alternative to MLE so that pᵣ never equals 0."
+  [lm r]
+  (* (inc r) (/ (lm (inc r)) (lm r))))
+
 (defn estimator
+  "Switches between a Turing estimator and a Linear Good Turing estimator."
  [lm rs nrs]
  (fn
    ([x lgt?]
@ -315,7 +322,12 @@
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) estimations))
        sum-probs (apply + probs)]
-    [lgts
+    [zrs
+     lgts
+     estimations
+     probs
+     (apply + probs)
+     rs
     (map
        (fn [r]
          (* (inc r) (/ (lm (inc r)) (lm r))))
@ -341,27 +353,8 @@
        log-zrs (map #(Math/log %) zrs)
        lm (least-squares-linear-regression log-rs log-zrs)
        lgts (map lm rs)
-        estimations (loop [coll rs
-                           lgt? false
-                           e (estimator lm rs zrs)
-                           estimations []]
-                      (cond
-                        (empty? coll) estimations
-                        :else
-                        (let [[estimation lgt?] (e (first coll) lgt?)]
-                          (recur
-                           (rest coll)
-                           lgt?
-                           e
-                           (conj estimations estimation)))))
-        N* (apply + (map #(apply * %) (map vector nrs estimations)))
-        probs (cons
-               (float p0)
-               (map #(* (- 1 p0) (/ % N*)) estimations))
-        sum-probs (apply + probs)]
-    [(cons 0 rs)
-     (map #(/ % sum-probs) probs)
-     estimations]))
+        r* (partial turing-estimate lm)]
+    [p0 rs lgts (map r* rs) (map #(/ (r* %) N) rs) N]))

 (comment
  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
--- a/src/com/owoga/prhyme/util/sgt.py
+++ b/src/com/owoga/prhyme/util/sgt.py
@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+
+# Copyright 2009-2011 by Max Bane
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""
+This module provides an implementation of Gale and Sampson's (1995/2001) "Simple
+Good Turing" algorithm. The main function is simpleGoodTuringProbs(), which
+takes a dictionary of species counts and returns the estimated population
+frequencies of the species, as estimated by the Simple Good Turing method. To
+use this module, you must have scipy and numpy installed.
+
+Also included is a function that uses pylab and matplotlib to draw a useful
+scatterplot for comparing the empirical frequencies against the Simple Good
+Turing estimates.
+
+Depends on reasonably recent versions of scipy and numpy.
+
+Version 0.3: June 21, 2011
+    First github version.
+
+Version 0.2: November 12, 2009.
+    Added __version__ string.
+    Added check for 0 counts.
+    Don't pollute namespace with "import *".
+    Added loglog keyword argument to plotFreqVsGoodTuring().
+Version 0.1: November 11, 2009.
+
+REFERENCES:
+    William Gale and Geoffrey Sampson. 1995. Good-Turing frequency estimation
+    without tears. Journal of Quantitative Linguistics, vol. 2, pp. 217--37.
+
+    See also the corrected reprint of same on Sampson's web site.
+"""
+
+__version__ = "0.3"
+
+from scipy import linalg
+from numpy import c_, exp, log, inf, NaN, sqrt
+
+def countOfCountsTable(counts, sparse=True):
+    """
+    Given a dictionary mapping keys (species) to counts, returns a dictionary
+    encoding the corresponding table of counts of counts, i.e., a dictionary
+    that maps a count to the number of species that have that count. If
+    sparse=True (default), counts with zero counts are not included in the
+    returned dictionary.
+    """
+    if sparse == True:
+        cs = counts.values()
+    else:
+        cs = xrange(1, max(counts.values())+1)
+
+    countsOfCounts = {}
+    for c in cs:
+        countsOfCounts[c] = 0
+        for species, speciesCount in counts.items():
+            if speciesCount == c:
+                countsOfCounts[c] += 1
+
+    return countsOfCounts
+
+def simpleGoodTuringProbs(counts, confidenceLevel=1.96):
+    """
+    Given a dictionary mapping keys (species) to counts, returns a dictionary
+    mapping those same species to their smoothed probabilities, according to
+    Gale and Sampson's (1995/2001 reprint) "Simple Good-Turing" method of
+    smoothing. The optional confidenceLevel argument should be a multiplier of
+    the standard deviation of the empirical Turing estimate (default 1.96,
+    corresponding to a 95% confidence interval), a parameter of the algorithm
+    that controls how many datapoints are smoothed loglinearly (see Gale and
+    Sampson 1995).
+    """
+    # Gale and Sampson (1995/2001 reprint)
+    if 0 in counts.values():
+        raise ValueError('Species must not have 0 count.')
+    totalCounts = float(sum(counts.values()))   # N (G&S)
+    countsOfCounts = countOfCountsTable(counts) # r -> n (G&S)
+    sortedCounts = sorted(countsOfCounts.keys())
+    assert(totalCounts == sum([r*n for r,n in countsOfCounts.items()]))
+
+    p0 = countsOfCounts[1] / totalCounts
+    print('p0 = %f' % p0)
+
+    Z = __sgtZ(sortedCounts, countsOfCounts)
+
+    print(f"Z {Z}")
+    # Compute a loglinear regression of Z[r] on r
+    rs = list(Z.keys())
+    zs = list(Z.values())
+    a, b = __loglinregression(rs, zs)
+
+    print(f'{a} {b}')
+    # Gale and Sampson's (1995/2001) "simple" loglinear smoothing method.
+    rSmoothed = {}
+    useY = False
+    for r in sortedCounts:
+        # y is the loglinear smoothing
+        y = float(r+1) * exp(a*log(r+1) + b) / exp(a*log(r) + b)
+        # If we've already started using y as the estimate for r, then
+        # contine doing so; also start doing so if no species was observed
+        # with count r+1.
+        if r+1 not in countsOfCounts:
+            if not useY:
+                print('Warning: reached unobserved count before crossing the '
+                      'smoothing threshold.')
+            useY = True
+
+        if useY:
+            rSmoothed[r] = y
+            continue
+
+        # x is the empirical Turing estimate for r
+        x = (float(r+1) * countsOfCounts[r+1]) / countsOfCounts[r]
+
+        Nr = float(countsOfCounts[r])
+        Nr1 = float(countsOfCounts[r+1])
+
+        # t is the width of the 95% (or whatever) confidence interval of the
+        # empirical Turing estimate, assuming independence.
+        t = confidenceLevel * \
+            sqrt(\
+                float(r+1)**2 * (Nr1 / Nr**2) \
+                              * (1. + (Nr1 / Nr))\
+            )
+
+        # If the difference between x and y is more than t, then the empirical
+        # Turing estimate x tends to be more accurate. Otherwise, use the
+        # loglinear smoothed value y.
+        if abs(x - y) > t:
+            rSmoothed[r] = x
+        useY = True
+        rSmoothed[r] = y
+
+    # normalize and return the resulting smoothed probabilities, less the
+    # estimated probability mass of unseen species.
+    sgtProbs = {}
+    smoothTot = 0.0
+    for r, rSmooth in rSmoothed.items():
+        smoothTot += countsOfCounts[r] * rSmooth
+
+    print(f"smoothTot {smoothTot} rSmoothed {rSmoothed}")
+    for species, spCount in counts.items():
+        sgtProbs[species] = (1.0 - p0) * (rSmoothed[spCount] / smoothTot)
+
+    print(f"sgtProbs {sgtProbs}")
+    return sgtProbs, p0
+
+def __sgtZ(sortedCounts, countsOfCounts):
+    # For each count j, set Z[j] to the linear interpolation of i,j,k, where i
+    # is the greatest observed count less than i and k is the smallest observed
+    # count greater than j.
+    Z = {}
+    for (jIdx, j) in enumerate(sortedCounts):
+        if jIdx == 0:
+            i = 0
+        else:
+            i = sortedCounts[jIdx-1]
+        if jIdx == len(sortedCounts)-1:
+            k = 2*j - i
+        else:
+            k = sortedCounts[jIdx+1]
+        Z[j] = 2*countsOfCounts[j] / float(k-i)
+    return Z
+
+def __loglinregression(rs, zs):
+    coef = linalg.lstsq(c_[log(rs), (1,)*len(rs)], log(zs))[0]
+    a, b = coef
+    print('Regression: log(z) = %f*log(r) + %f' % (a,b))
+    if a > -1.0:
+        print('Warning: slope is > -1.0')
+    return a, b
+
+
+# Related plotting functions for use in pylab
+
+def setupTexPlots():
+    """
+    Optional convenience function that configures matplotlib for TeX-based
+    output, if possible. Depends on matplotlib.
+    """
+    from matplotlib import rc
+
+    rc('text', usetex=True)
+    rc('text', dvipnghack=True) # for OSX
+    rc('font', family='serif')
+    rc('font', serif=['Computer Modern'])
+
+def plotFreqVsGoodTuring(counts, confidence=1.96, loglog=False):
+    """
+    Draws a scatterplot of the empirical frequencies of the counted species
+    versus their Simple Good Turing smoothed values, in rank order. Depends on
+    pylab and matplotlib.
+    """
+    import pylab
+    from matplotlib import rc
+
+    tot = float(sum(counts.values()))
+    freqs = dict([(species, cnt/tot) for species, cnt in counts.items()])
+    sgt, p0 = simpleGoodTuringProbs(counts, confidence)
+
+    if loglog:
+        plotFunc = pylab.loglog
+    else:
+        plotFunc = pylab.plot
+    plotFunc(sorted(freqs.values(), reverse=True), 'kD', mfc='white',
+            label="Observed")
+    plotFunc(sorted(sgt.values(), reverse=True), 'k+',
+            label="Simple Good-Turing Estimate")
+    pylab.xlim(-0.5, len(freqs)+0.5)
+    pylab.xlabel("Rank")
+    pylab.ylabel("Frequency")
+    pylab.legend(numpoints=1)
+
+
+def test():
+    i = {
+        1: 32,
+        2: 20,
+        3: 10,
+        4: 3,
+        5: 1,
+        6: 2,
+        7: 1,
+        8: 1,
+        9: 1,
+        10: 2,
+        12: 1,
+        26: 1,
+    }
+    return simpleGoodTuringProbs(i)
+
+test()
--- a/test/com/owoga/prhyme/generation/simple_good_turing_test.clj
+++ b/test/com/owoga/prhyme/generation/simple_good_turing_test.clj
@ -4,27 +4,7 @@
            [clojure.test :as t :refer [deftest is testing use-fixtures]]
            [clojure.java.io :as io]))

-(def train-corpus
-  (with-open [reader (io/reader (io/resource "dark-corpus-train.txt"))]
-    (->> (line-seq reader) doall)))

-(def test-corpus
-  (with-open [reader (io/reader (io/resource "dark-corpus-test.txt"))]
-    (->> (line-seq reader) doall)))
-
-(def test-sentence (first test-corpus))
-
-(def test-tokens
-  (sgt/pad-tokens (sgt/tokenize-line test-sentence) 1))
-
-(def train-trie
-  (sgt/lines->trie train-corpus 3))
-
-(def sgt-model
-  (sgt/simple-good-turing train-trie))
-
-(def vocab
-  (into #{} (remove #{:count} (keys train-trie))))

 (def maps-for-sgt (sgt/maps-for-simple-good-turing train-trie))

--- a/test/com/owoga/prhyme/util/math_test.clj
+++ b/test/com/owoga/prhyme/util/math_test.clj
@ -0,0 +1,103 @@
+(ns com.owoga.prhyme.util.math-test
+  (:require [com.owoga.prhyme.util.math :as math]
+            [clojure.test :as t]))
+
+(defn approx=
+  [a b e]
+  (<= (Math/abs (- a b)) e))
+
+;; Following the work in Church and Gale [1991], we averagewith each
+;; non-zeroNrthe zeroNr’s that surround it: order the non-zeroNrbyr, and letq,r,
+;; andtbesuccessive indices of non-zero values. We replaceNrbyZr=Nr/0. 5 (t−q).
+;; In other words we estimatethe expectedNrby the density ofNrfor larger. For
+;; smallr, there is no difference, because the length ofthe intervals is unity.
+;; For larger, the change can make a difference of several orders of magnitude.
+(t/deftest averaging-consecutives
+  (t/testing "averaging consecutives"
+    (let [r-coll  [1  2  3 5 10]
+          nr-coll [20 10 5 1 2]
+          zr-coll (math/average-consecutives r-coll nr-coll)]
+      (t/is (approx= (nth zr-coll 1)
+                     (/ 10 (* 0.5 (- 3 1)))
+                     1e-4))
+      (t/is (approx= (nth zr-coll 2)
+                     (/ 5 (* 0.5 (- 5 2)))
+                     1e-4))
+      (t/is (approx= (nth zr-coll 3)
+                     (/ 1 (* 0.5 (- 10 3)))
+                     1e-4))
+      (t/is (approx= (nth zr-coll 4)
+                     (/ 2 (* 0.5 (- 15 5)))
+                     1e-4)))))
+
+;;;; Values from LibreOffice data -> statistics -> regression
+;; 18.1311501368169
+;; 7.85117996785167
+;; 4.81179001426509
+;; 2.59672962119497
+;; 1.12444006199334
+
+(t/deftest linear-regression
+  (t/testing "The results of the linear regression model are accurate"
+    (let [r-coll  [1  2  3 5 10]
+          zr-coll [20 10 5 1 2] ;; not really smoothed, but smoothing isn't under test
+          log-r (map #(Math/log %) r-coll)
+          log-zr (map #(Math/log %) zr-coll)
+          linear-model (math/least-squares-linear-regression log-r log-zr)
+          linear-results (map linear-model (map #(Math/log %) r-coll))]
+      (t/is (every?
+             (fn [[expected predicted]]
+               (approx= expected predicted 0.01))
+             (map
+              vector
+              '(18.13
+                7.85
+                4.81
+                2.59
+                1.12)
+              (map #(Math/pow Math/E %) linear-results)))))))
+
+;; The below passes a sanity check in that each r* is slightly less than r.
+#_(t/deftest turing-estimation
+  (t/testing "turing estimation - r*"
+    (let [r-coll  [1  2  3 5 10]
+          nr-coll [20 10 5 1 2]
+          zr-coll (math/average-consecutives r-coll nr-coll)
+          log-r (map #(Math/log %) r-coll)
+          log-zr (map #(Math/log %) zr-coll)
+          linear-model (math/least-squares-linear-regression log-r log-zr)]
+      (t/is (= [] (map
+                   (partial math/turing-estimate linear-model)
+                   r-coll))))))
+
+(t/deftest simple-good-turing-estimator
+  (t/testing "The simple good turing estimator switches between linear and turing"
+    (let [r-coll  [1  2  3 5 10]
+          zr-coll [20 10 5 1 2] ;; not smoothed, but smoothing isn't under test
+          log-r (map #(Math/log %) r-coll)
+          log-zr (map #(Math/log %) zr-coll)
+          linear-model (math/least-squares-linear-regression log-r log-zr)
+          sgt-estimator (math/estimator linear-model r-coll zr-coll)
+          sgt-estimates (:r*
+                         (reduce
+                          (fn [{:keys [lgt? r*] :as acc} x]
+                            (let [[y lgt?] (sgt-estimator x lgt?)]
+                              {:lgt? lgt?
+                               :r* (conj r* y)}))
+                          {:lgt? false
+                           :r* []}
+                          r-coll))]
+      (println zr-coll)
+      (println (map linear-model r-coll))
+      (println sgt-estimates)
+      (t/is (every?
+             (fn [[expected predicted]]
+               (approx= expected predicted 0.01))
+             (map
+              vector
+              '(18.13
+                7.85
+                4.81
+                2.59
+                1.12)
+              sgt-estimates))))))