Fumble around with good turing

4 years ago · 234b40a2e9
parent 6b8352ae30
commit 234b40a2e9
8 changed files with 452 additions and 49 deletions
--- a/sgt/gt.s
+++ b/sgt/gt.s
@ -0,0 +1,4 @@
    cp $1 freqhist
    S <gtanal.S
    cat gtanal
    rm freqhist gtanal
--- a/sgt/gtanal.S
+++ b/sgt/gtanal.S
@ -0,0 +1,26 @@
 #-*- mode: Fundamental; -*-
 #read in data
 xm<-matrix(scan("freqhist",0),ncol=2,byrow=T)
 xr<-xm[,1]
 xnr<-xm[,2]
 xn<-sum(xr*xnr)
 # make averaging transform
 xnrz<-nrzest(xr, xnr)
 # get Linear Good-Turing estimate
 xf<-lsfit(log(xr), log(xnrz))
 xcoef<-xf$coef
 xrst<-rstest(xr,xcoef)
 xrstrel<-xrst/xr
 # get Turing estimate
 xrtry<-xr == c(xr[-1] - 1, 0)
 xrstarel<-rep(0, length(xr))
 xrstarel[xrtry]<-(xr[xrtry] + 1) / xr[xrtry] * c(xnr[-1], 0) [xrtry] / xnr[xrtry]
 # make switch from Turing to LGT estimates
 tursd<-rep(1, length(xr))
 for (i in 1:length(xr)) if (xrtry[i])
    tursd[i]<-(i+1) / xnr[i] * sqrt(xnr[i+1] * (1 + xnr[i+1] / xnr[i]))
--- a/sgt/gtfunc.S
+++ b/sgt/gtfunc.S
@ -0,0 +1,11 @@
    nrzest<-function(r, nr)
    {
    d <- c(1, diff(r))
    dr <- c(0.5 * (d[-1] + d[ - length(d)  ]), d[length(d)])
    return(nr/dr)
    }
    rstest<-function(r, coef)
    {
        return(r * (1 + 1r)^(1 + coef[2]))
    }
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -1114,16 +1114,44 @@
 ;;;; Accuracy
 (defn lookup-with-backoff
  [model lookup]
  (loop [lookup lookup]
    (let [node (trie/lookup model lookup)]
      (cond
        (empty? lookup) [model (count (trie/children model))]
        node [(trie/lookup model (butlast lookup))
              (second (get node []))]
        :else (recur (butlast lookup))))))
 (defn calc-N [node]
    (apply + (map #(second (get % [])) (trie/children node))))
 (defn trie-frequencies [node]
  (->> node
       trie/children
       (map #(second (get % [])))
       frequencies
       vec
       (sort-by first)
       (into (sorted-map))))
 (comment
  (time (def N (calc-N markov-tight-trie)))
  (time (trie-frequencies (trie/lookup markov-tight-trie [107])))
  )
 (defn mle
  [model lookup]
-  (let [node (trie/lookup model lookup)
+  (let [[parent freq] (lookup-with-backoff model lookup)
-        [_ freq] (get node [] [nil 1])
+        [_ parent-freq] (get parent [] [nil N])]
-        parent (trie/lookup model (butlast lookup))
+    [freq parent-freq]))
        [_ parent-freq] (get parent [] [nil 1])]
    (/ freq parent-freq)))
 (comment
-  (mle markov-tight-trie [795 68 69])
+  (mle markov-tight-trie [9095 452 27040])
  (count (trie/children markov-tight-trie))
  )
 (defn perplexity
@ -1136,5 +1164,18 @@
 (comment
  (perplexity markov-tight-trie database 3 "hi there eric how are you")
  (database "through") ;; 1924
  database
  (count database)
  (get markov-tight-trie [315 1924])
  (->>
   (map #(second (get % []))
        (trie/children (trie/lookup markov-tight-trie [315])))
   frequencies
   vec
   (sort-by first)
   (into (sorted-map)))
  )
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@ -144,7 +144,7 @@
        err-x-sqr (map #(* % %) err-x)
        m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
             (apply + err-x-sqr))
-        b (/ (- sum-y (* m sum-x)) n)]
+        b (- mean-y (* m mean-x))]
    (assert (< m -1)
            (format
             (str "See Good-Turing Without Tears"
@ -249,7 +249,14 @@
      (/ nr1 (Math/pow nr 2))
      (inc (/ nr1 nr)))))
 (defn turing-estimate
  "Value of r* such that pᵣ = r*/N
  Alternative to MLE so that pᵣ never equals 0."
  [lm r]
  (* (inc r) (/ (lm (inc r)) (lm r))))
 (defn estimator
  "Switches between a Turing estimator and a Linear Good Turing estimator."
  [lm rs nrs]
  (fn
    ([x lgt?]
@ -315,7 +322,12 @@
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) estimations))
        sum-probs (apply + probs)]
-    [lgts
+    [zrs
     lgts
     estimations
     probs
     (apply + probs)
     rs
     (map
        (fn [r]
          (* (inc r) (/ (lm (inc r)) (lm r))))
@ -341,27 +353,8 @@
        log-zrs (map #(Math/log %) zrs)
        lm (least-squares-linear-regression log-rs log-zrs)
        lgts (map lm rs)
-        estimations (loop [coll rs
+        r* (partial turing-estimate lm)]
-                           lgt? false
+    [p0 rs lgts (map r* rs) (map #(/ (r* %) N) rs) N]))
                           e (estimator lm rs zrs)
                           estimations []]
                      (cond
                        (empty? coll) estimations
                        :else
                        (let [[estimation lgt?] (e (first coll) lgt?)]
                          (recur
                           (rest coll)
                           lgt?
                           e
                           (conj estimations estimation)))))
        N* (apply + (map #(apply * %) (map vector nrs estimations)))
        probs (cons
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) estimations))
        sum-probs (apply + probs)]
    [(cons 0 rs)
     (map #(/ % sum-probs) probs)
     estimations]))
 (comment
  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
--- a/src/com/owoga/prhyme/util/sgt.py
+++ b/src/com/owoga/prhyme/util/sgt.py
@ -0,0 +1,245 @@
 #!/usr/bin/env python3
 # Copyright 2009-2011 by Max Bane
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 This module provides an implementation of Gale and Sampson's (1995/2001) "Simple
 Good Turing" algorithm. The main function is simpleGoodTuringProbs(), which
 takes a dictionary of species counts and returns the estimated population
 frequencies of the species, as estimated by the Simple Good Turing method. To
 use this module, you must have scipy and numpy installed.
 Also included is a function that uses pylab and matplotlib to draw a useful
 scatterplot for comparing the empirical frequencies against the Simple Good
 Turing estimates.
 Depends on reasonably recent versions of scipy and numpy.
 Version 0.3: June 21, 2011
    First github version.
 Version 0.2: November 12, 2009.
    Added __version__ string.
    Added check for 0 counts.
    Don't pollute namespace with "import *".
    Added loglog keyword argument to plotFreqVsGoodTuring().
 Version 0.1: November 11, 2009.
 REFERENCES:
    William Gale and Geoffrey Sampson. 1995. Good-Turing frequency estimation
    without tears. Journal of Quantitative Linguistics, vol. 2, pp. 217--37.
    See also the corrected reprint of same on Sampson's web site.
 """
 __version__ = "0.3"
 from scipy import linalg
 from numpy import c_, exp, log, inf, NaN, sqrt
 def countOfCountsTable(counts, sparse=True):
    """
    Given a dictionary mapping keys (species) to counts, returns a dictionary
    encoding the corresponding table of counts of counts, i.e., a dictionary
    that maps a count to the number of species that have that count. If
    sparse=True (default), counts with zero counts are not included in the
    returned dictionary.
    """
    if sparse == True:
        cs = counts.values()
    else:
        cs = xrange(1, max(counts.values())+1)
    countsOfCounts = {}
    for c in cs:
        countsOfCounts[c] = 0
        for species, speciesCount in counts.items():
            if speciesCount == c:
                countsOfCounts[c] += 1
    return countsOfCounts
 def simpleGoodTuringProbs(counts, confidenceLevel=1.96):
    """
    Given a dictionary mapping keys (species) to counts, returns a dictionary
    mapping those same species to their smoothed probabilities, according to
    Gale and Sampson's (1995/2001 reprint) "Simple Good-Turing" method of
    smoothing. The optional confidenceLevel argument should be a multiplier of
    the standard deviation of the empirical Turing estimate (default 1.96,
    corresponding to a 95% confidence interval), a parameter of the algorithm
    that controls how many datapoints are smoothed loglinearly (see Gale and
    Sampson 1995).
    """
    # Gale and Sampson (1995/2001 reprint)
    if 0 in counts.values():
        raise ValueError('Species must not have 0 count.')
    totalCounts = float(sum(counts.values()))   # N (G&S)
    countsOfCounts = countOfCountsTable(counts) # r -> n (G&S)
    sortedCounts = sorted(countsOfCounts.keys())
    assert(totalCounts == sum([r*n for r,n in countsOfCounts.items()]))
    p0 = countsOfCounts[1] / totalCounts
    print('p0 = %f' % p0)
    Z = __sgtZ(sortedCounts, countsOfCounts)
    print(f"Z {Z}")
    # Compute a loglinear regression of Z[r] on r
    rs = list(Z.keys())
    zs = list(Z.values())
    a, b = __loglinregression(rs, zs)
    print(f'{a} {b}')
    # Gale and Sampson's (1995/2001) "simple" loglinear smoothing method.
    rSmoothed = {}
    useY = False
    for r in sortedCounts:
        # y is the loglinear smoothing
        y = float(r+1) * exp(a*log(r+1) + b) / exp(a*log(r) + b)
        # If we've already started using y as the estimate for r, then
        # contine doing so; also start doing so if no species was observed
        # with count r+1.
        if r+1 not in countsOfCounts:
            if not useY:
                print('Warning: reached unobserved count before crossing the '
                      'smoothing threshold.')
            useY = True
        if useY:
            rSmoothed[r] = y
            continue
        # x is the empirical Turing estimate for r
        x = (float(r+1) * countsOfCounts[r+1]) / countsOfCounts[r]
        Nr = float(countsOfCounts[r])
        Nr1 = float(countsOfCounts[r+1])
        # t is the width of the 95% (or whatever) confidence interval of the
        # empirical Turing estimate, assuming independence.
        t = confidenceLevel * \
            sqrt(\
                float(r+1)**2 * (Nr1 / Nr**2) \
                              * (1. + (Nr1 / Nr))\
            )
        # If the difference between x and y is more than t, then the empirical
        # Turing estimate x tends to be more accurate. Otherwise, use the
        # loglinear smoothed value y.
        if abs(x - y) > t:
            rSmoothed[r] = x
        useY = True
        rSmoothed[r] = y
    # normalize and return the resulting smoothed probabilities, less the
    # estimated probability mass of unseen species.
    sgtProbs = {}
    smoothTot = 0.0
    for r, rSmooth in rSmoothed.items():
        smoothTot += countsOfCounts[r] * rSmooth
    print(f"smoothTot {smoothTot} rSmoothed {rSmoothed}")
    for species, spCount in counts.items():
        sgtProbs[species] = (1.0 - p0) * (rSmoothed[spCount] / smoothTot)
    print(f"sgtProbs {sgtProbs}")
    return sgtProbs, p0
 def __sgtZ(sortedCounts, countsOfCounts):
    # For each count j, set Z[j] to the linear interpolation of i,j,k, where i
    # is the greatest observed count less than i and k is the smallest observed
    # count greater than j.
    Z = {}
    for (jIdx, j) in enumerate(sortedCounts):
        if jIdx == 0:
            i = 0
        else:
            i = sortedCounts[jIdx-1]
        if jIdx == len(sortedCounts)-1:
            k = 2*j - i
        else:
            k = sortedCounts[jIdx+1]
        Z[j] = 2*countsOfCounts[j] / float(k-i)
    return Z
 def __loglinregression(rs, zs):
    coef = linalg.lstsq(c_[log(rs), (1,)*len(rs)], log(zs))[0]
    a, b = coef
    print('Regression: log(z) = %f*log(r) + %f' % (a,b))
    if a > -1.0:
        print('Warning: slope is > -1.0')
    return a, b
 # Related plotting functions for use in pylab
 def setupTexPlots():
    """
    Optional convenience function that configures matplotlib for TeX-based
    output, if possible. Depends on matplotlib.
    """
    from matplotlib import rc
    rc('text', usetex=True)
    rc('text', dvipnghack=True) # for OSX
    rc('font', family='serif')
    rc('font', serif=['Computer Modern'])
 def plotFreqVsGoodTuring(counts, confidence=1.96, loglog=False):
    """
    Draws a scatterplot of the empirical frequencies of the counted species
    versus their Simple Good Turing smoothed values, in rank order. Depends on
    pylab and matplotlib.
    """
    import pylab
    from matplotlib import rc
    tot = float(sum(counts.values()))
    freqs = dict([(species, cnt/tot) for species, cnt in counts.items()])
    sgt, p0 = simpleGoodTuringProbs(counts, confidence)
    if loglog:
        plotFunc = pylab.loglog
    else:
        plotFunc = pylab.plot
    plotFunc(sorted(freqs.values(), reverse=True), 'kD', mfc='white',
            label="Observed")
    plotFunc(sorted(sgt.values(), reverse=True), 'k+',
            label="Simple Good-Turing Estimate")
    pylab.xlim(-0.5, len(freqs)+0.5)
    pylab.xlabel("Rank")
    pylab.ylabel("Frequency")
    pylab.legend(numpoints=1)
 def test():
    i = {
        1: 32,
        2: 20,
        3: 10,
        4: 3,
        5: 1,
        6: 2,
        7: 1,
        8: 1,
        9: 1,
        10: 2,
        12: 1,
        26: 1,
    }
    return simpleGoodTuringProbs(i)
 test()
--- a/test/com/owoga/prhyme/generation/simple_good_turing_test.clj
+++ b/test/com/owoga/prhyme/generation/simple_good_turing_test.clj
@ -4,27 +4,7 @@
            [clojure.test :as t :refer [deftest is testing use-fixtures]]
            [clojure.java.io :as io]))
 (def train-corpus
  (with-open [reader (io/reader (io/resource "dark-corpus-train.txt"))]
    (->> (line-seq reader) doall)))
 (def test-corpus
  (with-open [reader (io/reader (io/resource "dark-corpus-test.txt"))]
    (->> (line-seq reader) doall)))
 (def test-sentence (first test-corpus))
 (def test-tokens
  (sgt/pad-tokens (sgt/tokenize-line test-sentence) 1))
 (def train-trie
  (sgt/lines->trie train-corpus 3))
 (def sgt-model
  (sgt/simple-good-turing train-trie))
 (def vocab
  (into #{} (remove #{:count} (keys train-trie))))
 (def maps-for-sgt (sgt/maps-for-simple-good-turing train-trie))
--- a/test/com/owoga/prhyme/util/math_test.clj
+++ b/test/com/owoga/prhyme/util/math_test.clj
@ -0,0 +1,103 @@
 (ns com.owoga.prhyme.util.math-test
  (:require [com.owoga.prhyme.util.math :as math]
            [clojure.test :as t]))
 (defn approx=
  [a b e]
  (<= (Math/abs (- a b)) e))
 ;; Following the work in Church and Gale [1991], we averagewith each
 ;; non-zeroNrthe zeroNr’s that surround it: order the non-zeroNrbyr, and letq,r,
 ;; andtbesuccessive indices of non-zero values. We replaceNrbyZr=Nr/0. 5 (t−q).
 ;; In other words we estimatethe expectedNrby the density ofNrfor larger. For
 ;; smallr, there is no difference, because the length ofthe intervals is unity.
 ;; For larger, the change can make a difference of several orders of magnitude.
 (t/deftest averaging-consecutives
  (t/testing "averaging consecutives"
    (let [r-coll  [1  2  3 5 10]
          nr-coll [20 10 5 1 2]
          zr-coll (math/average-consecutives r-coll nr-coll)]
      (t/is (approx= (nth zr-coll 1)
                     (/ 10 (* 0.5 (- 3 1)))
                     1e-4))
      (t/is (approx= (nth zr-coll 2)
                     (/ 5 (* 0.5 (- 5 2)))
                     1e-4))
      (t/is (approx= (nth zr-coll 3)
                     (/ 1 (* 0.5 (- 10 3)))
                     1e-4))
      (t/is (approx= (nth zr-coll 4)
                     (/ 2 (* 0.5 (- 15 5)))
                     1e-4)))))
 ;;;; Values from LibreOffice data -> statistics -> regression
 ;; 18.1311501368169
 ;; 7.85117996785167
 ;; 4.81179001426509
 ;; 2.59672962119497
 ;; 1.12444006199334
 (t/deftest linear-regression
  (t/testing "The results of the linear regression model are accurate"
    (let [r-coll  [1  2  3 5 10]
          zr-coll [20 10 5 1 2] ;; not really smoothed, but smoothing isn't under test
          log-r (map #(Math/log %) r-coll)
          log-zr (map #(Math/log %) zr-coll)
          linear-model (math/least-squares-linear-regression log-r log-zr)
          linear-results (map linear-model (map #(Math/log %) r-coll))]
      (t/is (every?
             (fn [[expected predicted]]
               (approx= expected predicted 0.01))
             (map
              vector
              '(18.13
                7.85
                4.81
                2.59
                1.12)
              (map #(Math/pow Math/E %) linear-results)))))))
 ;; The below passes a sanity check in that each r* is slightly less than r.
 #_(t/deftest turing-estimation
  (t/testing "turing estimation - r*"
    (let [r-coll  [1  2  3 5 10]
          nr-coll [20 10 5 1 2]
          zr-coll (math/average-consecutives r-coll nr-coll)
          log-r (map #(Math/log %) r-coll)
          log-zr (map #(Math/log %) zr-coll)
          linear-model (math/least-squares-linear-regression log-r log-zr)]
      (t/is (= [] (map
                   (partial math/turing-estimate linear-model)
                   r-coll))))))
 (t/deftest simple-good-turing-estimator
  (t/testing "The simple good turing estimator switches between linear and turing"
    (let [r-coll  [1  2  3 5 10]
          zr-coll [20 10 5 1 2] ;; not smoothed, but smoothing isn't under test
          log-r (map #(Math/log %) r-coll)
          log-zr (map #(Math/log %) zr-coll)
          linear-model (math/least-squares-linear-regression log-r log-zr)
          sgt-estimator (math/estimator linear-model r-coll zr-coll)
          sgt-estimates (:r*
                         (reduce
                          (fn [{:keys [lgt? r*] :as acc} x]
                            (let [[y lgt?] (sgt-estimator x lgt?)]
                              {:lgt? lgt?
                               :r* (conj r* y)}))
                          {:lgt? false
                           :r* []}
                          r-coll))]
      (println zr-coll)
      (println (map linear-model r-coll))
      (println sgt-estimates)
      (t/is (every?
             (fn [[expected predicted]]
               (approx= expected predicted 0.01))
             (map
              vector
              '(18.13
                7.85
                4.81
                2.59
                1.12)
              sgt-estimates))))))