From d89f5da9942614df5cbc42c53dc87155ff088770 Mon Sep 17 00:00:00 2001
From: Eric Ihli <eihli@owoga.com>
Date: Thu, 1 Jul 2021 14:57:00 -0500
Subject: [PATCH] Implement simple good-turing probabilities

---
 src/com/owoga/prhyme/util/math.clj       | 241 ++++++-----------------
 test/com/owoga/prhyme/util/math_test.clj |  93 +++++----
 2 files changed, 121 insertions(+), 213 deletions(-)

diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj
index 102e305..8a82fdc 100644
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@@ -251,182 +251,65 @@
       (/ nr1 (Math/pow nr 2))
       (inc (/ nr1 nr)))))
 
-(defn turing-estimate
-  "Value of r* such that pᵣ = r*/N
-  Alternative to MLE so that pᵣ never equals 0."
-  [lm r]
-  (* (inc r) (/ (lm (inc r)) (lm r))))
-
-(defn estimator
-  "Switches between a Turing estimator and a Linear Good Turing estimator."
-  [lm rs nrs]
-  (fn
-    ([x lgt?]
-     (let [i (.indexOf rs x)]
-       (if (= (inc i) (count rs))
-         [(/ (* (inc x)
-                (lm (inc x)))
-             (lm x))
-          lgt?]
-         (let [turing-estimate (float
-                                (/ (* (inc x)
-                                      (nth nrs (inc i)))
-                                   (nth nrs i)))
-               r-plus-one-squared
-               (Math/pow (inc x) 2)
-
-               term2
-               (/ (nth nrs (inc i))
-                  (Math/pow (nth nrs i) 2))
-
-               term3
-               (inc (/ (nth nrs (inc i))
-                       (nth nrs i)))
-               
-               stdv (Math/sqrt (* r-plus-one-squared term2 term3))
-               lgt-estimate (/ (* (inc x)
-                                  (lm (inc x)))
-                               (lm x))]
-           (assert (>= i 0) (str x " not found"))
-           (let [diff (Math/abs (- lgt-estimate turing-estimate))
-                 lgt? (or lgt?
-                          (< diff (* 1.95 stdv)))]
-             (if lgt?
-               [lgt-estimate lgt?]
-               [turing-estimate lgt?]))))))))
-
-
-(defn smoothed-frequencies
-  [rs nrs]
-  (let [l (count rs)
-        N (apply + (map #(apply * %) (map vector rs nrs)))
-        p0 (/ (first nrs) N)
-        zrs (average-consecutives rs nrs)
-        log-rs (map #(Math/log %) rs)
-        log-zrs (map #(Math/log %) zrs)
-        lm (least-squares-linear-regression log-rs log-zrs)
-        lgts (map lm rs)
-        estimations (loop [coll rs
-                           lgt? false
-                           e (estimator lm rs zrs)
-                           estimations []]
-                      (cond
-                        (empty? coll) estimations
-                        :else
-                        (let [[estimation lgt?] (e (first coll) lgt?)]
-                          (recur
-                           (rest coll)
-                           lgt?
-                           e
-                           (conj estimations estimation)))))
-        N* (apply + (map #(apply * %) (map vector nrs estimations)))
-        probs (cons
-               (float p0)
-               (map #(* (- 1 p0) (/ % N*)) estimations))
-        sum-probs (apply + probs)]
-    [zrs
-     lgts
-     estimations
-     probs
-     (apply + probs)
-     rs
-     (map
-        (fn [r]
-          (* (inc r) (/ (lm (inc r)) (lm r))))
-        rs)]))
-
-(comment
-  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
-        nrs [32 20 10 3 1 2 1 1 1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
-        rs  [1 2 3 4 5 6 7 8 9 10 12 26]
-        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
-    (smoothed-frequencies rs nrs))
-
-  )
-
-(defn sgt [rs nrs]
-  (assert (and (not-empty nrs) (not-empty rs))
-          "frequencies and frequency-of-frequencies can't be empty")
-  (let [l (count rs)
-        N (apply + (map #(apply * %) (map vector rs nrs)))
-        p0 (/ (first nrs) N)
-        zrs (average-consecutives rs nrs)
-        log-rs (map #(Math/log %) rs)
-        log-zrs (map #(Math/log %) zrs)
-        lm (least-squares-linear-regression log-rs log-zrs)
-        lgts (map lm rs)
-        r* (partial turing-estimate lm)]
-    [p0 rs lgts (map r* rs) (map #(/ (r* %) N) rs) N]))
-
-(comment
-  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
-        nrs [32 20 10 3 1 2 1 1 1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
-        rs  [1 2 3 4 5 6 7 8 9 10 12 26]
-        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
-    (map #(apply * %) (map vector rs (sgt rs nrs)))
-    (sgt rs nrs))
- 
-  )
-(comment
-  (let [rs [1 2 3 4 5 6 7 8 9 10 12]
-        nrs [120 40 24 13 15 5 11 2 2 1 3]
-        sgts (sgt rs nrs)
-        N0 (apply + nrs)]
-    [(float (/ 120 N0))
-     (apply + sgts)])
-
-  )
-
-(defn sgt-with-counts [rs nrs]
-  (assert (and (not-empty nrs) (not-empty rs))
-          "frequencies and frequency-of-frequencies can't be empty")
-  (let [l (count rs)
-        N (apply + (map #(apply * %) (map vector rs nrs)))
-        p0 (/ (first nrs) N)
-        zrs (average-consecutives rs nrs)
-        log-rs (map #(Math/log %) rs)
-        log-zrs (map #(Math/log %) zrs)
-        lm (least-squares-linear-regression log-rs log-zrs)
-        lgts (map lm rs)
-        estimations (loop [coll rs
-                           lgt? false
-                           e (estimator lm rs zrs)
-                           estimations []]
-                      (cond
-                        (empty? coll) estimations
-                        :else
-                        (let [[estimation lgt?] (e (first coll) lgt?)]
-                          (recur
-                           (rest coll)
-                           lgt?
-                           e
-                           (conj estimations estimation)))))
-        N* (apply + (map #(apply * %) (map vector nrs estimations)))
-        probs (cons
-               (float p0)
-               (map #(* (- 1 p0) (/ % N*)) estimations))
-        sum-probs (apply + probs)]
-    [(cons 0 rs)
-     (map #(/ % sum-probs) probs)
-     estimations
-     lgts]))
-
-
-
-(defn discount-coefficient-map
-  "The probability of an unseen (Nr0) n-gram is Nr1/N.
-  We then have to adjust the probability of Nr1 down from the maximum-likelihood
-  estimate of Nr1 (which was Nr1/N) to something else.
-
-  The size of this adjustment is captured by the discount coefficient."
-  [frequency->frequency-of-frequency]
-  (let [[xs ys] ((juxt keys vals) frequency->frequency-of-frequency)
-        sgt (into (sorted-map) (apply map vector (sgt xs ys)))]
-
-    (into
-     (sorted-map)
-     (map
-      (fn [[r nr nr*]]
-        [r (/ nr* nr)])
-      (map vector xs ys (vals sgt))))))
-
+(defn turing-estimator
+  [r-coll nr-coll]
+  (let [m (into {} (map vector r-coll nr-coll))]
+    (fn [r]
+      (let [nr (m r)]
+        (if nr nr (throw (format "No Nᵣ value for r value of %s" r)))))))
+
+(defn lgt-estimator
+  "The simplest smooth is a line, and a downward sloping log-log line will
+  satisfy the priors onr*so long asthe slope of the linebis less than -1. This
+  is the proposed simple smooth, and we call the associatedGood-Turing estimate
+  the Linear Good Turing (LGT) estimate."
+  [lm]
+  #_(fn [r]
+      (* (inc r) (/ (lm (inc r)) (lm r))))
+  (fn [r]
+    (lm r)))
+
+(defn sgt-estimator
+  "Switches between turing and lgt
+  Simply for now at k=3. Should be based on variance in future."
+  [t lgt]
+  (fn [r]
+    (if (< r 3) (t r) (lgt r))))
+
+(defn sgt-probs
+  [p0 r*-coll N*]
+  (cons
+   p0
+   (map
+    (fn [r*]
+      (* (- 1 p0)
+         (/ r* N*)))
+    r*-coll)))
+
+(defn calc-N [r-coll nr-coll]
+  (apply + (map (fn [a b] (* a b)) r-coll nr-coll)))
+
+(defn frequencies->simple-good-turing-probabilities
+  "Returns a sorted map of frequencies to probability of seeing a word that
+  appeared with that frequency.
+
+  Useful for calculating perplexity."
+  [freqs]
+  (let [[r-coll nr-coll] (apply map vector (sort-by first freqs))
+        zr-coll (average-consecutives r-coll nr-coll)
+        linear-model (least-squares-linear-regression r-coll zr-coll)
+        turing-estimator (turing-estimator r-coll nr-coll)
+        lgt-estimator (lgt-estimator linear-model)
+        sgt-estimator (sgt-estimator turing-estimator lgt-estimator)
+        r*-coll (map
+                 (fn [r]
+                   (if (< r 2)
+                     (* (inc r) (/ (turing-estimator (inc r))
+                                   (turing-estimator r)))
+                     (* (inc r) (/ (lgt-estimator (inc r))
+                                   (lgt-estimator r)))))
+                 r-coll)
+        p0 (/ (first nr-coll) (apply + (map (partial apply *) (map vector r-coll nr-coll))))
+        N* (apply + (map (partial apply *) (map vector r*-coll nr-coll)))
+        probs (sgt-probs p0 r*-coll N*)]
+    (into (sorted-map) (map vector (cons 0 r-coll) probs))))
diff --git a/test/com/owoga/prhyme/util/math_test.clj b/test/com/owoga/prhyme/util/math_test.clj
index 97bd60f..23e3735 100644
--- a/test/com/owoga/prhyme/util/math_test.clj
+++ b/test/com/owoga/prhyme/util/math_test.clj
@@ -55,47 +55,72 @@
                 1.12)
               linear-results))))))
 
-;; The below passes a sanity check in that each r* is slightly less than r.
-#_(t/deftest turing-estimation
-  (t/testing "turing estimation - r*"
+;; Silly test, turing estimation just returns Nr unchanged.
+(t/deftest turing-estimation
+  (t/testing "turing estimation"
+    (let [r-coll  [1  2  3 5 10]
+          nr-coll [20 10 5 1 2]
+          turing-estimator (math/turing-estimator r-coll nr-coll)]
+      (t/is (= nr-coll
+               (map
+                turing-estimator
+                r-coll))))))
+
+;; Hand check looks reasonable but precise assertion would come from careful hand calculation
+;; or comparison to a known-good implementation.
+#_(t/deftest lgt-estimation
+  (t/testing "linear good-turing estimation"
     (let [r-coll  [1  2  3 5 10]
           nr-coll [20 10 5 1 2]
           zr-coll (math/average-consecutives r-coll nr-coll)
-          log-r (map #(Math/log %) r-coll)
-          log-zr (map #(Math/log %) zr-coll)
-          linear-model (math/least-squares-linear-regression log-r log-zr)]
+          _ (println zr-coll)
+          linear-model (math/least-squares-linear-regression r-coll zr-coll)
+          lgt-estimator (math/lgt-estimator linear-model)]
       (t/is (= [] (map
-                   (partial math/turing-estimate linear-model)
+                   lgt-estimator
                    r-coll))))))
 
-(t/deftest simple-good-turing-estimator
+(comment
+  (let [r-coll  [1  2  3 5 10]
+        nr-coll [20 10 5 1 2]
+        zr-coll (math/average-consecutives r-coll nr-coll)
+        linear-model (math/least-squares-linear-regression r-coll zr-coll)
+        lgt-estimator (math/lgt-estimator linear-model)]
+    (map
+     lgt-estimator
+     r-coll))
+  ;; => (23.33291663880418
+  ;;     5.8271897728476425
+  ;;     2.5882932698055106
+  ;;     0.931074517265868
+  ;;     0.23252763418987563)
+  )
+
+;; Hardcoded the expectation received by runing ~sgt/sgt.h~
+;; // Simple Good-Turing estimation
+;; //
+;; // Copyright (c) David Elworthy 2004.
+(t/deftest simple-good-turing-probabilities
   (t/testing "The simple good turing estimator switches between linear and turing"
-    (let [r-coll  [1  2  3 5 10]
-          zr-coll [20 10 5 1 2] ;; not smoothed, but smoothing isn't under test
-          log-r (map #(Math/log %) r-coll)
-          log-zr (map #(Math/log %) zr-coll)
-          linear-model (math/least-squares-linear-regression log-r log-zr)
-          sgt-estimator (math/estimator linear-model r-coll zr-coll)
-          sgt-estimates (:r*
-                         (reduce
-                          (fn [{:keys [lgt? r*] :as acc} x]
-                            (let [[y lgt?] (sgt-estimator x lgt?)]
-                              {:lgt? lgt?
-                               :r* (conj r* y)}))
-                          {:lgt? false
-                           :r* []}
-                          r-coll))]
-      (println zr-coll)
-      (println (map linear-model r-coll))
-      (println sgt-estimates)
+    (let [freqs {7 1, 1 32, 4 3, 6 2, 3 10, 12 1, 2 20, 9 1, 5 1, 26 1, 10 2, 8 1}
+          probs (math/frequencies->simple-good-turing-probabilities freqs)]
       (t/is (every?
-             (fn [[expected predicted]]
-               (approx= expected predicted 0.01))
+             (fn [[a b]]
+               (approx= a b 0.0001))
              (map
               vector
-              '(18.13
-                7.85
-                4.81
-                2.59
-                1.12)
-              sgt-estimates))))))
+              (vals probs)
+              [0.150325
+               0.005617
+               0.006013
+               0.010137
+               0.014408
+               0.018754
+               0.023142
+               0.027557
+               0.031989
+               0.036434
+               0.040887
+               0.049813
+               0.112550]))))))
+