From d89f5da9942614df5cbc42c53dc87155ff088770 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Thu, 1 Jul 2021 14:57:00 -0500 Subject: [PATCH] Implement simple good-turing probabilities --- src/com/owoga/prhyme/util/math.clj | 241 ++++++----------------- test/com/owoga/prhyme/util/math_test.clj | 93 +++++---- 2 files changed, 121 insertions(+), 213 deletions(-) diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj index 102e305..8a82fdc 100644 --- a/src/com/owoga/prhyme/util/math.clj +++ b/src/com/owoga/prhyme/util/math.clj @@ -251,182 +251,65 @@ (/ nr1 (Math/pow nr 2)) (inc (/ nr1 nr))))) -(defn turing-estimate - "Value of r* such that pᵣ = r*/N - Alternative to MLE so that pᵣ never equals 0." - [lm r] - (* (inc r) (/ (lm (inc r)) (lm r)))) - -(defn estimator - "Switches between a Turing estimator and a Linear Good Turing estimator." - [lm rs nrs] - (fn - ([x lgt?] - (let [i (.indexOf rs x)] - (if (= (inc i) (count rs)) - [(/ (* (inc x) - (lm (inc x))) - (lm x)) - lgt?] - (let [turing-estimate (float - (/ (* (inc x) - (nth nrs (inc i))) - (nth nrs i))) - r-plus-one-squared - (Math/pow (inc x) 2) - - term2 - (/ (nth nrs (inc i)) - (Math/pow (nth nrs i) 2)) - - term3 - (inc (/ (nth nrs (inc i)) - (nth nrs i))) - - stdv (Math/sqrt (* r-plus-one-squared term2 term3)) - lgt-estimate (/ (* (inc x) - (lm (inc x))) - (lm x))] - (assert (>= i 0) (str x " not found")) - (let [diff (Math/abs (- lgt-estimate turing-estimate)) - lgt? (or lgt? - (< diff (* 1.95 stdv)))] - (if lgt? - [lgt-estimate lgt?] - [turing-estimate lgt?])))))))) - - -(defn smoothed-frequencies - [rs nrs] - (let [l (count rs) - N (apply + (map #(apply * %) (map vector rs nrs))) - p0 (/ (first nrs) N) - zrs (average-consecutives rs nrs) - log-rs (map #(Math/log %) rs) - log-zrs (map #(Math/log %) zrs) - lm (least-squares-linear-regression log-rs log-zrs) - lgts (map lm rs) - estimations (loop [coll rs - lgt? false - e (estimator lm rs zrs) - estimations []] - (cond - (empty? coll) estimations - :else - (let [[estimation lgt?] (e (first coll) lgt?)] - (recur - (rest coll) - lgt? - e - (conj estimations estimation))))) - N* (apply + (map #(apply * %) (map vector nrs estimations))) - probs (cons - (float p0) - (map #(* (- 1 p0) (/ % N*)) estimations)) - sum-probs (apply + probs)] - [zrs - lgts - estimations - probs - (apply + probs) - rs - (map - (fn [r] - (* (inc r) (/ (lm (inc r)) (lm r)))) - rs)])) - -(comment - (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] - nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] - rs [1 2 3 4 5 6 7 8 9 10 12 26] - nrs [32 20 10 3 1 2 1 1 1 2 1 1]] - (smoothed-frequencies rs nrs)) - - ) - -(defn sgt [rs nrs] - (assert (and (not-empty nrs) (not-empty rs)) - "frequencies and frequency-of-frequencies can't be empty") - (let [l (count rs) - N (apply + (map #(apply * %) (map vector rs nrs))) - p0 (/ (first nrs) N) - zrs (average-consecutives rs nrs) - log-rs (map #(Math/log %) rs) - log-zrs (map #(Math/log %) zrs) - lm (least-squares-linear-regression log-rs log-zrs) - lgts (map lm rs) - r* (partial turing-estimate lm)] - [p0 rs lgts (map r* rs) (map #(/ (r* %) N) rs) N])) - -(comment - (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] - nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] - rs [1 2 3 4 5 6 7 8 9 10 12 26] - nrs [32 20 10 3 1 2 1 1 1 2 1 1]] - (map #(apply * %) (map vector rs (sgt rs nrs))) - (sgt rs nrs)) - - ) -(comment - (let [rs [1 2 3 4 5 6 7 8 9 10 12] - nrs [120 40 24 13 15 5 11 2 2 1 3] - sgts (sgt rs nrs) - N0 (apply + nrs)] - [(float (/ 120 N0)) - (apply + sgts)]) - - ) - -(defn sgt-with-counts [rs nrs] - (assert (and (not-empty nrs) (not-empty rs)) - "frequencies and frequency-of-frequencies can't be empty") - (let [l (count rs) - N (apply + (map #(apply * %) (map vector rs nrs))) - p0 (/ (first nrs) N) - zrs (average-consecutives rs nrs) - log-rs (map #(Math/log %) rs) - log-zrs (map #(Math/log %) zrs) - lm (least-squares-linear-regression log-rs log-zrs) - lgts (map lm rs) - estimations (loop [coll rs - lgt? false - e (estimator lm rs zrs) - estimations []] - (cond - (empty? coll) estimations - :else - (let [[estimation lgt?] (e (first coll) lgt?)] - (recur - (rest coll) - lgt? - e - (conj estimations estimation))))) - N* (apply + (map #(apply * %) (map vector nrs estimations))) - probs (cons - (float p0) - (map #(* (- 1 p0) (/ % N*)) estimations)) - sum-probs (apply + probs)] - [(cons 0 rs) - (map #(/ % sum-probs) probs) - estimations - lgts])) - - - -(defn discount-coefficient-map - "The probability of an unseen (Nr0) n-gram is Nr1/N. - We then have to adjust the probability of Nr1 down from the maximum-likelihood - estimate of Nr1 (which was Nr1/N) to something else. - - The size of this adjustment is captured by the discount coefficient." - [frequency->frequency-of-frequency] - (let [[xs ys] ((juxt keys vals) frequency->frequency-of-frequency) - sgt (into (sorted-map) (apply map vector (sgt xs ys)))] - - (into - (sorted-map) - (map - (fn [[r nr nr*]] - [r (/ nr* nr)]) - (map vector xs ys (vals sgt)))))) - +(defn turing-estimator + [r-coll nr-coll] + (let [m (into {} (map vector r-coll nr-coll))] + (fn [r] + (let [nr (m r)] + (if nr nr (throw (format "No Nᵣ value for r value of %s" r))))))) + +(defn lgt-estimator + "The simplest smooth is a line, and a downward sloping log-log line will + satisfy the priors onr*so long asthe slope of the linebis less than -1. This + is the proposed simple smooth, and we call the associatedGood-Turing estimate + the Linear Good Turing (LGT) estimate." + [lm] + #_(fn [r] + (* (inc r) (/ (lm (inc r)) (lm r)))) + (fn [r] + (lm r))) + +(defn sgt-estimator + "Switches between turing and lgt + Simply for now at k=3. Should be based on variance in future." + [t lgt] + (fn [r] + (if (< r 3) (t r) (lgt r)))) + +(defn sgt-probs + [p0 r*-coll N*] + (cons + p0 + (map + (fn [r*] + (* (- 1 p0) + (/ r* N*))) + r*-coll))) + +(defn calc-N [r-coll nr-coll] + (apply + (map (fn [a b] (* a b)) r-coll nr-coll))) + +(defn frequencies->simple-good-turing-probabilities + "Returns a sorted map of frequencies to probability of seeing a word that + appeared with that frequency. + + Useful for calculating perplexity." + [freqs] + (let [[r-coll nr-coll] (apply map vector (sort-by first freqs)) + zr-coll (average-consecutives r-coll nr-coll) + linear-model (least-squares-linear-regression r-coll zr-coll) + turing-estimator (turing-estimator r-coll nr-coll) + lgt-estimator (lgt-estimator linear-model) + sgt-estimator (sgt-estimator turing-estimator lgt-estimator) + r*-coll (map + (fn [r] + (if (< r 2) + (* (inc r) (/ (turing-estimator (inc r)) + (turing-estimator r))) + (* (inc r) (/ (lgt-estimator (inc r)) + (lgt-estimator r))))) + r-coll) + p0 (/ (first nr-coll) (apply + (map (partial apply *) (map vector r-coll nr-coll)))) + N* (apply + (map (partial apply *) (map vector r*-coll nr-coll))) + probs (sgt-probs p0 r*-coll N*)] + (into (sorted-map) (map vector (cons 0 r-coll) probs)))) diff --git a/test/com/owoga/prhyme/util/math_test.clj b/test/com/owoga/prhyme/util/math_test.clj index 97bd60f..23e3735 100644 --- a/test/com/owoga/prhyme/util/math_test.clj +++ b/test/com/owoga/prhyme/util/math_test.clj @@ -55,47 +55,72 @@ 1.12) linear-results)))))) -;; The below passes a sanity check in that each r* is slightly less than r. -#_(t/deftest turing-estimation - (t/testing "turing estimation - r*" +;; Silly test, turing estimation just returns Nr unchanged. +(t/deftest turing-estimation + (t/testing "turing estimation" + (let [r-coll [1 2 3 5 10] + nr-coll [20 10 5 1 2] + turing-estimator (math/turing-estimator r-coll nr-coll)] + (t/is (= nr-coll + (map + turing-estimator + r-coll)))))) + +;; Hand check looks reasonable but precise assertion would come from careful hand calculation +;; or comparison to a known-good implementation. +#_(t/deftest lgt-estimation + (t/testing "linear good-turing estimation" (let [r-coll [1 2 3 5 10] nr-coll [20 10 5 1 2] zr-coll (math/average-consecutives r-coll nr-coll) - log-r (map #(Math/log %) r-coll) - log-zr (map #(Math/log %) zr-coll) - linear-model (math/least-squares-linear-regression log-r log-zr)] + _ (println zr-coll) + linear-model (math/least-squares-linear-regression r-coll zr-coll) + lgt-estimator (math/lgt-estimator linear-model)] (t/is (= [] (map - (partial math/turing-estimate linear-model) + lgt-estimator r-coll)))))) -(t/deftest simple-good-turing-estimator +(comment + (let [r-coll [1 2 3 5 10] + nr-coll [20 10 5 1 2] + zr-coll (math/average-consecutives r-coll nr-coll) + linear-model (math/least-squares-linear-regression r-coll zr-coll) + lgt-estimator (math/lgt-estimator linear-model)] + (map + lgt-estimator + r-coll)) + ;; => (23.33291663880418 + ;; 5.8271897728476425 + ;; 2.5882932698055106 + ;; 0.931074517265868 + ;; 0.23252763418987563) + ) + +;; Hardcoded the expectation received by runing ~sgt/sgt.h~ +;; // Simple Good-Turing estimation +;; // +;; // Copyright (c) David Elworthy 2004. +(t/deftest simple-good-turing-probabilities (t/testing "The simple good turing estimator switches between linear and turing" - (let [r-coll [1 2 3 5 10] - zr-coll [20 10 5 1 2] ;; not smoothed, but smoothing isn't under test - log-r (map #(Math/log %) r-coll) - log-zr (map #(Math/log %) zr-coll) - linear-model (math/least-squares-linear-regression log-r log-zr) - sgt-estimator (math/estimator linear-model r-coll zr-coll) - sgt-estimates (:r* - (reduce - (fn [{:keys [lgt? r*] :as acc} x] - (let [[y lgt?] (sgt-estimator x lgt?)] - {:lgt? lgt? - :r* (conj r* y)})) - {:lgt? false - :r* []} - r-coll))] - (println zr-coll) - (println (map linear-model r-coll)) - (println sgt-estimates) + (let [freqs {7 1, 1 32, 4 3, 6 2, 3 10, 12 1, 2 20, 9 1, 5 1, 26 1, 10 2, 8 1} + probs (math/frequencies->simple-good-turing-probabilities freqs)] (t/is (every? - (fn [[expected predicted]] - (approx= expected predicted 0.01)) + (fn [[a b]] + (approx= a b 0.0001)) (map vector - '(18.13 - 7.85 - 4.81 - 2.59 - 1.12) - sgt-estimates)))))) + (vals probs) + [0.150325 + 0.005617 + 0.006013 + 0.010137 + 0.014408 + 0.018754 + 0.023142 + 0.027557 + 0.031989 + 0.036434 + 0.040887 + 0.049813 + 0.112550])))))) +