Implement simple good-turing probabilities

main
Eric Ihli 3 years ago
parent f54a99258f
commit d89f5da994

@ -251,182 +251,65 @@
(/ nr1 (Math/pow nr 2)) (/ nr1 (Math/pow nr 2))
(inc (/ nr1 nr))))) (inc (/ nr1 nr)))))
(defn turing-estimate (defn turing-estimator
"Value of r* such that p = r*/N [r-coll nr-coll]
Alternative to MLE so that p never equals 0." (let [m (into {} (map vector r-coll nr-coll))]
[lm r]
(* (inc r) (/ (lm (inc r)) (lm r))))
(defn estimator
"Switches between a Turing estimator and a Linear Good Turing estimator."
[lm rs nrs]
(fn
([x lgt?]
(let [i (.indexOf rs x)]
(if (= (inc i) (count rs))
[(/ (* (inc x)
(lm (inc x)))
(lm x))
lgt?]
(let [turing-estimate (float
(/ (* (inc x)
(nth nrs (inc i)))
(nth nrs i)))
r-plus-one-squared
(Math/pow (inc x) 2)
term2
(/ (nth nrs (inc i))
(Math/pow (nth nrs i) 2))
term3
(inc (/ (nth nrs (inc i))
(nth nrs i)))
stdv (Math/sqrt (* r-plus-one-squared term2 term3))
lgt-estimate (/ (* (inc x)
(lm (inc x)))
(lm x))]
(assert (>= i 0) (str x " not found"))
(let [diff (Math/abs (- lgt-estimate turing-estimate))
lgt? (or lgt?
(< diff (* 1.95 stdv)))]
(if lgt?
[lgt-estimate lgt?]
[turing-estimate lgt?]))))))))
(defn smoothed-frequencies
[rs nrs]
(let [l (count rs)
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
log-rs (map #(Math/log %) rs)
log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs)
estimations (loop [coll rs
lgt? false
e (estimator lm rs zrs)
estimations []]
(cond
(empty? coll) estimations
:else
(let [[estimation lgt?] (e (first coll) lgt?)]
(recur
(rest coll)
lgt?
e
(conj estimations estimation)))))
N* (apply + (map #(apply * %) (map vector nrs estimations)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[zrs
lgts
estimations
probs
(apply + probs)
rs
(map
(fn [r] (fn [r]
(let [nr (m r)]
(if nr nr (throw (format "No Nᵣ value for r value of %s" r)))))))
(defn lgt-estimator
"The simplest smooth is a line, and a downward sloping log-log line will
satisfy the priors onr*so long asthe slope of the linebis less than -1. This
is the proposed simple smooth, and we call the associatedGood-Turing estimate
the Linear Good Turing (LGT) estimate."
[lm]
#_(fn [r]
(* (inc r) (/ (lm (inc r)) (lm r)))) (* (inc r) (/ (lm (inc r)) (lm r))))
rs)])) (fn [r]
(lm r)))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(smoothed-frequencies rs nrs))
)
(defn sgt [rs nrs]
(assert (and (not-empty nrs) (not-empty rs))
"frequencies and frequency-of-frequencies can't be empty")
(let [l (count rs)
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
log-rs (map #(Math/log %) rs)
log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs)
r* (partial turing-estimate lm)]
[p0 rs lgts (map r* rs) (map #(/ (r* %) N) rs) N]))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(map #(apply * %) (map vector rs (sgt rs nrs)))
(sgt rs nrs))
)
(comment
(let [rs [1 2 3 4 5 6 7 8 9 10 12]
nrs [120 40 24 13 15 5 11 2 2 1 3]
sgts (sgt rs nrs)
N0 (apply + nrs)]
[(float (/ 120 N0))
(apply + sgts)])
) (defn sgt-estimator
"Switches between turing and lgt
Simply for now at k=3. Should be based on variance in future."
[t lgt]
(fn [r]
(if (< r 3) (t r) (lgt r))))
(defn sgt-with-counts [rs nrs] (defn sgt-probs
(assert (and (not-empty nrs) (not-empty rs)) [p0 r*-coll N*]
"frequencies and frequency-of-frequencies can't be empty") (cons
(let [l (count rs) p0
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
log-rs (map #(Math/log %) rs)
log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs)
estimations (loop [coll rs
lgt? false
e (estimator lm rs zrs)
estimations []]
(cond
(empty? coll) estimations
:else
(let [[estimation lgt?] (e (first coll) lgt?)]
(recur
(rest coll)
lgt?
e
(conj estimations estimation)))))
N* (apply + (map #(apply * %) (map vector nrs estimations)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[(cons 0 rs)
(map #(/ % sum-probs) probs)
estimations
lgts]))
(defn discount-coefficient-map
"The probability of an unseen (Nr0) n-gram is Nr1/N.
We then have to adjust the probability of Nr1 down from the maximum-likelihood
estimate of Nr1 (which was Nr1/N) to something else.
The size of this adjustment is captured by the discount coefficient."
[frequency->frequency-of-frequency]
(let [[xs ys] ((juxt keys vals) frequency->frequency-of-frequency)
sgt (into (sorted-map) (apply map vector (sgt xs ys)))]
(into
(sorted-map)
(map (map
(fn [[r nr nr*]] (fn [r*]
[r (/ nr* nr)]) (* (- 1 p0)
(map vector xs ys (vals sgt)))))) (/ r* N*)))
r*-coll)))
(defn calc-N [r-coll nr-coll]
(apply + (map (fn [a b] (* a b)) r-coll nr-coll)))
(defn frequencies->simple-good-turing-probabilities
"Returns a sorted map of frequencies to probability of seeing a word that
appeared with that frequency.
Useful for calculating perplexity."
[freqs]
(let [[r-coll nr-coll] (apply map vector (sort-by first freqs))
zr-coll (average-consecutives r-coll nr-coll)
linear-model (least-squares-linear-regression r-coll zr-coll)
turing-estimator (turing-estimator r-coll nr-coll)
lgt-estimator (lgt-estimator linear-model)
sgt-estimator (sgt-estimator turing-estimator lgt-estimator)
r*-coll (map
(fn [r]
(if (< r 2)
(* (inc r) (/ (turing-estimator (inc r))
(turing-estimator r)))
(* (inc r) (/ (lgt-estimator (inc r))
(lgt-estimator r)))))
r-coll)
p0 (/ (first nr-coll) (apply + (map (partial apply *) (map vector r-coll nr-coll))))
N* (apply + (map (partial apply *) (map vector r*-coll nr-coll)))
probs (sgt-probs p0 r*-coll N*)]
(into (sorted-map) (map vector (cons 0 r-coll) probs))))

@ -55,47 +55,72 @@
1.12) 1.12)
linear-results)))))) linear-results))))))
;; The below passes a sanity check in that each r* is slightly less than r. ;; Silly test, turing estimation just returns Nr unchanged.
#_(t/deftest turing-estimation (t/deftest turing-estimation
(t/testing "turing estimation - r*" (t/testing "turing estimation"
(let [r-coll [1 2 3 5 10]
nr-coll [20 10 5 1 2]
turing-estimator (math/turing-estimator r-coll nr-coll)]
(t/is (= nr-coll
(map
turing-estimator
r-coll))))))
;; Hand check looks reasonable but precise assertion would come from careful hand calculation
;; or comparison to a known-good implementation.
#_(t/deftest lgt-estimation
(t/testing "linear good-turing estimation"
(let [r-coll [1 2 3 5 10] (let [r-coll [1 2 3 5 10]
nr-coll [20 10 5 1 2] nr-coll [20 10 5 1 2]
zr-coll (math/average-consecutives r-coll nr-coll) zr-coll (math/average-consecutives r-coll nr-coll)
log-r (map #(Math/log %) r-coll) _ (println zr-coll)
log-zr (map #(Math/log %) zr-coll) linear-model (math/least-squares-linear-regression r-coll zr-coll)
linear-model (math/least-squares-linear-regression log-r log-zr)] lgt-estimator (math/lgt-estimator linear-model)]
(t/is (= [] (map (t/is (= [] (map
(partial math/turing-estimate linear-model) lgt-estimator
r-coll)))))) r-coll))))))
(t/deftest simple-good-turing-estimator (comment
(t/testing "The simple good turing estimator switches between linear and turing"
(let [r-coll [1 2 3 5 10] (let [r-coll [1 2 3 5 10]
zr-coll [20 10 5 1 2] ;; not smoothed, but smoothing isn't under test nr-coll [20 10 5 1 2]
log-r (map #(Math/log %) r-coll) zr-coll (math/average-consecutives r-coll nr-coll)
log-zr (map #(Math/log %) zr-coll) linear-model (math/least-squares-linear-regression r-coll zr-coll)
linear-model (math/least-squares-linear-regression log-r log-zr) lgt-estimator (math/lgt-estimator linear-model)]
sgt-estimator (math/estimator linear-model r-coll zr-coll) (map
sgt-estimates (:r* lgt-estimator
(reduce r-coll))
(fn [{:keys [lgt? r*] :as acc} x] ;; => (23.33291663880418
(let [[y lgt?] (sgt-estimator x lgt?)] ;; 5.8271897728476425
{:lgt? lgt? ;; 2.5882932698055106
:r* (conj r* y)})) ;; 0.931074517265868
{:lgt? false ;; 0.23252763418987563)
:r* []} )
r-coll))]
(println zr-coll) ;; Hardcoded the expectation received by runing ~sgt/sgt.h~
(println (map linear-model r-coll)) ;; // Simple Good-Turing estimation
(println sgt-estimates) ;; //
;; // Copyright (c) David Elworthy 2004.
(t/deftest simple-good-turing-probabilities
(t/testing "The simple good turing estimator switches between linear and turing"
(let [freqs {7 1, 1 32, 4 3, 6 2, 3 10, 12 1, 2 20, 9 1, 5 1, 26 1, 10 2, 8 1}
probs (math/frequencies->simple-good-turing-probabilities freqs)]
(t/is (every? (t/is (every?
(fn [[expected predicted]] (fn [[a b]]
(approx= expected predicted 0.01)) (approx= a b 0.0001))
(map (map
vector vector
'(18.13 (vals probs)
7.85 [0.150325
4.81 0.005617
2.59 0.006013
1.12) 0.010137
sgt-estimates)))))) 0.014408
0.018754
0.023142
0.027557
0.031989
0.036434
0.040887
0.049813
0.112550]))))))

Loading…
Cancel
Save