More SGT exploration

Eric Ihli 4 years ago
parent b63b8d6cf4
commit bf6836bb69

@ -262,6 +262,8 @@
;; => 0.016222893164898698
(sgt/katz-estimator trie r*s 0 ["you're" "my" "baz"])
(get-in trie ["you're" ])
(get-in r*s [1 :N])
(sgt/katz-beta-alpha trie r*s 0 ["you're" "not"])
@ -740,15 +742,22 @@
(map float ys-avg-cons))
;; y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
(let [xs [1 2 3 4 5 6 7 8 9 10 12 26]
ys [32 20 10 3 1 2 1 1 1 2 1 1]
ys-avg-cons (average-consecutives xs ys)
log-xs (map #(Math/log %) xs)
log-ys (map #(Math/log %) ys-avg-cons)
lm (least-squares-linear-regression log-xs log-ys)
zs (map lm log-xs)]
;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14]
[log-ys log-xs zs (map #(Math/pow Math/E %) zs)])
(let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
Nrs [32 20 10 3 1 2 1 1 1 2 1 1]
N (apply + (map #(apply * %) (map vector rs Nrs)))
P0 (float (/ (first Nrs) N))
sgt-estimator (sgt/simple-good-turing-estimator rs Nrs)
r*s (map sgt-estimator rs)
new-N (apply + (map #(apply * %) (map vector r*s Nrs)))
pr (fn [r]
(* (- 1 P0)
(/ r new-N)))
sum-pr-unnormalized (apply + (map pr r*s))
pr-normalized (map #(* (- 1 P0)
(/ (pr %) sum-pr-unnormalized))
(sgt/simple-good-turing-probability rs Nrs)
(apply + (map #(/ % N) (sgt/sgt-estimates rs Nrs))))
(Math/log 1)

Binary file not shown.

@ -192,6 +192,9 @@ template <class ObsType> class SGT
Data &d = (*j).second;
ObsType obs1 = obs + 1;
printf("%0.2f smoothed: %0.2f\n",
(double) obs,
(double) smoothed(obs, intercept, slope));
double y = obs1 * smoothed(obs1, intercept, slope)
/ smoothed(obs, intercept, slope);
@ -241,8 +244,14 @@ template <class ObsType> class SGT
printf("%f\n", rStar[i]);
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
printf("%f %f %f\n", (float) (1 - PZero), (float) rStar[r], (float) bigNprime);
(*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
printf("%f %f %f\n", (float) (1 - PZero), (float) rStar[r], (float) bigNprime);
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
printf("%f\n", (*j).second.estimate);
return true;

Binary file not shown.

@ -1,11 +1,26 @@
(ns com.owoga.prhyme.generation.simple-good-turing
(:require [clojure.set]))
(:require [clojure.set]
[clojure.string :as string]
[clojure.set :as set]))
;; Pythons NLTK is a great resource for this.
;; Useful to check out commit 3c8a25379 and look at nltk/model/
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
(defn tokenize-line
(->> line
(re-seq re-word)
(map second)
(map string/lower-case)))
(defn least-squares-log-log-linear-regression
"Returns a 'Good-Turing Estimator' as defined on page 4 of
@ -127,10 +142,11 @@
The variance for the Turing estimate is approximately
(r + 1)² * N / N² * (1 + N / N²)"
[r Nr Nr1]
(* (Math/pow (inc r) 2)
(/ Nr1 (Math/pow Nr 2))
(inc (/ Nr1 (Math/pow Nr 2))))))
(let [Nr1 (or Nr1 0)]
(* (Math/pow (inc r) 2)
(/ Nr1 (Math/pow Nr 2))
(inc (/ Nr1 Nr))))))
(defn r-stars
"r* = (r + 1) * E(N) / E(N)
@ -146,36 +162,93 @@
deviation more than 1.65 times the difference between the Turing estimator
and the Linear Good-Turing estimator."
[rs nrs lm]
(loop [rs rs
nrs nrs
lgt? false
result []]
(empty? rs) result
(if-let [lgt? lgt?]
(rest rs)
(rest nrs)
(* (inc (first rs))
(/ (lm (inc (first rs)))
(lm (first rs))))))
(let [lgt-estimate (lm (first rs))
turing-estimate (first nrs)
stdv (stdv-for-turing-estimate
(first rs)
(first nrs)
(second nrs))
lgt? (or (> (Math/abs (- lgt-estimate turing-estimate))
(let [smoothed (fn [r]
(* (inc r)
(/ (lm (inc r))
(lm r))))
turing (fn [r N N]
(* (inc r)
(/ N N)))]
(loop [rs rs
nrs nrs
lgt? false
result []]
(empty? rs) result
(if-let [lgt? lgt?]
(rest rs)
(rest nrs)
(smoothed (first rs))))
(let [lgt-estimate (lm (first rs))
turing-estimate (first nrs)
stdv (stdv-for-turing-estimate
(first rs)
(first nrs)
(second nrs))
lgt? (or (> (Math/abs (- lgt-estimate turing-estimate))
(* 1.65 stdv))
;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1
;; if we get to the end of nrs and still aren't
;; taking the linear good-turing estimate.
(= 1 (count nrs)))]
(rest rs)
(rest nrs)
(if lgt?
(smoothed (first rs))
(turing (first rs) (first nrs) (second nrs)))))))))))
(defn r*
"r* = (r + 1) * E(N) / E(N)
Where E is an 'estimator'.
The Turing estimator is simply the identity function, substituting N for E(N).
The smoothed Linear Good-Turing estimator is a linear regression model
over the log rs log nrs inputs.
We choose the Turing estimator when it is significantly different from the
smoothed estimator. Significantly different defined as having a standard
deviation more than 1.65 times the difference between the Turing estimator
and the Linear Good-Turing estimator."
[rs nrs lm]
(let [smoothed (fn [r]
(* (inc r)
(/ (lm (inc r))
(lm r))))
turing (fn [r N N]
(* (inc r)
(/ N N)))]
(loop [rs rs
nrs nrs
lgt? false
result []]
(empty? rs) result
(let [r (first rs)
N (first nrs)
N (or (second nrs) 0)
lgt-estimate (lm r)
turing-estimate N
stdv (stdv-for-turing-estimate r N N)
lgt? (or lgt?
(> (Math/abs (- lgt-estimate turing-estimate))
(* 1.65 stdv))
;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1
;; if we get to the end of nrs and still aren't
;; taking the linear good-turing estimate.
(= 1 (count nrs)))]
(nil? (second nrs)))]
(rest rs)
(rest nrs)
@ -183,12 +256,19 @@
(if lgt?
(* (inc (first rs))
(/ (lm (inc (first rs)))
(lm (first rs))))
(* (inc (first rs))
(/ (second nrs)
(first nrs)))))))))))
(smoothed (first rs))
(turing (first rs) (first nrs) (second nrs))))))))))
(defn make-r*
"Returns a function that takes an r and returns an r*."
[rs nrs lm]
(let [r*s (->> (r* rs nrs lm)
(map vector rs)
(into (sorted-map)))]
(fn [r]
(get r*s r (* (inc r)
(/ (lm (inc r))
(lm r)))))))
(defn simple-good-turing
[rs nrs]
@ -243,6 +323,33 @@
[(conj prefix k) v]))
(defn add-to-trie
[trie n tokens]
(let [pad-n n
tokens (concat (repeat (max 1 (dec pad-n)) "<s>") tokens ["</s>"])
partitions (partition n 1 tokens)]
(fn [acc tokens]
(update-in acc (concat tokens [:count]) (fnil inc 0)))
(defn lines->trie
[lines n]
(->> lines
(map tokenize-line)
(filter #(> (count %) 1))
(fn [acc tokens]
(fn [acc n]
(add-to-trie acc n tokens))
(range 1 (inc n))))
(lines->trie '("hi there" "hi eric" "my name is eric") 2)
(defn filter-trie-to-ngrams [trie n]
(->> trie
@ -348,6 +455,7 @@
(if (> c k)
(P-bar trie r*s words)
(let [alpha (katz-beta-alpha trie r*s k words)]
(println "alpha" alpha)
(* alpha (P-sub-s trie r*s k (rest words)))))))
@ -503,3 +611,193 @@
(* d (/ c-num c-den)))))
(apply +))]
(defn make-r
[trie n-gram]
(:count (get-in trie n-gram {:count 0})))
(defn make-n
[trie n-gram r]
(->> trie
(#(filter-trie-to-ngrams % (count n-gram)))
(map second)
(#(get % r))))
(defn linear-good-turing-frequency-estimator
[rs Nrs]
(let [averaged (average-consecutives rs Nrs)]
(least-squares-log-log-linear-regression rs averaged)))
(defn simple-good-turing-estimator
"r* = (r + 1) * E(N) / E(N)
Where E is an 'estimator'.
The Turing Estimator is simply the identity function, substituting N for E(N).
The Linear Good-Turing Estimator is a linear regression model
over the log rs log nrs inputs.
The Simple Good-Turing Estimator switches from the Turing Estimator to the
Linear Good-Turing Estimator whenever the difference between the two
exceeds some value deemed 'significant' (for example, 1.65 times the standard
deviation of the Turing estimate).
Returns a function that takes `r`, a frequency, and returns
the `r*`, the estimated frequency of that frequency."
[rs Nrs]
(let [r->Nr (into (sorted-map) (map vector rs Nrs))
lgt-estimator (linear-good-turing-frequency-estimator rs Nrs)
r*-fn (fn [estimator r]
(* (inc r)
(/ (estimator (inc r))
(estimator r))))
r*s (loop [rs rs
Nrs Nrs
lgt? false
result []]
(if (empty? rs)
(let [r (first rs)
lgt-estimate (r*-fn lgt-estimator r)]
(if (nil? (r->Nr (inc r)))
(rest rs)
(rest Nrs)
(let [turing-estimate (r*-fn r->Nr r)
stdv (stdv-for-turing-estimate r (r->Nr r) (r->Nr (inc r)))
lgt? (or lgt?
(> (* 1.65 stdv)
(Math/abs (- lgt-estimate turing-estimate)))
;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1
;; if we get to the end of Nrs and still aren't
;; taking the linear good-turing estimate.
(nil? (second Nrs)))]
(rest rs)
(rest Nrs)
(if lgt?
r->r*-map (into (sorted-map) (map vector rs r*s))
r->r* (fn [r]
(get r->r*-map r (r*-fn lgt-estimator r)))]
(defn sgt-estimates
"Returns list of r*s using Simple Good-Turing."
[rs Nrs]
(let [sgt-estimator (simple-good-turing-estimator rs Nrs)]
(map sgt-estimator rs)))
(defn normalize-estimates
"Normalizes r*s (and P0) probabilities."
[rs r*s P0]
(let [N (apply + (map #(apply * %) (map vector rs r*s)))
probability (fn [r]
(* (- 1 P0)
(/ r N)))
sum-probabilities (apply + (map probability r*s))]
(fn [r*]
(* (- 1 P0)
(/ (probability r*) sum-probabilities)))
(defn simple-good-turing-probability
"Returns a function that given an `r` returns a probability."
[rs Nrs]
(let [N (apply + (map #(apply * %) (map vector rs Nrs)))
P0 (float (/ (first Nrs) N))
r*s (sgt-estimates rs Nrs)
probs (normalize-estimates rs r*s P0)]
(map vector rs probs))))
(defn simple-good-turing-discount
(* (inc r)
(/ r-plus-one-estimated-frequency
(defn turing-probablity
[discounted-count-of-ngram sample-text-size]
(/ discounted-count-of-ngram sample-text-size))
(defn maps-for-simple-good-turing
(let [ns (range 1 (inc (depth-of-map trie)))
(fn [d]
(let [flattened (filter-trie-to-ngrams trie d)]
[d (into (sorted-map) (frequencies (map second flattened)))]))
Ns (into
(fn [n]
(apply + (map #(apply * %)
(keys (n->r->nr n))
(vals (n->r->nr n)))))])
P0s (into
(fn [n]
[n (float (/ (get-in n->r->nr [n 1]) (Ns n)))])
(fn [n]
(keys (n->r->nr n))
(vals (n->r->nr n))))])
(defn simple-good-turing
(let [ns (range 1 (inc (depth-of-map trie)))
P0s] (maps-for-simple-good-turing trie)]
(fn [vocab-set ngram]
(let [n (count ngram)
c (get-in
(concat ngram [:count])
(into #{} (remove #{:count} (keys (get-in trie ngram))))
unseen (set/difference vocab-set seen)]
[n c]
(float (/ (P0s n) (count unseen))))))))

@ -0,0 +1,33 @@
(ns com.owoga.prhyme.generation.simple-good-turing-test
(:require [com.owoga.prhyme.generation.simple-good-turing :as sgt]
[ :as dict]
[clojure.test :as t :refer [deftest is testing use-fixtures]]
[ :as io]))
(def train-corpus
(with-open [reader (io/reader (io/resource "dark-corpus-train.txt"))]
(->> (line-seq reader) doall)))
(def test-corpus
(with-open [reader (io/reader (io/resource "dark-corpus-test.txt"))]
(->> (line-seq reader) doall)))
(def train-trie
(sgt/lines->trie train-corpus 3))
(def sgt-model
(sgt/simple-good-turing train-trie))
(def vocab
(into #{} (remove #{:count} (keys train-trie))))
(def maps-for-sgt (sgt/maps-for-simple-good-turing train-trie))
(def n->r->nr (first maps-for-sgt))
(def n->r->sgt-prob (second maps-for-sgt))
(def Ns (nth maps-for-sgt 2))
(def P0s (nth maps-for-sgt 3))
(deftest simple-good-turing
(testing "accuracy"))