More SGT exploration

main
Eric Ihli 4 years ago
parent b63b8d6cf4
commit bf6836bb69

@ -262,6 +262,8 @@
;; => 0.016222893164898698 ;; => 0.016222893164898698
(sgt/katz-estimator trie r*s 0 ["you're" "my" "baz"]) (sgt/katz-estimator trie r*s 0 ["you're" "my" "baz"])
(get-in trie ["you're" ]) (get-in trie ["you're" ])
(get-in r*s [1 :N]) (get-in r*s [1 :N])
(sgt/katz-beta-alpha trie r*s 0 ["you're" "not"]) (sgt/katz-beta-alpha trie r*s 0 ["you're" "not"])
@ -740,15 +742,22 @@
(map float ys-avg-cons)) (map float ys-avg-cons))
;; y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]); ;; y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
(let [xs [1 2 3 4 5 6 7 8 9 10 12 26] (let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
ys [32 20 10 3 1 2 1 1 1 2 1 1] Nrs [32 20 10 3 1 2 1 1 1 2 1 1]
ys-avg-cons (average-consecutives xs ys) N (apply + (map #(apply * %) (map vector rs Nrs)))
log-xs (map #(Math/log %) xs) P0 (float (/ (first Nrs) N))
log-ys (map #(Math/log %) ys-avg-cons) sgt-estimator (sgt/simple-good-turing-estimator rs Nrs)
lm (least-squares-linear-regression log-xs log-ys) r*s (map sgt-estimator rs)
zs (map lm log-xs)] new-N (apply + (map #(apply * %) (map vector r*s Nrs)))
;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14] pr (fn [r]
[log-ys log-xs zs (map #(Math/pow Math/E %) zs)]) (* (- 1 P0)
(/ r new-N)))
sum-pr-unnormalized (apply + (map pr r*s))
pr-normalized (map #(* (- 1 P0)
(/ (pr %) sum-pr-unnormalized))
r*s)]
(sgt/simple-good-turing-probability rs Nrs)
(apply + (map #(/ % N) (sgt/sgt-estimates rs Nrs))))
(Math/log 1) (Math/log 1)
) )

Binary file not shown.

@ -192,6 +192,9 @@ template <class ObsType> class SGT
Data &d = (*j).second; Data &d = (*j).second;
ObsType obs1 = obs + 1; ObsType obs1 = obs + 1;
printf("%0.2f smoothed: %0.2f\n",
(double) obs,
(double) smoothed(obs, intercept, slope));
double y = obs1 * smoothed(obs1, intercept, slope) double y = obs1 * smoothed(obs1, intercept, slope)
/ smoothed(obs, intercept, slope); / smoothed(obs, intercept, slope);
@ -241,8 +244,14 @@ template <class ObsType> class SGT
printf("%f\n", rStar[i]); printf("%f\n", rStar[i]);
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
{
printf("%f %f %f\n", (float) (1 - PZero), (float) rStar[r], (float) bigNprime);
(*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime; (*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
}
printf("%f %f %f\n", (float) (1 - PZero), (float) rStar[r], (float) bigNprime);
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
printf("%f\n", (*j).second.estimate);
return true; return true;
} }

Binary file not shown.

@ -1,11 +1,26 @@
(ns com.owoga.prhyme.generation.simple-good-turing (ns com.owoga.prhyme.generation.simple-good-turing
(:require [clojure.set])) (:require [clojure.set]
[clojure.string :as string]
[clojure.set :as set]))
;; Pythons NLTK is a great resource for this. ;; Pythons NLTK is a great resource for this.
;; https://github.com/nltk/nltk/blob/2.0.4/nltk/probability.py ;; https://github.com/nltk/nltk/blob/2.0.4/nltk/probability.py
;; ;;
;; Useful to check out commit 3c8a25379 and look at nltk/model/ngram.py ;; Useful to check out commit 3c8a25379 and look at nltk/model/ngram.py
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(map second)
(map string/lower-case)))
(defn least-squares-log-log-linear-regression (defn least-squares-log-log-linear-regression
"Returns a 'Good-Turing Estimator' as defined on page 4 of "Returns a 'Good-Turing Estimator' as defined on page 4 of
@ -127,10 +142,11 @@
The variance for the Turing estimate is approximately The variance for the Turing estimate is approximately
(r + 1)² * N / N² * (1 + N / N²)" (r + 1)² * N / N² * (1 + N / N²)"
[r Nr Nr1] [r Nr Nr1]
(Math/sqrt (let [Nr1 (or Nr1 0)]
(* (Math/pow (inc r) 2) (Math/sqrt
(/ Nr1 (Math/pow Nr 2)) (* (Math/pow (inc r) 2)
(inc (/ Nr1 (Math/pow Nr 2)))))) (/ Nr1 (Math/pow Nr 2))
(inc (/ Nr1 Nr))))))
(defn r-stars (defn r-stars
"r* = (r + 1) * E(N) / E(N) "r* = (r + 1) * E(N) / E(N)
@ -146,36 +162,93 @@
deviation more than 1.65 times the difference between the Turing estimator deviation more than 1.65 times the difference between the Turing estimator
and the Linear Good-Turing estimator." and the Linear Good-Turing estimator."
[rs nrs lm] [rs nrs lm]
(loop [rs rs (let [smoothed (fn [r]
nrs nrs (* (inc r)
lgt? false (/ (lm (inc r))
result []] (lm r))))
(cond turing (fn [r N N]
(empty? rs) result (* (inc r)
:else (/ N N)))]
(if-let [lgt? lgt?] (loop [rs rs
(recur nrs nrs
(rest rs) lgt? false
(rest nrs) result []]
lgt? (cond
(conj (empty? rs) result
result :else
(* (inc (first rs)) (if-let [lgt? lgt?]
(/ (lm (inc (first rs))) (recur
(lm (first rs)))))) (rest rs)
(let [lgt-estimate (lm (first rs)) (rest nrs)
turing-estimate (first nrs) lgt?
stdv (stdv-for-turing-estimate (conj
(first rs) result
(first nrs) (smoothed (first rs))))
(second nrs)) (let [lgt-estimate (lm (first rs))
lgt? (or (> (Math/abs (- lgt-estimate turing-estimate)) turing-estimate (first nrs)
stdv (stdv-for-turing-estimate
(first rs)
(first nrs)
(second nrs))
lgt? (or (> (Math/abs (- lgt-estimate turing-estimate))
(* 1.65 stdv))
;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1
;; if we get to the end of nrs and still aren't
;; taking the linear good-turing estimate.
(= 1 (count nrs)))]
(recur
(rest rs)
(rest nrs)
lgt?
(conj
result
(if lgt?
(smoothed (first rs))
(turing (first rs) (first nrs) (second nrs)))))))))))
(defn r*
"r* = (r + 1) * E(N) / E(N)
Where E is an 'estimator'.
The Turing estimator is simply the identity function, substituting N for E(N).
The smoothed Linear Good-Turing estimator is a linear regression model
over the log rs log nrs inputs.
We choose the Turing estimator when it is significantly different from the
smoothed estimator. Significantly different defined as having a standard
deviation more than 1.65 times the difference between the Turing estimator
and the Linear Good-Turing estimator."
[rs nrs lm]
(let [smoothed (fn [r]
(* (inc r)
(/ (lm (inc r))
(lm r))))
turing (fn [r N N]
(* (inc r)
(/ N N)))]
(loop [rs rs
nrs nrs
lgt? false
result []]
(cond
(empty? rs) result
:else
(let [r (first rs)
N (first nrs)
N (or (second nrs) 0)
lgt-estimate (lm r)
turing-estimate N
stdv (stdv-for-turing-estimate r N N)
lgt? (or lgt?
(> (Math/abs (- lgt-estimate turing-estimate))
(* 1.65 stdv)) (* 1.65 stdv))
;; Note possibility for the turing estimate to ;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1 ;; require an out-of-range Nr+1
;; if we get to the end of nrs and still aren't ;; if we get to the end of nrs and still aren't
;; taking the linear good-turing estimate. ;; taking the linear good-turing estimate.
(= 1 (count nrs)))] (nil? (second nrs)))]
(recur (recur
(rest rs) (rest rs)
(rest nrs) (rest nrs)
@ -183,12 +256,19 @@
(conj (conj
result result
(if lgt? (if lgt?
(* (inc (first rs)) (smoothed (first rs))
(/ (lm (inc (first rs))) (turing (first rs) (first nrs) (second nrs))))))))))
(lm (first rs))))
(* (inc (first rs)) (defn make-r*
(/ (second nrs) "Returns a function that takes an r and returns an r*."
(first nrs))))))))))) [rs nrs lm]
(let [r*s (->> (r* rs nrs lm)
(map vector rs)
(into (sorted-map)))]
(fn [r]
(get r*s r (* (inc r)
(/ (lm (inc r))
(lm r)))))))
(defn simple-good-turing (defn simple-good-turing
[rs nrs] [rs nrs]
@ -243,6 +323,33 @@
[(conj prefix k) v])) [(conj prefix k) v]))
m))) m)))
(defn add-to-trie
[trie n tokens]
(let [pad-n n
tokens (concat (repeat (max 1 (dec pad-n)) "<s>") tokens ["</s>"])
partitions (partition n 1 tokens)]
(reduce
(fn [acc tokens]
(update-in acc (concat tokens [:count]) (fnil inc 0)))
trie
partitions)))
(defn lines->trie
[lines n]
(->> lines
(map tokenize-line)
(filter #(> (count %) 1))
(reduce
(fn [acc tokens]
(reduce
(fn [acc n]
(add-to-trie acc n tokens))
acc
(range 1 (inc n))))
{})))
(lines->trie '("hi there" "hi eric" "my name is eric") 2)
(defn filter-trie-to-ngrams [trie n] (defn filter-trie-to-ngrams [trie n]
(->> trie (->> trie
(flatmap) (flatmap)
@ -348,6 +455,7 @@
(if (> c k) (if (> c k)
(P-bar trie r*s words) (P-bar trie r*s words)
(let [alpha (katz-beta-alpha trie r*s k words)] (let [alpha (katz-beta-alpha trie r*s k words)]
(println "alpha" alpha)
(* alpha (P-sub-s trie r*s k (rest words))))))) (* alpha (P-sub-s trie r*s k (rest words)))))))
@ -503,3 +611,193 @@
(* d (/ c-num c-den))))) (* d (/ c-num c-den)))))
(apply +))] (apply +))]
b)))) b))))
(defn make-r
[trie n-gram]
(:count (get-in trie n-gram {:count 0})))
(defn make-n
[trie n-gram r]
(->> trie
(#(filter-trie-to-ngrams % (count n-gram)))
(map second)
(frequencies)
(#(get % r))))
(defn linear-good-turing-frequency-estimator
[rs Nrs]
(let [averaged (average-consecutives rs Nrs)]
(least-squares-log-log-linear-regression rs averaged)))
(defn simple-good-turing-estimator
"r* = (r + 1) * E(N) / E(N)
Where E is an 'estimator'.
The Turing Estimator is simply the identity function, substituting N for E(N).
The Linear Good-Turing Estimator is a linear regression model
over the log rs log nrs inputs.
The Simple Good-Turing Estimator switches from the Turing Estimator to the
Linear Good-Turing Estimator whenever the difference between the two
exceeds some value deemed 'significant' (for example, 1.65 times the standard
deviation of the Turing estimate).
Returns a function that takes `r`, a frequency, and returns
the `r*`, the estimated frequency of that frequency."
[rs Nrs]
(let [r->Nr (into (sorted-map) (map vector rs Nrs))
lgt-estimator (linear-good-turing-frequency-estimator rs Nrs)
r*-fn (fn [estimator r]
(* (inc r)
(/ (estimator (inc r))
(estimator r))))
r*s (loop [rs rs
Nrs Nrs
lgt? false
result []]
(if (empty? rs)
result
(let [r (first rs)
lgt-estimate (r*-fn lgt-estimator r)]
(if (nil? (r->Nr (inc r)))
(recur
(rest rs)
(rest Nrs)
lgt?
(conj
result
lgt-estimate))
(let [turing-estimate (r*-fn r->Nr r)
stdv (stdv-for-turing-estimate r (r->Nr r) (r->Nr (inc r)))
lgt? (or lgt?
(> (* 1.65 stdv)
(Math/abs (- lgt-estimate turing-estimate)))
;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1
;; if we get to the end of Nrs and still aren't
;; taking the linear good-turing estimate.
(nil? (second Nrs)))]
(recur
(rest rs)
(rest Nrs)
lgt?
(conj
result
(if lgt?
lgt-estimate
turing-estimate))))))))
r->r*-map (into (sorted-map) (map vector rs r*s))
r->r* (fn [r]
(get r->r*-map r (r*-fn lgt-estimator r)))]
r->r*))
(defn sgt-estimates
"Returns list of r*s using Simple Good-Turing."
[rs Nrs]
(let [sgt-estimator (simple-good-turing-estimator rs Nrs)]
(map sgt-estimator rs)))
(defn normalize-estimates
"Normalizes r*s (and P0) probabilities."
[rs r*s P0]
(let [N (apply + (map #(apply * %) (map vector rs r*s)))
probability (fn [r]
(* (- 1 P0)
(/ r N)))
sum-probabilities (apply + (map probability r*s))]
(map
(fn [r*]
(* (- 1 P0)
(/ (probability r*) sum-probabilities)))
r*s)))
(defn simple-good-turing-probability
"Returns a function that given an `r` returns a probability."
[rs Nrs]
(let [N (apply + (map #(apply * %) (map vector rs Nrs)))
P0 (float (/ (first Nrs) N))
r*s (sgt-estimates rs Nrs)
probs (normalize-estimates rs r*s P0)]
(into
(sorted-map)
(map vector rs probs))))
(defn simple-good-turing-discount
[r
r-plus-one-estimated-frequency
r-estimated-frequency]
(* (inc r)
(/ r-plus-one-estimated-frequency
r-estimated-frequency)))
(defn turing-probablity
[discounted-count-of-ngram sample-text-size]
(/ discounted-count-of-ngram sample-text-size))
(defn maps-for-simple-good-turing
[trie]
(let [ns (range 1 (inc (depth-of-map trie)))
n->r->nr
(into
{}
(map
(fn [d]
(let [flattened (filter-trie-to-ngrams trie d)]
[d (into (sorted-map) (frequencies (map second flattened)))]))
ns))
Ns (into
{}
(map
(fn [n]
[n
(apply + (map #(apply * %)
(map
vector
(keys (n->r->nr n))
(vals (n->r->nr n)))))])
ns))
P0s (into
{}
(map
(fn [n]
[n (float (/ (get-in n->r->nr [n 1]) (Ns n)))])
ns))
n->r->sgt-prob
(into
{}
(map
(fn [n]
[n
(into
(sorted-map)
(simple-good-turing-probability
(keys (n->r->nr n))
(vals (n->r->nr n))))])
ns))]
[n->r->nr
n->r->sgt-prob
Ns
P0s]))
(defn simple-good-turing
[trie]
(let [ns (range 1 (inc (depth-of-map trie)))
[n->r->nr
n->r->sgt-prob
Ns
P0s] (maps-for-simple-good-turing trie)]
(fn [vocab-set ngram]
(let [n (count ngram)
c (get-in
trie
(concat ngram [:count])
0)
seen
(into #{} (remove #{:count} (keys (get-in trie ngram))))
unseen (set/difference vocab-set seen)]
(get-in
n->r->sgt-prob
[n c]
(float (/ (P0s n) (count unseen))))))))

@ -0,0 +1,33 @@
(ns com.owoga.prhyme.generation.simple-good-turing-test
(:require [com.owoga.prhyme.generation.simple-good-turing :as sgt]
[com.owoga.prhyme.data.dictionary :as dict]
[clojure.test :as t :refer [deftest is testing use-fixtures]]
[clojure.java.io :as io]))
(def train-corpus
(with-open [reader (io/reader (io/resource "dark-corpus-train.txt"))]
(->> (line-seq reader) doall)))
(def test-corpus
(with-open [reader (io/reader (io/resource "dark-corpus-test.txt"))]
(->> (line-seq reader) doall)))
(def train-trie
(sgt/lines->trie train-corpus 3))
(def sgt-model
(sgt/simple-good-turing train-trie))
(def vocab
(into #{} (remove #{:count} (keys train-trie))))
(def maps-for-sgt (sgt/maps-for-simple-good-turing train-trie))
(def n->r->nr (first maps-for-sgt))
(def n->r->sgt-prob (second maps-for-sgt))
(def Ns (nth maps-for-sgt 2))
(def P0s (nth maps-for-sgt 3))
(deftest simple-good-turing
(testing "accuracy"))
Loading…
Cancel
Save