Clean up and expand examples SGT

main
Eric Ihli 4 years ago
parent 495ef6c602
commit 72e06612bf

@ -3,6 +3,7 @@
[clojure.string :as string]
[clojure.set]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.generation.simple-good-turing :as sgt]
[com.owoga.prhyme.util.math :as math]))
(def re-word
@ -100,7 +101,7 @@
(defn add-to-trie-1
[trie n tokens]
(let [pad-n (dec n)
(let [pad-n n
tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol))
partitions (partition n 1 tokens)]
(reduce
@ -145,18 +146,111 @@
i
(recur (inc i) (+ (slices i) sum))))))
(defn depth-of-map
[m]
(loop [d 0
m m]
(let [child-maps (filter map? (vals m))]
(if (empty? child-maps)
(dec d)
(recur (inc d) (first child-maps))))))
(defn completions [trie probs words]
(let [n (inc (count words))
(let [n (apply min (concat (keys probs) [(depth-of-map trie) (inc (count words))]))
possibilities (->> (get-in trie words)
(filter #(string? (first %)))
(filter #(or (string? (first %))
(#{:eol :bol} (first %))))
(map (fn [[k v]]
[k (get-in probs [n (:count v)])]))
(into {}))
sum-probs (apply + (vals possibilities))
sum-probs (apply + (or (vals possibilities) '()))
possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))]
possibilities))
(defn backoff-completions [trie probs words]
(if (empty? words)
'()
(let [c (completions trie probs words)]
(if (empty? c)
(backoff-completions trie probs (rest words))
c))))
(defn generate-lines
[trie n]
(let [probs (->> (range 1 (inc n))
(map #(vector % (filter-trie-to-ngrams trie %)))
(map (fn [[n v]] [n (map #(second %) v)]))
(map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
(map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
(map (fn [[n [rs probs]]]
[n (into {} (map vector rs probs))]))
(into {}))]
(loop [words [:bol]
freqs []]
(if (= :eol (last words))
[words freqs]
(let [cs (backoff-completions trie probs words)]
(if (empty? cs)
[words freqs]
(let [word (->> (reverse (sort-by second cs))
(math/weighted-selection second))]
(recur
(conj words (first word))
(conj freqs (second word))))))))))
(comment
(def trie
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %)))]
(->> documents
(map slurp)
(mapcat #(string/split % #"\n"))
(map tokenize-line)
(filter #(> (count %) 1))
(take 5000)
(reduce
(fn [acc tokens]
(-> (add-to-trie-1 acc 1 tokens)
(add-to-trie-1 2 tokens)
(add-to-trie-1 3 tokens)))
{}))))
(count trie)
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 500)
(take 50000))
t (->> documents
(map slurp)
(mapcat #(string/split % #"\n"))
(map tokenize-line)
(filter #(> (count %) 1)))
trie (->> documents
(map slurp)
(mapcat #(string/split % #"\n"))
(map tokenize-line)
(filter #(> (count %) 1))
(take 5000)
(reduce
(fn [acc tokens]
(-> (add-to-trie-1 acc 1 tokens)
(add-to-trie-1 2 tokens)
(add-to-trie-1 3 tokens)))
{}))
probs (->> (range 1 4)
(map #(vector % (filter-trie-to-ngrams trie %)))
(map (fn [[n v]] [n (map #(second %) v)]))
(map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
(map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
(map (fn [[n [rs probs]]]
[n (into {} (map vector rs probs))]))
(into {}))]
(sgt/stupid-backoff trie probs [:bol "you" "must" "not"])
(count t))
;; Turning corpus into a trie.
(let [documents (->> "dark-corpus"
io/file
@ -169,7 +263,7 @@
(mapcat #(string/split % #"\n"))
(map tokenize-line)
(filter #(> (count %) 1))
(take 500)
(take 5000)
(reduce
(fn [acc tokens]
(-> (add-to-trie-1 acc 1 tokens)
@ -183,8 +277,14 @@
(map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
(map (fn [[n [rs probs]]]
[n (into {} (map vector rs probs))]))
(into {}))]
(reverse (sort-by second (completions trie probs [:bol "you"]))))
(into {}))
poss (->> (get-in trie ["the" "dungeons"])
(filter #(or (string? (first %))
(#{:eol :bol} (first %))))
(map (fn [[k v]]
[k (get-in probs [3 (:count v)])]))
(into {}))]
poss)
(into {} (map vector [1 2 3] [4 5 6]))
;;
@ -308,8 +408,6 @@
;; Inc to account for :count
(filter #(= (inc n) (count (first %))))))
(apply hash-map '([1 2] 3 [4 5] 6))
(comment
(apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} []))
@ -439,9 +537,6 @@
(let [N (->> trie vals (map :count) (apply +))]
(/ c N)))
(->> bigram
(filter (fn [[k v]] (= 3 (v :count)))))
;; Good-Turing Smoothing
;;
;; There are 4 steps to perform the GT smoothing, which are:
@ -594,22 +689,5 @@
(defn good-turing-discount [trie c]
)
(->> bigram
(map second))
(count (into #{} (tokenize (slurp "dev/examples/sandman.txt"))))
(->> bigram
(map second)
(map #(dissoc % :count))
(map keys)
flatten
(into #{})
(clojure.set/difference (into #{} (keys bigram))))
(partition 3 1 (repeat :end) (range 6))
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(take 10))]
documents)

@ -0,0 +1,248 @@
(ns com.owoga.prhyme.generation.simple-good-turing)
;; Pythons NLTK is a great resource for this.
;; https://github.com/nltk/nltk/blob/2.0.4/nltk/probability.py
(defn least-squares-log-log-linear-regression
"Returns a 'Good-Turing Estimator' as defined on page 4 of
https://www.csie.ntu.edu.tw/~b92b02053/print/good-turing-smoothing-without.pdf
A precise statement of the theorem underlying the Good-Turing method is that
r* = (r + 1) * E(N + 1) / E(N)
Where E(x) represents the expectation of random variable x.
It's not unreasonable for E to be identity, simply substituting
N for E(N). In fact, that is known as the Turing Estimator.
However, the larger r is, the less reasonable this substitution is, due
to how much noise there is in large values of r.
So, this function acts as a more reasonable E.
The paper also states that you should use the Turing estimator so long
as the Turing estimate is significantly different from the Linear Good-Turing
estimate. It defines significantly different as exceeding 1.65 times the
standard deviation of the Turing estimate."
[xs ys]
(let [xs (map #(Math/log %) xs)
ys (map #(Math/log %) ys)
n (count xs)
sum-x (apply + xs)
sum-y (apply + ys)
mean-x (/ sum-x n)
mean-y (/ sum-y n)
err-x (map #(- % mean-x) xs)
err-y (map #(- % mean-y) ys)
err-x-sqr (map #(* % %) err-x)
m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
(apply + err-x-sqr))
b (/ (- sum-y (* m sum-x)) n)]
(assert (< m -1)
(format
(str "See Good-Turing Without Tears"
" for why slope must be less than -1."
"\nSlope: %.2f Intersect %.2f")
(float m)
(float b)))
(fn [x]
(Math/pow Math/E (+ b (* m (Math/log x)))))))
(defn average-consecutives
"Average all the non-zero frequency of observations (frequency of frequencies)
using the equation Zr = Nr / 0.5 (t - q)
where q, r, and t are consecutive observations.
An intuitive reason for this is that you can't see something a fraction of a time,
but seeing something a fraction of a time is a truer representation of its
expectation.
For example, in a typical linguistic corpus, you'll see many tokens once, many twice,
fewer 3 times, fewer 4 times, etc... By the time you get up to tokens that have been
seen 20 times, you might only see 1 token. Then 0 occurrences of
21, 22, and 23 tokens. Then you might once see a token 24 times.
Then 0 occurrences of 25, 26, or 27 tokens, then 1 occurence of 28 tokens.
Even though frequencies of 24 and 28 have both occurred once, that doesn't mean
their expected occurence is each once. In actuality, 28 is less likely than 24.
This averaging accomplishes that.
It's known as Z in most papers on Good-Turing estimation. It's used in place of
N as soon as possible, since it's more accurate.
Let's say you observered
observation frequency Zr
1 32 32
2 20 20
3 9 6
5 2 1
7 1 0.5
If observations occur consecutively, then Zr is unchanged from the observed frequency.
But if there is a gap in observation occurence, then this effectively
redistributes some of the observations that we did make into some of the observations that
we didn't make.
For example, we saw some 3's and 5's, but no 4's. So this algorithm shaves a little off
of the 3's and 5's so that if we fit a line to the frequencies the line will more accurately
go through the `4` observations that we just so happened to miss.
This assumes some type of distribution amongst the data where that assumption is valid,
that the observations are independent of each other and that are either linear or logarithmic
(not polynomial... maybe another word for that is monotonic?)
"
[freqs Nrs]
(let [freqs (vec freqs)
Nrs (vec Nrs)]
(loop [i 0
result []]
(let [q (if (= i 0) 0 (nth freqs (dec i)))
Nr (nth Nrs i)
r (nth freqs i)
t (if (= (inc i) (count freqs))
(- (* 2 r) q)
(nth freqs (inc i)))]
(cond
(= (inc i) (count freqs))
(conj result (/ (* 2 Nr) (- t q)))
:else
(recur
(inc i)
(conj result (/ (* 2 Nr) (- t q)))))))))
(defn stdv-for-turing-estimate
"The Simple Good-Turing paper suggests using a Turing estimator
for small values of r and switching to a Linear Good Turing estimator
once the differences between the two are no longer significantly different.
Turing estimate are considered significantly different from LGT estimates
if their difference exceeds 1.65 times the standard deviation of
the Turing estimate.
The variance for the Turing estimate is approximately
(r + 1)² * N / N² * (1 + N / N²)"
[r Nr Nr1]
(Math/sqrt
(* (Math/pow (inc r) 2)
(/ Nr1 (Math/pow Nr 2))
(inc (/ Nr1 (Math/pow Nr 2))))))
(defn r-stars
"r* = (r + 1) * E(N) / E(N)
Where E is an 'estimator'.
The Turing estimator is simply the identity function, substituting N for E(N).
The smoothed Linear Good-Turing estimator is a linear regression model
over the log rs log nrs inputs.
We choose the Turing estimator when it is significantly different from the
smoothed estimator. Significantly different defined as having a standard
deviation more than 1.65 times the difference between the Turing estimator
and the Linear Good-Turing estimator."
[rs nrs lm]
(loop [rs rs
nrs nrs
lgt? false
result []]
(cond
(empty? rs) result
:else
(if-let [lgt? lgt?]
(recur
(rest rs)
(rest nrs)
lgt?
(conj
result
(* (inc (first rs))
(/ (lm (inc (first rs)))
(lm (first rs))))))
(let [lgt-estimate (lm (first rs))
turing-estimate (first nrs)
stdv (stdv-for-turing-estimate
(first rs)
(first nrs)
(second nrs))
lgt? (or (> (Math/abs (- lgt-estimate turing-estimate))
(* 1.65 stdv))
;; Note possibility for the turing estimate to
;; require an out-of-range Nr+1
;; if we get to the end of nrs and still aren't
;; taking the linear good-turing estimate.
(= 1 (count nrs)))]
(recur
(rest rs)
(rest nrs)
lgt?
(conj
result
(if lgt?
(* (inc (first rs))
(/ (lm (inc (first rs)))
(lm (first rs))))
(* (inc (first rs))
(/ (second nrs)
(first nrs)))))))))))
(defn simple-good-turing
[rs nrs]
(assert (and (not-empty nrs) (not-empty rs))
"frequencies and frequency-of-frequencies can't be empty")
(let [N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
lm (least-squares-log-log-linear-regression rs zrs)
lgts (map lm rs)
r*s (r-stars rs zrs lm)
N* (apply + (map #(apply * %) (map vector nrs r*s)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) r*s))
sum-probs (apply + probs)]
[(cons 0 rs) (map #(/ % sum-probs) probs)]))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(map #(apply * %) (map vector rs (simple-good-turing rs nrs)))
(simple-good-turing rs nrs))
(let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
lm (least-squares-log-log-linear-regression rs zrs)
lgts (map lm rs)
r*s (r-stars rs nrs lm)
N* (apply + (map #(apply * %) (map vector nrs r*s)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) r*s))
sum-probs (apply + probs)]
(r-stars rs nrs lm))
)
(defn stupid-backoff
[trie probs words]
(let [times-seen (or (get-in trie (concat words [:count])) 0)]
(cond
(= 1 (count words))
(get-in probs [1 (:count (trie (first words)))])
(< 0 times-seen)
(/ times-seen (get-in trie (concat (butlast words) [:count])))
:else
(* 0.4 (stupid-backoff trie probs (butlast words))))))
(defn probability-of-sentence
[trie probs sentence]
())

@ -67,12 +67,10 @@
(->Vose N alias prob)))))
(defn from-weights [ws]
(let [N (count ws)
tot (reduce + 0.0 ws)
dist (if (zero? tot)
(repeat N (/ 1 tot))
(map #(/ % tot) ws))]
(make-vose (vec dist))))
(let [tot (reduce + 0.0 ws)]
(assert (> tot 0) "Can't Vose RNG from 0 weights.")
(let [dist (map #(/ % tot) ws)]
(make-vose (vec dist)))))
(comment
(let [ws [1 2 1 3 3]
@ -91,10 +89,12 @@
If given a key function and a collection, uses the key function to get a
collection of weights and returns the value at the randomly selected index."
([coll]
(assert (not-empty coll) "Can't select from empty coll")
(let [rng (from-weights coll)
index (nextr rng nil)]
index))
([key-fn coll]
(assert (not-empty coll) "Can't select from empty coll")
(let [rng (from-weights (map key-fn coll))
index (nextr rng nil)
selection (nth coll index)]
@ -157,18 +157,6 @@
(fn [x]
(Math/pow Math/E (+ b (* m (Math/log x)))))))
(defn averaged-smooth
"Assumes 0 Nrs are included."
[rs Nrs]
(let [rs (concat rs [(inc (last rs))])
Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs)
(last (butlast Nrs))))])]
[rs Nrs]))
(comment
(averaged-smooth [1 2 3 4] [32 10 0 2])
)
(defn average-consecutives
"Average all the non-zero frequency of observations (frequency of frequencies)
using the equation Zr = Nr / 0.5 (t - q)
@ -329,7 +317,8 @@
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[(cons 0 rs) (map #(/ % sum-probs) probs)]))
[(cons 0 rs)
(map #(/ % sum-probs) probs)]))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
@ -338,7 +327,7 @@
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(map #(apply * %) (map vector rs (sgt rs nrs)))
(sgt rs nrs))
)
(comment
(let [rs [1 2 3 4 5 6 7 8 9 10 12]

Loading…
Cancel
Save