From 72e06612bf6a91e9441f9b32373581ad1adfc580 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 29 Dec 2020 14:42:34 -0800 Subject: [PATCH] Clean up and expand examples SGT --- dev/examples/scratch.clj | 140 +++++++--- .../prhyme/generation/simple_good_turing.clj | 248 ++++++++++++++++++ src/com/owoga/prhyme/util/math.clj | 29 +- 3 files changed, 366 insertions(+), 51 deletions(-) create mode 100644 src/com/owoga/prhyme/generation/simple_good_turing.clj diff --git a/dev/examples/scratch.clj b/dev/examples/scratch.clj index a4bd327..5b4d897 100644 --- a/dev/examples/scratch.clj +++ b/dev/examples/scratch.clj @@ -3,6 +3,7 @@ [clojure.string :as string] [clojure.set] [com.owoga.prhyme.nlp.core :as nlp] + [com.owoga.prhyme.generation.simple-good-turing :as sgt] [com.owoga.prhyme.util.math :as math])) (def re-word @@ -100,7 +101,7 @@ (defn add-to-trie-1 [trie n tokens] - (let [pad-n (dec n) + (let [pad-n n tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol)) partitions (partition n 1 tokens)] (reduce @@ -145,18 +146,111 @@ i (recur (inc i) (+ (slices i) sum)))))) +(defn depth-of-map + [m] + (loop [d 0 + m m] + (let [child-maps (filter map? (vals m))] + (if (empty? child-maps) + (dec d) + (recur (inc d) (first child-maps)))))) + (defn completions [trie probs words] - (let [n (inc (count words)) + (let [n (apply min (concat (keys probs) [(depth-of-map trie) (inc (count words))])) possibilities (->> (get-in trie words) - (filter #(string? (first %))) + (filter #(or (string? (first %)) + (#{:eol :bol} (first %)))) (map (fn [[k v]] [k (get-in probs [n (:count v)])])) (into {})) - sum-probs (apply + (vals possibilities)) + sum-probs (apply + (or (vals possibilities) '())) possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))] possibilities)) +(defn backoff-completions [trie probs words] + (if (empty? words) + '() + (let [c (completions trie probs words)] + (if (empty? c) + (backoff-completions trie probs (rest words)) + c)))) + +(defn generate-lines + [trie n] + (let [probs (->> (range 1 (inc n)) + (map #(vector % (filter-trie-to-ngrams trie %))) + (map (fn [[n v]] [n (map #(second %) v)])) + (map (fn [[n v]] [n (into (sorted-map) (frequencies v))])) + (map (fn [[n v]] [n (math/sgt (keys v) (vals v))])) + (map (fn [[n [rs probs]]] + [n (into {} (map vector rs probs))])) + (into {}))] + (loop [words [:bol] + freqs []] + (if (= :eol (last words)) + [words freqs] + (let [cs (backoff-completions trie probs words)] + (if (empty? cs) + [words freqs] + (let [word (->> (reverse (sort-by second cs)) + (math/weighted-selection second))] + (recur + (conj words (first word)) + (conj freqs (second word)))))))))) + (comment + (def trie + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)))] + (->> documents + (map slurp) + (mapcat #(string/split % #"\n")) + (map tokenize-line) + (filter #(> (count %) 1)) + (take 5000) + (reduce + (fn [acc tokens] + (-> (add-to-trie-1 acc 1 tokens) + (add-to-trie-1 2 tokens) + (add-to-trie-1 3 tokens))) + {})))) + (count trie) + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)) + (drop 500) + (take 50000)) + t (->> documents + (map slurp) + (mapcat #(string/split % #"\n")) + (map tokenize-line) + (filter #(> (count %) 1))) + trie (->> documents + (map slurp) + (mapcat #(string/split % #"\n")) + (map tokenize-line) + (filter #(> (count %) 1)) + (take 5000) + (reduce + (fn [acc tokens] + (-> (add-to-trie-1 acc 1 tokens) + (add-to-trie-1 2 tokens) + (add-to-trie-1 3 tokens))) + {})) + probs (->> (range 1 4) + (map #(vector % (filter-trie-to-ngrams trie %))) + (map (fn [[n v]] [n (map #(second %) v)])) + (map (fn [[n v]] [n (into (sorted-map) (frequencies v))])) + (map (fn [[n v]] [n (math/sgt (keys v) (vals v))])) + (map (fn [[n [rs probs]]] + [n (into {} (map vector rs probs))])) + (into {}))] + (sgt/stupid-backoff trie probs [:bol "you" "must" "not"]) + (count t)) + ;; Turning corpus into a trie. (let [documents (->> "dark-corpus" io/file @@ -169,7 +263,7 @@ (mapcat #(string/split % #"\n")) (map tokenize-line) (filter #(> (count %) 1)) - (take 500) + (take 5000) (reduce (fn [acc tokens] (-> (add-to-trie-1 acc 1 tokens) @@ -183,8 +277,14 @@ (map (fn [[n v]] [n (math/sgt (keys v) (vals v))])) (map (fn [[n [rs probs]]] [n (into {} (map vector rs probs))])) - (into {}))] - (reverse (sort-by second (completions trie probs [:bol "you"])))) + (into {})) + poss (->> (get-in trie ["the" "dungeons"]) + (filter #(or (string? (first %)) + (#{:eol :bol} (first %)))) + (map (fn [[k v]] + [k (get-in probs [3 (:count v)])])) + (into {}))] + poss) (into {} (map vector [1 2 3] [4 5 6])) ;; @@ -308,8 +408,6 @@ ;; Inc to account for :count (filter #(= (inc n) (count (first %)))))) -(apply hash-map '([1 2] 3 [4 5] 6)) - (comment (apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} [])) @@ -439,9 +537,6 @@ (let [N (->> trie vals (map :count) (apply +))] (/ c N))) -(->> bigram - (filter (fn [[k v]] (= 3 (v :count))))) - ;; Good-Turing Smoothing ;; ;; There are 4 steps to perform the GT smoothing, which are: @@ -594,22 +689,5 @@ (defn good-turing-discount [trie c] ) -(->> bigram - (map second)) -(count (into #{} (tokenize (slurp "dev/examples/sandman.txt")))) -(->> bigram - (map second) - (map #(dissoc % :count)) - (map keys) - flatten - (into #{}) - (clojure.set/difference (into #{} (keys bigram)))) - -(partition 3 1 (repeat :end) (range 6)) - -(let [documents (->> "dark-corpus" - io/file - file-seq - (remove #(.isDirectory %)) - (take 10))] - documents) + + diff --git a/src/com/owoga/prhyme/generation/simple_good_turing.clj b/src/com/owoga/prhyme/generation/simple_good_turing.clj new file mode 100644 index 0000000..6ac687a --- /dev/null +++ b/src/com/owoga/prhyme/generation/simple_good_turing.clj @@ -0,0 +1,248 @@ +(ns com.owoga.prhyme.generation.simple-good-turing) + +;; Pythons NLTK is a great resource for this. +;; https://github.com/nltk/nltk/blob/2.0.4/nltk/probability.py + + +(defn least-squares-log-log-linear-regression + "Returns a 'Good-Turing Estimator' as defined on page 4 of + https://www.csie.ntu.edu.tw/~b92b02053/print/good-turing-smoothing-without.pdf + + A precise statement of the theorem underlying the Good-Turing method is that + r* = (r + 1) * E(Nᵣ + 1) / E(Nᵣ) + Where E(x) represents the expectation of random variable x. + + It's not unreasonable for E to be identity, simply substituting + Nᵣ for E(Nᵣ). In fact, that is known as the Turing Estimator. + + However, the larger r is, the less reasonable this substitution is, due + to how much noise there is in large values of r. + + So, this function acts as a more reasonable E. + + The paper also states that you should use the Turing estimator so long + as the Turing estimate is significantly different from the Linear Good-Turing + estimate. It defines significantly different as exceeding 1.65 times the + standard deviation of the Turing estimate." + [xs ys] + (let [xs (map #(Math/log %) xs) + ys (map #(Math/log %) ys) + n (count xs) + sum-x (apply + xs) + sum-y (apply + ys) + mean-x (/ sum-x n) + mean-y (/ sum-y n) + err-x (map #(- % mean-x) xs) + err-y (map #(- % mean-y) ys) + err-x-sqr (map #(* % %) err-x) + m (/ (apply + (map #(apply * %) (map vector err-x err-y))) + (apply + err-x-sqr)) + b (/ (- sum-y (* m sum-x)) n)] + (assert (< m -1) + (format + (str "See Good-Turing Without Tears" + " for why slope must be less than -1." + "\nSlope: %.2f Intersect %.2f") + (float m) + (float b))) + (fn [x] + (Math/pow Math/E (+ b (* m (Math/log x))))))) + +(defn average-consecutives + "Average all the non-zero frequency of observations (frequency of frequencies) + using the equation Zr = Nr / 0.5 (t - q) + where q, r, and t are consecutive observations. + + An intuitive reason for this is that you can't see something a fraction of a time, + but seeing something a fraction of a time is a truer representation of its + expectation. + + For example, in a typical linguistic corpus, you'll see many tokens once, many twice, + fewer 3 times, fewer 4 times, etc... By the time you get up to tokens that have been + seen 20 times, you might only see 1 token. Then 0 occurrences of + 21, 22, and 23 tokens. Then you might once see a token 24 times. + Then 0 occurrences of 25, 26, or 27 tokens, then 1 occurence of 28 tokens. + + Even though frequencies of 24 and 28 have both occurred once, that doesn't mean + their expected occurence is each once. In actuality, 28 is less likely than 24. + + This averaging accomplishes that. + + It's known as Zᵣ in most papers on Good-Turing estimation. It's used in place of + Nᵣ as soon as possible, since it's more accurate. + + Let's say you observered + observation frequency Zr + 1 32 32 + 2 20 20 + 3 9 6 + 5 2 1 + 7 1 0.5 + + If observations occur consecutively, then Zr is unchanged from the observed frequency. + But if there is a gap in observation occurence, then this effectively + redistributes some of the observations that we did make into some of the observations that + we didn't make. + + For example, we saw some 3's and 5's, but no 4's. So this algorithm shaves a little off + of the 3's and 5's so that if we fit a line to the frequencies the line will more accurately + go through the `4` observations that we just so happened to miss. + + This assumes some type of distribution amongst the data where that assumption is valid, + that the observations are independent of each other and that are either linear or logarithmic + (not polynomial... maybe another word for that is monotonic?) + " + [freqs Nrs] + (let [freqs (vec freqs) + Nrs (vec Nrs)] + (loop [i 0 + result []] + (let [q (if (= i 0) 0 (nth freqs (dec i))) + Nr (nth Nrs i) + r (nth freqs i) + t (if (= (inc i) (count freqs)) + (- (* 2 r) q) + (nth freqs (inc i)))] + (cond + (= (inc i) (count freqs)) + (conj result (/ (* 2 Nr) (- t q))) + + :else + (recur + (inc i) + (conj result (/ (* 2 Nr) (- t q))))))))) + +(defn stdv-for-turing-estimate + "The Simple Good-Turing paper suggests using a Turing estimator + for small values of r and switching to a Linear Good Turing estimator + once the differences between the two are no longer significantly different. + + Turing estimate are considered significantly different from LGT estimates + if their difference exceeds 1.65 times the standard deviation of + the Turing estimate. + + The variance for the Turing estimate is approximately + (r + 1)² * Nᵣ₊₁ / N²ᵣ * (1 + Nᵣ₊₁ / N²ᵣ)" + [r Nr Nr1] + (Math/sqrt + (* (Math/pow (inc r) 2) + (/ Nr1 (Math/pow Nr 2)) + (inc (/ Nr1 (Math/pow Nr 2)))))) + +(defn r-stars + "r* = (r + 1) * E(Nᵣ₊₁) / E(Nᵣ) + Where E is an 'estimator'. + + The Turing estimator is simply the identity function, substituting Nᵧ for E(Nᵧ). + + The smoothed Linear Good-Turing estimator is a linear regression model + over the log rs log nrs inputs. + + We choose the Turing estimator when it is significantly different from the + smoothed estimator. Significantly different defined as having a standard + deviation more than 1.65 times the difference between the Turing estimator + and the Linear Good-Turing estimator." + [rs nrs lm] + (loop [rs rs + nrs nrs + lgt? false + result []] + (cond + (empty? rs) result + :else + (if-let [lgt? lgt?] + (recur + (rest rs) + (rest nrs) + lgt? + (conj + result + (* (inc (first rs)) + (/ (lm (inc (first rs))) + (lm (first rs)))))) + (let [lgt-estimate (lm (first rs)) + turing-estimate (first nrs) + stdv (stdv-for-turing-estimate + (first rs) + (first nrs) + (second nrs)) + lgt? (or (> (Math/abs (- lgt-estimate turing-estimate)) + (* 1.65 stdv)) + ;; Note possibility for the turing estimate to + ;; require an out-of-range Nr+1 + ;; if we get to the end of nrs and still aren't + ;; taking the linear good-turing estimate. + (= 1 (count nrs)))] + (recur + (rest rs) + (rest nrs) + lgt? + (conj + result + (if lgt? + (* (inc (first rs)) + (/ (lm (inc (first rs))) + (lm (first rs)))) + (* (inc (first rs)) + (/ (second nrs) + (first nrs))))))))))) + +(defn simple-good-turing + [rs nrs] + (assert (and (not-empty nrs) (not-empty rs)) + "frequencies and frequency-of-frequencies can't be empty") + (let [N (apply + (map #(apply * %) (map vector rs nrs))) + p0 (/ (first nrs) N) + zrs (average-consecutives rs nrs) + lm (least-squares-log-log-linear-regression rs zrs) + lgts (map lm rs) + r*s (r-stars rs zrs lm) + N* (apply + (map #(apply * %) (map vector nrs r*s))) + probs (cons + (float p0) + (map #(* (- 1 p0) (/ % N*)) r*s)) + sum-probs (apply + probs)] + [(cons 0 rs) (map #(/ % sum-probs) probs)])) + +(comment + (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] + rs [1 2 3 4 5 6 7 8 9 10 12 26] + nrs [32 20 10 3 1 2 1 1 1 2 1 1]] + (map #(apply * %) (map vector rs (simple-good-turing rs nrs))) + (simple-good-turing rs nrs)) + + (let [rs [1 2 3 4 5 6 7 8 9 10 12 26] + nrs [32 20 10 3 1 2 1 1 1 2 1 1] + N (apply + (map #(apply * %) (map vector rs nrs))) + p0 (/ (first nrs) N) + zrs (average-consecutives rs nrs) + lm (least-squares-log-log-linear-regression rs zrs) + lgts (map lm rs) + r*s (r-stars rs nrs lm) + N* (apply + (map #(apply * %) (map vector nrs r*s))) + probs (cons + (float p0) + (map #(* (- 1 p0) (/ % N*)) r*s)) + sum-probs (apply + probs)] + (r-stars rs nrs lm)) + + ) + + +(defn stupid-backoff + [trie probs words] + (let [times-seen (or (get-in trie (concat words [:count])) 0)] + (cond + (= 1 (count words)) + (get-in probs [1 (:count (trie (first words)))]) + + (< 0 times-seen) + (/ times-seen (get-in trie (concat (butlast words) [:count]))) + + :else + (* 0.4 (stupid-backoff trie probs (butlast words)))))) + +(defn probability-of-sentence + [trie probs sentence] + ()) diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj index 1fa3d2d..4605335 100644 --- a/src/com/owoga/prhyme/util/math.clj +++ b/src/com/owoga/prhyme/util/math.clj @@ -67,12 +67,10 @@ (->Vose N alias prob))))) (defn from-weights [ws] - (let [N (count ws) - tot (reduce + 0.0 ws) - dist (if (zero? tot) - (repeat N (/ 1 tot)) - (map #(/ % tot) ws))] - (make-vose (vec dist)))) + (let [tot (reduce + 0.0 ws)] + (assert (> tot 0) "Can't Vose RNG from 0 weights.") + (let [dist (map #(/ % tot) ws)] + (make-vose (vec dist))))) (comment (let [ws [1 2 1 3 3] @@ -91,10 +89,12 @@ If given a key function and a collection, uses the key function to get a collection of weights and returns the value at the randomly selected index." ([coll] + (assert (not-empty coll) "Can't select from empty coll") (let [rng (from-weights coll) index (nextr rng nil)] index)) ([key-fn coll] + (assert (not-empty coll) "Can't select from empty coll") (let [rng (from-weights (map key-fn coll)) index (nextr rng nil) selection (nth coll index)] @@ -157,18 +157,6 @@ (fn [x] (Math/pow Math/E (+ b (* m (Math/log x))))))) -(defn averaged-smooth - "Assumes 0 Nrs are included." - [rs Nrs] - (let [rs (concat rs [(inc (last rs))]) - Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs) - (last (butlast Nrs))))])] - [rs Nrs])) -(comment - (averaged-smooth [1 2 3 4] [32 10 0 2]) - - ) - (defn average-consecutives "Average all the non-zero frequency of observations (frequency of frequencies) using the equation Zr = Nr / 0.5 (t - q) @@ -329,7 +317,8 @@ (float p0) (map #(* (- 1 p0) (/ % N*)) estimations)) sum-probs (apply + probs)] - [(cons 0 rs) (map #(/ % sum-probs) probs)])) + [(cons 0 rs) + (map #(/ % sum-probs) probs)])) (comment (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] @@ -338,7 +327,7 @@ nrs [32 20 10 3 1 2 1 1 1 2 1 1]] (map #(apply * %) (map vector rs (sgt rs nrs))) (sgt rs nrs)) - + ) (comment (let [rs [1 2 3 4 5 6 7 8 9 10 12]