Clean up and expand examples SGT

5 years ago · 72e06612bf
parent 495ef6c602
commit 72e06612bf
3 changed files with 366 additions and 51 deletions
--- a/dev/examples/scratch.clj
+++ b/dev/examples/scratch.clj
@ -3,6 +3,7 @@
            [clojure.string :as string]
            [clojure.set]
            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.prhyme.generation.simple-good-turing :as sgt]
            [com.owoga.prhyme.util.math :as math]))
 (def re-word
@ -100,7 +101,7 @@
 (defn add-to-trie-1
  [trie n tokens]
-  (let [pad-n (dec n)
+  (let [pad-n n
        tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol))
        partitions (partition n 1 tokens)]
    (reduce
@ -145,18 +146,111 @@
        i
        (recur (inc i) (+ (slices i) sum))))))
 (defn depth-of-map
  [m]
  (loop [d 0
         m m]
    (let [child-maps (filter map? (vals m))]
      (if (empty? child-maps)
        (dec d)
        (recur (inc d) (first child-maps))))))
 (defn completions [trie probs words]
-  (let [n (inc (count words))
+  (let [n (apply min (concat (keys probs) [(depth-of-map trie) (inc (count words))]))
        possibilities (->> (get-in trie words)
-                           (filter #(string? (first %)))
+                           (filter #(or (string? (first %))
                                        (#{:eol :bol} (first %))))
                           (map (fn [[k v]]
                                  [k (get-in probs [n (:count v)])]))
                           (into {}))
-        sum-probs (apply + (vals possibilities))
+        sum-probs (apply + (or (vals possibilities) '()))
        possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))]
    possibilities))
 (defn backoff-completions [trie probs words]
  (if (empty? words)
    '()
    (let [c (completions trie probs words)]
      (if (empty? c)
        (backoff-completions trie probs (rest words))
        c))))
 (defn generate-lines
  [trie n]
  (let [probs (->> (range 1 (inc n))
                   (map #(vector % (filter-trie-to-ngrams trie %)))
                   (map (fn [[n v]] [n (map #(second %) v)]))
                   (map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
                   (map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
                   (map (fn [[n [rs probs]]]
                          [n (into {} (map vector  rs probs))]))
                   (into {}))]
    (loop [words [:bol]
           freqs []]
      (if (= :eol (last words))
        [words freqs]
        (let [cs (backoff-completions trie probs words)]
          (if (empty? cs)
            [words freqs]
            (let [word (->> (reverse (sort-by second cs))
                            (math/weighted-selection second))]
              (recur
               (conj words (first word))
               (conj freqs (second word))))))))))
 (comment
  (def trie
    (let [documents (->> "dark-corpus"
                         io/file
                         file-seq
                         (remove #(.isDirectory %)))]
      (->> documents
           (map slurp)
           (mapcat #(string/split % #"\n"))
           (map tokenize-line)
           (filter #(> (count %) 1))
           (take 5000)
           (reduce
            (fn [acc tokens]
              (-> (add-to-trie-1 acc 1 tokens)
                  (add-to-trie-1 2 tokens)
                  (add-to-trie-1 3 tokens)))
            {}))))
  (count trie)
  (let [documents (->> "dark-corpus"
                       io/file
                       file-seq
                       (remove #(.isDirectory %))
                       (drop 500)
                       (take 50000))
        t (->> documents
               (map slurp)
               (mapcat #(string/split % #"\n"))
               (map tokenize-line)
               (filter #(> (count %) 1)))
        trie (->> documents
                  (map slurp)
                  (mapcat #(string/split % #"\n"))
                  (map tokenize-line)
                  (filter #(> (count %) 1))
                  (take 5000)
                  (reduce
                   (fn [acc tokens]
                     (-> (add-to-trie-1 acc 1 tokens)
                         (add-to-trie-1 2 tokens)
                         (add-to-trie-1 3 tokens)))
                   {}))
        probs (->> (range 1 4)
                   (map #(vector % (filter-trie-to-ngrams trie %)))
                   (map (fn [[n v]] [n (map #(second %) v)]))
                   (map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
                   (map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
                   (map (fn [[n [rs probs]]]
                          [n (into {} (map vector  rs probs))]))
                   (into {}))]
    (sgt/stupid-backoff trie probs [:bol "you" "must" "not"])
    (count t))
  ;; Turning corpus into a trie.
  (let [documents (->> "dark-corpus"
                       io/file
@ -169,7 +263,7 @@
                  (mapcat #(string/split % #"\n"))
                  (map tokenize-line)
                  (filter #(> (count %) 1))
-                  (take 500)
+                  (take 5000)
                  (reduce
                   (fn [acc tokens]
                     (-> (add-to-trie-1 acc 1 tokens)
@ -183,8 +277,14 @@
                   (map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
                   (map (fn [[n [rs probs]]]
                          [n (into {} (map vector  rs probs))]))
-                   (into {}))]
+                   (into {}))
-    (reverse (sort-by second (completions trie probs [:bol "you"]))))
+        poss (->> (get-in trie ["the" "dungeons"])
                  (filter #(or (string? (first %))
                               (#{:eol :bol} (first %))))
                  (map (fn [[k v]]
                         [k (get-in probs [3 (:count v)])]))
                  (into {}))]
    poss)
  (into {} (map vector [1 2 3] [4 5 6]))
  ;;
@ -308,8 +408,6 @@
       ;; Inc to account for :count
       (filter #(= (inc n) (count (first %))))))
 (apply hash-map '([1 2] 3 [4 5] 6))
 (comment
  (apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} []))
@ -439,9 +537,6 @@
  (let [N (->> trie vals (map :count) (apply +))]
    (/ c N)))
 (->> bigram
     (filter (fn [[k v]] (= 3 (v :count)))))
 ;; Good-Turing Smoothing
 ;;
 ;; There are 4 steps to perform the GT smoothing, which are:
@ -594,22 +689,5 @@
 (defn good-turing-discount [trie c]
  )
-(->> bigram
+
-     (map second))
+
 (count (into #{} (tokenize (slurp "dev/examples/sandman.txt"))))
 (->> bigram
     (map second)
     (map #(dissoc % :count))
     (map keys)
     flatten
     (into #{})
     (clojure.set/difference (into #{} (keys bigram))))
 (partition 3 1 (repeat :end) (range 6))
 (let [documents (->> "dark-corpus"
                     io/file
                     file-seq
                     (remove #(.isDirectory %))
                     (take 10))]
  documents)
--- a/src/com/owoga/prhyme/generation/simple_good_turing.clj
+++ b/src/com/owoga/prhyme/generation/simple_good_turing.clj
@ -0,0 +1,248 @@
 (ns com.owoga.prhyme.generation.simple-good-turing)
 ;; Pythons NLTK is a great resource for this.
 ;; https://github.com/nltk/nltk/blob/2.0.4/nltk/probability.py
 (defn least-squares-log-log-linear-regression
  "Returns a 'Good-Turing Estimator' as defined on page 4 of
  https://www.csie.ntu.edu.tw/~b92b02053/print/good-turing-smoothing-without.pdf
  A precise statement of the theorem underlying the Good-Turing method is that
  r* = (r + 1) * E(Nᵣ + 1) / E(Nᵣ)
  Where E(x) represents the expectation of random variable x.
  It's not unreasonable for E to be identity, simply substituting
  Nᵣ for E(Nᵣ). In fact, that is known as the Turing Estimator.
  However, the larger r is, the less reasonable this substitution is, due
  to how much noise there is in large values of r.
  So, this function acts as a more reasonable E.
  The paper also states that you should use the Turing estimator so long
  as the Turing estimate is significantly different from the Linear Good-Turing
  estimate. It defines significantly different as exceeding 1.65 times the
  standard deviation of the Turing estimate."
  [xs ys]
  (let [xs (map #(Math/log %) xs)
        ys (map #(Math/log %) ys)
        n (count xs)
        sum-x (apply + xs)
        sum-y (apply + ys)
        mean-x (/ sum-x n)
        mean-y (/ sum-y n)
        err-x (map #(- % mean-x) xs)
        err-y (map #(- % mean-y) ys)
        err-x-sqr (map #(* % %) err-x)
        m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
             (apply + err-x-sqr))
        b (/ (- sum-y (* m sum-x)) n)]
    (assert (< m -1)
            (format
             (str "See Good-Turing Without Tears"
                  " for why slope must be less than -1."
                  "\nSlope: %.2f Intersect %.2f")
             (float m)
             (float b)))
    (fn [x]
      (Math/pow Math/E (+ b (* m (Math/log x)))))))
 (defn average-consecutives
  "Average all the non-zero frequency of observations (frequency of frequencies)
  using the equation Zr = Nr / 0.5 (t - q)
  where q, r, and t are consecutive observations.
  An intuitive reason for this is that you can't see something a fraction of a time,
  but seeing something a fraction of a time is a truer representation of its
  expectation.
  For example, in a typical linguistic corpus, you'll see many tokens once, many twice,
  fewer 3 times, fewer 4 times, etc... By the time you get up to tokens that have been
  seen 20 times, you might only see 1 token. Then 0 occurrences of
  21, 22, and 23 tokens. Then you might once see a token 24 times.
  Then 0 occurrences of 25, 26, or 27 tokens, then 1 occurence of 28 tokens.
  Even though frequencies of 24 and 28 have both occurred once, that doesn't mean
  their expected occurence is each once. In actuality, 28 is less likely than 24.
  This averaging accomplishes that.
  It's known as Zᵣ in most papers on Good-Turing estimation. It's used in place of
  Nᵣ as soon as possible, since it's more accurate.
  Let's say you observered
  observation		frequency	Zr
  1				32			32
  2				20			20
  3				9			6
  5				2			1
  7				1			0.5
  If observations occur consecutively, then Zr is unchanged from the observed frequency.
  But if there is a gap in observation occurence, then this effectively
  redistributes some of the observations that we did make into some of the observations that
  we didn't make.
  For example, we saw some 3's and 5's, but no 4's. So this algorithm shaves a little off
  of the 3's and 5's so that if we fit a line to the frequencies the line will more accurately
  go through the `4` observations that we just so happened to miss.
  This assumes some type of distribution amongst the data where that assumption is valid,
  that the observations are independent of each other and that are either linear or logarithmic
  (not polynomial... maybe another word for that is monotonic?)
  "
  [freqs Nrs]
  (let [freqs (vec freqs)
        Nrs (vec Nrs)]
    (loop [i 0
           result []]
      (let [q (if (= i 0) 0 (nth freqs (dec i)))
            Nr (nth Nrs i)
            r (nth freqs i)
            t (if (= (inc i) (count freqs))
                (- (* 2 r) q)
                (nth freqs (inc i)))]
        (cond
          (= (inc i) (count freqs))
          (conj result (/ (* 2 Nr) (- t q)))
          :else
          (recur
           (inc i)
           (conj result (/ (* 2 Nr) (- t q)))))))))
 (defn stdv-for-turing-estimate
  "The Simple Good-Turing paper suggests using a Turing estimator
  for small values of r and switching to a Linear Good Turing estimator
  once the differences between the two are no longer significantly different.
  Turing estimate are considered significantly different from LGT estimates
  if their difference exceeds 1.65 times the standard deviation of
  the Turing estimate.
  The variance for the Turing estimate is approximately
  (r + 1)² * Nᵣ₊₁ / N²ᵣ * (1 + Nᵣ₊₁ / N²ᵣ)"
  [r Nr Nr1]
  (Math/sqrt
   (* (Math/pow (inc r) 2)
      (/ Nr1 (Math/pow Nr 2))
      (inc (/ Nr1 (Math/pow Nr 2))))))
 (defn r-stars
  "r* = (r + 1) * E(Nᵣ₊₁) / E(Nᵣ)
  Where E is an 'estimator'.
  The Turing estimator is simply the identity function, substituting Nᵧ for E(Nᵧ).
  The smoothed Linear Good-Turing estimator is a linear regression model
  over the log rs log nrs inputs.
  We choose the Turing estimator when it is significantly different from the
  smoothed estimator. Significantly different defined as having a standard
  deviation more than 1.65 times the difference between the Turing estimator
  and the Linear Good-Turing estimator."
  [rs nrs lm]
  (loop [rs rs
         nrs nrs
         lgt? false
         result []]
    (cond
      (empty? rs) result
      :else
      (if-let [lgt? lgt?]
        (recur
         (rest rs)
         (rest nrs)
         lgt?
         (conj
          result
          (* (inc (first rs))
             (/ (lm (inc (first rs)))
                (lm (first rs))))))
        (let [lgt-estimate (lm (first rs))
              turing-estimate (first nrs)
              stdv (stdv-for-turing-estimate
                    (first rs)
                    (first nrs)
                    (second nrs))
              lgt? (or (> (Math/abs (- lgt-estimate turing-estimate))
                          (* 1.65 stdv))
                       ;; Note possibility for the turing estimate to
                       ;; require an out-of-range Nr+1
                       ;; if we get to the end of nrs and still aren't
                       ;; taking the linear good-turing estimate.
                       (= 1 (count nrs)))]
          (recur
           (rest rs)
           (rest nrs)
           lgt?
           (conj
            result
            (if lgt?
              (* (inc (first rs))
                 (/ (lm (inc (first rs)))
                    (lm (first rs))))
              (* (inc (first rs))
                 (/ (second nrs)
                    (first nrs)))))))))))
 (defn simple-good-turing
  [rs nrs]
  (assert (and (not-empty nrs) (not-empty rs))
          "frequencies and frequency-of-frequencies can't be empty")
  (let [N (apply + (map #(apply * %) (map vector rs nrs)))
        p0 (/ (first nrs) N)
        zrs (average-consecutives rs nrs)
        lm (least-squares-log-log-linear-regression rs zrs)
        lgts (map lm rs)
        r*s (r-stars rs zrs lm)
        N* (apply + (map #(apply * %) (map vector nrs r*s)))
        probs (cons
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) r*s))
        sum-probs (apply + probs)]
    [(cons 0 rs) (map #(/ % sum-probs) probs)]))
 (comment
  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
        nrs [32 20 10 3 1 2 1 1 1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
        rs  [1 2 3 4 5 6 7 8 9 10 12 26]
        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
    (map #(apply * %) (map vector rs (simple-good-turing rs nrs)))
    (simple-good-turing rs nrs))
  (let [rs  [1 2 3 4 5 6 7 8 9 10 12 26]
        nrs [32 20 10 3 1 2 1 1 1 2 1 1]
        N (apply + (map #(apply * %) (map vector rs nrs)))
        p0 (/ (first nrs) N)
        zrs (average-consecutives rs nrs)
        lm (least-squares-log-log-linear-regression rs zrs)
        lgts (map lm rs)
        r*s (r-stars rs nrs lm)
        N* (apply + (map #(apply * %) (map vector nrs r*s)))
        probs (cons
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) r*s))
        sum-probs (apply + probs)]
    (r-stars rs nrs lm))
  )
 (defn stupid-backoff
  [trie probs words]
  (let [times-seen (or (get-in trie (concat words [:count])) 0)]
    (cond
      (= 1 (count words))
      (get-in probs [1 (:count (trie (first words)))])
      (< 0 times-seen)
      (/ times-seen (get-in trie (concat (butlast words) [:count])))
      :else
      (* 0.4 (stupid-backoff trie probs (butlast words))))))
 (defn probability-of-sentence
  [trie probs sentence]
  ())
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@ -67,12 +67,10 @@
        (->Vose N alias prob)))))
 (defn from-weights [ws]
-  (let [N (count ws)
+  (let [tot (reduce + 0.0 ws)]
-        tot (reduce + 0.0 ws)
+    (assert (> tot 0) "Can't Vose RNG from 0 weights.")
-        dist (if (zero? tot)
+    (let [dist (map #(/ % tot) ws)]
-                  (repeat N (/ 1 tot))
+      (make-vose (vec dist)))))
                  (map #(/ % tot) ws))]
    (make-vose (vec dist))))
 (comment
  (let [ws [1 2 1 3 3]
@ -91,10 +89,12 @@
  If given a key function and a collection, uses the key function to get a
  collection of weights and returns the value at the randomly selected index."
  ([coll]
   (assert (not-empty coll) "Can't select from empty coll")
   (let [rng (from-weights coll)
         index (nextr rng nil)]
     index))
  ([key-fn coll]
   (assert (not-empty coll) "Can't select from empty coll")
   (let [rng (from-weights (map key-fn coll))
         index (nextr rng nil)
         selection (nth coll index)]
@ -157,18 +157,6 @@
    (fn [x]
      (Math/pow Math/E (+ b (* m (Math/log x)))))))
 (defn averaged-smooth
  "Assumes 0 Nrs are included."
  [rs Nrs]
  (let [rs (concat rs [(inc (last rs))])
        Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs)
                                          (last (butlast Nrs))))])]
    [rs Nrs]))
 (comment
  (averaged-smooth [1 2 3 4] [32 10 0 2])
  )
 (defn average-consecutives
  "Average all the non-zero frequency of observations (frequency of frequencies)
  using the equation Zr = Nr / 0.5 (t - q)
@ -329,7 +317,8 @@
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) estimations))
        sum-probs (apply + probs)]
-    [(cons 0 rs) (map #(/ % sum-probs) probs)]))
+    [(cons 0 rs)
     (map #(/ % sum-probs) probs)]))
 (comment
  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
@ -338,7 +327,7 @@
        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
    (map #(apply * %) (map vector rs (sgt rs nrs)))
    (sgt rs nrs))
-
+ 
  )
 (comment
  (let [rs [1 2 3 4 5 6 7 8 9 10 12]