From b63b8d6cf4e84a63e176b86776d0edb0c8c6938f Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Wed, 30 Dec 2020 20:48:34 -0800 Subject: [PATCH] More exploration of Katz back-off --- dev/examples/scratch.clj | 61 ++++++++++++-- .../prhyme/generation/simple_good_turing.clj | 81 ++++++++++++++----- 2 files changed, 112 insertions(+), 30 deletions(-) diff --git a/dev/examples/scratch.clj b/dev/examples/scratch.clj index bb1897d..9b9ae1b 100644 --- a/dev/examples/scratch.clj +++ b/dev/examples/scratch.clj @@ -236,13 +236,16 @@ (remove (fn [[k _]] (= :count k)))) (def r*s (sgt/trie->r*s trie)) - r*s - (get-in trie ["you're" "my"]) - (get-in r*s [2 :r*s]) + (get-in r*s [1 :N]) + + (get-in trie ["you're" "my"]) + (get-in r*s [1 :r*s 2616]) + (get-in r*s [1 :r0]) + (get-in trie ["you're" :count]) - (get-in trie ["my" "us"]) + (get-in trie [1 :r0]) (get-in {:a 1} '()) (sgt/katz-alpha @@ -254,11 +257,53 @@ (sgt/alpha trie r*s ["eat" "my"] 2) (get-in trie ["you're" "my" "lady"]) (sgt/katz-estimator trie r*s 0 ["you're" "my" "head"]) -;; => 0.1067916992217116 + ;; => 0.1067916992217116 (sgt/katz-estimator trie r*s 0 ["you're" "my" "lady"]) -;; => 0.016222893164898698 - (sgt/katz-estimator trie r*s 0 ["you're" "my" "fooball"]) -;; => 9.223367982725652E-6 + ;; => 0.016222893164898698 + (sgt/katz-estimator trie r*s 0 ["you're" "my" "baz"]) + + (get-in trie ["you're" ]) + (get-in r*s [1 :N]) + (sgt/katz-beta-alpha trie r*s 0 ["you're" "not"]) + ;; => 0.14643662138043667 + ;; => 0.014190462313655283 + (/ 0.14 0.014) + (/ 0.27 0.14) + (sgt/P-sub-s trie r*s 0 ["you're" "tearing" "foo"]) + +;; => 1.739617874207705E-4 + + (let [k 0 + words ["not"]] + (->> (get-in trie (butlast words)) + (remove #(= :count (first %))) + (filter (fn [[_ v]] (> (:count v) k))) + (map first) + (map #(concat (butlast words) [%])) + (map #(sgt/P-bar trie r*s %)) + (apply +))) + + (let [words ["you're" "my"]] + (->> (get-in trie (butlast words)) + (remove #(= :count (first %))) + (filter (fn [[_ v]] (> (:count v) 0))) + (map first) + (map #(concat (butlast words) [%])) + (map #(sgt/katz-estimator trie r*s 0 %)) + (apply +))) + (sgt/P-bar trie r*s ["foo"]) + + (let [words ["my"]] + (->> (get-in trie (butlast words)) + (remove #(= :count (first %))) + (filter (fn [[_ v]] (> (:count v) 0))) + (map first) + (map #(concat (butlast words) [%])) + (map #(sgt/katz-estimator trie r*s 0 %)) + (apply +))) + + + ;; => 9.223367982725652E-6 (float (/ 1 27)) (get-in trie ["eat" "my"]) (sgt/sum-of-betas trie r*s ["you're" "my"]) diff --git a/src/com/owoga/prhyme/generation/simple_good_turing.clj b/src/com/owoga/prhyme/generation/simple_good_turing.clj index c0a863f..92d1a86 100644 --- a/src/com/owoga/prhyme/generation/simple_good_turing.clj +++ b/src/com/owoga/prhyme/generation/simple_good_turing.clj @@ -276,13 +276,19 @@ (fn [[ngram rs-nrs-map]] (let [rs (keys rs-nrs-map) nrs (vals rs-nrs-map) + N (apply + (map #(apply * %) (map vector rs nrs))) + r0 (first nrs) zrs (average-consecutives rs nrs) lm (least-squares-log-log-linear-regression rs zrs)] - [ngram {:rs rs - :nrs nrs - :zrs zrs + [ngram {:N N + :r0 r0 + :rs rs + :nrs (first nrs) nrs + :zrs (first nrs) zrs :lm lm - :r*s (into (sorted-map) (map vector rs (r-stars rs zrs lm)))}])) + :r*s (into + (sorted-map) + (map vector rs (r-stars rs zrs lm)))}])) ngram-rs-nrs-map)))) ;; zrs (average-consecutives rs nrs) @@ -320,23 +326,56 @@ (declare katz-beta-alpha) +(defn theta [x] + (if (zero? x) 1 0)) + +(defn P-bar + [trie r*s words] + (let [n (count words) + c (get-in trie (concat words [:count]) 0) + r* (get-in r*s [n :r*s c]) + N (get-in r*s [n :N])] + (if (= 1 n) + (/ r* N) + (let [c-1 (get-in trie (concat (butlast words) [:count]) 0) + d (/ r* c)] + (println "dr" d r* c) + (* d (/ c c-1)))))) + +(defn P-sub-s + [trie r*s k words] + (let [c (get-in trie (concat (butlast words) [:count]) 0)] + (if (> c k) + (P-bar trie r*s words) + (let [alpha (katz-beta-alpha trie r*s k words)] + (* alpha (P-sub-s trie r*s k (rest words))))))) + + (defn katz-estimator [trie r*s k words] - (let [r (get-in trie (concat words [:count]) 0)] - (if (> r 0) - (let [n (count words) - r* (get-in r*s [n :r*s r]) - r-1 (get-in trie (concat (butlast words) [:count]) 1) - d (/ r* r)] - (* d (/ r r-1))) - (let [alpha (/ (katz-beta-alpha trie r*s k words) - (katz-beta-alpha trie r*s k (rest words)))] - (* alpha - (katz-estimator - trie - r*s - k - (rest words))))))) + (Thread/sleep 100) + (println words) + (if (= 1 (count words)) + (let [c (get-in trie (concat words [:count]))] + (if c + (/ (get-in r*s [1 :r*s c]) (get-in r*s [1 :N])) + (/ (get-in r*s [1 :r0]) + (get-in r*s [1 :N])))) + (let [r (get-in trie (concat words [:count]) 0)] + (if (> r 0) + (let [n (count words) + r* (get-in r*s [n :r*s r]) + r-1 (get-in trie (concat (butlast words) [:count]) 1) + d (/ r* r)] + (* d (/ r r-1))) + (let [alpha (/ (katz-beta-alpha trie r*s k words) + (katz-beta-alpha trie r*s k (rest words)))] + (* alpha + (katz-estimator + trie + r*s + k + (rest words)))))))) (defn katz-beta-alpha [trie r*s k words] @@ -345,7 +384,7 @@ (filter (fn [[_ v]] (> (:count v) k))) (map first) (map #(concat (butlast words) [%])) - (map #(katz-estimator trie r*s k %)) + (map #(P-bar trie r*s %)) (apply +))] (- 1 ngrams))) @@ -439,8 +478,6 @@ r-1 (get-in trie (concat (butlast words) [:count])) r* (get-in r*s [n :r*s r]) d (/ r* r)] - (Thread/sleep 100) - (println r r-1 d k words (* d (/ r r-1))) (if (> r k) (* d (/ r r-1)) (* (alpha trie r*s (butlast words) k)