From f6c30ba948a5b8bbd62650c73e92bbcf62d4498f Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 20 Jul 2021 08:55:58 -0500 Subject: [PATCH] Update markov code --- src/com/owoga/corpus/markov.clj | 49 +++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 9835b36..4fb04a6 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -1125,13 +1125,15 @@ :else (recur (butlast lookup)))))) (defn trie-frequencies [node] - (->> node - trie/children - (map #(second (get % []))) - frequencies - vec - (sort-by first) - (into (sorted-map)))) + (if (nil? node) + nil + (->> node + trie/children + (map #(second (get % []))) + frequencies + vec + (sort-by first) + (into (sorted-map))))) (comment (time (trie-frequencies (trie/lookup markov-tight-trie [107]))) @@ -1140,7 +1142,7 @@ (defn perplexity "TODO: - Tests - - Katz back-off + - *** Katz back-off - Performance " [rank model n-gram] @@ -1163,13 +1165,42 @@ (perplexity 2 markov-tight-trie [1 1 7 90]);; => -14.360605720470575 (perplexity 2 markov-tight-trie [1 1 7 89]);; => -12.98036901624079 (perplexity 2 markov-tight-trie [1 1 7 174]);; => -11.84336736411312 - + (perplexity 4 markov-tight-trie [22 22 22 22 34 34 18]) (trie/lookup markov-tight-trie [1 1 7 90]) (trie/lookup markov-tight-trie [1 1 7 89]) (map database [1 1 7]) ) +(defn perplexity-add-one + "If you're only using perplexity to compare phrases generated using + the same model, this might be a reasonable and simple alternative + to Katz Back-Off. + + Just give everything with 0 frequencies a freq of 1." + [rank model n-gram] + (loop [i 1 + n-gram n-gram + log-prob 0] + (if (> i (count n-gram)) + log-prob + (recur (min (inc i) rank) + (if (= i rank) (rest n-gram) n-gram) + (let [words (take i n-gram) + child (trie/lookup model words) + parent (trie/lookup model (butlast words)) + w1-freq (second (get child [] [nil 1])) + freqs (trie-frequencies parent) + sgt (math/frequencies->simple-good-turing-probabilities freqs)] + (+ log-prob (Math/log (sgt w1-freq)))))))) + +(comment + (perplexity-add-one 2 markov-tight-trie [1 1 7 90]);; => -14.360605720470575 + (perplexity-add-one 4 markov-tight-trie [22 22 22 22 34 34 18]) + (trie/lookup markov-tight-trie [1 1 7 90]) + (trie/lookup markov-tight-trie [1 1 7 89]) + (map database [1 1 7]) + ) ;;;; WGU (defn gen-rhyme-tree