|
|
@ -1125,13 +1125,15 @@
|
|
|
|
:else (recur (butlast lookup))))))
|
|
|
|
:else (recur (butlast lookup))))))
|
|
|
|
|
|
|
|
|
|
|
|
(defn trie-frequencies [node]
|
|
|
|
(defn trie-frequencies [node]
|
|
|
|
(->> node
|
|
|
|
(if (nil? node)
|
|
|
|
trie/children
|
|
|
|
nil
|
|
|
|
(map #(second (get % [])))
|
|
|
|
(->> node
|
|
|
|
frequencies
|
|
|
|
trie/children
|
|
|
|
vec
|
|
|
|
(map #(second (get % [])))
|
|
|
|
(sort-by first)
|
|
|
|
frequencies
|
|
|
|
(into (sorted-map))))
|
|
|
|
vec
|
|
|
|
|
|
|
|
(sort-by first)
|
|
|
|
|
|
|
|
(into (sorted-map)))))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
(time (trie-frequencies (trie/lookup markov-tight-trie [107])))
|
|
|
|
(time (trie-frequencies (trie/lookup markov-tight-trie [107])))
|
|
|
@ -1140,7 +1142,7 @@
|
|
|
|
(defn perplexity
|
|
|
|
(defn perplexity
|
|
|
|
"TODO:
|
|
|
|
"TODO:
|
|
|
|
- Tests
|
|
|
|
- Tests
|
|
|
|
- Katz back-off
|
|
|
|
- *** Katz back-off
|
|
|
|
- Performance
|
|
|
|
- Performance
|
|
|
|
"
|
|
|
|
"
|
|
|
|
[rank model n-gram]
|
|
|
|
[rank model n-gram]
|
|
|
@ -1163,13 +1165,42 @@
|
|
|
|
(perplexity 2 markov-tight-trie [1 1 7 90]);; => -14.360605720470575
|
|
|
|
(perplexity 2 markov-tight-trie [1 1 7 90]);; => -14.360605720470575
|
|
|
|
(perplexity 2 markov-tight-trie [1 1 7 89]);; => -12.98036901624079
|
|
|
|
(perplexity 2 markov-tight-trie [1 1 7 89]);; => -12.98036901624079
|
|
|
|
(perplexity 2 markov-tight-trie [1 1 7 174]);; => -11.84336736411312
|
|
|
|
(perplexity 2 markov-tight-trie [1 1 7 174]);; => -11.84336736411312
|
|
|
|
|
|
|
|
(perplexity 4 markov-tight-trie [22 22 22 22 34 34 18])
|
|
|
|
(trie/lookup markov-tight-trie [1 1 7 90])
|
|
|
|
(trie/lookup markov-tight-trie [1 1 7 90])
|
|
|
|
(trie/lookup markov-tight-trie [1 1 7 89])
|
|
|
|
(trie/lookup markov-tight-trie [1 1 7 89])
|
|
|
|
(map database [1 1 7])
|
|
|
|
(map database [1 1 7])
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(defn perplexity-add-one
|
|
|
|
|
|
|
|
"If you're only using perplexity to compare phrases generated using
|
|
|
|
|
|
|
|
the same model, this might be a reasonable and simple alternative
|
|
|
|
|
|
|
|
to Katz Back-Off.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Just give everything with 0 frequencies a freq of 1."
|
|
|
|
|
|
|
|
[rank model n-gram]
|
|
|
|
|
|
|
|
(loop [i 1
|
|
|
|
|
|
|
|
n-gram n-gram
|
|
|
|
|
|
|
|
log-prob 0]
|
|
|
|
|
|
|
|
(if (> i (count n-gram))
|
|
|
|
|
|
|
|
log-prob
|
|
|
|
|
|
|
|
(recur (min (inc i) rank)
|
|
|
|
|
|
|
|
(if (= i rank) (rest n-gram) n-gram)
|
|
|
|
|
|
|
|
(let [words (take i n-gram)
|
|
|
|
|
|
|
|
child (trie/lookup model words)
|
|
|
|
|
|
|
|
parent (trie/lookup model (butlast words))
|
|
|
|
|
|
|
|
w1-freq (second (get child [] [nil 1]))
|
|
|
|
|
|
|
|
freqs (trie-frequencies parent)
|
|
|
|
|
|
|
|
sgt (math/frequencies->simple-good-turing-probabilities freqs)]
|
|
|
|
|
|
|
|
(+ log-prob (Math/log (sgt w1-freq))))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
|
|
|
|
(perplexity-add-one 2 markov-tight-trie [1 1 7 90]);; => -14.360605720470575
|
|
|
|
|
|
|
|
(perplexity-add-one 4 markov-tight-trie [22 22 22 22 34 34 18])
|
|
|
|
|
|
|
|
(trie/lookup markov-tight-trie [1 1 7 90])
|
|
|
|
|
|
|
|
(trie/lookup markov-tight-trie [1 1 7 89])
|
|
|
|
|
|
|
|
(map database [1 1 7])
|
|
|
|
|
|
|
|
)
|
|
|
|
;;;; WGU
|
|
|
|
;;;; WGU
|
|
|
|
|
|
|
|
|
|
|
|
(defn gen-rhyme-tree
|
|
|
|
(defn gen-rhyme-tree
|
|
|
|