Move towards Katz backoff

main
Eric Ihli 4 years ago
parent 72e06612bf
commit 33b5f26d0b

@ -2,6 +2,7 @@
(:require [clojure.java.io :as io]
[clojure.string :as string]
[clojure.set]
[com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.generation.simple-good-turing :as sgt]
[com.owoga.prhyme.util.math :as math]))
@ -198,6 +199,12 @@
(conj words (first word))
(conj freqs (second word))))))))))
(defn normalize [coll]
(let [s (apply + coll)]
(map #(/ % s) coll)))
(comment
(def trie
(let [documents (->> "dark-corpus"
@ -205,18 +212,53 @@
file-seq
(remove #(.isDirectory %)))]
(->> documents
(take 10000)
(map slurp)
(mapcat #(string/split % #"\n"))
(map tokenize-line)
(filter #(> (count %) 1))
(take 5000)
(reduce
(fn [acc tokens]
(-> (add-to-trie-1 acc 1 tokens)
(add-to-trie-1 2 tokens)
(add-to-trie-1 3 tokens)))
{}))))
(count trie)
(->> (get-in trie ["you're" "my"])
(remove (fn [[k _]] (= :count k))))
(def r*s (sgt/trie->r*s trie))
(def probs
(->> (range 1 4)
(map #(vector % (filter-trie-to-ngrams trie %)))
(map (fn [[n v]] [n (map #(second %) v)]))
(map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
(map (fn [[n v]] [n (sgt/simple-good-turing (keys v) (vals v))]))
(map (fn [[n [rs probs]]]
[n (into {} (map vector rs probs))]))
(into {})))
(sgt/katz-backoff trie probs r*s)
;; probability of 3-grams
(let [bigram ["eat" "my"]
trigrams (map #(conj bigram %) dict/popular)]
(->> trigrams
(map #(vector % (sgt/stupid-backoff trie probs %)))
(map #(apply vec %))
(sort-by second)
(reverse)
(take 20)))
(repeatedly
10
(fn []
(let [bigram ["eat" "my"]
trigrams (map #(conj bigram %) dict/popular)]
(->> trigrams
(map #(vector % (sgt/stupid-backoff trie probs %)))
(take 10)))))
(let [documents (->> "dark-corpus"
io/file
file-seq
@ -422,7 +464,8 @@
"g" {:count 1}}
"i" {:count 1
"g" {:count 1}}}}]
(filter-trie-to-ngrams trie 3))
(filter-trie-to-ngrams trie 3)
(sgt/trie->r*s trie))
)

@ -229,20 +229,103 @@
)
(defn flatmap
([m]
(flatmap m []))
([m prefix]
(mapcat
(fn [[k v]]
(if (map? v)
(flatmap v (conj prefix k))
[(conj prefix k) v]))
m)))
(defn filter-trie-to-ngrams [trie n]
(->> trie
(flatmap)
(partition 2)
;; Inc to account for :count
(filter #(= (inc n) (count (first %))))))
(defn depth-of-map
[m]
(loop [d 0
m m]
(let [child-maps (filter map? (vals m))]
(if (empty? child-maps)
d
(recur (inc d) (first child-maps))))))
(defn trie->r*s
[trie]
(let [depth (depth-of-map trie)
ngram-rs-nrs-map
(into
{}
(map
(fn [d]
(let [flattened (filter-trie-to-ngrams trie d)]
[d (into (sorted-map) (frequencies (map second flattened)))]))
(range 1 (inc depth))))]
(into
{}
(map
(fn [[ngram rs-nrs-map]]
(let [rs (keys rs-nrs-map)
nrs (vals rs-nrs-map)
zrs (average-consecutives rs nrs)
lm (least-squares-log-log-linear-regression rs zrs)]
[ngram {:rs rs
:nrs nrs
:zrs zrs
:lm lm
:r*s (r-stars rs zrs lm)}]))
ngram-rs-nrs-map))))
;; zrs (average-consecutives rs nrs)
;; lm (least-squares-log-log-linear-regression rs zrs)
(defn stupid-backoff
[trie probs words]
(let [times-seen (or (get-in trie (concat words [:count])) 0)]
#_(when (and (> times-seen 0)
(> (count words) 2))
(Thread/sleep 100)
(println "Seen" words times-seen (get-in trie (concat (butlast words) [:count]))))
(cond
(= 1 (count words))
(get-in probs [1 (:count (trie (first words)))])
(let [r (get-in probs [1 (:count (get trie (first words) {:count 0}))])]
(if (nil? r)
(println "1" words)
r))
(< 0 times-seen)
(/ times-seen (get-in trie (concat (butlast words) [:count])))
(let [r (/ times-seen (get-in trie (concat words [:count])))]
(if (nil? r)
(println "0" words)
r))
:else
(* 0.4 (stupid-backoff trie probs (butlast words))))))
(defn probability-of-sentence
[trie probs sentence]
())
(* 0.4 (stupid-backoff trie probs (rest words))))))
(defn katz-backoff
[trie probs r*s words]
(let [k 0
n (count words)
c (get-in trie (concat words [:count]) 0)
d (/ (r*s c) c)
v (* d (/ c (get-in trie (concat (butlast words) [:count]))))]
(if (> c k)
v
(let [words (butlast words)
b (->> (get-in trie words)
(remove (fn [[k _]] (= :count k)))
(filter (fn [[k v]] (> (:count v) k)))
(map (fn [[k v]]
(let [c-den (:count v)
c-num (get-in trie (concat words [k :count]))
d (/ (r*s c) )]
(* d (/ c-num c-den)))))
(apply +))]
b))))

Loading…
Cancel
Save