More exploration of Katz back-off

main
Eric Ihli 4 years ago
parent f0e03d76a2
commit b63b8d6cf4

@ -236,13 +236,16 @@
(remove (fn [[k _]] (= :count k))))
(def r*s (sgt/trie->r*s trie))
r*s
(get-in trie ["you're" "my"])
(get-in r*s [2 :r*s])
(get-in r*s [1 :N])
(get-in trie ["you're" "my"])
(get-in trie ["my" "us"])
(get-in r*s [1 :r*s 2616])
(get-in r*s [1 :r0])
(get-in trie ["you're" :count])
(get-in trie [1 :r0])
(get-in {:a 1} '())
(sgt/katz-alpha
@ -257,7 +260,49 @@
;; => 0.1067916992217116
(sgt/katz-estimator trie r*s 0 ["you're" "my" "lady"])
;; => 0.016222893164898698
(sgt/katz-estimator trie r*s 0 ["you're" "my" "fooball"])
(sgt/katz-estimator trie r*s 0 ["you're" "my" "baz"])
(get-in trie ["you're" ])
(get-in r*s [1 :N])
(sgt/katz-beta-alpha trie r*s 0 ["you're" "not"])
;; => 0.14643662138043667
;; => 0.014190462313655283
(/ 0.14 0.014)
(/ 0.27 0.14)
(sgt/P-sub-s trie r*s 0 ["you're" "tearing" "foo"])
;; => 1.739617874207705E-4
(let [k 0
words ["not"]]
(->> (get-in trie (butlast words))
(remove #(= :count (first %)))
(filter (fn [[_ v]] (> (:count v) k)))
(map first)
(map #(concat (butlast words) [%]))
(map #(sgt/P-bar trie r*s %))
(apply +)))
(let [words ["you're" "my"]]
(->> (get-in trie (butlast words))
(remove #(= :count (first %)))
(filter (fn [[_ v]] (> (:count v) 0)))
(map first)
(map #(concat (butlast words) [%]))
(map #(sgt/katz-estimator trie r*s 0 %))
(apply +)))
(sgt/P-bar trie r*s ["foo"])
(let [words ["my"]]
(->> (get-in trie (butlast words))
(remove #(= :count (first %)))
(filter (fn [[_ v]] (> (:count v) 0)))
(map first)
(map #(concat (butlast words) [%]))
(map #(sgt/katz-estimator trie r*s 0 %))
(apply +)))
;; => 9.223367982725652E-6
(float (/ 1 27))
(get-in trie ["eat" "my"])

@ -276,13 +276,19 @@
(fn [[ngram rs-nrs-map]]
(let [rs (keys rs-nrs-map)
nrs (vals rs-nrs-map)
N (apply + (map #(apply * %) (map vector rs nrs)))
r0 (first nrs)
zrs (average-consecutives rs nrs)
lm (least-squares-log-log-linear-regression rs zrs)]
[ngram {:rs rs
:nrs nrs
:zrs zrs
[ngram {:N N
:r0 r0
:rs rs
:nrs (first nrs) nrs
:zrs (first nrs) zrs
:lm lm
:r*s (into (sorted-map) (map vector rs (r-stars rs zrs lm)))}]))
:r*s (into
(sorted-map)
(map vector rs (r-stars rs zrs lm)))}]))
ngram-rs-nrs-map))))
;; zrs (average-consecutives rs nrs)
@ -320,8 +326,41 @@
(declare katz-beta-alpha)
(defn theta [x]
(if (zero? x) 1 0))
(defn P-bar
[trie r*s words]
(let [n (count words)
c (get-in trie (concat words [:count]) 0)
r* (get-in r*s [n :r*s c])
N (get-in r*s [n :N])]
(if (= 1 n)
(/ r* N)
(let [c-1 (get-in trie (concat (butlast words) [:count]) 0)
d (/ r* c)]
(println "dr" d r* c)
(* d (/ c c-1))))))
(defn P-sub-s
[trie r*s k words]
(let [c (get-in trie (concat (butlast words) [:count]) 0)]
(if (> c k)
(P-bar trie r*s words)
(let [alpha (katz-beta-alpha trie r*s k words)]
(* alpha (P-sub-s trie r*s k (rest words)))))))
(defn katz-estimator
[trie r*s k words]
(Thread/sleep 100)
(println words)
(if (= 1 (count words))
(let [c (get-in trie (concat words [:count]))]
(if c
(/ (get-in r*s [1 :r*s c]) (get-in r*s [1 :N]))
(/ (get-in r*s [1 :r0])
(get-in r*s [1 :N]))))
(let [r (get-in trie (concat words [:count]) 0)]
(if (> r 0)
(let [n (count words)
@ -336,7 +375,7 @@
trie
r*s
k
(rest words)))))))
(rest words))))))))
(defn katz-beta-alpha
[trie r*s k words]
@ -345,7 +384,7 @@
(filter (fn [[_ v]] (> (:count v) k)))
(map first)
(map #(concat (butlast words) [%]))
(map #(katz-estimator trie r*s k %))
(map #(P-bar trie r*s %))
(apply +))]
(- 1 ngrams)))
@ -439,8 +478,6 @@
r-1 (get-in trie (concat (butlast words) [:count]))
r* (get-in r*s [n :r*s r])
d (/ r* r)]
(Thread/sleep 100)
(println r r-1 d k words (* d (/ r r-1)))
(if (> r k)
(* d (/ r r-1))
(* (alpha trie r*s (butlast words) k)

Loading…
Cancel
Save