More exploration of Katz back-off

main
Eric Ihli 4 years ago
parent f0e03d76a2
commit b63b8d6cf4

@ -236,13 +236,16 @@
(remove (fn [[k _]] (= :count k)))) (remove (fn [[k _]] (= :count k))))
(def r*s (sgt/trie->r*s trie)) (def r*s (sgt/trie->r*s trie))
r*s
(get-in trie ["you're" "my"])
(get-in r*s [2 :r*s]) (get-in r*s [1 :N])
(get-in trie ["you're" "my"])
(get-in r*s [1 :r*s 2616])
(get-in r*s [1 :r0])
(get-in trie ["you're" :count])
(get-in trie ["my" "us"]) (get-in trie [1 :r0])
(get-in {:a 1} '()) (get-in {:a 1} '())
(sgt/katz-alpha (sgt/katz-alpha
@ -254,11 +257,53 @@
(sgt/alpha trie r*s ["eat" "my"] 2) (sgt/alpha trie r*s ["eat" "my"] 2)
(get-in trie ["you're" "my" "lady"]) (get-in trie ["you're" "my" "lady"])
(sgt/katz-estimator trie r*s 0 ["you're" "my" "head"]) (sgt/katz-estimator trie r*s 0 ["you're" "my" "head"])
;; => 0.1067916992217116 ;; => 0.1067916992217116
(sgt/katz-estimator trie r*s 0 ["you're" "my" "lady"]) (sgt/katz-estimator trie r*s 0 ["you're" "my" "lady"])
;; => 0.016222893164898698 ;; => 0.016222893164898698
(sgt/katz-estimator trie r*s 0 ["you're" "my" "fooball"]) (sgt/katz-estimator trie r*s 0 ["you're" "my" "baz"])
;; => 9.223367982725652E-6
(get-in trie ["you're" ])
(get-in r*s [1 :N])
(sgt/katz-beta-alpha trie r*s 0 ["you're" "not"])
;; => 0.14643662138043667
;; => 0.014190462313655283
(/ 0.14 0.014)
(/ 0.27 0.14)
(sgt/P-sub-s trie r*s 0 ["you're" "tearing" "foo"])
;; => 1.739617874207705E-4
(let [k 0
words ["not"]]
(->> (get-in trie (butlast words))
(remove #(= :count (first %)))
(filter (fn [[_ v]] (> (:count v) k)))
(map first)
(map #(concat (butlast words) [%]))
(map #(sgt/P-bar trie r*s %))
(apply +)))
(let [words ["you're" "my"]]
(->> (get-in trie (butlast words))
(remove #(= :count (first %)))
(filter (fn [[_ v]] (> (:count v) 0)))
(map first)
(map #(concat (butlast words) [%]))
(map #(sgt/katz-estimator trie r*s 0 %))
(apply +)))
(sgt/P-bar trie r*s ["foo"])
(let [words ["my"]]
(->> (get-in trie (butlast words))
(remove #(= :count (first %)))
(filter (fn [[_ v]] (> (:count v) 0)))
(map first)
(map #(concat (butlast words) [%]))
(map #(sgt/katz-estimator trie r*s 0 %))
(apply +)))
;; => 9.223367982725652E-6
(float (/ 1 27)) (float (/ 1 27))
(get-in trie ["eat" "my"]) (get-in trie ["eat" "my"])
(sgt/sum-of-betas trie r*s ["you're" "my"]) (sgt/sum-of-betas trie r*s ["you're" "my"])

@ -276,13 +276,19 @@
(fn [[ngram rs-nrs-map]] (fn [[ngram rs-nrs-map]]
(let [rs (keys rs-nrs-map) (let [rs (keys rs-nrs-map)
nrs (vals rs-nrs-map) nrs (vals rs-nrs-map)
N (apply + (map #(apply * %) (map vector rs nrs)))
r0 (first nrs)
zrs (average-consecutives rs nrs) zrs (average-consecutives rs nrs)
lm (least-squares-log-log-linear-regression rs zrs)] lm (least-squares-log-log-linear-regression rs zrs)]
[ngram {:rs rs [ngram {:N N
:nrs nrs :r0 r0
:zrs zrs :rs rs
:nrs (first nrs) nrs
:zrs (first nrs) zrs
:lm lm :lm lm
:r*s (into (sorted-map) (map vector rs (r-stars rs zrs lm)))}])) :r*s (into
(sorted-map)
(map vector rs (r-stars rs zrs lm)))}]))
ngram-rs-nrs-map)))) ngram-rs-nrs-map))))
;; zrs (average-consecutives rs nrs) ;; zrs (average-consecutives rs nrs)
@ -320,23 +326,56 @@
(declare katz-beta-alpha) (declare katz-beta-alpha)
(defn theta [x]
(if (zero? x) 1 0))
(defn P-bar
[trie r*s words]
(let [n (count words)
c (get-in trie (concat words [:count]) 0)
r* (get-in r*s [n :r*s c])
N (get-in r*s [n :N])]
(if (= 1 n)
(/ r* N)
(let [c-1 (get-in trie (concat (butlast words) [:count]) 0)
d (/ r* c)]
(println "dr" d r* c)
(* d (/ c c-1))))))
(defn P-sub-s
[trie r*s k words]
(let [c (get-in trie (concat (butlast words) [:count]) 0)]
(if (> c k)
(P-bar trie r*s words)
(let [alpha (katz-beta-alpha trie r*s k words)]
(* alpha (P-sub-s trie r*s k (rest words)))))))
(defn katz-estimator (defn katz-estimator
[trie r*s k words] [trie r*s k words]
(let [r (get-in trie (concat words [:count]) 0)] (Thread/sleep 100)
(if (> r 0) (println words)
(let [n (count words) (if (= 1 (count words))
r* (get-in r*s [n :r*s r]) (let [c (get-in trie (concat words [:count]))]
r-1 (get-in trie (concat (butlast words) [:count]) 1) (if c
d (/ r* r)] (/ (get-in r*s [1 :r*s c]) (get-in r*s [1 :N]))
(* d (/ r r-1))) (/ (get-in r*s [1 :r0])
(let [alpha (/ (katz-beta-alpha trie r*s k words) (get-in r*s [1 :N]))))
(katz-beta-alpha trie r*s k (rest words)))] (let [r (get-in trie (concat words [:count]) 0)]
(* alpha (if (> r 0)
(katz-estimator (let [n (count words)
trie r* (get-in r*s [n :r*s r])
r*s r-1 (get-in trie (concat (butlast words) [:count]) 1)
k d (/ r* r)]
(rest words))))))) (* d (/ r r-1)))
(let [alpha (/ (katz-beta-alpha trie r*s k words)
(katz-beta-alpha trie r*s k (rest words)))]
(* alpha
(katz-estimator
trie
r*s
k
(rest words))))))))
(defn katz-beta-alpha (defn katz-beta-alpha
[trie r*s k words] [trie r*s k words]
@ -345,7 +384,7 @@
(filter (fn [[_ v]] (> (:count v) k))) (filter (fn [[_ v]] (> (:count v) k)))
(map first) (map first)
(map #(concat (butlast words) [%])) (map #(concat (butlast words) [%]))
(map #(katz-estimator trie r*s k %)) (map #(P-bar trie r*s %))
(apply +))] (apply +))]
(- 1 ngrams))) (- 1 ngrams)))
@ -439,8 +478,6 @@
r-1 (get-in trie (concat (butlast words) [:count])) r-1 (get-in trie (concat (butlast words) [:count]))
r* (get-in r*s [n :r*s r]) r* (get-in r*s [n :r*s r])
d (/ r* r)] d (/ r* r)]
(Thread/sleep 100)
(println r r-1 d k words (* d (/ r r-1)))
(if (> r k) (if (> r k)
(* d (/ r r-1)) (* d (/ r r-1))
(* (alpha trie r*s (butlast words) k) (* (alpha trie r*s (butlast words) k)

Loading…
Cancel
Save