From 33b5f26d0b60a4ec7a547a857fef5ab3ff7f56c7 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 29 Dec 2020 21:08:59 -0800 Subject: [PATCH] Move towards Katz backoff --- dev/examples/scratch.clj | 49 +++++++++- .../prhyme/generation/simple_good_turing.clj | 97 +++++++++++++++++-- 2 files changed, 136 insertions(+), 10 deletions(-) diff --git a/dev/examples/scratch.clj b/dev/examples/scratch.clj index 5b4d897..4fdead2 100644 --- a/dev/examples/scratch.clj +++ b/dev/examples/scratch.clj @@ -2,6 +2,7 @@ (:require [clojure.java.io :as io] [clojure.string :as string] [clojure.set] + [com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.nlp.core :as nlp] [com.owoga.prhyme.generation.simple-good-turing :as sgt] [com.owoga.prhyme.util.math :as math])) @@ -198,6 +199,12 @@ (conj words (first word)) (conj freqs (second word)))))))))) + +(defn normalize [coll] + (let [s (apply + coll)] + (map #(/ % s) coll))) + + (comment (def trie (let [documents (->> "dark-corpus" @@ -205,18 +212,53 @@ file-seq (remove #(.isDirectory %)))] (->> documents + (take 10000) (map slurp) (mapcat #(string/split % #"\n")) (map tokenize-line) (filter #(> (count %) 1)) - (take 5000) (reduce (fn [acc tokens] (-> (add-to-trie-1 acc 1 tokens) (add-to-trie-1 2 tokens) (add-to-trie-1 3 tokens))) {})))) - (count trie) + + (->> (get-in trie ["you're" "my"]) + (remove (fn [[k _]] (= :count k)))) + + (def r*s (sgt/trie->r*s trie)) + + (def probs + (->> (range 1 4) + (map #(vector % (filter-trie-to-ngrams trie %))) + (map (fn [[n v]] [n (map #(second %) v)])) + (map (fn [[n v]] [n (into (sorted-map) (frequencies v))])) + (map (fn [[n v]] [n (sgt/simple-good-turing (keys v) (vals v))])) + (map (fn [[n [rs probs]]] + [n (into {} (map vector rs probs))])) + (into {}))) + + (sgt/katz-backoff trie probs r*s) + ;; probability of 3-grams + (let [bigram ["eat" "my"] + trigrams (map #(conj bigram %) dict/popular)] + (->> trigrams + (map #(vector % (sgt/stupid-backoff trie probs %))) + (map #(apply vec %)) + (sort-by second) + (reverse) + (take 20))) + + (repeatedly + 10 + (fn [] + (let [bigram ["eat" "my"] + trigrams (map #(conj bigram %) dict/popular)] + (->> trigrams + (map #(vector % (sgt/stupid-backoff trie probs %))) + (take 10))))) + (let [documents (->> "dark-corpus" io/file file-seq @@ -422,7 +464,8 @@ "g" {:count 1}} "i" {:count 1 "g" {:count 1}}}}] - (filter-trie-to-ngrams trie 3)) + (filter-trie-to-ngrams trie 3) + (sgt/trie->r*s trie)) ) diff --git a/src/com/owoga/prhyme/generation/simple_good_turing.clj b/src/com/owoga/prhyme/generation/simple_good_turing.clj index 6ac687a..a6a81fc 100644 --- a/src/com/owoga/prhyme/generation/simple_good_turing.clj +++ b/src/com/owoga/prhyme/generation/simple_good_turing.clj @@ -229,20 +229,103 @@ ) +(defn flatmap + ([m] + (flatmap m [])) + ([m prefix] + (mapcat + (fn [[k v]] + (if (map? v) + (flatmap v (conj prefix k)) + [(conj prefix k) v])) + m))) + +(defn filter-trie-to-ngrams [trie n] + (->> trie + (flatmap) + (partition 2) + ;; Inc to account for :count + (filter #(= (inc n) (count (first %)))))) + +(defn depth-of-map + [m] + (loop [d 0 + m m] + (let [child-maps (filter map? (vals m))] + (if (empty? child-maps) + d + (recur (inc d) (first child-maps)))))) + +(defn trie->r*s + [trie] + (let [depth (depth-of-map trie) + ngram-rs-nrs-map + (into + {} + (map + (fn [d] + (let [flattened (filter-trie-to-ngrams trie d)] + [d (into (sorted-map) (frequencies (map second flattened)))])) + (range 1 (inc depth))))] + (into + {} + (map + (fn [[ngram rs-nrs-map]] + (let [rs (keys rs-nrs-map) + nrs (vals rs-nrs-map) + zrs (average-consecutives rs nrs) + lm (least-squares-log-log-linear-regression rs zrs)] + [ngram {:rs rs + :nrs nrs + :zrs zrs + :lm lm + :r*s (r-stars rs zrs lm)}])) + ngram-rs-nrs-map)))) + + ;; zrs (average-consecutives rs nrs) + ;; lm (least-squares-log-log-linear-regression rs zrs) + (defn stupid-backoff [trie probs words] (let [times-seen (or (get-in trie (concat words [:count])) 0)] + #_(when (and (> times-seen 0) + (> (count words) 2)) + (Thread/sleep 100) + (println "Seen" words times-seen (get-in trie (concat (butlast words) [:count])))) (cond (= 1 (count words)) - (get-in probs [1 (:count (trie (first words)))]) + (let [r (get-in probs [1 (:count (get trie (first words) {:count 0}))])] + (if (nil? r) + (println "1" words) + r)) (< 0 times-seen) - (/ times-seen (get-in trie (concat (butlast words) [:count]))) + (let [r (/ times-seen (get-in trie (concat words [:count])))] + (if (nil? r) + (println "0" words) + r)) :else - (* 0.4 (stupid-backoff trie probs (butlast words)))))) - -(defn probability-of-sentence - [trie probs sentence] - ()) + (* 0.4 (stupid-backoff trie probs (rest words)))))) + +(defn katz-backoff + [trie probs r*s words] + (let [k 0 + n (count words) + c (get-in trie (concat words [:count]) 0) + d (/ (r*s c) c) + v (* d (/ c (get-in trie (concat (butlast words) [:count]))))] + (if (> c k) + v + (let [words (butlast words) + b (->> (get-in trie words) + (remove (fn [[k _]] (= :count k))) + (filter (fn [[k v]] (> (:count v) k))) + (map (fn [[k v]] + (let [c-den (:count v) + c-num (get-in trie (concat words [k :count])) + d (/ (r*s c) )] + (* d (/ c-num c-den))))) + (apply +))] + b))))