(ns examples.scratch (:require [clojure.java.io :as io] [clojure.string :as string] [clojure.set] [com.owoga.prhyme.nlp.core :as nlp])) (def re-word "Regex for tokenizing a string into words (including contractions and hyphenations), commas, periods, and newlines." #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") (defn tokenize "Tokenizes for suffix trie. First token is end of document." [text] (->> text (re-seq re-word) (map second) (map string/lower-case) (reverse) (cons :end))) (comment (-> (slurp "dev/examples/sandman.txt") tokenize)) (defn zero-to-n-seq ([coll] (zero-to-n-seq coll 1)) ([coll i] (let [l (count coll)] (if (> i l) nil (cons (take i coll) (lazy-seq (zero-to-n-seq coll (inc i)))))))) (comment (zero-to-n-seq '(1 2 3 4)) ;; => ((1) (1 2) (1 2 3) (1 2 3 4)) ) (defn i-to-j-seq ([coll i j] (zero-to-n-seq (->> coll (drop i) (take (- j i)))))) (defn n-to-zero-seq ([coll] (n-to-zero-seq coll 0)) ([coll i] (if (= i (count coll)) nil (cons (drop i coll) (lazy-seq (n-to-zero-seq coll (inc i))))))) (comment (n-to-zero-seq '(1 2 3 4)) ;; => ((1 2 3 4) (2 3 4) (3 4) (4)) ) (defn add-to-trie [trie coll] (update-in trie (concat coll [:count]) (fnil inc 0))) (defn add-multiple-to-trie [trie colls] (loop [colls colls trie trie] (cond (empty? colls) trie :else (recur (rest colls) (add-to-trie trie (first colls)))))) (defn n-gram-suffix-trie "Creates a suffix trie of 1-gram to n-gram. Useful for backoff language model (I think)." [n tokens] (let [trie {} windows (partition (inc n) 1 tokens)] (loop [trie trie windows windows] (cond (= 1 (count windows)) (add-multiple-to-trie trie (concat (zero-to-n-seq (first windows)) (rest (n-to-zero-seq (first windows))))) :else (recur (add-multiple-to-trie trie (zero-to-n-seq (first windows))) (rest windows)))))) (comment (let [last-window '("in" "the" "frat")] (concat (zero-to-n-seq last-window) (rest (n-to-zero-seq last-window)))) ;; => (("in") ("in" "the") ("in" "the" "frat") ("the" "frat") ("frat")) (n-gram-suffix-trie 2 (string/split "the cat in the hat is the rat in the frat" #" ")) ;; => {"the" ;; {:count 3, ;; "cat" {:count 1, "in" {:count 1}}, ;; "hat" {:count 1, "is" {:count 1}}, ;; "rat" {:count 1, "in" {:count 1}}, ;; "frat" {:count 1}}, ;; "cat" {:count 1, "in" {:count 1, "the" {:count 1}}}, ;; "in" {:count 2, "the" {:count 2, "hat" {:count 1}, "frat" {:count 1}}}, ;; "hat" {:count 1, "is" {:count 1, "the" {:count 1}}}, ;; "is" {:count 1, "the" {:count 1, "rat" {:count 1}}}, ;; "rat" {:count 1, "in" {:count 1, "the" {:count 1}}}, ;; "frat" {:count 1}} ) (comment (def unigram (n-gram-suffix-trie 1 (tokenize (slurp "dev/examples/sandman.txt")))) unigram (->> unigram (map (fn [[k v]] (vector k (:count v)))) (map second) (apply +)) (def bigram (n-gram-suffix-trie 2 (tokenize (slurp "dev/examples/sandman.txt")))) (->> bigram (map (fn [[k v]] (vector k (:count v)))) (map second) (apply +)) (count bigram) (->> bigram (take 4) (into {})) ;; => {"cutest" {:count 2, "the" {:count 2, "him" {:count 2}}}, ;; "us" {:count 3, "bring" {:count 3, "," {:count 2}, "yeesss" {:count 1}}}, ;; "his" {:count 2, "that" {:count 2, "him" {:count 2}}}, ;; "him" ;; {:count 8, ;; "give" {:count 4, "\n" {:count 4}}, ;; "tell" {:count 2, "then" {:count 2}}, ;; "make" {:count 2, "\n" {:count 2}}}} (->> bigram vals (map :count) frequencies (into []) sort (map #(apply * %)) (apply +)) (count (tokenize (slurp "dev/examples/sandman.txt"))) ;; => ([1 32] [2 20] [3 10] [4 3] [5 1] [6 2] [7 1] [8 2] [9 1] [10 1] [12 1] [26 1]) ) (defn P [trie w] (let [ws (trie w) c (get-in trie [w :count])] (->> ws (#(dissoc % :count)) (map (fn [[k v]] [k (/ (:count v) c)]))))) (defn vals-or-seconds [m] (cond (empty? m) m (map? m) (apply concat (vals m)) :else (apply concat (map second m)))) (defn flat-at-depth "Convenience way of getting frequencies of n-grams. Given a trie with a depth of 0, it will return all 1-grams key/value pairs. That collection can be filtered for keys that hold the freqs." [m depth] (let [m (if (map? m) (into [] m) m)] (cond (<= depth 0) m :else (flat-at-depth (->> m (mapcat second) (remove #(= :count (first %)))) (dec depth))))) (comment (let [trie {"d" {:count 3 "o" {:count 3 "g" {:count 2} "t" {:count 1}} "a" {:count 1 "y" {:count 1}}} "f" {:count 2 "o" {:count 1 "g" {:count 1}} "i" {:count 1 "g" {:count 1}}}}] (->> (flat-at-depth trie 2))) ) ;; Let Nc be the number of N-grams that occur c times. ;; Good-turing discounting: ;; c* = (c + 1) * Nc+1 / Nc (defn n-gram-frequencies [trie n] (if (< n 0) {} (->> trie (#(flat-at-depth % (dec n))) (map second) (map :count) frequencies (into (sorted-map))))) (defn n-gram->occurence-count-frequencies [trie n] (n-gram-frequencies trie n)) (comment (def tokens ["d" "o" "g" "\n" "d" "a" "y" "\n" "d" "o" "g" "\n" "d" "o" "t"]) (def trie (n-gram-suffix-trie 2 tokens)) trie ;; => {"d" ;; {:count 4, ;; "o" {:count 3, "g" {:count 2}, "t" {:count 1}}, ;; "a" {:count 1, "y" {:count 1}}}, ;; "o" {:count 2, "g" {:count 2, "\n" {:count 2}}}, ;; "g" {:count 2, "\n" {:count 2, "d" {:count 2}}}, ;; "\n" {:count 3, "d" {:count 3, "a" {:count 1}, "o" {:count 2}}}, ;; "a" {:count 1, "y" {:count 1, "\n" {:count 1}}}, ;; "y" {:count 1, "\n" {:count 1, "d" {:count 1}}}} (count bigram) (count (flat-at-depth bigram 0)) (->> bigram (#(flat-at-depth % 0)) (filter #(= :count (first %))) (map second) frequencies (into (sorted-map)) (map #(apply * %)) (apply +)) (n-gram-frequencies trie 2) ;; => {3 2, 1 3, 2 2} ;; for bigrams ;; of frequency 3 occurs 2 times ;; of frequency 2 occurs 2 times ;; of frequency 1 occurs 3 times (n-gram-frequencies trie 1) ;; => {4 1, 2 2, 3 1, 1 2} ) (defn num-seen-n-grams [trie n] (->> trie (#(flat-at-depth % (dec n))) (remove #(= :count (first %))) count)) (defn n-gram-frequency-map "Map of n-gram to frequency of frequencies." [trie n] (into {} (map #(vector % (n-gram-frequencies trie %)) (range 1 (inc n))))) (comment (n-gram-frequencies bigram 1) (n-gram-frequency-map bigram 2) ) (defn number-of-n-grams [trie n] (->> trie (#(flat-at-depth % (dec n))) (remove #(= :count (first %))) count)) (defn number-of-possible-n-grams [dict n] (int (Math/pow (count dict) n))) (defn number-of-n-grams-that-occur-c-times [trie n c] (if (zero? c) (- (number-of-possible-n-grams trie n) (count (flat-at-depth trie (dec n)))) (let [frequencies-map (->> (n-gram-frequency-map trie n) (#(get % n)))] (get frequencies-map c 0)))) (comment (number-of-possible-n-grams bigram 2) (count (flat-at-depth bigram 1)) (count bigram) (->> (number-of-n-grams-that-occur-c-times bigram 1 1)) (->> (number-of-n-grams-that-occur-c-times bigram 0 3) (filter #(= :count (first %))) (map second) frequencies sort) ) (defn mle [trie c] (let [N (->> trie vals (map :count) (apply +))] (/ c N))) (->> bigram (filter (fn [[k v]] (= 3 (v :count))))) ;; Good-Turing Smoothing ;; ;; There are 4 steps to perform the GT smoothing, which are: ;; 1. Count the frequency of frequency Nr ;; 2. Average all the non-zero counts using Zr = Nr / 0.5 (t - q) ;; 3. Fit a linear regression model log(Zr) = a + b log(r) ;; 4. Update r with r* using Katz equation and constant k, with ;; updated Zr corresponding to specific r read out from the linear ;; regression model. (defn least-squares-linear-regression [xs ys] (let [n (count xs) sum-x (apply + xs) sum-y (apply + ys) sum-xy (apply + (map #(apply * %) (map vector xs ys))) sum-x-sqr (apply + (map #(* % %) xs)) m (/ (- (* n sum-xy) (* sum-x sum-y)) (- (* n sum-x-sqr) (* sum-x sum-x))) b (/ (- sum-y (* m sum-x)) n)] (fn [x] (+ (* m x) b)))) (defn average-consecutives "Average all the non-zero counts using the equation Zr = Nr / 0.5 (t - q)" [freqs Nrs] (let [freqs (vec freqs) Nrs (vec Nrs)] (loop [i 0 result []] (let [q (nth freqs (max (dec i) 0)) Nr (nth Nrs (min (dec (count freqs)) i)) r (nth freqs (min (dec (count freqs)) i)) t (nth freqs (min (dec (count freqs)) (inc i)))] (cond (= i (count freqs)) result (zero? i) (recur (inc i) (conj result (/ (* 2 Nr) t))) (= (dec i) (count freqs)) (recur (inc i) (conj result (/ (* 2 Nr (- t q))))) :else (recur (inc i) (conj result (/ Nr (- r q))))))))) (comment (let [xs [1 2 3 4 5 6 7 8 9 10 12 26] ys [32 20 10 3 1 2 1 1 1 2 1 1] smoothed (average-consecutives xs ys) logged (map #(Math/log %) smoothed) lm (least-squares-linear-regression xs ys) log-lm (map lm xs) log-ys (map #(Math/pow % Math/E) log-lm)] ;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14] [log-lm log-ys]) (Math/log 1) ) (defn turings-estimate [trie n r] (/ (* (inc r) (number-of-n-grams-that-occur-c-times trie n (inc r))) (number-of-n-grams-that-occur-c-times trie n r))) (defn good-turing [trie n r] (let [nr (number-of-n-grams-that-occur-c-times trie n r) nr1 (number-of-n-grams-that-occur-c-times trie n (inc r))] (println (format "cx %d nc %d ncx1 %d - %f" r nr nr1 (float (/ (* (inc r) nr1) nr)))) (/ (* (inc r) nr1) nr))) (comment (number-of-n-grams-that-occur-c-times bigram 1 1) ;; unigram counts (def unigram-counts (->> bigram vals (map :count) frequencies (into (sorted-map)))) ;; => {1 32, 2 20, 3 10, 4 3, 5 1, 6 2, 7 1, 8 1, 9 1, 10 2, 12 1, 26 1} ;; revised good-turing counts (->> unigram-counts (map (fn [[freq freq']] [freq (good-turing bigram 1 freq)])) (into (sorted-map))) ;; => {1 5/4, 2 3/2, 3 6/5, 4 5/3, 5 12, 6 7/2, 7 8, 8 9, 9 20, 10 0, 12 0, 26 0} (map (fn [[r nr]] (good-turing bigram 1 r)) unigram-counts) ;; => (5/4 3/2 6/5 5/3 12 7/2 8 9 20 0 0 0) (turings-estimate bigram 1 7) ) (defn revise-frequencies [frequencies N] (let [m (reverse (sort (keys frequencies)))] (loop [revised {} m m] (cond (empty? m) revised :else (recur (assoc revised (first m) (good-turing (get frequencies (first m) 0) (get frequencies (second m) 0) N)) (rest m)))))) (comment (get (n-gram-frequency-map trie 3) 1) ;; => {4 1, 2 2, 3 1, 1 2} (revise-frequencies (get (n-gram-frequency-map trie 3) 1) (apply + (map :count (vals trie)))) ;; => {4 2/13, 3 4/13, 2 3/13, 1 0} (def n-gram-freq-map (n-gram-frequency-map trie 3)) (def unigram-frequencies (n-gram-freq-map 1)) unigram-frequencies ) (defn number-of-n-grams-that-occur-with-count [trie n c] ) (defn good-turing-discount [trie c] ) (->> bigram (map second)) (count (into #{} (tokenize (slurp "dev/examples/sandman.txt")))) (->> bigram (map second) (map #(dissoc % :count)) (map keys) flatten (into #{}) (clojure.set/difference (into #{} (keys bigram)))) (partition 3 1 (repeat :end) (range 6)) (let [documents (->> "dark-corpus" io/file file-seq (remove #(.isDirectory %)) (take 10))] documents)