(ns examples.tpt (:require [clojure.string :as string] [clojure.java.io :as io] [taoensso.tufte :as tufte :refer (defnp p profiled profile)] [com.owoga.trie :as trie] [com.owoga.tightly-packed-trie :as tpt] [com.owoga.trie.math :as math] [com.owoga.tightly-packed-trie.encoding :as encoding] [com.owoga.prhyme.util :as util] [com.owoga.prhyme.data.dictionary :as dict] [clojure.zip :as zip] [cljol.dig9 :as d])) (tufte/add-basic-println-handler! {}) (defn dark-corpus-file-seq [start end] (let [documents (->> "dark-corpus" io/file file-seq (remove #(.isDirectory %)) (drop start) (take end))] documents)) (defn remove-sentences-with-words-not-in-dictionary "This gets rid of a lot of good words. All contractions for example... I'll, They'll... possessives like morning's... Might not end up using it." [dictionary] (let [dictionary (into #{} dictionary)] (fn [sentences] (->> sentences (map #(string/split % #" ")) (remove #(some (complement dictionary) %)) (remove #(some string/blank? %)) (map #(string/join " " %)))))) (def re-word "Regex for tokenizing a string into words (including contractions and hyphenations), commas, periods, and newlines." #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") (defn pad-tokens "Pads the beginning with n - 1 tokens and the end with 1 token." [tokens n] (concat (repeat (max 1 (dec n)) "") tokens [""])) (defn tokenize-line [line] (->> line (string/trim) (re-seq re-word) (map second) (map string/lower-case))) (defn process-files-for-trie "Expects an entire song, lines seperated by \n." [files] (->> files (map slurp) (filter dict/english?) (map util/clean-text) (map #(string/split % #"\n+")) (map (remove-sentences-with-words-not-in-dictionary dict/popular)) (remove empty?) (remove #(some empty? %)) (map (fn [lines] (map tokenize-line lines))) (map (fn [lines] (map #(pad-tokens % 1) lines))) (map (fn [lines] (map #(partition 2 1 %) lines))))) (defn text->ngrams "Takes text from a file, including newlines. Pads lines with and for start/end of line. Pads beginning with n - 1 s" [text n] (let [words-not-in-dict-filter (remove-sentences-with-words-not-in-dictionary dict/popular)] (->> text util/clean-text (#(string/split % #"\n+")) (remove empty?) (map tokenize-line) (map #(pad-tokens % n)) (map #(partition n 1 %)) (apply concat)))) (defn n-to-m-grams "Exclusive of m, similar to range." [n m text] (loop [i n r '()] (cond (= i m) (apply concat r) :else (recur (inc i) (cons (text->ngrams text i) r))))) (defn prep-ngram-for-trie "The tpt/trie expects values conjed into an ngram to be of format '(k1 k2 k3 value)." [ngram] (clojure.lang.MapEntry. (vec ngram) ngram)) (defn create-trie-from-texts [texts] (->> texts (map #(n-to-m-grams 1 5 %)) (apply concat) (map prep-ngram-for-trie) (reduce (fn [[trie i db] [k v]] (let [[db i] (reduce (fn [[db i] k] (let [k' (get db k i) i' (if (= i k') (inc i) i) db' (-> db (assoc k' k) (assoc k k'))] [db' i'])) [db i] k)] (let [k' (map #(get db %) k)] (if-let [existing (get trie k')] (let [[val count] existing trie (assoc trie k' [val (inc count)])] [trie i db]) [(assoc trie k' [i 1]) i db])))) [(trie/make-trie) 1 {}]))) (defn seq-of-nodes->sorted-by-count "Sorted first by the rank of the ngram, lowest ranks first. Sorted second by the frequency of the ngram, highest frequencies first. This is the order that you'd populate a mapping of keys to IDs." [trie] (->> trie trie/children (map #(get % [])) (sort-by :count) reverse)) (time (let [texts (->> (dark-corpus-file-seq 0 250000) (map slurp)) [trie _ db] (create-trie-from-texts texts)] (def trie trie) (def trie-database db))) (defn encode-fn [v] (let [[value count] (if (seqable? v) v [nil nil])] (if (nil? value) (encoding/encode 0) (byte-array (concat (encoding/encode value) (encoding/encode count)))))) (defn decode-fn [byte-buffer] (let [value (encoding/decode byte-buffer)] (if (zero? value) [nil nil] [value (encoding/decode byte-buffer)]))) (time (def tightly-packed-trie (tpt/tightly-packed-trie trie encode-fn decode-fn))) (take 20 tightly-packed-trie) (defn key-get-in-tpt [tpt db ks] (let [id (map #(get-in db [(list %) :id]) ks) v (get tpt id)] {id v})) (defn id-get-in-tpt [tpt db ids] (let [ks (apply concat (map #(get db %) ids)) v (get tpt ids) id (get-in db [ks :id])] {ks (assoc v :value (get db id))})) (time (count (map #(get % []) (trie/children (trie/lookup tightly-packed-trie [1]))))) (time (count (trie/children (trie/lookup tightly-packed-trie [1])))) (comment (profile {} (def example-story (loop [generated-text [(get trie-database "")] i 0] (if (> i 10) generated-text (let [children (loop [i 4] (let [node (p :lookup (trie/lookup tightly-packed-trie (vec (take-last i generated-text)))) children (p :seq-children (doall (seq (and node (trie/children node)))))] (cond (nil? node) (recur (dec i)) (< i 0) (throw (Exception. "Error")) children children :else (recur (dec i)))))] (recur (conj generated-text (->> children (map #(get % [])) (remove nil?) (#(p :weighted-selection (math/weighted-selection :count %))) :value (get trie-database))) (inc i))))))) (->> example-story (map #(get-in trie-database [% :value])) (concat) (string/join " ") (#(string/replace % #" ([\.,\?])" "$1")) ((fn [txt] (string/replace txt #"(^|\. |\? )([a-z])" (fn [[a b c]] (str b (.toUpperCase c))))))) (key-get-in-tpt tightly-packed-trie trie-database '("" "" "the")) ;; => {(2 2 3) {:value 3263, :count 462}} (id-get-in-tpt tightly-packed-trie trie-database '(2 2 3)) ;; => {("" "" "the") {:value ("" "" "the"), :count 462}} ) (comment (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts)] (tpt/as-map (transform-trie->ids trie))) (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts) tightly-packed-trie (tpt/tightly-packed-trie (transform-trie->ids trie))] (get tightly-packed-trie '(2 2 3))) (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts)] (tpt/as-map trie)) (let [text (slurp (first (dark-corpus-file-seq 500 1)))] (->> text util/clean-text (#(string/split % #"\n+")))) )