(ns markov-language-model (:require [clojure.string :as string] [clojure.java.io :as io] [com.owoga.trie.math :as math] [com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie.encoding :as encoding] [com.owoga.trie :as tr] [cljol.dig9 :as d] [clojure.zip :as zip] [com.owoga.tightly-packed-trie.bit-manip :as bm])) (def corpus (slurp (io/resource "cask_of_amontillado.txt"))) (defn prep-punctuation-for-tokenization "Puts spaces around punctuation so that they aren't tokenized with the words they are attached to. Might add extraneous whitespace, but presumedly that will be ignored/removed during tokenization." [text] (string/replace text #"([\.,!?])" " $1 ")) ;; For better generation of text, you'll probably want to pad the starts ;; of sentences with n-1 "start-of-sentence" tokens. (defn add-bol-and-eol-tokens [text] (-> text (string/replace #"(\.)" " . ") (#(str " " %)))) (defn remove-quotes [text] (string/replace text #"\"" "")) (defn remove-formatting-characters "Input has underscores, presumably because the text might be rendered by something that can italicize or bold text. We'll just ignore them for now." [text] (string/replace text #"[_*]" "")) (defn tokenize [text] (-> text remove-formatting-characters prep-punctuation-for-tokenization remove-quotes add-bol-and-eol-tokens string/lower-case (string/split #"[\n ]+"))) (defn interleave-all "Like interleave, but instead of ending the interleave when the shortest collection has been consumed, continues to interleave the remaining collections." {:added "1.0" :static true} ([] ()) ([c1] (lazy-seq c1)) ([c1 c2] (lazy-seq (let [s1 (seq c1) s2 (seq c2)] (if (and s1 s2) (cons (first s1) (cons (first s2) (interleave-all (rest s1) (rest s2)))) (lazy-seq (or s1 s2)))))) ([c1 c2 & colls] (lazy-seq (let [ss (->> (map seq (conj colls c2 c1)) (remove nil?))] (when ss (concat (map first ss) (apply interleave-all (map rest ss)))))))) (comment (let [tokens [1 2 3 4 5] p1 (partition 1 1 tokens) p2 (partition 2 1 tokens) p3 (partition 3 1 tokens)] [p1 p2 p3 (interleave-all p1 p2 p3)]) ;; => [((1) (2) (3) (4) (5)) ;; ((1 2) (2 3) (3 4) (4 5)) ;; ((1 2 3) (2 3 4) (3 4 5)) ;; ((1) (1 2) (1 2 3) (2) (2 3) (2 3 4) (3) (3 4) (3 4 5) (4) (4 5) (5))] ) (defn ngramify-tokens [n m tokens] (let [partition-colls (map #(partition % 1 tokens) (range n m)) ngrams (apply interleave-all partition-colls)] ngrams)) (comment (->> (tokenize corpus) (take 5) (ngramify-tokens 1 4)) ;; => (("the") ;; ("the" "thousand") ;; ("the" "thousand" "injuries") ;; ("thousand") ;; ("thousand" "injuries") ;; ("thousand" "injuries" "of") ;; ("injuries") ;; ("injuries" "of") ;; ("injuries" "of" "fortunato") ;; ("of") ;; ("of" "fortunato") ;; ("fortunato")) ) (defn add-terminal-value-to-ngram "The Trie expects entries to be of the form '(k1 k2 k3 value). The ngrams generated above are just '(k1 k2 k3). This adds a value that is simply the ngram itself: '(k1 k2 k3 '(k1 k2 k3))." [ngram] (concat ngram (list ngram))) (comment (let [ngrams (->> corpus tokenize (take 200) (ngramify-tokens 1 4) (map add-terminal-value-to-ngram))])) (defn trie->database "It's convenient to work with a trie that has keys and values as human-readable strings, as pulled straight from a corpus in the case of a markov trie. But to tightly pack the trie into a byte array, we need every value to be an integer that we can variable-length-encode. This creates a database for conveniently converting the human-readable entries to ids and back from ids to human-readable entries. Ids will start at 1 so that 0 can be reserved for the root node." [trie] (let [sorted-keys (->> (seq trie) (sort-by (fn [[k v]] (:count v))) (reverse))] (loop [sorted-keys sorted-keys database {} i 1] (if (empty? sorted-keys) database (recur (rest sorted-keys) (-> database (assoc (first (first sorted-keys)) (assoc (second (first sorted-keys)) :id i)) (assoc i (first (first sorted-keys)))) (inc i)))))) (comment (take 10 (trie->database trie)) (let [ngrams (->> corpus tokenize (take 200) (ngramify-tokens 1 4) (map add-terminal-value-to-ngram)) trie (apply make-trie ngrams)] (trie->database trie)) ;; {("at") {:count 1, :id 39}, ;; 453 ("revenge" "." "You"), ;; ("The") {:count 1, :id 37}, ;; ("resolved" ",") {:count 1, :id 256}, ;; 487 ("very" "definitiveness" "with"), ;; ("be" "respected") {:count 1, :id 170}, ;; ("a" "point") {:count 1, :id 158}, ;; 357 ("and" "he" "did"), ;; 275 ("the" "very"), ;; ("doubt" "my" "good") {:count 1, :id 381}, ;; ,,,} ) (defn transform-trie->ids "Once we have a database to convert from string-keys to integer-ids and back, we can traverse the trie replacing the string-keys with their integer-ids." [trie database] (->> trie (map (fn [[k v]] [(vec (map #(get (get database [%]) :id) k)) {:id (get-in database [k :id]) :count (get-in database [k :count])}])) (into (tr/make-trie)))) (def trie (let [ngrams (->> corpus tokenize (ngramify-tokens 1 4) (map add-terminal-value-to-ngram) (map (fn [entry] (list (butlast entry) (last entry)))))] (->> ngrams (reduce (fn [acc [k v]] (update acc k (fnil (fn [existing] (update existing :count inc)) {:value v :count 0}))) (tr/make-trie))))) (comment (take 10 (drop 1000 trie)) ;; => ([["be" "awaiting"] {:value ("be" "awaiting"), :count 1}] ;; [["be" "cautious" "as"] {:value ("be" "cautious" "as"), :count 1}] ;; [["be" "gone"] {:value ("be" "gone"), :count 2}] ;; [["be" "ill" ","] {:value ("be" "ill" ","), :count 1}]) ) (def trie-database (trie->database trie)) (comment (take 4 trie-database) ;; => ([0 ["."]] ;; [["to" "your" "long"] {:value ("to" "your" "long"), :count 1, :id 1119}] ;; [["an" "instant" "he"] {:value ("an" "instant" "he"), :count 1, :id 4800}] ;; [["fifth" "," "the"] {:value ("fifth" "," "the"), :count 1, :id 3919}]) ) (def tpt-ready-trie (transform-trie->ids trie trie-database)) (comment (take 4 tpt-ready-trie) ;; => ([[0 1 27] {:id 5082, :count 1}] ;; [[0 1 104] {:id 5072, :count 1}] ;; [[0 1 112] {:id 5075, :count 1}] ;; [[0 1 146] {:id 5077, :count 1}]) ) (defn value-encode-fn [v] (if (and (number? v) (zero? v)) (encoding/encode 0) (byte-array (concat (encoding/encode (:id v)) (encoding/encode (:count v)))))) (defn value-decode-fn [byte-buffer] (let [id (encoding/decode byte-buffer)] (if (zero? id) {:id id} {:id id :count (encoding/decode byte-buffer)}))) (def tightly-packed-trie (tpt/tightly-packed-trie tpt-ready-trie value-encode-fn value-decode-fn)) ;;;; DEMO ;;;; ** Out of date since new TrieAgain code (comment ;;;; Size comparisons ;; ;; Original trie, keys and values are lists and strings. ;; ~1,900 kb (d/sum [trie]) ;; 61112 objects ;; 103249 references between them ;; 1901656 bytes total in all objects ;; no cycles ;; 8421 leaf objects (no references to other objects) ;; Original trie, keys and values numbers ;; ~900 kb (d/sum [tpt-ready-trie]) ;; 30008 objects ;; 62543 references between them ;; 907992 bytes total in all objects ;; no cycles ;; 5438 leaf objects (no references to other objects) ;; Tightly-packed-trie, keys and values numbers (backed by var-len encoded ints) ;; ~36 kb (d/sum [tightly-packed-trie]) ;; 6 objects ;; 5 references between them ;; 36736 bytes total in all objects ;; no cycles ;; 4 leaf objects (no references to other objects) ;;;; Size comparison summary ;; ;; Condensed original: 900 kb ;; Tightly packed: 36 kb ;; Compression: ~96% !!! ;;;; Getting value from each type of trie ;; (get trie ["" "i" "was"]) ;; => {:value ("" "i" "was"), :count 1} (get tpt-ready-trie [0 8 21]) ;; => {:id 5116, :count 1} (get tightly-packed-trie [0 8 21]) ;; => {:id 5116, :count 1} ;; And then to get back to a string version, use the database. (->> [0 8 21] (get tightly-packed-trie) :id (get trie-database) (get trie-database)) ;; => {:value ("" "i" "was"), :count 1, :id 5116} ;;;; Our "database" (just a hash-map) serves a dual purpose. ;; ;; It maps n-grams to their frequency counts and to an integer ID. ;; It also maps that integer ID back to the n-gram. (take 10 trie-database) ;; => ([("Then" "I") {:count 1, :id 1475}] ;; [("to" "your" "long") {:count 1, :id 5371}] ;; [("an" "instant" "he") {:count 1, :id 3842}] ;; [("fifth" "," "the") {:count 1, :id 4209}] ;; [("from" "the" "depth") {:count 1, :id 4270}] ;; [2721 ("the" "more")] ;; [("during" "which" ",") {:count 1, :id 4144}] ;; [("nodded") {:count 1, :id 674}] ;; [("the" "feeble") {:count 1, :id 2693}] ;; [("intermingling" "," "into") {:count 1, :id 4488}]) ;;;; Database ;; Each n-gram has its own integer ID. The integer IDs should be handed ;; out to n-grams in order of frequency. Therefore, you're 1-grams will probably ;; have lower IDs than the higher-order n-grams. ;; ;; Here we see "," is the 2nd most-common n-gram. (get-in trie-database ['(",") :id]) ;; => 7 (get-in trie-database ['("i") :id]) ;; => 8 ;; The ID of a 2-gram is not related in any way to ;; the two 1-grams that make it up. Every n-gram is unique ;; and gets its own unique ID. ;; ;; BUT... Every node is referenced by a 1-gram key. ;; So the 2-gram '("," "I") is referenced from ;; the :root key's children by the 1-gram key '(",") ;; and then by that 1-gram key's children by the 1-gram '("I"). ;; The VALUE of that node though is the 2-gram '("," "I"). ;; ;; To re-iterate: The keys are all 1-grams at every nesting level. ;; The values are the higher-order n-grams the lower in the nesting ;; that you go. (get-in trie-database ['("," "i") :id]) ;; => 23 ;;;; Trie vs Tightly Packed Trie ;;;; ;; The interface is *almost* the same between the two. ;; Tightly packed tries can't be updated or written to. ;; They can only be read. ;; And to get from integer IDs to human-readable strings, ;; you need to go through the database. ;; ;; Other than that though, let's see it in action! ;; ;;;; Here is the Trie (get trie '("i")) ;; => {:value ("i"), :count 107} (->> (tr/lookup trie '("i")) (take 5)) ;; => ([["," "i"] {:value ("i" "," "i"), :count 1}] ;; [[","] {:value ("i" ","), :count 1}] ;; [["again" "paused"] {:value ("i" "again" "paused"), :count 1}] ;; [["again"] {:value ("i" "again"), :count 1}] ;; [["am" "on"] {:value ("i" "am" "on"), :count 1}]) (->> (tr/lookup trie '("i")) (tr/children) (map #(get % [])) (take 5)) ;; => ({:value ("i" ","), :count 1} ;; {:value ("i" "again"), :count 1} ;; {:value ("i" "am"), :count 1} ;; {:value ("i" "answered"), :count 1} ;; {:value ("i" "began"), :count 2}) ;;;; And here is the tightly-packed trie (->> (tr/lookup tightly-packed-trie '(8)) (tr/children) (map #(get % [])) (take 5)) ;; => ({:id 3392, :count 1} ;; {:id 3270, :count 1} ;; {:id 129, :count 5} ;; {:id 70, :count 9} ;; {:id 69, :count 9}) (->> (tr/lookup tightly-packed-trie '(8)) (tr/children) (map #(get % [])) (take 5) (map #(get trie-database (:id %)))) ;; => (["i" ","] ["i" "to"] ["i" "was"] ["i" "had"] ["i" "said"]) ;;;; Ta-da! ) (defn key-get-in-tpt [tpt db ks] (let [id (map #(get-in db [(list %) :id]) ks) v (get tpt id)] {id v})) (defn id-get-in-tpt [tpt db ids] (let [ks (apply concat (map #(get db %) ids)) v (get tpt ids) id (get-in db [ks :id])] {ks (assoc v :value (get db id))})) (comment (key-get-in-tpt tightly-packed-trie trie-database '("i" "will")) ;; => {(8 49) {:id 3257, :count 1}} (id-get-in-tpt tightly-packed-trie trie-database '(8 49)) ;; => {("i" "will") {:id 3257, :count 1, :value ["i" "will"]}} ) ;;;; Markov-generating text from trie (comment (def example-story (loop [generated-text [(:id (get trie-database [""]))] i 0] (if (> i 100) generated-text (recur (conj generated-text (tpt/.key (math/weighted-selection #(:count (get % [])) (loop [i 3 children (tr/children (tr/lookup tightly-packed-trie (vec (take-last i generated-text))))] (if (empty? children) (recur (dec i) (tr/children (tr/lookup tightly-packed-trie (vec (take-last i generated-text))))) children))))) (inc i))))) (->> example-story (map #(get trie-database %)) (apply concat) (remove #{"" ""}) (string/join " ") (#(string/replace % #" ([\.,\?])" "$1")) ((fn [txt] (string/replace txt #"(^|\. |\? )([a-z])" (fn [[a b c]] (str b (.toUpperCase c))))))) ;; => "I broke and reached him a flagon of de grave. We came at length. He ;; again took my arm, and holding the flambeaux over the wall; i replied, were ;; a great and numerous family. Whither? to your long life. Putting on a ;; tight-fitting parti-striped dress, and descending again, and had given them ;; explicit orders not to be found, and this time i made bold to seize ;; fortunato by an arm above the elbow. In its destined position." )