(ns markov-language-model (:require [clojure.string :as string] [clojure.java.io :as io] [com.owoga.tightly-packed-trie.core :as tpt] [com.owoga.tightly-packed-trie.encoding :as encoding] [cljol.dig9 :as d] [clojure.zip :as zip] [com.owoga.tightly-packed-trie.bit-manip :as bm])) (def corpus (slurp (io/resource "cask_of_amontillado.txt"))) ;; For better generation of text, you'll probably want to pad the starts ;; of sentences with n-1 "start-of-sentence" tokens. (defn prep-punctuation-for-tokenization "Puts spaces around punctuation so that they aren't tokenized with the words they are attached to. Might add extraneous whitespace, but presumedly that will be ignored/removed during tokenization." [text] (string/replace text #"([\.,!?])" " $1 ")) (defn remove-formatting-characters "Input has underscores, presumably because the text might be rendered by something that can italicize or bold text. We'll just ignore them for now." [text] (string/replace text #"[_*]" "")) (defn tokenize [text] (-> text remove-formatting-characters prep-punctuation-for-tokenization (string/split #"[\n ]+"))) (defn interleave-all "Like interleave, but instead of ending the interleave when the shortest collection has been consumed, continues to interleave the remaining collections." {:added "1.0" :static true} ([] ()) ([c1] (lazy-seq c1)) ([c1 c2] (lazy-seq (let [s1 (seq c1) s2 (seq c2)] (if (and s1 s2) (cons (first s1) (cons (first s2) (interleave-all (rest s1) (rest s2)))) (lazy-seq (or s1 s2)))))) ([c1 c2 & colls] (lazy-seq (let [ss (->> (map seq (conj colls c2 c1)) (remove nil?))] (when ss (concat (map first ss) (apply interleave-all (map rest ss)))))))) (comment (let [tokens [1 2 3 4 5] p1 (partition 1 1 tokens) p2 (partition 2 1 tokens) p3 (partition 3 1 tokens)] (interleave-all p1 p2 p3))) (defn ngramify-tokens [n m tokens] (let [partition-colls (map #(partition % 1 tokens) (range n m)) ngrams (apply interleave-all partition-colls)] ngrams)) (comment (->> (tokenize corpus) (take 5) (ngramify-tokens 1 4)) ;; => (("The") ;; ("The" "thousand") ;; ("The" "thousand" "injuries") ;; ("thousand") ;; ("thousand" "injuries") ;; ("thousand" "injuries" "of") ;; ("injuries") ;; ("injuries" "of") ;; ("injuries" "of" "Fortunato") ;; ("of") ;; ("of" "Fortunato") ;; ("Fortunato")) ) (defn make-trie ([] (tpt/->Trie (fn update-fn [prev cur] (if (nil? prev) (sorted-map :value (last cur) :count 1) (-> prev (update :count (fnil inc 0)) (assoc :value (last cur))))) (sorted-map))) ([& ks] (reduce (fn [t k] (conj t k)) (make-trie) ks))) (defn add-terminal-value-to-ngram "The Trie expects entries to be of the form '(k1 k2 k3 value). The ngrams generated above are just '(k1 k2 k3). This adds a value that is simply the ngram itself: '(k1 k2 k3 '(k1 k2 k3))." [ngram] (concat ngram (list ngram))) (comment (let [ngrams (->> corpus tokenize (take 200) (ngramify-tokens 1 4) (map add-terminal-value-to-ngram)) trie (apply make-trie ngrams)] (tpt/as-map trie)) ;; {:root ;; {:children ;; {"," ;; {:children ;; {"I" ;; {:children {"vowed" {:count 1, :value ("," "I" "vowed")}}, ;; :count 1, ;; :value ("," "I")}, ;; "and" ;; {:children {"he" {:count 1, :value ("," "and" "he")}}, ;; :count 1, ;; :value ("," "and")}, ;; ,,,}}}}}} ) ;; TODO: Move this to ITrie? (defn trie->seq-of-nodes "Returns a seq of every terminal node. Useful for things like doing aggregation calculations." [trie] (->> trie tpt/as-vec zip/vector-zip (iterate zip/next) (take-while (complement zip/end?)) (map zip/node) (filter map?))) ;; The tightly packed trie uses an encoding where integers are encoded with ;; variable lengths. To maximize memory efficiency, the most commonly used values ;; should have the smallest integer IDs. That way the values that most commonly appear ;; are encoded with the fewest bytes. (defn seq-of-nodes->sorted-by-count "Sorted first by the rank of the ngram, lowest ranks first. Sorted second by the frequency of the ngram, highest frequencies first. This is the order that you'd populate a mapping of keys to IDs." [nodes] (->> nodes (map (comp first seq)) (map (fn [[k v]] (vector (:value v) (:count v)))) ;; root node and padded starts (remove (comp nil? second)) (sort-by #(vector (count (first %)) (- (second %)))))) (comment (let [ngrams (->> corpus tokenize (take 200) (ngramify-tokens 1 4) (map add-terminal-value-to-ngram)) trie (apply make-trie ngrams)] (->> trie trie->seq-of-nodes seq-of-nodes->sorted-by-count (take 10))) ;; => ([(",") 11] ;; [(".") 9] ;; [("I") 8] ;; [("the") 6] ;; [("to") 6] ;; [("was") 5] ;; [("a") 4] ;; [("my") 4] ;; [("of") 4] ;; [("as") 3]) ) (defn trie->database "It's convenient to work with a trie that has keys and values as human-readable strings, as pulled straight from a corpus in the case of a markov trie. But to tightly pack the trie into a byte array, we need every value to be an integer that we can variable-length-encode. This creates a database for conveniently converting the human-readable entries to ids and back from ids to human-readable entries." [trie] (let [sorted-keys (->> (trie->seq-of-nodes trie) seq-of-nodes->sorted-by-count)] (loop [sorted-keys sorted-keys database {} i 1] (if (empty? sorted-keys) database (recur (rest sorted-keys) (-> database (assoc (first (first sorted-keys)) {:count (second (first sorted-keys)) :id i}) (assoc i (first (first sorted-keys)))) (inc i)))))) (comment (let [ngrams (->> corpus tokenize (take 200) (ngramify-tokens 1 4) (map add-terminal-value-to-ngram)) trie (apply make-trie ngrams)] (trie->database trie)) ;; {("at") {:count 1, :id 39}, ;; 453 ("revenge" "." "You"), ;; ("The") {:count 1, :id 37}, ;; ("resolved" ",") {:count 1, :id 256}, ;; 487 ("very" "definitiveness" "with"), ;; ("be" "respected") {:count 1, :id 170}, ;; ("a" "point") {:count 1, :id 158}, ;; 357 ("and" "he" "did"), ;; 275 ("the" "very"), ;; ("doubt" "my" "good") {:count 1, :id 381}, ;; ,,,} ) (seq {"and" {:count 1 :value '("foo")}});; => (["and" {:count 1, :value ("foo")}]) (defn transform-trie->ids "Once we have a database to convert from string-keys to integer-ids and back, we can traverse the trie using its `transform` zipper and `zip/edit` each node replacing the string-keys with their integer-ids." [trie database] (let [transform-p #(map? (zip/node %)) transform-f (fn tf [loc] (zip/edit loc (fn [node] ;; {"And {:count 1, :value (! " "And)}} ;; <- Node (let [[k v] (first (seq node))] {(get-in database [(list k) :id] (if (= k :root) :root)) (assoc v :value (get-in database [(:value v) :id] 0))}))))] (tpt/transform trie (tpt/visitor-filter transform-p transform-f)))) (def trie (let [ngrams (->> corpus tokenize (ngramify-tokens 1 4) (map add-terminal-value-to-ngram))] (apply make-trie ngrams))) (def trie-database (trie->database trie)) (def tpt-ready-trie (transform-trie->ids trie trie-database)) (def tightly-packed-trie (tpt/tightly-packed-trie tpt-ready-trie)) ;;;; DEMO ;; (comment ;;;; Our "database" (just a hash-map) serves a dual purpose. ;; ;; It maps n-grams to their frequency counts and to an integer ID. ;; It also maps that integer ID back to the n-gram. (take 10 trie-database) ;; => ([("Then" "I") {:count 1, :id 1475}] ;; [("to" "your" "long") {:count 1, :id 5371}] ;; [("an" "instant" "he") {:count 1, :id 3842}] ;; [("fifth" "," "the") {:count 1, :id 4209}] ;; [("from" "the" "depth") {:count 1, :id 4270}] ;; [2721 ("the" "more")] ;; [("during" "which" ",") {:count 1, :id 4144}] ;; [("nodded") {:count 1, :id 674}] ;; [("the" "feeble") {:count 1, :id 2693}] ;; [("intermingling" "," "into") {:count 1, :id 4488}]) ;;;; We can `get` the value of an n-gram from a Trie. ;; The value returned will be a Trie that has as its root node the ;; value at the n-gram. This gives you access to all of the descendants. ;; ;; Having access to the descendants is useful for something like ;; auto-complete. You can get in the trie the input to the completion, the prefix. ;; Then you can get the completions by simple seq-ing over the child nodes. (tpt/as-map (get trie '("," "I"))) ;; => {"I" ;; {:count 10, ;; :value ("," "I"), ;; :children ;; {"am" {:count 1, :value ("," "I" "am")}, ;; "began" {:count 2, :value ("," "I" "began")}, ;; ,,, ;; "well" {:count 1, :value ("," "I" "well")}}}} ;;;; Database ;; Each n-gram has its own integer ID. The integer IDs should be handed ;; out to n-grams in order of frequency. Therefore, you're 1-grams will probably ;; have lower IDs than the higher-order n-grams. ;; ;; Here we see "," is the 2nd most-common n-gram. (get-in trie-database ['(",") :id]) ;; => 2 (get-in trie-database ['("I") :id]) ;; => 4 ;; The ID of a 2-gram is not related in any way to ;; the two 1-grams that make it up. Every n-gram is unique ;; and gets its own unique ID. ;; ;; BUT... Every node is referenced by a 1-gram key. ;; So the 2-gram '("," "I") is referenced from ;; the :root key's children by the 1-gram key '(",") ;; and then by that 1-gram key's children by the 1-gram '("I"). ;; The VALUE of that node though is the 2-gram '("," "I"). ;; ;; To re-iterate: The keys are all 1-grams at every nesting level. ;; The values are the higher-order n-grams the lower in the nesting ;; that you go. (get-in trie-database ['("," "I") :id]) ;; => 911 ;;;; Map-based Trie vs Tightly Packed Trie ;;;; ;; The interface is *almost* the same between the two. ;; Tightly packed tries can't be updated or written to. ;; They can only be read. ;; And to get from integer IDs to human-readable strings, ;; you need to go through the database. ;; ;; Other than that though, let's see it in action! ;; ;;;; Here is the map-based trie. (->> (tpt/as-map (get trie '("," "I"))) (#(get-in % ["I" :children])) (map seq) (map first)) ;; => ("am" "began" "ceased" "had" "resumed" "soon" "suffered" "vowed" "well") ;;;; And here is the tightly-packed trie (->> (get tightly-packed-trie '(2 4)) tpt/children (map tpt/value) (map :value) (map #(get trie-database %)) (map last) sort) ;; => ("am" "began" "ceased" "had" "resumed" "soon" "suffered" "vowed" "well") ;;;; Ta-da! ;;;; Let's check the size difference ;; Memory footprint comparison ;; 2.2mb -> 37kb. ;; 1.7% of its original Clojure map size!!! (->> trie (.trie) vector d/sum) ;; 65485 objects ;; 109687 references between them ;; 2179088 bytes total in all objects ;; no cycles ;; 8413 leaf objects (no references to other objects) ;; 1 root nodes (no reference to them from other objects _in this graph_) (->> tightly-packed-trie (.byte-buffer) vector d/sum) ;; 2 objects ;; 1 references between them ;; 37680 bytes total in all objects ;; no cycles ;; 1 leaf objects (no references to other objects) ;; 1 root nodes (no reference to them from other objects _in this graph_) (let [trie-at-2 (get tightly-packed-trie '(2)) address (.address trie-at-2) byte-buffer (.byte-buffer trie-at-2) limit (.limit trie-at-2)] (tpt/wrap-byte-buffer byte-buffer (.position byte-buffer address) (.limit byte-buffer limit) (println "Address of node at 2" address) (encoding/decode byte-buffer) (encoding/decode byte-buffer) (println "Size of index at 2" (encoding/decode byte-buffer)) (println "position of first key in index" (.position byte-buffer)) (encoding/decode-number-from-tightly-packed-trie-index byte-buffer) (encoding/decode-number-from-tightly-packed-trie-index byte-buffer) (.position byte-buffer (- address 1037)) ;; Position of '("," "the") (encoding/decode byte-buffer) (encoding/decode byte-buffer) (encoding/decode byte-buffer) ;; 11 size of index ;; 1462 position of buffer ;; max-address of index = 1473 (or 1472?) (.position byte-buffer) (.position byte-buffer 2618) ;; First mid of broken binary search (tpt/rewind-to-key byte-buffer 2500) (println (.position byte-buffer)) (println (bm/to-binary-string (.get byte-buffer (.position byte-buffer)))) (println (bm/to-binary-string (.get byte-buffer (dec (.position byte-buffer))))) )) (let [trie-at-2 (get tightly-packed-trie '(2)) address (.address trie-at-2) byte-buffer (.byte-buffer trie-at-2) limit (.limit trie-at-2)] (tpt/wrap-byte-buffer byte-buffer (.position byte-buffer address) (.limit byte-buffer limit) (encoding/decode byte-buffer) (encoding/decode byte-buffer) (encoding/decode byte-buffer) (encoding/decode-number-from-tightly-packed-trie-index byte-buffer) (encoding/decode-number-from-tightly-packed-trie-index byte-buffer) (.position byte-buffer (- address 1037)) (encoding/decode byte-buffer) (encoding/decode byte-buffer) (encoding/decode byte-buffer))) ;; I's offset, 986. ;; See below. ID at offset 986 is 2! Same as at above offset! ;; And the count is 4? The count coincidentally is the ID we expect? (let [trie-at-2 (get tightly-packed-trie '(2)) address 986 byte-buffer (.byte-buffer trie-at-2) limit (.limit trie-at-2)] (tpt/wrap-byte-buffer byte-buffer (.position byte-buffer address) (.limit byte-buffer limit) (encoding/decode byte-buffer))) (get trie-database 4) (let [byte-buffer (.byte-buffer tightly-packed-trie)] (.position byte-buffer))) ;; Memory footprint comparison ;; 2.2mb -> 32kb. ;; 1.5% of its original Clojure map size! (comment (->> trie (.trie) vector d/sum) ;; 65485 objects ;; 109687 references between them ;; 2179088 bytes total in all objects ;; no cycles ;; 8413 leaf objects (no references to other objects) ;; 1 root nodes (no reference to them from other objects _in this graph_) (->> tightly-packed-trie (.byte-buffer) vector d/sum) ;; 2 objects ;; 1 references between them ;; 32896 bytes total in all objects ;; no cycles ;; 1 leaf objects (no references to other objects) ;; 1 root nodes (no reference to them from other objects _in this graph_) ) (defn key-get-in-tpt [tpt db ks] (let [id (map #(get-in db [(list %) :id]) ks) v (get tpt id)] {id v})) (defn id-get-in-tpt [tpt db ids] (let [ks (apply concat (map #(get db %) ids)) v (get tpt ids) id (get-in db [ks :id])] {ks (assoc v :value (get db id))})) (comment (key-get-in-tpt tightly-packed-trie trie-database '("another")) ;; => {(2 2 3) {:value 3263, :count 462}} (id-get-in-tpt tightly-packed-trie trie-database '(2 2 3)) ;; => {("" "" "the") {:value ("" "" "the"), :count 462}} ) (comment ;; database (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts)] (->> (trie->database trie) (#(get % 3)))) (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts)] (tpt/as-map (transform-trie->ids trie))) (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts) tightly-packed-trie (tpt/tightly-packed-trie (transform-trie->ids trie))] (get tightly-packed-trie '(2 2 3))) (let [texts (->> (dark-corpus-file-seq 500 2) (map slurp)) trie (create-trie-from-texts texts)] (tpt/as-map trie)) (let [text (slurp (first (dark-corpus-file-seq 500 1)))] (->> text util/clean-text (#(string/split % #"\n+")))) )