You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

475 lines
14 KiB
Clojure

(ns markov-language-model
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.trie.math :as math]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.trie :as tr]
[cljol.dig9 :as d]
[clojure.zip :as zip]
[com.owoga.tightly-packed-trie.bit-manip :as bm]))
(def corpus (slurp (io/resource "cask_of_amontillado.txt")))
(defn prep-punctuation-for-tokenization
"Puts spaces around punctuation so that they aren't
tokenized with the words they are attached to.
Might add extraneous whitespace, but presumedly that will be ignored/removed
during tokenization."
[text]
(string/replace text #"([\.,!?])" " $1 "))
;; For better generation of text, you'll probably want to pad the starts
;; of sentences with n-1 "start-of-sentence" tokens.
(defn add-bol-and-eol-tokens [text]
(-> text
(string/replace #"(\.)" "</s> . <s>")
(#(str "<s> " %))))
(defn remove-quotes
[text]
(string/replace text #"\"" ""))
(defn remove-formatting-characters
"Input has underscores, presumably because the text
might be rendered by something that can italicize or bold text.
We'll just ignore them for now."
[text]
(string/replace text #"[_*]" ""))
(defn tokenize [text]
(-> text
remove-formatting-characters
prep-punctuation-for-tokenization
remove-quotes
add-bol-and-eol-tokens
string/lower-case
(string/split #"[\n ]+")))
(defn interleave-all
"Like interleave, but instead of ending the interleave when the shortest collection
has been consumed, continues to interleave the remaining collections."
{:added "1.0"
:static true}
([] ())
([c1] (lazy-seq c1))
([c1 c2]
(lazy-seq
(let [s1 (seq c1) s2 (seq c2)]
(if (and s1 s2)
(cons (first s1) (cons (first s2)
(interleave-all (rest s1) (rest s2))))
(lazy-seq (or s1 s2))))))
([c1 c2 & colls]
(lazy-seq
(let [ss (->> (map seq (conj colls c2 c1))
(remove nil?))]
(when ss
(concat (map first ss) (apply interleave-all (map rest ss))))))))
(comment
(let [tokens [1 2 3 4 5]
p1 (partition 1 1 tokens)
p2 (partition 2 1 tokens)
p3 (partition 3 1 tokens)]
[p1
p2
p3
(interleave-all p1 p2 p3)])
;; => [((1) (2) (3) (4) (5))
;; ((1 2) (2 3) (3 4) (4 5))
;; ((1 2 3) (2 3 4) (3 4 5))
;; ((1) (1 2) (1 2 3) (2) (2 3) (2 3 4) (3) (3 4) (3 4 5) (4) (4 5) (5))]
)
(defn ngramify-tokens [n m tokens]
(let [partition-colls (map #(partition % 1 tokens) (range n m))
ngrams (apply interleave-all partition-colls)]
ngrams))
(comment
(->> (tokenize corpus)
(take 5)
(ngramify-tokens 1 4))
;; => (("the")
;; ("the" "thousand")
;; ("the" "thousand" "injuries")
;; ("thousand")
;; ("thousand" "injuries")
;; ("thousand" "injuries" "of")
;; ("injuries")
;; ("injuries" "of")
;; ("injuries" "of" "fortunato")
;; ("of")
;; ("of" "fortunato")
;; ("fortunato"))
)
(defn add-terminal-value-to-ngram
"The Trie expects entries to be of the form '(k1 k2 k3 value).
The ngrams generated above are just '(k1 k2 k3).
This adds a value that is simply the ngram itself:
'(k1 k2 k3 '(k1 k2 k3))."
[ngram]
(concat ngram (list ngram)))
(comment
(let [ngrams (->> corpus
tokenize
(take 200)
(ngramify-tokens 1 4)
(map add-terminal-value-to-ngram))]))
(defn trie->database
"It's convenient to work with a trie that has keys and values as
human-readable strings, as pulled straight from a corpus in the case
of a markov trie. But to tightly pack the trie into a byte array,
we need every value to be an integer that we can variable-length-encode.
This creates a database for conveniently converting the human-readable
entries to ids and back from ids to human-readable entries.
Ids will start at 1 so that 0 can be reserved for the root node."
[trie]
(let [sorted-keys (->> (seq trie)
(sort-by (fn [[k v]]
(:count v)))
(reverse))]
(loop [sorted-keys sorted-keys
database {}
i 1]
(if (empty? sorted-keys)
database
(recur
(rest sorted-keys)
(-> database
(assoc (first (first sorted-keys))
(assoc (second (first sorted-keys)) :id i))
(assoc i (first (first sorted-keys))))
(inc i))))))
(comment
(take 10 (trie->database trie))
(let [ngrams (->> corpus
tokenize
(take 200)
(ngramify-tokens 1 4)
(map add-terminal-value-to-ngram))
trie (apply make-trie ngrams)]
(trie->database trie))
;; {("at") {:count 1, :id 39},
;; 453 ("revenge" "." "You"),
;; ("The") {:count 1, :id 37},
;; ("resolved" ",") {:count 1, :id 256},
;; 487 ("very" "definitiveness" "with"),
;; ("be" "respected") {:count 1, :id 170},
;; ("a" "point") {:count 1, :id 158},
;; 357 ("and" "he" "did"),
;; 275 ("the" "very"),
;; ("doubt" "my" "good") {:count 1, :id 381},
;; ,,,}
)
(defn transform-trie->ids
"Once we have a database to convert from string-keys to integer-ids and back,
we can traverse the trie replacing the string-keys with their integer-ids."
[trie database]
(->> trie
(map
(fn [[k v]]
[(vec (map #(get (get database [%]) :id) k))
{:id (get-in database [k :id])
:count (get-in database [k :count])}]))
(into (tr/make-trie))))
(def trie
(let [ngrams (->> corpus
tokenize
(ngramify-tokens 1 4)
(map add-terminal-value-to-ngram)
(map (fn [entry]
(list (butlast entry)
(last entry)))))]
(->> ngrams
(reduce
(fn [acc [k v]]
(update
acc
k
(fnil
(fn [existing]
(update existing :count inc))
{:value v
:count 0})))
(tr/make-trie)))))
(comment
(take 10 (drop 1000 trie))
;; => ([["be" "awaiting"] {:value ("be" "awaiting"), :count 1}]
;; [["be" "cautious" "as"] {:value ("be" "cautious" "as"), :count 1}]
;; [["be" "gone"] {:value ("be" "gone"), :count 2}]
;; [["be" "ill" ","] {:value ("be" "ill" ","), :count 1}])
)
(def trie-database
(trie->database trie))
(comment
(take 4 trie-database)
;; => ([0 ["."]]
;; [["to" "your" "long"] {:value ("to" "your" "long"), :count 1, :id 1119}]
;; [["an" "instant" "he"] {:value ("an" "instant" "he"), :count 1, :id 4800}]
;; [["fifth" "," "the"] {:value ("fifth" "," "the"), :count 1, :id 3919}])
)
(def tpt-ready-trie (transform-trie->ids trie trie-database))
(comment
(take 4 tpt-ready-trie)
;; => ([[0 1 27] {:id 5082, :count 1}]
;; [[0 1 104] {:id 5072, :count 1}]
;; [[0 1 112] {:id 5075, :count 1}]
;; [[0 1 146] {:id 5077, :count 1}])
)
(defn value-encode-fn [v]
(if (and (number? v) (zero? v))
(encoding/encode 0)
(byte-array
(concat (encoding/encode (:id v))
(encoding/encode (:count v))))))
(defn value-decode-fn [byte-buffer]
(let [id (encoding/decode byte-buffer)]
(if (zero? id)
{:id id}
{:id id
:count (encoding/decode byte-buffer)})))
(def tightly-packed-trie
(tpt/tightly-packed-trie tpt-ready-trie value-encode-fn value-decode-fn))
;;;; DEMO
;;;; ** Out of date since new TrieAgain code
(comment
;;;; Size comparisons
;;
;; Original trie, keys and values are lists and strings.
;; ~1,900 kb
(d/sum [trie])
;; 61112 objects
;; 103249 references between them
;; 1901656 bytes total in all objects
;; no cycles
;; 8421 leaf objects (no references to other objects)
;; Original trie, keys and values numbers
;; ~900 kb
(d/sum [tpt-ready-trie])
;; 30008 objects
;; 62543 references between them
;; 907992 bytes total in all objects
;; no cycles
;; 5438 leaf objects (no references to other objects)
;; Tightly-packed-trie, keys and values numbers (backed by var-len encoded ints)
;; ~36 kb
(d/sum [tightly-packed-trie])
;; 6 objects
;; 5 references between them
;; 36736 bytes total in all objects
;; no cycles
;; 4 leaf objects (no references to other objects)
;;;; Size comparison summary
;;
;; Condensed original: 900 kb
;; Tightly packed: 36 kb
;; Compression: ~96% !!!
;;;; Getting value from each type of trie
;;
(get trie ["<s>" "i" "was"])
;; => {:value ("<s>" "i" "was"), :count 1}
(get tpt-ready-trie [0 8 21])
;; => {:id 5116, :count 1}
(get tightly-packed-trie [0 8 21])
;; => {:id 5116, :count 1}
;; And then to get back to a string version, use the database.
(->> [0 8 21]
(get tightly-packed-trie)
:id
(get trie-database)
(get trie-database))
;; => {:value ("<s>" "i" "was"), :count 1, :id 5116}
;;;; Our "database" (just a hash-map) serves a dual purpose.
;;
;; It maps n-grams to their frequency counts and to an integer ID.
;; It also maps that integer ID back to the n-gram.
(take 10 trie-database)
;; => ([("Then" "I") {:count 1, :id 1475}]
;; [("to" "your" "long") {:count 1, :id 5371}]
;; [("an" "instant" "he") {:count 1, :id 3842}]
;; [("fifth" "," "the") {:count 1, :id 4209}]
;; [("from" "the" "depth") {:count 1, :id 4270}]
;; [2721 ("the" "more")]
;; [("during" "which" ",") {:count 1, :id 4144}]
;; [("nodded") {:count 1, :id 674}]
;; [("the" "feeble") {:count 1, :id 2693}]
;; [("intermingling" "," "into") {:count 1, :id 4488}])
;;;; Database
;; Each n-gram has its own integer ID. The integer IDs should be handed
;; out to n-grams in order of frequency. Therefore, you're 1-grams will probably
;; have lower IDs than the higher-order n-grams.
;;
;; Here we see "," is the 2nd most-common n-gram.
(get-in trie-database ['(",") :id]) ;; => 7
(get-in trie-database ['("i") :id]) ;; => 8
;; The ID of a 2-gram is not related in any way to
;; the two 1-grams that make it up. Every n-gram is unique
;; and gets its own unique ID.
;;
;; BUT... Every node is referenced by a 1-gram key.
;; So the 2-gram '("," "I") is referenced from
;; the :root key's children by the 1-gram key '(",")
;; and then by that 1-gram key's children by the 1-gram '("I").
;; The VALUE of that node though is the 2-gram '("," "I").
;;
;; To re-iterate: The keys are all 1-grams at every nesting level.
;; The values are the higher-order n-grams the lower in the nesting
;; that you go.
(get-in trie-database ['("," "i") :id]) ;; => 23
;;;; Trie vs Tightly Packed Trie
;;;;
;; The interface is *almost* the same between the two.
;; Tightly packed tries can't be updated or written to.
;; They can only be read.
;; And to get from integer IDs to human-readable strings,
;; you need to go through the database.
;;
;; Other than that though, let's see it in action!
;;
;;;; Here is the Trie
(get trie '("i"))
;; => {:value ("i"), :count 107}
(->> (tr/lookup trie '("i"))
(take 5))
;; => ([["," "i"] {:value ("i" "," "i"), :count 1}]
;; [[","] {:value ("i" ","), :count 1}]
;; [["again" "paused"] {:value ("i" "again" "paused"), :count 1}]
;; [["again"] {:value ("i" "again"), :count 1}]
;; [["am" "on"] {:value ("i" "am" "on"), :count 1}])
(->> (tr/lookup trie '("i"))
(tr/children)
(map #(get % []))
(take 5))
;; => ({:value ("i" ","), :count 1}
;; {:value ("i" "again"), :count 1}
;; {:value ("i" "am"), :count 1}
;; {:value ("i" "answered"), :count 1}
;; {:value ("i" "began"), :count 2})
;;;; And here is the tightly-packed trie
(->> (tr/lookup tightly-packed-trie '(8))
(tr/children)
(map #(get % []))
(take 5))
;; => ({:id 3392, :count 1}
;; {:id 3270, :count 1}
;; {:id 129, :count 5}
;; {:id 70, :count 9}
;; {:id 69, :count 9})
(->> (tr/lookup tightly-packed-trie '(8))
(tr/children)
(map #(get % []))
(take 5)
(map #(get trie-database (:id %))))
;; => (["i" ","] ["i" "to"] ["i" "was"] ["i" "had"] ["i" "said"])
;;;; Ta-da!
)
(defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)]
{id v}))
(defn id-get-in-tpt [tpt db ids]
(let [ks (apply concat (map #(get db %) ids))
v (get tpt ids)
id (get-in db [ks :id])]
{ks (assoc v :value (get db id))}))
(comment
(key-get-in-tpt
tightly-packed-trie
trie-database
'("i" "will"))
;; => {(8 49) {:id 3257, :count 1}}
(id-get-in-tpt
tightly-packed-trie
trie-database
'(8 49))
;; => {("i" "will") {:id 3257, :count 1, :value ["i" "will"]}}
)
;;;; Markov-generating text from trie
(comment
(def example-story
(loop [generated-text [(:id (get trie-database ["<s>"]))]
i 0]
(if (> i 100)
generated-text
(recur
(conj
generated-text
(tpt/.key
(math/weighted-selection
#(:count (get % []))
(loop [i 3
children
(tr/children
(tr/lookup
tightly-packed-trie
(vec (take-last i generated-text))))]
(if (empty? children)
(recur (dec i)
(tr/children
(tr/lookup
tightly-packed-trie
(vec (take-last i generated-text)))))
children)))))
(inc i)))))
(->> example-story
(map #(get trie-database %))
(apply concat)
(remove #{"<s>" "</s>"})
(string/join " ")
(#(string/replace % #" ([\.,\?])" "$1"))
((fn [txt]
(string/replace txt #"(^|\. |\? )([a-z])" (fn [[a b c]]
(str b (.toUpperCase c)))))))
;; => "I broke and reached him a flagon of de grave. We came at length. He
;; again took my arm, and holding the flambeaux over the wall; i replied, were
;; a great and numerous family. Whither? to your long life. Putting on a
;; tight-fitting parti-striped dress, and descending again, and had given them
;; explicit orders not to be found, and this time i made bold to seize
;; fortunato by an arm above the elbow. In its destined position."
)