You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1183 lines
38 KiB
Clojure

(ns com.owoga.corpus.markov
(:require [com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data-transform :as data-transform]
[com.owoga.prhyme.util.math :as math]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[clojure.set :as set]
[clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify]
[taoensso.nippy :as nippy]
[clojure.math.combinatorics :as combinatorics]
[com.owoga.prhyme.nlp.core :as nlp]))
(defrecord RhymeSet [phones words])
; Since we're dealing with phonetics, a word consists of the spelling as well as all possible phonetic pronunciations.
(defrecord UnpronouncedWord [word pronunciations])
(defrecord PronouncedWord [word pronunciation])
(defn all-pronunciations
[words]
(let [pronunciations (apply combinatorics/cartesian-product (map :pronunciations words))]
(map
(fn [pronunciation]
(map ->PronouncedWord (map :word words) pronunciation))
pronunciations)))
(let [input-words ["bog" "hog"]
words (map (fn [word] (->UnpronouncedWord word (phonetics/get-phones word))) input-words)
pronunciations (all-pronunciations words)]
pronunciations)
(defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
(defn xf-file-seq [start end]
(comp (remove #(.isDirectory %))
(drop start)
(take end)))
;;;; Efficient Tries with Database
;; To make a more memory-efficient trie, and
;; to more easily support the conversion of a trie
;; to a tightly packed trie, convert all keys and values
;; to integers.
;;
;; Also, create a database to map integer IDs back to
;; their string values and string values to integer IDs.
;;;; The difference between a forwards and a backwards
;; markov is that the backwards markov has its tokens
;; reversed and has the </s> tokens padded by a number
;; equal to the markov rank (rather than the <s> padded).
(defn file-seq->markov-trie
"For forwards markov."
[database files n m]
(transduce
(comp
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce data-transform/xf-tokenize conj))
(map (partial transduce data-transform/xf-filter-english conj))
(map (partial remove empty?))
(map (partial into [] (data-transform/xf-pad-tokens (dec m) "<s>" 1 "</s>")))
(map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
(mapcat (partial mapv (data-transform/make-database-processor database))))
(completing
(fn [trie lookup]
(update trie lookup (fnil #(update % 1 inc) [lookup 0]))))
(trie/make-trie)
files))
(comment
(let [files (->> "dark-corpus"
io/file
file-seq
(eduction (xf-file-seq 501 2)))
database (atom {:next-id 1})
trie (file-seq->markov-trie database files 1 3)]
[(take 5 trie)
(map (comp (partial map @database) first) (take 20 (drop 105 trie)))
(take 10 @database)])
)
(defn file-seq->backwards-markov-trie
"For backwards markov."
[database files n m]
(transduce
(comp
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce data-transform/xf-tokenize conj))
(map (partial transduce data-transform/xf-filter-english conj))
(map (partial remove empty?))
(map (partial map (comp vec reverse)))
;; xf-pad-tokens needs vectors to properly pad due to `into`
(map (partial into [] (data-transform/xf-pad-tokens (dec m) "</s>" 1 "<s>")))
(map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
(mapcat (partial mapv (data-transform/make-database-processor database))))
(completing
(fn [trie lookup]
(update trie lookup (fnil #(update % 1 inc) [(peek lookup) 0]))))
(trie/make-trie)
files))
(comment
(let [files (->> "dark-corpus"
io/file
file-seq
(eduction (xf-file-seq 501 2)))
database (atom {:next-id 1})
trie (file-seq->backwards-markov-trie database files 1 3)]
[(take 5 trie)
(->> (trie/children-at-depth trie 0 1)
(map
(fn [[k v]]
[(map @database k) v]))
(sort-by (comp - second second))
(take 5))])
;; => [([(1 1 2) [[1 1 2] 55]]
;; [(1 1) [[1 1] 55]]
;; [(1 2 3) [[1 2 3] 1]]
;; [(1 2 7) [[1 2 7] 1]]
;; [(1 2 12) [[1 2 12] 1]])
;; ([("</s>") [[1] 110]]
;; [("<s>") [[2] 55]]
;; [(",") [[19] 14]]
;; [("you") [[63] 11]]
;; [("to") [[15] 7]])]
)
;;;; Packing the trie into a small memory footprint
(defn encode-fn [v]
(let [[value count] (if (seqable? v) v [nil nil])]
(if (nil? value)
(encoding/encode 0)
(byte-array
(concat (encoding/encode value)
(encoding/encode count))))))
(defn decode-fn [db]
(fn [byte-buffer]
(let [value (encoding/decode byte-buffer)]
(if (zero? value)
nil
[value (encoding/decode byte-buffer)]))))
(defn save-tightly-packed-trie
[trie database filepath]
(let [tightly-packed-trie
(tpt/tightly-packed-trie
trie
encode-fn
(decode-fn @database))]
(tpt/save-tightly-packed-trie-to-file
filepath
tightly-packed-trie)))
(defn load-tightly-packed-trie
[filepath database]
(tpt/load-tightly-packed-trie-from-file
filepath
(decode-fn @database)))
;;;; Training
(defn train-backwards
"For building lines backwards so they can be seeded with a target rhyme."
[files n m trie-filepath database-filepath tightly-packed-trie-filepath]
(let [database (atom {:next-id 1})
trie (file-seq->backwards-markov-trie database files n m)]
(nippy/freeze-to-file trie-filepath (seq trie))
(println "Froze" trie-filepath)
(nippy/freeze-to-file database-filepath @database)
(println "Froze" database-filepath)
(save-tightly-packed-trie trie database tightly-packed-trie-filepath)
(let [loaded-trie (->> trie-filepath
nippy/thaw-from-file
(into (trie/make-trie)))
loaded-db (->> database-filepath
nippy/thaw-from-file)
loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file
tightly-packed-trie-filepath
(decode-fn loaded-db))]
(println "Loaded trie:" (take 5 loaded-trie))
(println "Loaded database:" (take 5 loaded-db))
(println "Loaded tightly-packed-trie:" (take 5 loaded-tightly-packed-trie))
(println "Successfully loaded trie and database."))))
(comment
(time
(let [files (->> "dark-corpus"
io/file
file-seq
(eduction (xf-file-seq 0 250000)))
[trie database] (train-backwards
files
1
5
"/home/eihli/.models/markov-trie-4-gram-backwards.bin"
"/home/eihli/.models/markov-database-4-gram-backwards.bin"
"/home/eihli/.models/markov-tightly-packed-trie-4-gram-backwards.bin")]))
(time
(def markov-trie (into (trie/make-trie) (nippy/thaw-from-file "/home/eihli/.models/markov-trie-4-gram-backwards.bin"))))
(time
(def database (nippy/thaw-from-file "/home/eihli/.models/markov-database-4-gram-backwards.bin")))
(time
(def markov-tight-trie
(tpt/load-tightly-packed-trie-from-file
"/home/eihli/.models/markov-tightly-packed-trie-4-gram-backwards.bin"
(decode-fn database))))
(take 20 markov-tight-trie)
)
(defn gen-rhyme-model
[rhyme-type-fn database database-filepath]
(let [words (filter string? (keys @database))
rhyme-trie (prhyme/words->rhyme-trie rhyme-type-fn words)]
(nippy/freeze-to-file database-filepath (seq rhyme-trie))
(let [loaded-trie (->> (nippy/thaw-from-file database-filepath)
(into (trie/make-trie)))]
(println "Successfully loaded rhyme model")
(println (take 5 loaded-trie)))))
(comment
(time
(let [database (atom (nippy/thaw-from-file "/home/eihli/.models/markov-database-4-gram-backwards.bin"))]
(gen-rhyme-model
prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
database
"/home/eihli/.models/rhyme-trie-primary-stressed-vowels-and-trailing-consonants.bin")
(gen-rhyme-model
prhyme/phrase->unstressed-vowels-and-tailing-consonants
database
"/home/eihli/.models/rhyme-trie-unstressed-vowels-and-trailing-consonants.bin")))
(def rhyme-trie
(into
(trie/make-trie)
(nippy/thaw-from-file
"/home/eihli/.models/rhyme-trie-unstressed-vowels-and-trailing-consonants.bin")))
)
(defn choice->n-gram
[{:keys [database]} choice]
(map database (first choice)))
(defn weighted-selection-from-choices
[choices]
(math/weighted-selection
(comp second second)
choices))
(ns-unmap (find-ns 'com.owoga.corpus.markov) 'rhyme-choices)
(defmulti rhyme-choices
"Returns a list of words that end with the same phones
as the target. If the target is a string, converts the string to phones."
(fn [trie target] (class target)))
(defmethod rhyme-choices String
[trie phrase]
(let [phones (phonetics/get-phones phrase)]
(->> phones
(map reverse)
(mapcat (partial rhyme-choices trie))
(remove empty?))))
(defmethod rhyme-choices :default
[trie phones]
(->> (trie/lookup trie phones)
(remove (comp nil? second))
(map #(update % 0 into (reverse phones)))
(map #(update % 0 vec))))
(comment
(->> (prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "bog")
(map first)
(map reverse)
(mapcat (partial rhyme-choices rhyme-trie)))
(let [rhyme-trie (trie/make-trie ["G" "AA1" "B"] "bog" ["G" "AO1" "B"] "bog"
["T" "AA1" "H"] "hot" ["G" "AO1" "F"] "fog")]
[(rhyme-choices rhyme-trie ["G" "AO1"])
(rhyme-choices rhyme-trie "fog")
(rhyme-choices rhyme-trie "bog")])
;; => [([("G" "AO1" "B") "bog"] [("G" "AO1" "F") "fog"])
;; ([("G" "AO1" "F") "fog"])
;; ([("G" "AA1" "B") "bog"] [("G" "AO1" "B") "bog"])]
(->> (rhyme-choices-walking-target-rhyme
rhyme-trie
["N" "AH1" "F"]
identity)
rand-nth
((fn [[phones words]]
[[phones] (rand-nth (vec words))])))
)
(defn rhyme-choices-walking-target-rhyme-with-stop
"All target rhymes need to be in phone form.
`target-rhyme`: [N UH1 F]
If we try to turn string form into phone form,
we'd sometimes be forced to deal with multiple pronunciations.
By only handling phone form here, the caller can handle multiple pronunciations.
Makes for a cleaner API.
`words-fn` gets passed the result of `rhyme-choices` which has this structures
([(G AO1 B) bog] [(G AO1 F) fog])
`stop?` gets passed the remaining target-rhyme phones and the current choices.
"
([trie stop? target-rhyme]
(rhyme-choices-walking-target-rhyme-with-stop
trie
stop?
target-rhyme
identity))
([trie stop? target-rhyme words-fn]
(loop [target-rhyme target-rhyme
result []]
(let [choices (words-fn (rhyme-choices trie target-rhyme))]
(if (stop? target-rhyme choices)
(into result choices)
(recur (butlast target-rhyme)
(into result choices)))))))
(defn rhyme-choices-walking-target-rhyme
"All target rhymes need to be in phone form.
`target-rhyme`: [N UH1 F]
If we try to turn string form into phone form,
we'd sometimes be forced to deal with multiple pronunciations.
By only handling phone form here, the caller can handle multiple pronunciations.
Makes for a cleaner API.
`words-fn` gets passed the result of `rhyme-choices` which has this structures
([(G AO1 B) bog] [(G AO1 F) fog])
"
([trie target-rhyme]
(rhyme-choices-walking-target-rhyme
trie
target-rhyme
identity))
([trie target-rhyme words-fn]
(loop [target-rhyme target-rhyme
result []]
(let [choices (words-fn (rhyme-choices trie target-rhyme))]
(if (or (empty? target-rhyme)
(and (not-empty choices)
(prhyme/last-primary-stress? (reverse target-rhyme))))
(into result choices)
(recur (butlast target-rhyme)
(into result choices)))))))
(comment
(let [words ["bloodclot" "woodrot" "moonshot" "dot" "bog" "pat" "pot" "lot"]
phones (mapcat prhyme/phrase->all-flex-rhyme-tailing-consonants-phones words)
rhyme-trie (reduce
(fn [trie [phones word]]
(update trie phones (fnil conj #{}) [phones word]))
(trie/make-trie)
(map #(update % 0 reverse) phones))]
(rhyme-choices-walking-target-rhyme
rhyme-trie
(reverse (first (first (prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "tight knot"))))))
;; => [[("T" "AA1" "AH1") #{[("T" "AA1" "AH1") "bloodclot"]}]
;; [("T" "AA1" "UH1") #{[("T" "AA1" "UH1") "woodrot"]}]
;; [("T" "AA1" "UW1") #{[("T" "AA1" "UW1") "moonshot"]}]
;; [("T" "AA1")
;; #{[("T" "AA1") "dot"] [("T" "AA1") "pot"] [("T" "AA1") "lot"]}]]
(rhyme-choices-walking-target-rhyme
rhyme-trie
(reverse '("UH1" "AA1" "R" "T")))
)
(defn get-next-markov
"Weighted selection from markov model with backoff.
Expects markov key/values to be [k1 k2 k3] [<value> freq]."
([markov-trie seed]
(get-next-markov markov-trie seed identity))
([markov-trie seed process-children-fn]
(let [seed (take-last 3 seed)
node (trie/lookup markov-trie seed)
children (and node
(->> node
trie/children
process-children-fn
(map (fn [^com.owoga.trie.ITrie child]
; Get key and frequency of each child
[(.key child)
(get child [])]))
(remove (comp nil? second))))]
(cond
; If we've never seen this n-gram, fallback to n-1-gram
(nil? node) (recur markov-trie (rest seed) process-children-fn)
(seq children)
(if (< (rand) (/ (apply max (map (comp second second) children))
(apply + (map (comp second second) children))))
(recur markov-trie (rest seed) process-children-fn)
(first (math/weighted-selection (comp second second) children)))
(> (count seed) 0)
(recur markov-trie (rest seed) process-children-fn)
; If we have a node but no children, or if we don't have a seed,
; we don't know how to handle that situation.
:else (throw (Exception. "Error"))))))
(defn normalized-frequencies
[coll]
(let [freqs (frequencies coll)
total (apply + (vals freqs))]
(reduce
(fn [freqs [k v]]
(assoc freqs k (float (/ v total))))
{}
freqs)))
(comment
(let [markov-trie (trie/make-trie ["see" "dog" "run"] [["see" "dog" "run"] 1]
["see" "cat" "eat"] [["see" "cat" "eat"] 1]
["see" "dog"] [["see" "dog"] 1]
["dog" "eat"] [["dog" "eat"] 1]
["see" "cat"] [["see" "cat"] 1]
["cat" "eat"] [["cat" "eat"] 1]
["see"] [["see"] 2]
["dog"] [["dog"] 1]
["run"] [["run"] 1]
["cat"] [["cat"] 1]
["eat"] [["eat"] 1])
seed ["see"]
node (trie/lookup markov-trie seed)]
[(normalized-frequencies
(repeatedly 1000 #(get-next-markov markov-trie ["see"])))
(normalized-frequencies
(repeatedly 1000 #(get-next-markov markov-trie ["see dog"])))])
;; => [{"cat" 0.336, "dog" 0.308, "eat" 0.088, "see" 0.178, "run" 0.09}
;; {"cat" 0.141, "dog" 0.176, "see" 0.32, "eat" 0.187, "run" 0.176}]
)
(defn generate-n-syllable-sentence-rhyming-with
[markov-trie
rhyme-trie
target-rhyme
n-gram-rank
target-rhyme-syllable-count
target-sentence-syllable-count]
(let [rhyme (->> (rhyme-choices-walking-target-rhyme rhyme-trie target-rhyme)
rand-nth
((fn [[phones words]]
[[phones] (rand-nth (vec words))])))]
(loop [phrase [rhyme]]
(if (or (= prhyme/BOS (second (peek phrase)))
(<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase
(string/join " " (map second phrase)))))
phrase
(recur
(conj
phrase
(let [word (get-next-markov
markov-trie
(into (mapv second phrase)
(vec (repeat (dec n-gram-rank) prhyme/EOS))))]
[(phonetics/get-phones word) word])))))))
(comment
(let [words [["see" "dog" "run"] [["see" "dog" "run"] 1]
["see" "cat" "eat"] [["see" "cat" "eat"] 1]
["dog" "has" "fun"] [["dog" "has" "fun"] 1]
["see" "dog"] [["see" "dog"] 1]
["dog" "eat"] [["dog" "eat"] 1]
["see" "cat"] [["see" "cat"] 1]
["cat" "eat"] [["cat" "eat"] 1]
["has" "fun"] [["has" "fun"] 1]
["see"] [["see"] 2]
["dog"] [["dog"] 2]
["run"] [["run"] 1]
["cat"] [["cat"] 1]
["eat"] [["eat"] 1]
["has"] [["has"] 1]
["fun"] [["fun"] 1]]
words (map
(fn [[k [v f]]]
[(reverse k) [(reverse v) f]])
(partition 2 words))
markov-trie (into (trie/make-trie) words)
words ["see" "dog" "run" "cat" "eat" "has" "fun"]
rhyme-trie (prhyme/words->rhyme-trie
prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
words)
target-rhyme ["N" "AH1" "F"]]
(sort-by
(comp - second)
(normalized-frequencies
(repeatedly
10
#(map
second
(generate-n-syllable-sentence-rhyming-with
markov-trie
rhyme-trie
target-rhyme
3
1
3))))))
;; => ([("fun" "see" "see") 0.027]
;; [("fun" "dog" "see") 0.026]
;; [("fun" "see" "dog") 0.026]
;; ,,,
;; [("fun" "run" "has") 0.001])
)
(defn tightly-generate-n-syllable-sentence
"It's difficult to mix a tight trie with rhymes. You need
to convert ids using the database.
This is going to generate sentences backwards.
Generates the following structure:
[[[[S K AY1]] sky]
[[[DH AH0] [DH AH1] [DH IY0]] the]
[[[K R AE1 K S]] cracks]
[[[G R AW1 N D]] ground]
[[[DH AH0] [DH AH1] [DH IY0]] the]
[[[T UW1] [T IH0] [T AH0]] to]
[[[K IH1 NG D AH0 M]] kingdom]
[[[DH AH0] [DH AH1] [DH IY0]] the]
[[[D IH0 S T R OY1]] destroy]]
"
([database
markov-trie
n-gram-rank
target-sentence-syllable-count]
(tightly-generate-n-syllable-sentence
database
markov-trie
n-gram-rank
target-sentence-syllable-count
identity))
([database
markov-trie
n-gram-rank
target-sentence-syllable-count
process-markov-children]
(let [eos (database prhyme/EOS)
bos (database prhyme/BOS)]
(loop [phrase []]
(if (<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase
(string/join " " (map second phrase))))
phrase
(recur
(conj
phrase
(let [word (database
(get-next-markov
markov-trie
; Pad sentence with eos markers since we're working backwards
(into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase))
process-markov-children))]
[(phonetics/get-phones word) word]))))))))
(comment
(tightly-generate-n-syllable-sentence
database
markov-trie
3
10
(fn [children]
(remove
(fn [child]
(let [lookup (.key child)
[word freq] (get child [])]
(#{(database prhyme/EOS) (database prhyme/BOS)} word)))
children)))
;; [[[["HH" "ER1" "T" "S"]] "hurts"]
;; [[["IH1" "T"] ["IH0" "T"]] "it"]
;; [[["AH0" "N" "D"] ["AE1" "N" "D"]] "and"]
;; [[["F" "EY1" "S"]] "face"]
;; [[["M" "AY1"]] "my"]
;; [[["AH0" "G" "EH1" "N" "S" "T"] ["AH0" "G" "EY1" "N" "S" "T"]] "against"]
;; [[["L" "AY1" "F"]] "life"]
;; [[["M" "AY1"]] "my"]
;; [[["L" "AY1" "V"] ["L" "IH1" "V"]] "live"]]
)
(defn tightly-generate-n-syllable-sentence-rhyming-with
"It's difficult to mix a tight trie with rhymes. You need
to convert ids using the database.
`rhyme-wordset-fn` will take something that looks like
([(G AO1 B) bog] [(G AO1 F) fog])
"
([database
markov-trie
rhyme-trie
target-rhyme
n-gram-rank
target-rhyme-syllable-count
target-sentence-syllable-count]
(tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
target-rhyme
n-gram-rank
target-rhyme-syllable-count
target-sentence-syllable-count
identity
identity))
([database
markov-trie
rhyme-trie
target-rhyme
n-gram-rank
target-rhyme-syllable-count
target-sentence-syllable-count
markov-process-children
rhyme-process-words]
(let [eos (database prhyme/EOS)
bos (database prhyme/BOS)
choices (rhyme-choices-walking-target-rhyme
rhyme-trie
target-rhyme
rhyme-process-words)
[rhyming-phones rhyming-word] (update (rand-nth choices) 1 (comp rand-nth vec))
;; The rhyme only has the rhyming phones. Grab full pronunciation.
rhyming-word-phones (rand-nth (phonetics/get-phones rhyming-word))]
(loop [phrase [[rhyming-word-phones rhyming-word]]]
(if (<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase
(string/join " " (map second phrase))))
phrase
(recur
(conj
phrase
(let [word (database
(get-next-markov
markov-trie
(into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase))
markov-process-children))]
[(rand-nth (phonetics/get-phones word)) word]))))))))
(defn make-markov-filter
"Specifically works with markovs with entries of the format:
[lookup [index freq]]
"
[words-to-remove]
(let [words-to-remove (into #{} words-to-remove)]
(fn [children]
(remove
(fn [child]
(let [[word _] (get child [])]
(words-to-remove word)))
children))))
(defn make-rhyme-filter
[words-to-remove]
(let [words-to-remove (into #{} words-to-remove)]
(fn [rhyming-words]
(->> (map (fn [[phones wordset]]
[phones (set/difference wordset words-to-remove)])
rhyming-words)
(remove (fn [[phones wordset]]
(empty? wordset)))))))
(defn tightly-generate-n-syllable-sentence-v2
"
If you want to generate a sentence targeting a rhyme, generate the rhyming tail out-of-band
and then pass it as a seed to this function.
"
([database
markov-trie
n-gram-rank
target-sentence-syllable-count
seed]
(tightly-generate-n-syllable-sentence-v2
database
markov-trie
n-gram-rank
target-sentence-syllable-count
identity
seed))
([database
markov-trie
n-gram-rank
target-sentence-syllable-count
markov-process-children
seed]
(let [[eos bos] (map database [prhyme/EOS prhyme/BOS])]
(loop [phrase seed]
(if (<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase
(string/join " " (map second phrase))))
phrase
(recur
(conj
phrase
(let [word (database
(get-next-markov
markov-trie
(into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase))
markov-process-children))]
[(rand-nth (phonetics/get-phones word)) word]))))))))
;;;; Demo
;;;;
(comment
(let [target-rhyme ["N" "AH1" "F"]]
(->> (repeatedly
10
#(->> (tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
target-rhyme
3
3
7
(make-markov-filter (map database ["overdone" "outdone" "rerun" "undone" "</s>" "<s>"]))
(make-rhyme-filter ["begun"]))))))
;; => ("darkness the lost souls will run"
;; "predominant thunder , scream comes undone"
;; "only day to come undone"
;; "i denounce the bad undone"
;; "me before i will outrun"
;; "convictions of the just will outrun"
;; "you mean stained are overdone"
;; "fight has begun overdone"
;; "are being skinned keep are one"
;; "demise , lift you up overdone")
(let [target-rhyme (->(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
"filling")
first
first
reverse)]
(->> (repeatedly
10
#(->> (tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
target-rhyme
3
3
7
)
(map second)
reverse))
(map (partial remove #{prhyme/BOS}))
(map data-transform/untokenize)))
)
(defn take-until
"Repeately calls some stateful function until predicate is met."
[take? f]
(loop [result (f)]
(if (take? result)
(first (take? result))
(recur (f)))))
(comment
(let [val (atom 0)]
(letfn [(foo []
(swap! val inc)
@val)]
(take-until even? foo)))
;; => 2
)
(defn valid-or-best-sentence?
[max-iterations]
(fn []
(let [context (atom {:current-best nil :iteration 0})]
(fn [text]
(let [sentence (string/join " " (map (comp map second) text))]
(swap! context update :iteration inc)
(let [current-best (:current-best @context)
log-prob (second (nlp/most-likely-parse sentence))]
(when (or (nil? current-best)
(> log-prob (nth current-best 1)))
(swap! context assoc :current-best [text log-prob]))
(if (or (> log-prob -1)
(>= (:iteration @context) max-iterations))
(:current-best @context)
false)))))))
(def best-of-20 (valid-or-best-sentence? 20))
(comment
(take-until (best-of-20) (constantly "my name sky does eat"))
)
(comment
(take-until
(best-of-10)
#(->> (tightly-generate-n-syllable-sentence
database
markov-trie
3
7
(make-markov-filter (map database [prhyme/BOS prhyme/EOS])))
(map second)
reverse
(string/join " ")))
)
(defn sentence->phones
"Sentence is of the format
[[[[F L OW1]] flow]
[[[AH0 N D] [AE1 N D]] and]
[[[S IY1 K]] seek]
[[[F IH1 NG G ER0 Z]] fingers]
[[[Y AO1 R] [Y UH1 R]] your]
[[[TH R UW1]] through]
[[[S T R EH1 NG K TH] [S T R EH1 NG TH]]
strength]
[[[F AY1 N D]] find]
[[[K AE1 N] [K AH0 N]] can]]
Returns the concatenated list of phones so you can pluck some off and find
rhymes.
Note that each word in the sentence can have more than one pronunciation.
This function chooses one randomly.
"
[sentence]
(->> sentence
(remove (comp empty? first)) ; Commas have no phones so rand-nth breaks
(map #(update % 0 rand-nth))
(apply map vector)
((fn [[phones words]]
[(reduce into [] phones) (string/join " " words)]))
(first)))
(comment
(let [sentence '[[[[F L OW1]] flow]
[[[AH0 N D] [AE1 N D]] and]
[[[S IY1 K]] seek]
[[[F IH1 NG G ER0 Z]] fingers]
[[[Y AO1 R] [Y UH1 R]] your]
[[[TH R UW1]] through]
[[[S T R EH1 NG K TH] [S T R EH1 NG TH]]
strength]
[[[F AY1 N D]] find]
[[[K AE1 N] [K AH0 N]] can]]]
(sentence->phones sentence))
)
(defn rhyme-from-scheme
"scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]
Will include as many syllables as possible in finding rhymes
and will choose randomly with equal chance from all possible rhymes."
[scheme database markov-trie rhyme-trie]
(loop [scheme scheme
line-phones {}
result []]
(cond
(empty? scheme) result
:else
(let [[pattern syllable-count] (first scheme)
banned-words (into #{} (->> result
(map (comp last first))))
line (if (nil? (get line-phones pattern))
; Here, we need to make a choice about which pronunciation
; we want to use to build line-phones. Choose randomly.
(take-until
(best-of-20)
#(tightly-generate-n-syllable-sentence
database
markov-trie
3
syllable-count
(make-markov-filter (map database [prhyme/BOS prhyme/EOS]))))
(take-until
(best-of-20)
#(tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
(reverse
(take-last 4 (prhyme/phones->all-flex-rhyme-tailing-consonants-phones
(get line-phones pattern))))
3
3
syllable-count
(make-markov-filter (map database [prhyme/BOS prhyme/EOS]))
(make-rhyme-filter banned-words))))
rhyme (sentence->phones (reverse line))]
(recur (rest scheme)
(if (nil? (get line-phones pattern))
(assoc line-phones pattern rhyme)
line-phones)
(conj result line))))))
(comment
(tightly-generate-n-syllable-sentence
database
markov-trie
3
10)
(repeatedly
2
#(->> (rhyme-from-scheme
'[[A 8] [A 8] [B 5] [B 5] [A 8]]
database
markov-tight-trie
rhyme-trie)
(map reverse)
(map (partial map second))
(map data-transform/untokenize)))
(->> "overdrive"
(prhyme/phrase->unstressed-vowels-and-tailing-consonants)
(map first)
(map reverse)
(map (partial
rhyme-choices-walking-target-rhyme-with-stop
rhyme-trie
(fn [phones choices]
(every? phonetics/consonant (butlast phones))))))
(trie/lookup rhyme-trie ["V" "AY1"])
(trie/lookup markov-tight-trie nil)
(tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
(first
(first
(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
"bother me")))
3
3
9
(constantly false)
(fn [[phones wordset]]
(set/difference wordset (into #{} []))))
)
#_(ns-unmap (find-ns 'com.owoga.corpus.markov) 'RhymeTrie)
(defprotocol IRhymeTrie
(rhymes [this phones] [this phones preprocess-rhymes]))
(deftype RhymeTrie [trie prep-phones end-walk]
IRhymeTrie
(rhymes [this phones]
(rhymes this phones identity))
(rhymes [this phones preprocess-rhymes]
(let [prepped-phones (reverse (prep-phones phones))]
(rhyme-choices-walking-target-rhyme-with-stop
trie
end-walk
prepped-phones
preprocess-rhymes))))
(comment
(def rhymetrie
(->RhymeTrie
rhyme-trie
(fn [phones]
(->> phones
prhyme/take-vowels-and-tail-consonants
prhyme/remove-all-stress))
(fn [phones choices]
(every? phonetics/consonant (butlast phones)))))
(rhymes rhymetrie ["AY" "V"])
(time
(count
(trie/children-at-depth markov-tight-trie 2 3)))
)
(defn rhyme-from-scheme-v2
"scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]
Will include as many syllables as possible in finding rhymes
and will choose randomly with equal chance from all possible rhymes.
Result will be a map of schemes to vectors of lines.
{[A 9] [[[you [Y UW]] [are [AA R]] [so [S OH]]]
[[and [AE N D]] [we [W IY]] [go [G OH]]]]
[B 5] [[[hey [H AY]]]
[[bay [B AY]]]]}
Currently hard-coded to work with 4-gram.
"
[scheme database markov-trie rhyme-trie]
(let [[eos bos] (map database [prhyme/EOS prhyme/BOS])]
(loop [scheme scheme
result {}]
(if (empty? scheme)
result
(let [[pattern syllable-count] (first scheme)
existing-lines (result (first scheme))
banned-words
(into #{} (->> existing-lines
(map (comp last last))))
line
(take-until
(best-of-20)
(fn []
(let [seed (if existing-lines
(->> existing-lines
rand-nth
reverse
(map first)
(apply concat)
(#(rhymes
rhyme-trie
%
(fn [choices]
(->> choices
(map (fn [[phones wordset]]
[phones
(set/difference
wordset
banned-words)]))
(remove (comp empty? second))))))
rand-nth
((fn [[phones wordset]]
(let [word (rand-nth (vec wordset))]
[(rand-nth (phonetics/get-phones word))
word])))
vector)
(->> (get-next-markov
markov-trie
[eos eos eos]
(fn [children]
(remove
#(#{eos bos} (.key %)) children)))
database
(#(vector (rand-nth (phonetics/get-phones %)) %))
vector))
line (tightly-generate-n-syllable-sentence-v2
database
markov-trie
4
syllable-count
(make-markov-filter [eos bos])
seed)]
line)))]
(recur (rest scheme)
(update result (first scheme) (fnil conj []) line)))))))
(comment
(let [scheme '[[a 8] [a 8] [b 5] [b 5] [a 8]]]
(rhyme-from-scheme-v2
scheme database markov-tight-trie rhymetrie))
(phonetics/get-phones "unleashed")
(rhymes
rhymetrie
["IY" "SH" "T"]
(fn [choices]
(->> choices
(map (fn [[phones wordset]]
[phones
(set/difference
wordset
#{"unleashed"})]))
(remove (comp empty? second)))))
)
(comment
(let [existing-lines '([[["K" "AA" "AH"] "unlock"]
[["M" "EH1" "M" "ER0" "IY0" "Z"] "memories"]
[["D" "IH0" "Z" "AO1" "L" "V" "IH0" "NG"] "dissolving"]])]
(->> existing-lines
rand-nth
reverse
(map first)
(mapcat reverse)))
)
;;;; Accuracy
(defn lookup-with-backoff
[model lookup]
(loop [lookup lookup]
(let [node (trie/lookup model lookup)]
(cond
(empty? lookup) [model (count (trie/children model))]
node [(trie/lookup model (butlast lookup))
(second (get node []))]
:else (recur (butlast lookup))))))
(defn calc-N [node]
(apply + (map #(second (get % [])) (trie/children node))))
(defn trie-frequencies [node]
(->> node
trie/children
(map #(second (get % [])))
frequencies
vec
(sort-by first)
(into (sorted-map))))
(comment
(time (def N (calc-N markov-tight-trie)))
(time (trie-frequencies (trie/lookup markov-tight-trie [107])))
)
(defn mle
[model lookup]
(let [N (calc-N model)
[parent freq] (lookup-with-backoff model lookup)
[_ parent-freq] (get parent [] [nil N])]
[freq parent-freq]))
(comment
(mle markov-tight-trie [9095 452 27040])
(count (trie/children markov-tight-trie))
)
(defn perplexity
[model database rank line]
(let [tokens (into [] data-transform/xf-tokenize [line])
token-ids (map database (first tokens))
n-grams (data-transform/n-to-m-partitions rank (inc rank) token-ids)]
[(map (partial mle model) n-grams)
n-grams]))
(comment
(perplexity markov-tight-trie database 3 "hi there eric how are you")
(database "through") ;; 1924
database
(count database)
(get markov-tight-trie [315 1924])
(->>
(map #(second (get % []))
(trie/children (trie/lookup markov-tight-trie [315])))
frequencies
vec
(sort-by first)
(into (sorted-map)))
)