Rhyme by scheme

TODO: Consider stress. Multiprocess
main
Eric Ihli 3 years ago
parent 1544680f55
commit 79a35adb43

@ -11,7 +11,29 @@
[clojure.java.io :as io]
[com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify]
[taoensso.nippy :as nippy]))
[taoensso.nippy :as nippy]
[clojure.math.combinatorics :as combinatorics]
[com.owoga.prhyme.nlp.core :as nlp]))
(defrecord RhymeSet [phones words])
; Since we're dealing with phonetics, a word consists of the spelling as well as all possible phonetic pronunciations.
(defrecord UnpronouncedWord [word pronunciations])
(defrecord PronouncedWord [word pronunciation])
(defn all-pronunciations
[words]
(let [pronunciations (apply combinatorics/cartesian-product (map :pronunciations words))]
(map
(fn [pronunciation]
(map ->PronouncedWord (map :word words) pronunciation))
pronunciations)))
(let [input-words ["bog" "hog"]
words (map (fn [word] (->UnpronouncedWord word (phonetics/get-phones word))) input-words)
pronunciations (all-pronunciations words)]
pronunciations)
(defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
@ -204,6 +226,7 @@
(def rhyme-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin")))
)
(defn choice->n-gram
[{:keys [database]} choice]
(map database (first choice)))
@ -300,35 +323,40 @@
;; [("T" "AA1" "UW1") #{[("T" "AA1" "UW1") "moonshot"]}]
;; [("T" "AA1")
;; #{[("T" "AA1") "dot"] [("T" "AA1") "pot"] [("T" "AA1") "lot"]}]]
(rhyme-choices-walking-target-rhyme
rhyme-trie
(reverse '("UH1" "AA1" "R" "T")))
)
(defn get-next-markov
"Weighted selection from markov model with backoff.
Expects markov key/values to be [k1 k2 k3] [<value> freq]."
([markov-trie seed]
(get-next-markov markov-trie seed (constantly false)))
([markov-trie seed remove-fn]
(get-next-markov markov-trie seed identity))
([markov-trie seed process-children-fn]
(let [seed (take-last 3 seed)
node (trie/lookup markov-trie seed)
children (and node
(->> node
trie/children
process-children-fn
(map (fn [^com.owoga.trie.ITrie child]
; Get key and frequency of each child
[(.key child)
(get child [])]))
(remove (comp nil? second))
(remove remove-fn)))]
(remove (comp nil? second))))]
(cond
; If we've never seen this n-gram, fallback to n-1-gram
(nil? node) (recur markov-trie (rest seed) remove-fn)
(nil? node) (recur markov-trie (rest seed) process-children-fn)
(seq children)
(if (< (rand) (/ (apply max (map (comp second second) children))
(apply + (map (comp second second) children))))
(recur markov-trie (rest seed) remove-fn)
(recur markov-trie (rest seed) process-children-fn)
(first (math/weighted-selection (comp second second) children)))
(> (count seed) 0)
(recur markov-trie (rest seed) remove-fn)
(recur markov-trie (rest seed) process-children-fn)
; If we have a node but no children, or if we don't have a seed,
; we don't know how to handle that situation.
:else (throw (Exception. "Error"))))))
@ -466,12 +494,12 @@
markov-trie
n-gram-rank
target-sentence-syllable-count
(constantly false)))
identity))
([database
markov-trie
n-gram-rank
target-sentence-syllable-count
markov-remove-fn]
process-markov-children]
(let [eos (database prhyme/EOS)
bos (database prhyme/BOS)]
(loop [phrase []]
@ -488,10 +516,7 @@
; Pad sentence with eos markers since we're working backwards
(into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase))
; Remove eos, bos, and forbidden words
(fn [[lookup [word frequency]]]
(or (markov-remove-fn [lookup [word frequency]])
(#{eos bos} word)))))]
process-markov-children))]
[(phonetics/get-phones word) word]))))))))
(comment
@ -499,7 +524,23 @@
database
markov-trie
3
10)
10
(fn [children]
(remove
(fn [child]
(let [lookup (.key child)
[word freq] (get child [])]
(#{(database prhyme/EOS) (database prhyme/BOS)} word)))
children)))
;; [[[["HH" "ER1" "T" "S"]] "hurts"]
;; [[["IH1" "T"] ["IH0" "T"]] "it"]
;; [[["AH0" "N" "D"] ["AE1" "N" "D"]] "and"]
;; [[["F" "EY1" "S"]] "face"]
;; [[["M" "AY1"]] "my"]
;; [[["AH0" "G" "EH1" "N" "S" "T"] ["AH0" "G" "EY1" "N" "S" "T"]] "against"]
;; [[["L" "AY1" "F"]] "life"]
;; [[["M" "AY1"]] "my"]
;; [[["L" "AY1" "V"] ["L" "IH1" "V"]] "live"]]
)
@ -525,7 +566,7 @@
n-gram-rank
target-rhyme-syllable-count
target-sentence-syllable-count
(constantly false)
identity
identity))
([database
markov-trie
@ -534,16 +575,18 @@
n-gram-rank
target-rhyme-syllable-count
target-sentence-syllable-count
markov-remove-fn
rhyme-wordset-fn]
markov-process-children
rhyme-process-words]
(let [eos (database prhyme/EOS)
bos (database prhyme/BOS)
choices (rhyme-choices-walking-target-rhyme
rhyme-trie
target-rhyme
rhyme-wordset-fn)
rhyme (update (rand-nth choices) 1 (comp rand-nth vec))]
(loop [phrase [rhyme]]
rhyme-process-words)
[rhyming-phones rhyming-word] (update (rand-nth choices) 1 (comp rand-nth vec))
;; The rhyme only has the rhyming phones. Grab full pronunciation.
rhyming-word-phones (rand-nth (phonetics/get-phones rhyming-word))]
(loop [phrase [[rhyming-word-phones rhyming-word]]]
(if (<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase
(string/join " " (map second phrase))))
@ -556,11 +599,32 @@
markov-trie
(into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase))
(fn [[lookup [word frequency]]]
(or (markov-remove-fn [lookup [word frequency]])
(#{eos bos} word)))))]
markov-process-children))]
[(rand-nth (phonetics/get-phones word)) word]))))))))
(defn make-markov-filter
"Specifically works with markovs with entries of the format:
[lookup [index freq]]
"
[words-to-remove]
(let [words-to-remove (into #{} words-to-remove)]
(fn [children]
(remove
(fn [child]
(let [[word _] (get child [])]
(words-to-remove word)))
children))))
(defn make-rhyme-filter
[words-to-remove]
(let [words-to-remove (into #{} words-to-remove)]
(fn [rhyming-words]
(->> (map (fn [[phones wordset]]
[phones (set/difference wordset words-to-remove)])
rhyming-words)
(remove (fn [[phones wordset]]
(empty? wordset)))))))
;;;; Demo
;;;;
(comment
@ -575,27 +639,19 @@
3
3
7
(fn [[lookup [word freq]]]
(= (database "begun") word))
(fn [rhyming-words]
(->> (map (fn [[phones wordset]]
[phones (set/difference wordset #{"begun"})])
rhyming-words)
(remove (fn [[phones wordset]]
(empty? wordset))))))
(map second)
reverse))
(map (partial string/join " "))))
;; => ("funeral has just begun"
;; "dead illusion overdone"
;; "all shout boy for then , outrun"
;; "and those that turn till rerun"
;; "heading for a pack of rerun"
;; "furnace of end outrun"
;; "the deaths outrun"
;; "our funeral has begun"
;; "paper that looking dark rerun"
;; "walk overdone")
(make-markov-filter (map database ["overdone" "outdone" "rerun" "undone" "</s>" "<s>"]))
(make-rhyme-filter ["begun"]))))))
;; => ("darkness the lost souls will run"
;; "predominant thunder , scream comes undone"
;; "only day to come undone"
;; "i denounce the bad undone"
;; "me before i will outrun"
;; "convictions of the just will outrun"
;; "you mean stained are overdone"
;; "fight has begun overdone"
;; "are being skinned keep are one"
;; "demise , lift you up overdone")
(let [target-rhyme (->(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
"filling")
@ -611,13 +667,68 @@
target-rhyme
3
3
7)
7
)
(map second)
reverse))
(map (partial remove #{prhyme/BOS}))
(map data-transform/untokenize)))
)
(defn take-until
"Repeately calls some stateful function until predicate is met."
[take? f]
(loop [result (f)]
(if (take? result)
(first (take? result))
(recur (f)))))
(comment
(let [val (atom 0)]
(letfn [(foo []
(swap! val inc)
@val)]
(take-until even? foo)))
;; => 2
)
(defn valid-or-best-sentence?
[max-iterations]
(fn []
(let [context (atom {:current-best nil :iteration 0})]
(fn [text]
(let [sentence (string/join " " (map (comp map second) text))]
(swap! context update :iteration inc)
(let [current-best (:current-best @context)
log-prob (second (nlp/most-likely-parse sentence))]
(when (or (nil? current-best)
(> log-prob (nth current-best 1)))
(swap! context assoc :current-best [text log-prob]))
(if (or (> log-prob -1)
(>= (:iteration @context) max-iterations))
(:current-best @context)
false)))))))
(def best-of-10 (valid-or-best-sentence? 10))
(comment
(take-until (best-of-10) (constantly "my name sky does eat"))
)
(comment
(take-until
(best-of-10)
#(->> (tightly-generate-n-syllable-sentence
database
markov-trie
3
7
(make-markov-filter (map database [prhyme/BOS prhyme/EOS])))
(map second)
reverse
(string/join " ")))
)
(defn sentence->phones
"Sentence is of the format
@ -640,6 +751,7 @@
"
[sentence]
(->> sentence
(remove (comp empty? first)) ; Commas have no phones so rand-nth breaks
(map #(update % 0 rand-nth))
(apply map vector)
((fn [[phones words]]
@ -675,39 +787,38 @@
:else
(let [[pattern syllable-count] (first scheme)
banned-words (into #{} (->> result
(map (comp last last))))
(map (comp last first))))
line (if (nil? (get line-phones pattern))
; Here, we need to make a choice about which pronunciation
; we want to use to build line-phones. Choose randomly.
(tightly-generate-n-syllable-sentence
database
markov-trie
3
syllable-count)
(tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
(take 4 (get line-phones pattern))
3
3
syllable-count
(constantly false)
;; words-fn
;; ([("G" "AO1" "B") #{"bog"}] [("G" "AO1" "F") #{"fog"}])
(fn [rhyming-words]
(->> (map (fn [[phones wordset]]
[phones (->> (set/difference
wordset
banned-words))])
rhyming-words)
(remove (fn [[phones wordset]]
(empty? wordset)))))))
rhyme (reverse (sentence->phones line))]
(println line)
(take-until
(best-of-10)
#(tightly-generate-n-syllable-sentence
database
markov-trie
3
syllable-count
(make-markov-filter (map database [prhyme/BOS prhyme/EOS]))))
(take-until
(best-of-10)
#(tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
rhyme-trie
(reverse
(take-last 4 (prhyme/phones->all-flex-rhyme-tailing-consonants-phones
(get line-phones pattern))))
3
3
syllable-count
(make-markov-filter (map database [prhyme/BOS prhyme/EOS]))
(make-rhyme-filter banned-words))))
rhyme (sentence->phones (reverse line))]
(recur (rest scheme)
(assoc line-phones pattern rhyme)
(conj result (reverse line)))))))
(if (nil? (get line-phones pattern))
(assoc line-phones pattern rhyme)
line-phones)
(conj result line))))))
(comment
(tightly-generate-n-syllable-sentence
@ -716,11 +827,16 @@
3
10)
(rhyme-from-scheme
'[[A 9] [A 9] [B 5] [B 5]]
database
markov-tight-trie
rhyme-trie)
(repeatedly
5
#(->> (rhyme-from-scheme
'[[A 9] [A 9] [B 5] [B 5] [A 9]]
database
markov-tight-trie
rhyme-trie)
(map reverse)
(map (partial map second))
(map data-transform/untokenize)))
(trie/lookup markov-tight-trie nil)
(tightly-generate-n-syllable-sentence-rhyming-with

@ -268,6 +268,12 @@
#(string/replace % #"[02-9]" "")
phones))
(defn phones->all-flex-rhyme-tailing-consonants-phones
[phones]
(->> phones
take-vowels-and-tail-consonants
remove-non-primary-stress))
(defn phrase->all-flex-rhyme-tailing-consonants-phones
"Takes a space-seperated string of words
and returns the concatenation of the words

@ -45,8 +45,9 @@
(defn untokenize
[coll]
(->> coll
(map #(string/join " " %))
(map #(string/replace % #" (['\-,\?\.] ?)" "$1"))))
(string/join " ")
(#(string/replace % #" (['\-,\?\.] ?)" "$1"))))
(def xf-untokenize
(comp
(map #(string/join " " %))

@ -87,12 +87,8 @@
(string/split results #"\n")))
(comment
(- (Math/log 0.001) (Math/log 0.01))
(Math/E)
(tokenize "Eric's testing.")
(Math/log 0.9999)
(Math/pow Math/E -0.5)
(let [results (StringBuffer.)
parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)]
((juxt parse-probs parse-strs) parses))
@ -122,19 +118,31 @@
)
(Math/log (Math/pow Math/E -4.1))
(defn parse-top-n [tokenized n]
(let [results (StringBuffer.)
parses (ParserTool/parseLine tokenized custom-parser n)]
(apply map vector ((juxt parse-strs parse-probs) parses))))
(defn most-likely-parse
[tokenized]
(let [results (StringBuffer.)
parses (ParserTool/parseLine tokenized custom-parser 1)]
(->> ((juxt parse-strs parse-probs) parses)
(map first))))
(comment
(let [phrase "The feeling hurts."]
(->> phrase
tokenize
(string/join " ")
(#(parse-top-n % 10))))
(Math/pow Math/E -0.96)
tokenize
(string/join " ")
(#(parse-top-n % 100))))
(let [phrase "The feeling hurts."]
(->> phrase
tokenize
(string/join " ")
(#(most-likely-parse %))))
)
(defn deep-merge-with [f & maps]
@ -151,12 +159,19 @@
;; => {:a 1, :b {:b 7}, :c 3}
)
(defn likely-sentence?
[text]
(->> text
tokenize
(string/join " ")
(#(parse-top-n % 100)))
)
(defn valid-sentence?
"Tokenizes and parses the phrase using OpenNLP models from
http://opennlp.sourceforge.net/models-1.5/
If the parse tree has an clause as the top-level tag, then
If the parse tree has a clause as the top-level tag, then
we consider it a valid English sentence."
[phrase]
(->> phrase
@ -176,8 +191,8 @@
(->> "the lazy fox"
vector
parse)
)
(defn unmake-tree
"Tokenizing and then parsing a sentence returns a string
representation of the parse tree. This is a helper function

Loading…
Cancel
Save