Rhyme by scheme

TODO: Consider stress. Multiprocess
main
Eric Ihli 3 years ago
parent 1544680f55
commit 79a35adb43

@ -11,7 +11,29 @@
[clojure.java.io :as io] [clojure.java.io :as io]
[com.owoga.phonetics :as phonetics] [com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify] [com.owoga.phonetics.syllabify :as syllabify]
[taoensso.nippy :as nippy])) [taoensso.nippy :as nippy]
[clojure.math.combinatorics :as combinatorics]
[com.owoga.prhyme.nlp.core :as nlp]))
(defrecord RhymeSet [phones words])
; Since we're dealing with phonetics, a word consists of the spelling as well as all possible phonetic pronunciations.
(defrecord UnpronouncedWord [word pronunciations])
(defrecord PronouncedWord [word pronunciation])
(defn all-pronunciations
[words]
(let [pronunciations (apply combinatorics/cartesian-product (map :pronunciations words))]
(map
(fn [pronunciation]
(map ->PronouncedWord (map :word words) pronunciation))
pronunciations)))
(let [input-words ["bog" "hog"]
words (map (fn [word] (->UnpronouncedWord word (phonetics/get-phones word))) input-words)
pronunciations (all-pronunciations words)]
pronunciations)
(defn clean-text [text] (defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
@ -204,6 +226,7 @@
(def rhyme-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin"))) (def rhyme-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin")))
) )
(defn choice->n-gram (defn choice->n-gram
[{:keys [database]} choice] [{:keys [database]} choice]
(map database (first choice))) (map database (first choice)))
@ -300,35 +323,40 @@
;; [("T" "AA1" "UW1") #{[("T" "AA1" "UW1") "moonshot"]}] ;; [("T" "AA1" "UW1") #{[("T" "AA1" "UW1") "moonshot"]}]
;; [("T" "AA1") ;; [("T" "AA1")
;; #{[("T" "AA1") "dot"] [("T" "AA1") "pot"] [("T" "AA1") "lot"]}]] ;; #{[("T" "AA1") "dot"] [("T" "AA1") "pot"] [("T" "AA1") "lot"]}]]
(rhyme-choices-walking-target-rhyme
rhyme-trie
(reverse '("UH1" "AA1" "R" "T")))
) )
(defn get-next-markov (defn get-next-markov
"Weighted selection from markov model with backoff. "Weighted selection from markov model with backoff.
Expects markov key/values to be [k1 k2 k3] [<value> freq]." Expects markov key/values to be [k1 k2 k3] [<value> freq]."
([markov-trie seed] ([markov-trie seed]
(get-next-markov markov-trie seed (constantly false))) (get-next-markov markov-trie seed identity))
([markov-trie seed remove-fn] ([markov-trie seed process-children-fn]
(let [seed (take-last 3 seed) (let [seed (take-last 3 seed)
node (trie/lookup markov-trie seed) node (trie/lookup markov-trie seed)
children (and node children (and node
(->> node (->> node
trie/children trie/children
process-children-fn
(map (fn [^com.owoga.trie.ITrie child] (map (fn [^com.owoga.trie.ITrie child]
; Get key and frequency of each child ; Get key and frequency of each child
[(.key child) [(.key child)
(get child [])])) (get child [])]))
(remove (comp nil? second)) (remove (comp nil? second))))]
(remove remove-fn)))]
(cond (cond
; If we've never seen this n-gram, fallback to n-1-gram ; If we've never seen this n-gram, fallback to n-1-gram
(nil? node) (recur markov-trie (rest seed) remove-fn) (nil? node) (recur markov-trie (rest seed) process-children-fn)
(seq children) (seq children)
(if (< (rand) (/ (apply max (map (comp second second) children)) (if (< (rand) (/ (apply max (map (comp second second) children))
(apply + (map (comp second second) children)))) (apply + (map (comp second second) children))))
(recur markov-trie (rest seed) remove-fn) (recur markov-trie (rest seed) process-children-fn)
(first (math/weighted-selection (comp second second) children))) (first (math/weighted-selection (comp second second) children)))
(> (count seed) 0) (> (count seed) 0)
(recur markov-trie (rest seed) remove-fn) (recur markov-trie (rest seed) process-children-fn)
; If we have a node but no children, or if we don't have a seed, ; If we have a node but no children, or if we don't have a seed,
; we don't know how to handle that situation. ; we don't know how to handle that situation.
:else (throw (Exception. "Error")))))) :else (throw (Exception. "Error"))))))
@ -466,12 +494,12 @@
markov-trie markov-trie
n-gram-rank n-gram-rank
target-sentence-syllable-count target-sentence-syllable-count
(constantly false))) identity))
([database ([database
markov-trie markov-trie
n-gram-rank n-gram-rank
target-sentence-syllable-count target-sentence-syllable-count
markov-remove-fn] process-markov-children]
(let [eos (database prhyme/EOS) (let [eos (database prhyme/EOS)
bos (database prhyme/BOS)] bos (database prhyme/BOS)]
(loop [phrase []] (loop [phrase []]
@ -488,10 +516,7 @@
; Pad sentence with eos markers since we're working backwards ; Pad sentence with eos markers since we're working backwards
(into (vec (repeat (dec n-gram-rank) eos)) (into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase)) (mapv (comp database second) phrase))
; Remove eos, bos, and forbidden words process-markov-children))]
(fn [[lookup [word frequency]]]
(or (markov-remove-fn [lookup [word frequency]])
(#{eos bos} word)))))]
[(phonetics/get-phones word) word])))))))) [(phonetics/get-phones word) word]))))))))
(comment (comment
@ -499,7 +524,23 @@
database database
markov-trie markov-trie
3 3
10) 10
(fn [children]
(remove
(fn [child]
(let [lookup (.key child)
[word freq] (get child [])]
(#{(database prhyme/EOS) (database prhyme/BOS)} word)))
children)))
;; [[[["HH" "ER1" "T" "S"]] "hurts"]
;; [[["IH1" "T"] ["IH0" "T"]] "it"]
;; [[["AH0" "N" "D"] ["AE1" "N" "D"]] "and"]
;; [[["F" "EY1" "S"]] "face"]
;; [[["M" "AY1"]] "my"]
;; [[["AH0" "G" "EH1" "N" "S" "T"] ["AH0" "G" "EY1" "N" "S" "T"]] "against"]
;; [[["L" "AY1" "F"]] "life"]
;; [[["M" "AY1"]] "my"]
;; [[["L" "AY1" "V"] ["L" "IH1" "V"]] "live"]]
) )
@ -525,7 +566,7 @@
n-gram-rank n-gram-rank
target-rhyme-syllable-count target-rhyme-syllable-count
target-sentence-syllable-count target-sentence-syllable-count
(constantly false) identity
identity)) identity))
([database ([database
markov-trie markov-trie
@ -534,16 +575,18 @@
n-gram-rank n-gram-rank
target-rhyme-syllable-count target-rhyme-syllable-count
target-sentence-syllable-count target-sentence-syllable-count
markov-remove-fn markov-process-children
rhyme-wordset-fn] rhyme-process-words]
(let [eos (database prhyme/EOS) (let [eos (database prhyme/EOS)
bos (database prhyme/BOS) bos (database prhyme/BOS)
choices (rhyme-choices-walking-target-rhyme choices (rhyme-choices-walking-target-rhyme
rhyme-trie rhyme-trie
target-rhyme target-rhyme
rhyme-wordset-fn) rhyme-process-words)
rhyme (update (rand-nth choices) 1 (comp rand-nth vec))] [rhyming-phones rhyming-word] (update (rand-nth choices) 1 (comp rand-nth vec))
(loop [phrase [rhyme]] ;; The rhyme only has the rhyming phones. Grab full pronunciation.
rhyming-word-phones (rand-nth (phonetics/get-phones rhyming-word))]
(loop [phrase [[rhyming-word-phones rhyming-word]]]
(if (<= target-sentence-syllable-count (if (<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase (prhyme/count-syllables-of-phrase
(string/join " " (map second phrase)))) (string/join " " (map second phrase))))
@ -556,11 +599,32 @@
markov-trie markov-trie
(into (vec (repeat (dec n-gram-rank) eos)) (into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase)) (mapv (comp database second) phrase))
(fn [[lookup [word frequency]]] markov-process-children))]
(or (markov-remove-fn [lookup [word frequency]])
(#{eos bos} word)))))]
[(rand-nth (phonetics/get-phones word)) word])))))))) [(rand-nth (phonetics/get-phones word)) word]))))))))
(defn make-markov-filter
"Specifically works with markovs with entries of the format:
[lookup [index freq]]
"
[words-to-remove]
(let [words-to-remove (into #{} words-to-remove)]
(fn [children]
(remove
(fn [child]
(let [[word _] (get child [])]
(words-to-remove word)))
children))))
(defn make-rhyme-filter
[words-to-remove]
(let [words-to-remove (into #{} words-to-remove)]
(fn [rhyming-words]
(->> (map (fn [[phones wordset]]
[phones (set/difference wordset words-to-remove)])
rhyming-words)
(remove (fn [[phones wordset]]
(empty? wordset)))))))
;;;; Demo ;;;; Demo
;;;; ;;;;
(comment (comment
@ -575,27 +639,19 @@
3 3
3 3
7 7
(fn [[lookup [word freq]]] (make-markov-filter (map database ["overdone" "outdone" "rerun" "undone" "</s>" "<s>"]))
(= (database "begun") word)) (make-rhyme-filter ["begun"]))))))
(fn [rhyming-words] ;; => ("darkness the lost souls will run"
(->> (map (fn [[phones wordset]] ;; "predominant thunder , scream comes undone"
[phones (set/difference wordset #{"begun"})]) ;; "only day to come undone"
rhyming-words) ;; "i denounce the bad undone"
(remove (fn [[phones wordset]] ;; "me before i will outrun"
(empty? wordset)))))) ;; "convictions of the just will outrun"
(map second) ;; "you mean stained are overdone"
reverse)) ;; "fight has begun overdone"
(map (partial string/join " ")))) ;; "are being skinned keep are one"
;; => ("funeral has just begun" ;; "demise , lift you up overdone")
;; "dead illusion overdone"
;; "all shout boy for then , outrun"
;; "and those that turn till rerun"
;; "heading for a pack of rerun"
;; "furnace of end outrun"
;; "the deaths outrun"
;; "our funeral has begun"
;; "paper that looking dark rerun"
;; "walk overdone")
(let [target-rhyme (->(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones (let [target-rhyme (->(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
"filling") "filling")
@ -611,13 +667,68 @@
target-rhyme target-rhyme
3 3
3 3
7) 7
)
(map second) (map second)
reverse)) reverse))
(map (partial remove #{prhyme/BOS})) (map (partial remove #{prhyme/BOS}))
(map data-transform/untokenize))) (map data-transform/untokenize)))
) )
(defn take-until
"Repeately calls some stateful function until predicate is met."
[take? f]
(loop [result (f)]
(if (take? result)
(first (take? result))
(recur (f)))))
(comment
(let [val (atom 0)]
(letfn [(foo []
(swap! val inc)
@val)]
(take-until even? foo)))
;; => 2
)
(defn valid-or-best-sentence?
[max-iterations]
(fn []
(let [context (atom {:current-best nil :iteration 0})]
(fn [text]
(let [sentence (string/join " " (map (comp map second) text))]
(swap! context update :iteration inc)
(let [current-best (:current-best @context)
log-prob (second (nlp/most-likely-parse sentence))]
(when (or (nil? current-best)
(> log-prob (nth current-best 1)))
(swap! context assoc :current-best [text log-prob]))
(if (or (> log-prob -1)
(>= (:iteration @context) max-iterations))
(:current-best @context)
false)))))))
(def best-of-10 (valid-or-best-sentence? 10))
(comment
(take-until (best-of-10) (constantly "my name sky does eat"))
)
(comment
(take-until
(best-of-10)
#(->> (tightly-generate-n-syllable-sentence
database
markov-trie
3
7
(make-markov-filter (map database [prhyme/BOS prhyme/EOS])))
(map second)
reverse
(string/join " ")))
)
(defn sentence->phones (defn sentence->phones
"Sentence is of the format "Sentence is of the format
@ -640,6 +751,7 @@
" "
[sentence] [sentence]
(->> sentence (->> sentence
(remove (comp empty? first)) ; Commas have no phones so rand-nth breaks
(map #(update % 0 rand-nth)) (map #(update % 0 rand-nth))
(apply map vector) (apply map vector)
((fn [[phones words]] ((fn [[phones words]]
@ -675,39 +787,38 @@
:else :else
(let [[pattern syllable-count] (first scheme) (let [[pattern syllable-count] (first scheme)
banned-words (into #{} (->> result banned-words (into #{} (->> result
(map (comp last last)))) (map (comp last first))))
line (if (nil? (get line-phones pattern)) line (if (nil? (get line-phones pattern))
; Here, we need to make a choice about which pronunciation ; Here, we need to make a choice about which pronunciation
; we want to use to build line-phones. Choose randomly. ; we want to use to build line-phones. Choose randomly.
(tightly-generate-n-syllable-sentence (take-until
database (best-of-10)
markov-trie #(tightly-generate-n-syllable-sentence
3 database
syllable-count) markov-trie
(tightly-generate-n-syllable-sentence-rhyming-with 3
database syllable-count
markov-trie (make-markov-filter (map database [prhyme/BOS prhyme/EOS]))))
rhyme-trie (take-until
(take 4 (get line-phones pattern)) (best-of-10)
3 #(tightly-generate-n-syllable-sentence-rhyming-with
3 database
syllable-count markov-trie
(constantly false) rhyme-trie
;; words-fn (reverse
;; ([("G" "AO1" "B") #{"bog"}] [("G" "AO1" "F") #{"fog"}]) (take-last 4 (prhyme/phones->all-flex-rhyme-tailing-consonants-phones
(fn [rhyming-words] (get line-phones pattern))))
(->> (map (fn [[phones wordset]] 3
[phones (->> (set/difference 3
wordset syllable-count
banned-words))]) (make-markov-filter (map database [prhyme/BOS prhyme/EOS]))
rhyming-words) (make-rhyme-filter banned-words))))
(remove (fn [[phones wordset]] rhyme (sentence->phones (reverse line))]
(empty? wordset)))))))
rhyme (reverse (sentence->phones line))]
(println line)
(recur (rest scheme) (recur (rest scheme)
(assoc line-phones pattern rhyme) (if (nil? (get line-phones pattern))
(conj result (reverse line))))))) (assoc line-phones pattern rhyme)
line-phones)
(conj result line))))))
(comment (comment
(tightly-generate-n-syllable-sentence (tightly-generate-n-syllable-sentence
@ -716,11 +827,16 @@
3 3
10) 10)
(rhyme-from-scheme (repeatedly
'[[A 9] [A 9] [B 5] [B 5]] 5
database #(->> (rhyme-from-scheme
markov-tight-trie '[[A 9] [A 9] [B 5] [B 5] [A 9]]
rhyme-trie) database
markov-tight-trie
rhyme-trie)
(map reverse)
(map (partial map second))
(map data-transform/untokenize)))
(trie/lookup markov-tight-trie nil) (trie/lookup markov-tight-trie nil)
(tightly-generate-n-syllable-sentence-rhyming-with (tightly-generate-n-syllable-sentence-rhyming-with

@ -268,6 +268,12 @@
#(string/replace % #"[02-9]" "") #(string/replace % #"[02-9]" "")
phones)) phones))
(defn phones->all-flex-rhyme-tailing-consonants-phones
[phones]
(->> phones
take-vowels-and-tail-consonants
remove-non-primary-stress))
(defn phrase->all-flex-rhyme-tailing-consonants-phones (defn phrase->all-flex-rhyme-tailing-consonants-phones
"Takes a space-seperated string of words "Takes a space-seperated string of words
and returns the concatenation of the words and returns the concatenation of the words

@ -45,8 +45,9 @@
(defn untokenize (defn untokenize
[coll] [coll]
(->> coll (->> coll
(map #(string/join " " %)) (string/join " ")
(map #(string/replace % #" (['\-,\?\.] ?)" "$1")))) (#(string/replace % #" (['\-,\?\.] ?)" "$1"))))
(def xf-untokenize (def xf-untokenize
(comp (comp
(map #(string/join " " %)) (map #(string/join " " %))

@ -87,12 +87,8 @@
(string/split results #"\n"))) (string/split results #"\n")))
(comment (comment
(- (Math/log 0.001) (Math/log 0.01))
(Math/E)
(tokenize "Eric's testing.") (tokenize "Eric's testing.")
(Math/log 0.9999)
(Math/pow Math/E -0.5)
(let [results (StringBuffer.) (let [results (StringBuffer.)
parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)] parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)]
((juxt parse-probs parse-strs) parses)) ((juxt parse-probs parse-strs) parses))
@ -122,19 +118,31 @@
) )
(Math/log (Math/pow Math/E -4.1))
(defn parse-top-n [tokenized n] (defn parse-top-n [tokenized n]
(let [results (StringBuffer.) (let [results (StringBuffer.)
parses (ParserTool/parseLine tokenized custom-parser n)] parses (ParserTool/parseLine tokenized custom-parser n)]
(apply map vector ((juxt parse-strs parse-probs) parses)))) (apply map vector ((juxt parse-strs parse-probs) parses))))
(defn most-likely-parse
[tokenized]
(let [results (StringBuffer.)
parses (ParserTool/parseLine tokenized custom-parser 1)]
(->> ((juxt parse-strs parse-probs) parses)
(map first))))
(comment (comment
(let [phrase "The feeling hurts."] (let [phrase "The feeling hurts."]
(->> phrase (->> phrase
tokenize tokenize
(string/join " ") (string/join " ")
(#(parse-top-n % 10)))) (#(parse-top-n % 100))))
(Math/pow Math/E -0.96)
(let [phrase "The feeling hurts."]
(->> phrase
tokenize
(string/join " ")
(#(most-likely-parse %))))
) )
(defn deep-merge-with [f & maps] (defn deep-merge-with [f & maps]
@ -151,12 +159,19 @@
;; => {:a 1, :b {:b 7}, :c 3} ;; => {:a 1, :b {:b 7}, :c 3}
) )
(defn likely-sentence?
[text]
(->> text
tokenize
(string/join " ")
(#(parse-top-n % 100)))
)
(defn valid-sentence? (defn valid-sentence?
"Tokenizes and parses the phrase using OpenNLP models from "Tokenizes and parses the phrase using OpenNLP models from
http://opennlp.sourceforge.net/models-1.5/ http://opennlp.sourceforge.net/models-1.5/
If the parse tree has an clause as the top-level tag, then If the parse tree has a clause as the top-level tag, then
we consider it a valid English sentence." we consider it a valid English sentence."
[phrase] [phrase]
(->> phrase (->> phrase
@ -176,8 +191,8 @@
(->> "the lazy fox" (->> "the lazy fox"
vector vector
parse) parse)
) )
(defn unmake-tree (defn unmake-tree
"Tokenizing and then parsing a sentence returns a string "Tokenizing and then parsing a sentence returns a string
representation of the parse tree. This is a helper function representation of the parse tree. This is a helper function

Loading…
Cancel
Save