diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 2c611c4..ca2abab 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -11,7 +11,29 @@ [clojure.java.io :as io] [com.owoga.phonetics :as phonetics] [com.owoga.phonetics.syllabify :as syllabify] - [taoensso.nippy :as nippy])) + [taoensso.nippy :as nippy] + [clojure.math.combinatorics :as combinatorics] + [com.owoga.prhyme.nlp.core :as nlp])) + +(defrecord RhymeSet [phones words]) + +; Since we're dealing with phonetics, a word consists of the spelling as well as all possible phonetic pronunciations. +(defrecord UnpronouncedWord [word pronunciations]) + +(defrecord PronouncedWord [word pronunciation]) + +(defn all-pronunciations + [words] + (let [pronunciations (apply combinatorics/cartesian-product (map :pronunciations words))] + (map + (fn [pronunciation] + (map ->PronouncedWord (map :word words) pronunciation)) + pronunciations))) + +(let [input-words ["bog" "hog"] + words (map (fn [word] (->UnpronouncedWord word (phonetics/get-phones word))) input-words) + pronunciations (all-pronunciations words)] + pronunciations) (defn clean-text [text] (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) @@ -204,6 +226,7 @@ (def rhyme-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin"))) ) + (defn choice->n-gram [{:keys [database]} choice] (map database (first choice))) @@ -300,35 +323,40 @@ ;; [("T" "AA1" "UW1") #{[("T" "AA1" "UW1") "moonshot"]}] ;; [("T" "AA1") ;; #{[("T" "AA1") "dot"] [("T" "AA1") "pot"] [("T" "AA1") "lot"]}]] + + (rhyme-choices-walking-target-rhyme + rhyme-trie + (reverse '("UH1" "AA1" "R" "T"))) + ) (defn get-next-markov "Weighted selection from markov model with backoff. Expects markov key/values to be [k1 k2 k3] [ freq]." ([markov-trie seed] - (get-next-markov markov-trie seed (constantly false))) - ([markov-trie seed remove-fn] + (get-next-markov markov-trie seed identity)) + ([markov-trie seed process-children-fn] (let [seed (take-last 3 seed) node (trie/lookup markov-trie seed) children (and node (->> node trie/children + process-children-fn (map (fn [^com.owoga.trie.ITrie child] ; Get key and frequency of each child [(.key child) (get child [])])) - (remove (comp nil? second)) - (remove remove-fn)))] + (remove (comp nil? second))))] (cond ; If we've never seen this n-gram, fallback to n-1-gram - (nil? node) (recur markov-trie (rest seed) remove-fn) + (nil? node) (recur markov-trie (rest seed) process-children-fn) (seq children) (if (< (rand) (/ (apply max (map (comp second second) children)) (apply + (map (comp second second) children)))) - (recur markov-trie (rest seed) remove-fn) + (recur markov-trie (rest seed) process-children-fn) (first (math/weighted-selection (comp second second) children))) (> (count seed) 0) - (recur markov-trie (rest seed) remove-fn) + (recur markov-trie (rest seed) process-children-fn) ; If we have a node but no children, or if we don't have a seed, ; we don't know how to handle that situation. :else (throw (Exception. "Error")))))) @@ -466,12 +494,12 @@ markov-trie n-gram-rank target-sentence-syllable-count - (constantly false))) + identity)) ([database markov-trie n-gram-rank target-sentence-syllable-count - markov-remove-fn] + process-markov-children] (let [eos (database prhyme/EOS) bos (database prhyme/BOS)] (loop [phrase []] @@ -488,10 +516,7 @@ ; Pad sentence with eos markers since we're working backwards (into (vec (repeat (dec n-gram-rank) eos)) (mapv (comp database second) phrase)) - ; Remove eos, bos, and forbidden words - (fn [[lookup [word frequency]]] - (or (markov-remove-fn [lookup [word frequency]]) - (#{eos bos} word)))))] + process-markov-children))] [(phonetics/get-phones word) word])))))))) (comment @@ -499,7 +524,23 @@ database markov-trie 3 - 10) + 10 + (fn [children] + (remove + (fn [child] + (let [lookup (.key child) + [word freq] (get child [])] + (#{(database prhyme/EOS) (database prhyme/BOS)} word))) + children))) + ;; [[[["HH" "ER1" "T" "S"]] "hurts"] + ;; [[["IH1" "T"] ["IH0" "T"]] "it"] + ;; [[["AH0" "N" "D"] ["AE1" "N" "D"]] "and"] + ;; [[["F" "EY1" "S"]] "face"] + ;; [[["M" "AY1"]] "my"] + ;; [[["AH0" "G" "EH1" "N" "S" "T"] ["AH0" "G" "EY1" "N" "S" "T"]] "against"] + ;; [[["L" "AY1" "F"]] "life"] + ;; [[["M" "AY1"]] "my"] + ;; [[["L" "AY1" "V"] ["L" "IH1" "V"]] "live"]] ) @@ -525,7 +566,7 @@ n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count - (constantly false) + identity identity)) ([database markov-trie @@ -534,16 +575,18 @@ n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count - markov-remove-fn - rhyme-wordset-fn] + markov-process-children + rhyme-process-words] (let [eos (database prhyme/EOS) bos (database prhyme/BOS) choices (rhyme-choices-walking-target-rhyme rhyme-trie target-rhyme - rhyme-wordset-fn) - rhyme (update (rand-nth choices) 1 (comp rand-nth vec))] - (loop [phrase [rhyme]] + rhyme-process-words) + [rhyming-phones rhyming-word] (update (rand-nth choices) 1 (comp rand-nth vec)) + ;; The rhyme only has the rhyming phones. Grab full pronunciation. + rhyming-word-phones (rand-nth (phonetics/get-phones rhyming-word))] + (loop [phrase [[rhyming-word-phones rhyming-word]]] (if (<= target-sentence-syllable-count (prhyme/count-syllables-of-phrase (string/join " " (map second phrase)))) @@ -556,11 +599,32 @@ markov-trie (into (vec (repeat (dec n-gram-rank) eos)) (mapv (comp database second) phrase)) - (fn [[lookup [word frequency]]] - (or (markov-remove-fn [lookup [word frequency]]) - (#{eos bos} word)))))] + markov-process-children))] [(rand-nth (phonetics/get-phones word)) word])))))))) +(defn make-markov-filter + "Specifically works with markovs with entries of the format: + [lookup [index freq]] + " + [words-to-remove] + (let [words-to-remove (into #{} words-to-remove)] + (fn [children] + (remove + (fn [child] + (let [[word _] (get child [])] + (words-to-remove word))) + children)))) + +(defn make-rhyme-filter + [words-to-remove] + (let [words-to-remove (into #{} words-to-remove)] + (fn [rhyming-words] + (->> (map (fn [[phones wordset]] + [phones (set/difference wordset words-to-remove)]) + rhyming-words) + (remove (fn [[phones wordset]] + (empty? wordset))))))) + ;;;; Demo ;;;; (comment @@ -575,27 +639,19 @@ 3 3 7 - (fn [[lookup [word freq]]] - (= (database "begun") word)) - (fn [rhyming-words] - (->> (map (fn [[phones wordset]] - [phones (set/difference wordset #{"begun"})]) - rhyming-words) - (remove (fn [[phones wordset]] - (empty? wordset)))))) - (map second) - reverse)) - (map (partial string/join " ")))) - ;; => ("funeral has just begun" - ;; "dead illusion overdone" - ;; "all shout boy for then , outrun" - ;; "and those that turn till rerun" - ;; "heading for a pack of rerun" - ;; "furnace of end outrun" - ;; "the deaths outrun" - ;; "our funeral has begun" - ;; "paper that looking dark rerun" - ;; "walk overdone") + (make-markov-filter (map database ["overdone" "outdone" "rerun" "undone" "" ""])) + (make-rhyme-filter ["begun"])))))) + ;; => ("darkness the lost souls will run" + ;; "predominant thunder , scream comes undone" + ;; "only day to come undone" + ;; "i denounce the bad undone" + ;; "me before i will outrun" + ;; "convictions of the just will outrun" + ;; "you mean stained are overdone" + ;; "fight has begun overdone" + ;; "are being skinned keep are one" + ;; "demise , lift you up overdone") + (let [target-rhyme (->(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "filling") @@ -611,13 +667,68 @@ target-rhyme 3 3 - 7) + 7 + ) (map second) reverse)) (map (partial remove #{prhyme/BOS})) (map data-transform/untokenize))) ) +(defn take-until + "Repeately calls some stateful function until predicate is met." + [take? f] + (loop [result (f)] + (if (take? result) + (first (take? result)) + (recur (f))))) + +(comment + (let [val (atom 0)] + (letfn [(foo [] + (swap! val inc) + @val)] + (take-until even? foo))) + ;; => 2 + ) + +(defn valid-or-best-sentence? + [max-iterations] + (fn [] + (let [context (atom {:current-best nil :iteration 0})] + (fn [text] + (let [sentence (string/join " " (map (comp map second) text))] + (swap! context update :iteration inc) + (let [current-best (:current-best @context) + log-prob (second (nlp/most-likely-parse sentence))] + (when (or (nil? current-best) + (> log-prob (nth current-best 1))) + (swap! context assoc :current-best [text log-prob])) + (if (or (> log-prob -1) + (>= (:iteration @context) max-iterations)) + (:current-best @context) + false))))))) + +(def best-of-10 (valid-or-best-sentence? 10)) + +(comment + (take-until (best-of-10) (constantly "my name sky does eat")) + ) + +(comment + (take-until + (best-of-10) + #(->> (tightly-generate-n-syllable-sentence + database + markov-trie + 3 + 7 + (make-markov-filter (map database [prhyme/BOS prhyme/EOS]))) + (map second) + reverse + (string/join " "))) + + ) (defn sentence->phones "Sentence is of the format @@ -640,6 +751,7 @@ " [sentence] (->> sentence + (remove (comp empty? first)) ; Commas have no phones so rand-nth breaks (map #(update % 0 rand-nth)) (apply map vector) ((fn [[phones words]] @@ -675,39 +787,38 @@ :else (let [[pattern syllable-count] (first scheme) banned-words (into #{} (->> result - (map (comp last last)))) + (map (comp last first)))) line (if (nil? (get line-phones pattern)) ; Here, we need to make a choice about which pronunciation ; we want to use to build line-phones. Choose randomly. - (tightly-generate-n-syllable-sentence - database - markov-trie - 3 - syllable-count) - (tightly-generate-n-syllable-sentence-rhyming-with - database - markov-trie - rhyme-trie - (take 4 (get line-phones pattern)) - 3 - 3 - syllable-count - (constantly false) - ;; words-fn - ;; ([("G" "AO1" "B") #{"bog"}] [("G" "AO1" "F") #{"fog"}]) - (fn [rhyming-words] - (->> (map (fn [[phones wordset]] - [phones (->> (set/difference - wordset - banned-words))]) - rhyming-words) - (remove (fn [[phones wordset]] - (empty? wordset))))))) - rhyme (reverse (sentence->phones line))] - (println line) + (take-until + (best-of-10) + #(tightly-generate-n-syllable-sentence + database + markov-trie + 3 + syllable-count + (make-markov-filter (map database [prhyme/BOS prhyme/EOS])))) + (take-until + (best-of-10) + #(tightly-generate-n-syllable-sentence-rhyming-with + database + markov-trie + rhyme-trie + (reverse + (take-last 4 (prhyme/phones->all-flex-rhyme-tailing-consonants-phones + (get line-phones pattern)))) + 3 + 3 + syllable-count + (make-markov-filter (map database [prhyme/BOS prhyme/EOS])) + (make-rhyme-filter banned-words)))) + rhyme (sentence->phones (reverse line))] (recur (rest scheme) - (assoc line-phones pattern rhyme) - (conj result (reverse line))))))) + (if (nil? (get line-phones pattern)) + (assoc line-phones pattern rhyme) + line-phones) + (conj result line)))))) (comment (tightly-generate-n-syllable-sentence @@ -716,11 +827,16 @@ 3 10) - (rhyme-from-scheme - '[[A 9] [A 9] [B 5] [B 5]] - database - markov-tight-trie - rhyme-trie) + (repeatedly + 5 + #(->> (rhyme-from-scheme + '[[A 9] [A 9] [B 5] [B 5] [A 9]] + database + markov-tight-trie + rhyme-trie) + (map reverse) + (map (partial map second)) + (map data-transform/untokenize))) (trie/lookup markov-tight-trie nil) (tightly-generate-n-syllable-sentence-rhyming-with diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index 5808773..412ec86 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -268,6 +268,12 @@ #(string/replace % #"[02-9]" "") phones)) +(defn phones->all-flex-rhyme-tailing-consonants-phones + [phones] + (->> phones + take-vowels-and-tail-consonants + remove-non-primary-stress)) + (defn phrase->all-flex-rhyme-tailing-consonants-phones "Takes a space-seperated string of words and returns the concatenation of the words diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index 61e0b15..622460a 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -45,8 +45,9 @@ (defn untokenize [coll] (->> coll - (map #(string/join " " %)) - (map #(string/replace % #" (['\-,\?\.] ?)" "$1")))) + (string/join " ") + (#(string/replace % #" (['\-,\?\.] ?)" "$1")))) + (def xf-untokenize (comp (map #(string/join " " %)) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index 99deb31..7517d1f 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -87,12 +87,8 @@ (string/split results #"\n"))) (comment - (- (Math/log 0.001) (Math/log 0.01)) - (Math/E) (tokenize "Eric's testing.") - (Math/log 0.9999) - (Math/pow Math/E -0.5) (let [results (StringBuffer.) parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)] ((juxt parse-probs parse-strs) parses)) @@ -122,19 +118,31 @@ ) -(Math/log (Math/pow Math/E -4.1)) (defn parse-top-n [tokenized n] (let [results (StringBuffer.) parses (ParserTool/parseLine tokenized custom-parser n)] (apply map vector ((juxt parse-strs parse-probs) parses)))) +(defn most-likely-parse + [tokenized] + (let [results (StringBuffer.) + parses (ParserTool/parseLine tokenized custom-parser 1)] + (->> ((juxt parse-strs parse-probs) parses) + (map first)))) + (comment (let [phrase "The feeling hurts."] (->> phrase - tokenize - (string/join " ") - (#(parse-top-n % 10)))) - (Math/pow Math/E -0.96) + tokenize + (string/join " ") + (#(parse-top-n % 100)))) + + (let [phrase "The feeling hurts."] + (->> phrase + tokenize + (string/join " ") + (#(most-likely-parse %)))) + ) (defn deep-merge-with [f & maps] @@ -151,12 +159,19 @@ ;; => {:a 1, :b {:b 7}, :c 3} ) +(defn likely-sentence? + [text] + (->> text + tokenize + (string/join " ") + (#(parse-top-n % 100))) + ) (defn valid-sentence? "Tokenizes and parses the phrase using OpenNLP models from http://opennlp.sourceforge.net/models-1.5/ - If the parse tree has an clause as the top-level tag, then + If the parse tree has a clause as the top-level tag, then we consider it a valid English sentence." [phrase] (->> phrase @@ -176,8 +191,8 @@ (->> "the lazy fox" vector parse) - ) + (defn unmake-tree "Tokenizing and then parsing a sentence returns a string representation of the parse tree. This is a helper function