Rhyme by scheme

TODO: Consider stress. Multiprocess
3 years ago · 79a35adb43
parent 1544680f55
commit 79a35adb43
4 changed files with 231 additions and 93 deletions
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -11,7 +11,29 @@
            [clojure.java.io :as io]
            [com.owoga.phonetics :as phonetics]
            [com.owoga.phonetics.syllabify :as syllabify]
-            [taoensso.nippy :as nippy]))
+            [taoensso.nippy :as nippy]
+            [clojure.math.combinatorics :as combinatorics]
+            [com.owoga.prhyme.nlp.core :as nlp]))
+
+(defrecord RhymeSet [phones words])
+
+; Since we're dealing with phonetics, a word consists of the spelling as well as all possible phonetic pronunciations.
+(defrecord UnpronouncedWord [word pronunciations])
+
+(defrecord PronouncedWord [word pronunciation])
+
+(defn all-pronunciations
+  [words]
+  (let [pronunciations (apply combinatorics/cartesian-product (map :pronunciations words))]
+    (map
+     (fn [pronunciation]
+       (map ->PronouncedWord (map :word words) pronunciation))
+     pronunciations)))
+
+(let [input-words ["bog" "hog"]
+      words (map (fn [word] (->UnpronouncedWord word (phonetics/get-phones word))) input-words)
+      pronunciations (all-pronunciations words)]
+  pronunciations)

 (defn clean-text [text]
  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
@ -204,6 +226,7 @@
  (def rhyme-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin")))
  )

+
 (defn choice->n-gram
  [{:keys [database]} choice]
  (map database (first choice)))
@ -300,35 +323,40 @@
  ;;     [("T" "AA1" "UW1") #{[("T" "AA1" "UW1") "moonshot"]}]
  ;;     [("T" "AA1")
  ;;      #{[("T" "AA1") "dot"] [("T" "AA1") "pot"] [("T" "AA1") "lot"]}]]
+
+  (rhyme-choices-walking-target-rhyme
+   rhyme-trie
+   (reverse '("UH1" "AA1" "R" "T")))
+
  )

 (defn get-next-markov
  "Weighted selection from markov model with backoff.
  Expects markov key/values to be [k1 k2 k3] [<value> freq]."
  ([markov-trie seed]
-   (get-next-markov markov-trie seed (constantly false)))
-  ([markov-trie seed remove-fn]
+   (get-next-markov markov-trie seed identity))
+  ([markov-trie seed process-children-fn]
   (let [seed (take-last 3 seed)
         node (trie/lookup markov-trie seed)
         children (and node
                       (->> node
                            trie/children
+                            process-children-fn
                            (map (fn [^com.owoga.trie.ITrie child]
                                        ; Get key and frequency of each child
                                   [(.key child)
                                    (get child [])]))
-                            (remove (comp nil? second))
-                            (remove remove-fn)))]
+                            (remove (comp nil? second))))]
     (cond
                                        ; If we've never seen this n-gram, fallback to n-1-gram
-       (nil? node) (recur markov-trie (rest seed) remove-fn)
+       (nil? node) (recur markov-trie (rest seed) process-children-fn)
       (seq children)
       (if (< (rand) (/ (apply max (map (comp second second) children))
                        (apply + (map (comp second second) children))))
-         (recur markov-trie (rest seed) remove-fn)
+         (recur markov-trie (rest seed) process-children-fn)
         (first (math/weighted-selection (comp second second) children)))
       (> (count seed) 0)
-       (recur markov-trie (rest seed) remove-fn)
+       (recur markov-trie (rest seed) process-children-fn)
                                        ; If we have a node but no children, or if we don't have a seed,
                                        ; we don't know how to handle that situation.
       :else (throw (Exception. "Error"))))))
@ -466,12 +494,12 @@
    markov-trie
    n-gram-rank
    target-sentence-syllable-count
-    (constantly false)))
+    identity))
  ([database
    markov-trie
    n-gram-rank
    target-sentence-syllable-count
-    markov-remove-fn]
+    process-markov-children]
   (let [eos (database prhyme/EOS)
         bos (database prhyme/BOS)]
     (loop [phrase []]
@ -488,10 +516,7 @@
                        ; Pad sentence with eos markers since we're working backwards
                        (into (vec (repeat (dec n-gram-rank) eos))
                              (mapv (comp database second) phrase))
-                        ; Remove eos, bos, and forbidden words
-                        (fn [[lookup [word frequency]]]
-                          (or (markov-remove-fn [lookup [word frequency]])
-                              (#{eos bos} word)))))]
+                        process-markov-children))]
             [(phonetics/get-phones word) word]))))))))

 (comment
@ -499,7 +524,23 @@
   database
   markov-trie
   3
-   10)
+   10
+   (fn [children]
+     (remove
+      (fn [child]
+        (let [lookup (.key child)
+              [word freq] (get child [])]
+          (#{(database prhyme/EOS) (database prhyme/BOS)} word)))
+      children)))
+  ;; [[[["HH" "ER1" "T" "S"]] "hurts"]
+  ;;  [[["IH1" "T"] ["IH0" "T"]] "it"]
+  ;;  [[["AH0" "N" "D"] ["AE1" "N" "D"]] "and"]
+  ;;  [[["F" "EY1" "S"]] "face"]
+  ;;  [[["M" "AY1"]] "my"]
+  ;;  [[["AH0" "G" "EH1" "N" "S" "T"] ["AH0" "G" "EY1" "N" "S" "T"]] "against"]
+  ;;  [[["L" "AY1" "F"]] "life"]
+  ;;  [[["M" "AY1"]] "my"]
+  ;;  [[["L" "AY1" "V"] ["L" "IH1" "V"]] "live"]]

  )

@ -525,7 +566,7 @@
    n-gram-rank
    target-rhyme-syllable-count
    target-sentence-syllable-count
-    (constantly false)
+    identity
    identity))
  ([database
    markov-trie
@ -534,16 +575,18 @@
    n-gram-rank
    target-rhyme-syllable-count
    target-sentence-syllable-count
-    markov-remove-fn
-    rhyme-wordset-fn]
+    markov-process-children
+    rhyme-process-words]
   (let [eos (database prhyme/EOS)
         bos (database prhyme/BOS)
         choices (rhyme-choices-walking-target-rhyme
                  rhyme-trie
                  target-rhyme
-                  rhyme-wordset-fn)
-         rhyme (update (rand-nth choices) 1 (comp rand-nth vec))]
-     (loop [phrase [rhyme]]
+                  rhyme-process-words)
+         [rhyming-phones rhyming-word] (update (rand-nth choices) 1 (comp rand-nth vec))
+         ;; The rhyme only has the rhyming phones. Grab full pronunciation.
+         rhyming-word-phones (rand-nth (phonetics/get-phones rhyming-word))]
+     (loop [phrase [[rhyming-word-phones rhyming-word]]]
       (if (<= target-sentence-syllable-count
               (prhyme/count-syllables-of-phrase
                (string/join " " (map second phrase))))
@ -556,11 +599,32 @@
                        markov-trie
                        (into (vec (repeat (dec n-gram-rank) eos))
                              (mapv (comp database second) phrase))
-                        (fn [[lookup [word frequency]]]
-                          (or (markov-remove-fn [lookup [word frequency]])
-                              (#{eos bos} word)))))]
+                        markov-process-children))]
             [(rand-nth (phonetics/get-phones word)) word]))))))))

+(defn make-markov-filter
+  "Specifically works with markovs with entries of the format:
+  [lookup [index freq]]
+  "
+  [words-to-remove]
+  (let [words-to-remove (into #{} words-to-remove)]
+    (fn [children]
+      (remove
+       (fn [child]
+         (let [[word _] (get child [])]
+           (words-to-remove word)))
+       children))))
+
+(defn make-rhyme-filter
+  [words-to-remove]
+  (let [words-to-remove (into #{} words-to-remove)]
+    (fn [rhyming-words]
+      (->> (map (fn [[phones wordset]]
+                  [phones (set/difference wordset words-to-remove)])
+                rhyming-words)
+           (remove (fn [[phones wordset]]
+                     (empty? wordset)))))))
+
 ;;;; Demo
 ;;;;
 (comment
@ -575,27 +639,19 @@
                 3
                 3
                 7
-                 (fn [[lookup [word freq]]]
-                   (= (database "begun") word))
-                 (fn [rhyming-words]
-                   (->> (map (fn [[phones wordset]]
-                               [phones (set/difference wordset #{"begun"})])
-                             rhyming-words)
-                        (remove (fn [[phones wordset]]
-                                  (empty? wordset))))))
-                (map second)
-                reverse))
-         (map (partial string/join " "))))
-  ;; => ("funeral has just begun"
-  ;;     "dead illusion overdone"
-  ;;     "all shout boy for then , outrun"
-  ;;     "and those that turn till rerun"
-  ;;     "heading for a pack of rerun"
-  ;;     "furnace of end outrun"
-  ;;     "the deaths outrun"
-  ;;     "our funeral has begun"
-  ;;     "paper that looking dark rerun"
-  ;;     "walk overdone")
+                 (make-markov-filter (map database ["overdone" "outdone" "rerun" "undone" "</s>" "<s>"]))
+                 (make-rhyme-filter ["begun"]))))))
+  ;; => ("darkness the lost souls will run"
+  ;;     "predominant thunder , scream comes undone"
+  ;;     "only day to come undone"
+  ;;     "i denounce the bad undone"
+  ;;     "me before i will outrun"
+  ;;     "convictions of the just will outrun"
+  ;;     "you mean stained are overdone"
+  ;;     "fight has begun overdone"
+  ;;     "are being skinned keep are one"
+  ;;     "demise , lift you up overdone")
+

  (let [target-rhyme (->(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
                         "filling")
@ -611,13 +667,68 @@
                 target-rhyme
                 3
                 3
-                 7)
+                 7
+                 )
                (map second)
                reverse))
         (map (partial remove #{prhyme/BOS}))
         (map data-transform/untokenize)))
  )

+(defn take-until
+  "Repeately calls some stateful function until predicate is met."
+  [take? f]
+  (loop [result (f)]
+    (if (take? result)
+      (first (take? result))
+      (recur (f)))))
+
+(comment
+  (let [val (atom 0)]
+    (letfn [(foo []
+              (swap! val inc)
+              @val)]
+      (take-until even? foo)))
+  ;; => 2
+  )
+
+(defn valid-or-best-sentence?
+  [max-iterations]
+  (fn []
+    (let [context (atom {:current-best nil :iteration 0})]
+      (fn [text]
+        (let [sentence (string/join " " (map (comp map second) text))]
+          (swap! context update :iteration inc)
+          (let [current-best (:current-best @context)
+                log-prob (second (nlp/most-likely-parse sentence))]
+            (when (or (nil? current-best)
+                      (> log-prob (nth current-best 1)))
+              (swap! context assoc :current-best [text log-prob]))
+            (if (or (> log-prob -1)
+                    (>= (:iteration @context) max-iterations))
+              (:current-best @context)
+              false)))))))
+
+(def best-of-10 (valid-or-best-sentence? 10))
+
+(comment
+  (take-until (best-of-10) (constantly "my name sky does eat"))
+  )
+
+(comment
+  (take-until
+   (best-of-10)
+   #(->> (tightly-generate-n-syllable-sentence
+          database
+          markov-trie
+          3
+          7
+          (make-markov-filter (map database [prhyme/BOS prhyme/EOS])))
+         (map second)
+         reverse
+         (string/join " ")))
+
+  )
 (defn sentence->phones
  "Sentence is of the format

@ -640,6 +751,7 @@
  "
  [sentence]
  (->> sentence
+       (remove (comp empty? first))  ; Commas have no phones so rand-nth breaks
       (map #(update % 0 rand-nth))
       (apply map vector)
       ((fn [[phones words]]
@ -675,39 +787,38 @@
      :else
      (let [[pattern syllable-count] (first scheme)
            banned-words (into #{} (->> result
-                                        (map (comp last last))))
+                                        (map (comp last first))))
            line (if (nil? (get line-phones pattern))
                                        ; Here, we need to make a choice about which pronunciation
                                        ; we want to use to build line-phones. Choose randomly.
-                   (tightly-generate-n-syllable-sentence
-                    database
-                    markov-trie
-                    3
-                    syllable-count)
-                   (tightly-generate-n-syllable-sentence-rhyming-with
-                    database
-                    markov-trie
-                    rhyme-trie
-                    (take 4 (get line-phones pattern))
-                    3
-                    3
-                    syllable-count
-                    (constantly false)
-                    ;; words-fn
-                    ;; ([("G" "AO1" "B") #{"bog"}] [("G" "AO1" "F") #{"fog"}])
-                    (fn [rhyming-words]
-                      (->> (map (fn [[phones wordset]]
-                                  [phones (->> (set/difference
-                                                wordset
-                                                banned-words))])
-                                rhyming-words)
-                           (remove (fn [[phones wordset]]
-                                     (empty? wordset)))))))
-            rhyme (reverse (sentence->phones line))]
-        (println line)
+                   (take-until
+                    (best-of-10)
+                    #(tightly-generate-n-syllable-sentence
+                      database
+                      markov-trie
+                      3
+                      syllable-count
+                      (make-markov-filter (map database [prhyme/BOS prhyme/EOS]))))
+                   (take-until
+                    (best-of-10)
+                    #(tightly-generate-n-syllable-sentence-rhyming-with
+                      database
+                      markov-trie
+                      rhyme-trie
+                      (reverse
+                       (take-last 4 (prhyme/phones->all-flex-rhyme-tailing-consonants-phones
+                                     (get line-phones pattern))))
+                      3
+                      3
+                      syllable-count
+                      (make-markov-filter (map database [prhyme/BOS prhyme/EOS]))
+                      (make-rhyme-filter banned-words))))
+            rhyme (sentence->phones (reverse line))]
        (recur (rest scheme)
-               (assoc line-phones pattern rhyme)
-               (conj result (reverse line)))))))
+               (if (nil? (get line-phones pattern))
+                 (assoc line-phones pattern rhyme)
+                 line-phones)
+               (conj result line))))))

 (comment
  (tightly-generate-n-syllable-sentence
@ -716,11 +827,16 @@
   3
   10)

-  (rhyme-from-scheme
-   '[[A 9] [A 9] [B 5] [B 5]]
-   database
-   markov-tight-trie
-   rhyme-trie)
+  (repeatedly
+   5
+   #(->> (rhyme-from-scheme
+          '[[A 9] [A 9] [B 5] [B 5] [A 9]]
+          database
+          markov-tight-trie
+          rhyme-trie)
+         (map reverse)
+         (map (partial map second))
+         (map data-transform/untokenize)))

  (trie/lookup markov-tight-trie nil)
  (tightly-generate-n-syllable-sentence-rhyming-with
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -268,6 +268,12 @@
   #(string/replace % #"[02-9]" "")
   phones))

+(defn phones->all-flex-rhyme-tailing-consonants-phones
+  [phones]
+  (->> phones
+       take-vowels-and-tail-consonants
+       remove-non-primary-stress))
+
 (defn phrase->all-flex-rhyme-tailing-consonants-phones
  "Takes a space-seperated string of words
  and returns the concatenation of the words
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -45,8 +45,9 @@
 (defn untokenize
  [coll]
  (->> coll
-       (map #(string/join " " %))
-       (map #(string/replace % #" (['\-,\?\.] ?)" "$1"))))
+       (string/join " ")
+       (#(string/replace % #" (['\-,\?\.] ?)" "$1"))))
+
 (def xf-untokenize
  (comp
   (map #(string/join " " %))
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -87,12 +87,8 @@
    (string/split results #"\n")))

 (comment
-  (- (Math/log 0.001) (Math/log 0.01))
-  (Math/E)
  (tokenize "Eric's testing.")

-  (Math/log 0.9999)
-  (Math/pow Math/E -0.5)
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)]
    ((juxt parse-probs parse-strs) parses))
@ -122,19 +118,31 @@

  )

-(Math/log (Math/pow Math/E -4.1))
 (defn parse-top-n [tokenized n]
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine tokenized custom-parser n)]
    (apply map vector ((juxt parse-strs parse-probs) parses))))

+(defn most-likely-parse
+  [tokenized]
+  (let [results (StringBuffer.)
+        parses (ParserTool/parseLine tokenized custom-parser 1)]
+    (->> ((juxt parse-strs parse-probs) parses)
+         (map first))))
+
 (comment
  (let [phrase "The feeling hurts."]
    (->> phrase
-        tokenize
-        (string/join " ")
-        (#(parse-top-n % 10))))
-  (Math/pow Math/E -0.96)
+         tokenize
+         (string/join " ")
+         (#(parse-top-n % 100))))
+
+  (let [phrase "The feeling hurts."]
+    (->> phrase
+         tokenize
+         (string/join " ")
+         (#(most-likely-parse %))))
+
  )

 (defn deep-merge-with [f & maps]
@ -151,12 +159,19 @@
  ;; => {:a 1, :b {:b 7}, :c 3}
  )

+(defn likely-sentence?
+  [text]
+  (->> text
+       tokenize
+       (string/join " ")
+       (#(parse-top-n % 100)))
+  )

 (defn valid-sentence?
  "Tokenizes and parses the phrase using OpenNLP models from
  http://opennlp.sourceforge.net/models-1.5/

-  If the parse tree has an clause as the top-level tag, then
+  If the parse tree has a clause as the top-level tag, then
  we consider it a valid English sentence."
  [phrase]
  (->> phrase
@ -176,8 +191,8 @@
  (->> "the lazy fox"
       vector
       parse)
-
  )
+
 (defn unmake-tree
  "Tokenizing and then parsing a sentence returns a string
  representation of the parse tree. This is a helper function