Add training functions

4 years ago · a00f30fabe
parent 69fb02db65
commit a00f30fabe
3 changed files with 145 additions and 59 deletions
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -8,7 +8,8 @@
            [com.owoga.tightly-packed-trie :as tpt]
            [clojure.string :as string]
            [clojure.java.io :as io]
-            [com.owoga.phonetics :as phonetics]))
+            [com.owoga.phonetics :as phonetics]
            [taoensso.nippy :as nippy]))
 (defn clean-text [text]
  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
@ -294,7 +295,8 @@
    (map (partial transduce data-transform/xf-tokenize conj))
    (map (partial transduce data-transform/xf-filter-english conj))
    (map (partial remove empty?))
-    (map (partial map reverse))
+    (map (partial map (comp vec reverse)))
    ;; xf-pad-tokens works on vectors due to `into`
    (map (partial into [] (data-transform/xf-pad-tokens (dec m) "</s>" 1 "<s>")))
    (map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
    (mapcat (partial mapv (data-transform/make-database-processor database))))
@ -332,43 +334,49 @@
  )
 (defn train-backwards
  "For building lines backwards so they can be seeded with a target rhyme."
  [files n m trie-filepath database-filepath]
  (let [database (atom {:next-id 0})
        trie (file-seq->backwards-markov-trie database files n m)]
    (nippy/freeze-to-file trie-filepath (seq trie))
    (nippy/freeze-to-file database-filepath @database)
    (let [loaded-trie (->> trie-filepath
                           nippy/thaw-from-file
                           (into (trie/make-trie)))
          loaded-db (->> database-filepath
                         nippy/thaw-from-file)]
      (println "Successfully loaded trie and database.")
      (println (take 5 loaded-trie))
      (println (take 5 loaded-db)))))
 (comment
  (time
   (let [files (->> "dark-corpus"
                    io/file
                    file-seq
                    (eduction (xf-file-seq 0 4000)))
         [trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin")]))
  )
-(defn initialize
+(defn gen-rhyme-model
-  "Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie"
+  [rhyme-type-fn database database-filepath]
-  [context]
+  (let [words (filter string? (keys @database))
        rhyme-trie (prhyme/words->rhyme-trie rhyme-type-fn words)]
    (nippy/freeze-to-file database-filepath (seq rhyme-trie))
    (let [loaded-trie (->> (nippy/thaw-from-file database-filepath)
                           (into (trie/make-trie)))]
      (println "Successfully loaded rhyme model")
      (println (take 5 loaded-trie)))))
-  (swap!
+(comment
-   context
+  (let [database (atom (nippy/thaw-from-file "/tmp/database.edn"))]
-   assoc
+    (gen-rhyme-model prhyme/phrase->all-flex-rhyme-tailing-consonants-phones database "/tmp/rhyme-trie.bin"))
-   :rhyme-trie
+  (def rt (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin")))
   (transduce
    (comp
     (map first)
     (filter string?)
     (map (fn [word]
            (let [phones-coll (phonetics/get-phones)]
              (map
               #(vector (reverse (phonetics/get-phones %)) word)
               phones-coll)))))
    (completing
     (fn [trie [k v]]
       (update trie k (fnil #(update % 1 inc) [v 0]))))
    (trie/make-trie)
    (@context :database)))
-  (swap!
+  (take 5 rt)
-   context
+
-   assoc
+  (prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "brasilia")
-   :flex-rhyme-trie
+  (phonetics/get-phones "brasilia")
-   (transduce
+
-    (comp
+  )
     (map first)
     (filter string?)
     (map #(vector (reverse (prhyme/phrase->flex-rhyme-phones %)) %)))
    (completing
     (fn [trie [k v]]
       (update trie k (fnil conj [v]) v)))
    (trie/make-trie)
    (@context :database)))
  nil)
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -159,6 +159,9 @@
  ;;     ("P") nil}
  )
 ;;;; Flex rhymes maintain primary stress and ignore all other stress.
 ;;;;
 (defn phrase->all-flex-rhyme-phones
  "Takes a space-seperated string of words
  and returns the concatenation of the words
@ -167,7 +170,9 @@
  Returns them in reversed order so they
  are ready to be used in a lookup of a rhyme trie.
-  Returns all possible pronunciations. For hog -> haog, haag."
+  Returns all possible pronunciations. For hog -> haog, haag.
  ROBOT -> '(OW1 AA)"
  [phrase]
  (->> phrase
       (#(string/split % #" "))
@ -216,27 +221,100 @@
  (let [result (map phrase->all-flex-rhyme-phones ["dog" "hog" "hop" "bog hopscotch"])]
    result)
-  (let [phrase '(([["B" "AA1" "G"] "bog"] [["B" "AO1" "G"] "bog"])
+  )
-                 ([["S" "K" "AA1" "CH"] "scotch"]))]
+
-    (apply combinatorics/cartesian-product phrase))
+
 (defn vowel?-ignoring-stress
  [phone]
  (phonetics/vowel (string/replace phone #"\d" "")))
 (defn take-vowels-and-tail-consonants
  "HOPSCOTCH -> AA1 AA2 CH
  Useful for finding rhymes where the last vowel and tailing consonants
  are the same and preceding vowels are the same."
  ([phones]
   (take-vowels-and-tail-consonants phones '() false))
  ([phones result taken-vowel?]
   (cond
     (empty? phones) result
     (and taken-vowel? (not (vowel?-ignoring-stress (last phones))))
     (recur (butlast phones) result taken-vowel?)
     (vowel?-ignoring-stress (last phones))
     (recur (butlast phones) (cons (last phones) result) true)
     :else (recur (butlast phones) (cons (last phones) result) taken-vowel?))))
-  (phonetics/get-phones "bog")
+(comment
  (take-vowels-and-tail-consonants (first (phonetics/get-phones "robot")))
  ;; => ("OW1" "AA2" "T")
  )
-#_(defn phrase->flex-rhyme-trie
+(defn remove-non-primary-stress
  [phones]
  (map
   #(string/replace % #"[02-9]" "")
   phones))
 (defn phrase->all-flex-rhyme-tailing-consonants-phones
  "Takes a space-seperated string of words
  and returns the concatenation of the words
  vowel phones.
  Returns them in reversed order so they
  are ready to be used in a lookup of a rhyme trie.
  Returns all possible pronunciations. For hog -> haog, haag.
  ROBOT -> '(OW1 AA2 T)"
  [phrase]
-  (transduce
+  (->> phrase
-   (comp
+       (#(string/split % #" "))
-    (map #(vector (map reverse (phonetics/get-phones %)) %)))
+       (map (fn [word]
-   (completing
+              (let [phones (phonetics/get-phones word)]
-    (fn [trie [lookups v]]
+                (map #(vector % word) phones))))
       ;; Lots of nesting here.
       ;; We have phrase -> word pronunciations -> word pronunciation -> [phones word]
       ;; The rest will be easier if we get rid of a level of nesting
       ;; by mapcatting the cross product of pronunciations.
       (apply combinatorics/cartesian-product)
       ;; Now we have [phrases [pronunciations [[phones] word]]]
       (map (partial apply map vector))
       (map (fn [[phones words]]
              [(apply concat phones)
               (string/join " " words)]))
       (map #(update % 0 take-vowels-and-tail-consonants))
       (map #(update % 0 remove-non-primary-stress))))
 (comment
  (phrase->all-flex-rhyme-tailing-consonants-phones "robot")
  ;; => ([("OW1" "AA" "T") "robot"] [("OW1" "AH" "T") "robot"])
  )
 (defn words->rhyme-trie
  [rhyme-type-fn words]
  (->> words
       (mapcat rhyme-type-fn)
       (map #(update % 0 reverse))
       (reduce
-       (fn [trie lookup]
+        (fn [trie [phones word]]
-         (update trie lookup (fnil #(update % 1 inc) [v 0])))
+          (update trie phones conj word))
-       trie
+        (trie/make-trie))))
-       lookups)))
+
-   (trie/make-trie)
+
-   words))
+(comment
  (let [words ["tightnit" "tarpit"]
        trie (words->rhyme-trie phrase->all-flex-rhyme-tailing-consonants-phones words)]
    (->> (trie/lookup trie ["AA"]))
    trie)
  ;; => {("T" "IH1" "AA") ("tarpit"),
  ;;     ("T" "IH1" "AY1") ("tightnit"),
  ;;     ("T" "IH1") nil,
  ;;     ("T") nil}
  )
 (defn phrase->flex-rhyme-phones
  "Takes a space-seperated string of words
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -116,7 +116,7 @@
                            id
                            (new-key database key)))
                        lookup)]
-      [lookup' v])))
+      lookup')))
 (comment
  ;; TODO: Move to nlp.core