From a00f30fabe4b642b61362ca8b316e3b342add7f6 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Mon, 21 Jun 2021 09:03:39 -0500 Subject: [PATCH] Add training functions --- src/com/owoga/corpus/markov.clj | 88 +++++++++--------- src/com/owoga/prhyme/core.clj | 114 ++++++++++++++++++++---- src/com/owoga/prhyme/data_transform.clj | 2 +- 3 files changed, 145 insertions(+), 59 deletions(-) diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 1d11720..35a1fa8 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -8,7 +8,8 @@ [com.owoga.tightly-packed-trie :as tpt] [clojure.string :as string] [clojure.java.io :as io] - [com.owoga.phonetics :as phonetics])) + [com.owoga.phonetics :as phonetics] + [taoensso.nippy :as nippy])) (defn clean-text [text] (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) @@ -294,7 +295,8 @@ (map (partial transduce data-transform/xf-tokenize conj)) (map (partial transduce data-transform/xf-filter-english conj)) (map (partial remove empty?)) - (map (partial map reverse)) + (map (partial map (comp vec reverse))) + ;; xf-pad-tokens works on vectors due to `into` (map (partial into [] (data-transform/xf-pad-tokens (dec m) "" 1 ""))) (map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m)))) (mapcat (partial mapv (data-transform/make-database-processor database)))) @@ -332,43 +334,49 @@ ) +(defn train-backwards + "For building lines backwards so they can be seeded with a target rhyme." + [files n m trie-filepath database-filepath] + (let [database (atom {:next-id 0}) + trie (file-seq->backwards-markov-trie database files n m)] + (nippy/freeze-to-file trie-filepath (seq trie)) + (nippy/freeze-to-file database-filepath @database) + (let [loaded-trie (->> trie-filepath + nippy/thaw-from-file + (into (trie/make-trie))) + loaded-db (->> database-filepath + nippy/thaw-from-file)] + (println "Successfully loaded trie and database.") + (println (take 5 loaded-trie)) + (println (take 5 loaded-db))))) +(comment + (time + (let [files (->> "dark-corpus" + io/file + file-seq + (eduction (xf-file-seq 0 4000))) + [trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin")])) + ) + +(defn gen-rhyme-model + [rhyme-type-fn database database-filepath] + (let [words (filter string? (keys @database)) + rhyme-trie (prhyme/words->rhyme-trie rhyme-type-fn words)] + (nippy/freeze-to-file database-filepath (seq rhyme-trie)) + (let [loaded-trie (->> (nippy/thaw-from-file database-filepath) + (into (trie/make-trie)))] + (println "Successfully loaded rhyme model") + (println (take 5 loaded-trie))))) + +(comment + (let [database (atom (nippy/thaw-from-file "/tmp/database.edn"))] + (gen-rhyme-model prhyme/phrase->all-flex-rhyme-tailing-consonants-phones database "/tmp/rhyme-trie.bin")) + (def rt (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin"))) + + (take 5 rt) -(defn initialize - "Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie" - [context] - - (swap! - context - assoc - :rhyme-trie - (transduce - (comp - (map first) - (filter string?) - (map (fn [word] - (let [phones-coll (phonetics/get-phones)] - (map - #(vector (reverse (phonetics/get-phones %)) word) - phones-coll))))) - (completing - (fn [trie [k v]] - (update trie k (fnil #(update % 1 inc) [v 0])))) - (trie/make-trie) - (@context :database))) - - (swap! - context - assoc - :flex-rhyme-trie - (transduce - (comp - (map first) - (filter string?) - (map #(vector (reverse (prhyme/phrase->flex-rhyme-phones %)) %))) - (completing - (fn [trie [k v]] - (update trie k (fnil conj [v]) v))) - (trie/make-trie) - (@context :database))) - nil) + (prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "brasilia") + (phonetics/get-phones "brasilia") + + ) diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index 600727f..4e5b641 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -159,6 +159,9 @@ ;; ("P") nil} ) +;;;; Flex rhymes maintain primary stress and ignore all other stress. +;;;; + (defn phrase->all-flex-rhyme-phones "Takes a space-seperated string of words and returns the concatenation of the words @@ -167,7 +170,9 @@ Returns them in reversed order so they are ready to be used in a lookup of a rhyme trie. - Returns all possible pronunciations. For hog -> haog, haag." + Returns all possible pronunciations. For hog -> haog, haag. + + ROBOT -> '(OW1 AA)" [phrase] (->> phrase (#(string/split % #" ")) @@ -216,27 +221,100 @@ (let [result (map phrase->all-flex-rhyme-phones ["dog" "hog" "hop" "bog hopscotch"])] result) - (let [phrase '(([["B" "AA1" "G"] "bog"] [["B" "AO1" "G"] "bog"]) - ([["S" "K" "AA1" "CH"] "scotch"]))] - (apply combinatorics/cartesian-product phrase)) + ) + + +(defn vowel?-ignoring-stress + [phone] + (phonetics/vowel (string/replace phone #"\d" ""))) + +(defn take-vowels-and-tail-consonants + "HOPSCOTCH -> AA1 AA2 CH + + Useful for finding rhymes where the last vowel and tailing consonants + are the same and preceding vowels are the same." + ([phones] + (take-vowels-and-tail-consonants phones '() false)) + ([phones result taken-vowel?] + (cond + (empty? phones) result + (and taken-vowel? (not (vowel?-ignoring-stress (last phones)))) + (recur (butlast phones) result taken-vowel?) + (vowel?-ignoring-stress (last phones)) + (recur (butlast phones) (cons (last phones) result) true) + :else (recur (butlast phones) (cons (last phones) result) taken-vowel?)))) - (phonetics/get-phones "bog") +(comment + (take-vowels-and-tail-consonants (first (phonetics/get-phones "robot"))) + ;; => ("OW1" "AA2" "T") ) -#_(defn phrase->flex-rhyme-trie +(defn remove-non-primary-stress + [phones] + (map + #(string/replace % #"[02-9]" "") + phones)) + +(defn phrase->all-flex-rhyme-tailing-consonants-phones + "Takes a space-seperated string of words + and returns the concatenation of the words + vowel phones. + + Returns them in reversed order so they + are ready to be used in a lookup of a rhyme trie. + + Returns all possible pronunciations. For hog -> haog, haag. + + ROBOT -> '(OW1 AA2 T)" [phrase] - (transduce - (comp - (map #(vector (map reverse (phonetics/get-phones %)) %))) - (completing - (fn [trie [lookups v]] - (reduce - (fn [trie lookup] - (update trie lookup (fnil #(update % 1 inc) [v 0]))) - trie - lookups))) - (trie/make-trie) - words)) + (->> phrase + (#(string/split % #" ")) + (map (fn [word] + (let [phones (phonetics/get-phones word)] + (map #(vector % word) phones)))) + ;; Lots of nesting here. + ;; We have phrase -> word pronunciations -> word pronunciation -> [phones word] + ;; The rest will be easier if we get rid of a level of nesting + ;; by mapcatting the cross product of pronunciations. + (apply combinatorics/cartesian-product) + ;; Now we have [phrases [pronunciations [[phones] word]]] + (map (partial apply map vector)) + (map (fn [[phones words]] + [(apply concat phones) + (string/join " " words)])) + (map #(update % 0 take-vowels-and-tail-consonants)) + (map #(update % 0 remove-non-primary-stress)))) + +(comment + (phrase->all-flex-rhyme-tailing-consonants-phones "robot") + ;; => ([("OW1" "AA" "T") "robot"] [("OW1" "AH" "T") "robot"]) + ) + + +(defn words->rhyme-trie + [rhyme-type-fn words] + (->> words + (mapcat rhyme-type-fn) + (map #(update % 0 reverse)) + (reduce + (fn [trie [phones word]] + (update trie phones conj word)) + (trie/make-trie)))) + + +(comment + (let [words ["tightnit" "tarpit"] + trie (words->rhyme-trie phrase->all-flex-rhyme-tailing-consonants-phones words)] + (->> (trie/lookup trie ["AA"])) + trie) + ;; => {("T" "IH1" "AA") ("tarpit"), + ;; ("T" "IH1" "AY1") ("tightnit"), + ;; ("T" "IH1") nil, + ;; ("T") nil} + + ) + + (defn phrase->flex-rhyme-phones "Takes a space-seperated string of words diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index 3ff5a60..b99ccfa 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -116,7 +116,7 @@ id (new-key database key))) lookup)] - [lookup' v]))) + lookup'))) (comment ;; TODO: Move to nlp.core