Add training functions

main
Eric Ihli 3 years ago
parent 69fb02db65
commit a00f30fabe

@ -8,7 +8,8 @@
[com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie :as tpt]
[clojure.string :as string] [clojure.string :as string]
[clojure.java.io :as io] [clojure.java.io :as io]
[com.owoga.phonetics :as phonetics])) [com.owoga.phonetics :as phonetics]
[taoensso.nippy :as nippy]))
(defn clean-text [text] (defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
@ -294,7 +295,8 @@
(map (partial transduce data-transform/xf-tokenize conj)) (map (partial transduce data-transform/xf-tokenize conj))
(map (partial transduce data-transform/xf-filter-english conj)) (map (partial transduce data-transform/xf-filter-english conj))
(map (partial remove empty?)) (map (partial remove empty?))
(map (partial map reverse)) (map (partial map (comp vec reverse)))
;; xf-pad-tokens works on vectors due to `into`
(map (partial into [] (data-transform/xf-pad-tokens (dec m) "</s>" 1 "<s>"))) (map (partial into [] (data-transform/xf-pad-tokens (dec m) "</s>" 1 "<s>")))
(map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m)))) (map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
(mapcat (partial mapv (data-transform/make-database-processor database)))) (mapcat (partial mapv (data-transform/make-database-processor database))))
@ -332,43 +334,49 @@
) )
(defn train-backwards
"For building lines backwards so they can be seeded with a target rhyme."
[files n m trie-filepath database-filepath]
(let [database (atom {:next-id 0})
trie (file-seq->backwards-markov-trie database files n m)]
(nippy/freeze-to-file trie-filepath (seq trie))
(nippy/freeze-to-file database-filepath @database)
(let [loaded-trie (->> trie-filepath
nippy/thaw-from-file
(into (trie/make-trie)))
loaded-db (->> database-filepath
nippy/thaw-from-file)]
(println "Successfully loaded trie and database.")
(println (take 5 loaded-trie))
(println (take 5 loaded-db)))))
(comment
(time
(let [files (->> "dark-corpus"
io/file
file-seq
(eduction (xf-file-seq 0 4000)))
[trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin")]))
)
(defn initialize (defn gen-rhyme-model
"Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie" [rhyme-type-fn database database-filepath]
[context] (let [words (filter string? (keys @database))
rhyme-trie (prhyme/words->rhyme-trie rhyme-type-fn words)]
(nippy/freeze-to-file database-filepath (seq rhyme-trie))
(let [loaded-trie (->> (nippy/thaw-from-file database-filepath)
(into (trie/make-trie)))]
(println "Successfully loaded rhyme model")
(println (take 5 loaded-trie)))))
(swap! (comment
context (let [database (atom (nippy/thaw-from-file "/tmp/database.edn"))]
assoc (gen-rhyme-model prhyme/phrase->all-flex-rhyme-tailing-consonants-phones database "/tmp/rhyme-trie.bin"))
:rhyme-trie (def rt (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin")))
(transduce
(comp
(map first)
(filter string?)
(map (fn [word]
(let [phones-coll (phonetics/get-phones)]
(map
#(vector (reverse (phonetics/get-phones %)) word)
phones-coll)))))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap! (take 5 rt)
context
assoc (prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "brasilia")
:flex-rhyme-trie (phonetics/get-phones "brasilia")
(transduce
(comp )
(map first)
(filter string?)
(map #(vector (reverse (prhyme/phrase->flex-rhyme-phones %)) %)))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
(@context :database)))
nil)

@ -159,6 +159,9 @@
;; ("P") nil} ;; ("P") nil}
) )
;;;; Flex rhymes maintain primary stress and ignore all other stress.
;;;;
(defn phrase->all-flex-rhyme-phones (defn phrase->all-flex-rhyme-phones
"Takes a space-seperated string of words "Takes a space-seperated string of words
and returns the concatenation of the words and returns the concatenation of the words
@ -167,7 +170,9 @@
Returns them in reversed order so they Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie. are ready to be used in a lookup of a rhyme trie.
Returns all possible pronunciations. For hog -> haog, haag." Returns all possible pronunciations. For hog -> haog, haag.
ROBOT -> '(OW1 AA)"
[phrase] [phrase]
(->> phrase (->> phrase
(#(string/split % #" ")) (#(string/split % #" "))
@ -216,27 +221,100 @@
(let [result (map phrase->all-flex-rhyme-phones ["dog" "hog" "hop" "bog hopscotch"])] (let [result (map phrase->all-flex-rhyme-phones ["dog" "hog" "hop" "bog hopscotch"])]
result) result)
(let [phrase '(([["B" "AA1" "G"] "bog"] [["B" "AO1" "G"] "bog"]) )
([["S" "K" "AA1" "CH"] "scotch"]))]
(apply combinatorics/cartesian-product phrase))
(defn vowel?-ignoring-stress
[phone]
(phonetics/vowel (string/replace phone #"\d" "")))
(defn take-vowels-and-tail-consonants
"HOPSCOTCH -> AA1 AA2 CH
Useful for finding rhymes where the last vowel and tailing consonants
are the same and preceding vowels are the same."
([phones]
(take-vowels-and-tail-consonants phones '() false))
([phones result taken-vowel?]
(cond
(empty? phones) result
(and taken-vowel? (not (vowel?-ignoring-stress (last phones))))
(recur (butlast phones) result taken-vowel?)
(vowel?-ignoring-stress (last phones))
(recur (butlast phones) (cons (last phones) result) true)
:else (recur (butlast phones) (cons (last phones) result) taken-vowel?))))
(phonetics/get-phones "bog") (comment
(take-vowels-and-tail-consonants (first (phonetics/get-phones "robot")))
;; => ("OW1" "AA2" "T")
) )
#_(defn phrase->flex-rhyme-trie (defn remove-non-primary-stress
[phones]
(map
#(string/replace % #"[02-9]" "")
phones))
(defn phrase->all-flex-rhyme-tailing-consonants-phones
"Takes a space-seperated string of words
and returns the concatenation of the words
vowel phones.
Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie.
Returns all possible pronunciations. For hog -> haog, haag.
ROBOT -> '(OW1 AA2 T)"
[phrase] [phrase]
(transduce (->> phrase
(comp (#(string/split % #" "))
(map #(vector (map reverse (phonetics/get-phones %)) %))) (map (fn [word]
(completing (let [phones (phonetics/get-phones word)]
(fn [trie [lookups v]] (map #(vector % word) phones))))
;; Lots of nesting here.
;; We have phrase -> word pronunciations -> word pronunciation -> [phones word]
;; The rest will be easier if we get rid of a level of nesting
;; by mapcatting the cross product of pronunciations.
(apply combinatorics/cartesian-product)
;; Now we have [phrases [pronunciations [[phones] word]]]
(map (partial apply map vector))
(map (fn [[phones words]]
[(apply concat phones)
(string/join " " words)]))
(map #(update % 0 take-vowels-and-tail-consonants))
(map #(update % 0 remove-non-primary-stress))))
(comment
(phrase->all-flex-rhyme-tailing-consonants-phones "robot")
;; => ([("OW1" "AA" "T") "robot"] [("OW1" "AH" "T") "robot"])
)
(defn words->rhyme-trie
[rhyme-type-fn words]
(->> words
(mapcat rhyme-type-fn)
(map #(update % 0 reverse))
(reduce (reduce
(fn [trie lookup] (fn [trie [phones word]]
(update trie lookup (fnil #(update % 1 inc) [v 0]))) (update trie phones conj word))
trie (trie/make-trie))))
lookups)))
(trie/make-trie)
words)) (comment
(let [words ["tightnit" "tarpit"]
trie (words->rhyme-trie phrase->all-flex-rhyme-tailing-consonants-phones words)]
(->> (trie/lookup trie ["AA"]))
trie)
;; => {("T" "IH1" "AA") ("tarpit"),
;; ("T" "IH1" "AY1") ("tightnit"),
;; ("T" "IH1") nil,
;; ("T") nil}
)
(defn phrase->flex-rhyme-phones (defn phrase->flex-rhyme-phones
"Takes a space-seperated string of words "Takes a space-seperated string of words

@ -116,7 +116,7 @@
id id
(new-key database key))) (new-key database key)))
lookup)] lookup)]
[lookup' v]))) lookup')))
(comment (comment
;; TODO: Move to nlp.core ;; TODO: Move to nlp.core

Loading…
Cancel
Save