Better rhymetrie API

main
Eric Ihli 3 years ago
parent 3fe69d679b
commit c6881b361d

@ -76,12 +76,6 @@
(string/join "\n") (string/join "\n")
(string/trim))])))) (string/trim))]))))
(defn english? [text]
(let [words (string/split text #"\s+")
english-words
(->> words (filter #(util/words-map (string/lower-case %))))]
(< 0.7 (/ (count english-words) (count words)))))
(defn scrape (defn scrape
([base-url] ([base-url]
(scrape (drop 10 (parse-letters-urls (fetch-url base-url))) '() '())) (scrape (drop 10 (parse-letters-urls (fetch-url base-url))) '() '()))

@ -5,7 +5,6 @@
[com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.util.nlp :as nlp]
[com.owoga.prhyme.generation.weighted-selection :as weighted-selection] [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
[taoensso.tufte :as tufte :refer [defnp p profiled profile]] [taoensso.tufte :as tufte :refer [defnp p profiled profile]]
[clojure.java.io :as io])) [clojure.java.io :as io]))

@ -180,7 +180,9 @@
(let [database (atom {:next-id 1}) (let [database (atom {:next-id 1})
trie (file-seq->backwards-markov-trie database files n m)] trie (file-seq->backwards-markov-trie database files n m)]
(nippy/freeze-to-file trie-filepath (seq trie)) (nippy/freeze-to-file trie-filepath (seq trie))
(println "Froze" trie-filepath)
(nippy/freeze-to-file database-filepath @database) (nippy/freeze-to-file database-filepath @database)
(println "Froze" database-filepath)
(save-tightly-packed-trie trie database tightly-packed-trie-filepath) (save-tightly-packed-trie trie database tightly-packed-trie-filepath)
(let [loaded-trie (->> trie-filepath (let [loaded-trie (->> trie-filepath
nippy/thaw-from-file nippy/thaw-from-file
@ -200,18 +202,25 @@
(let [files (->> "dark-corpus" (let [files (->> "dark-corpus"
io/file io/file
file-seq file-seq
(eduction (xf-file-seq 0 1000))) (eduction (xf-file-seq 0 250000)))
[trie database] (train-backwards [trie database] (train-backwards
files files
1 1
5 5
"/tmp/markov-trie-4-gram-backwards.bin" "/home/eihli/.models/markov-trie-4-gram-backwards.bin"
"/tmp/markov-database-4-gram-backwards.bin" "/home/eihli/.models/markov-database-4-gram-backwards.bin"
"/tmp/markov-tightly-packed-trie-4-gram-backwards.bin")])) "/home/eihli/.models/markov-tightly-packed-trie-4-gram-backwards.bin")]))
(def markov-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/trie.bin"))) (time
(def database (nippy/thaw-from-file "/tmp/database.bin")) (def markov-trie (into (trie/make-trie) (nippy/thaw-from-file "/home/eihli/.models/markov-trie-4-gram-backwards.bin"))))
(def markov-tight-trie (tpt/tightly-packed-trie markov-trie encode-fn (decode-fn database))) (time
(def database (nippy/thaw-from-file "/home/eihli/.models/markov-database-4-gram-backwards.bin")))
(time
(def markov-tight-trie
(tpt/load-tightly-packed-trie-from-file
"/home/eihli/.models/markov-tightly-packed-trie-4-gram-backwards.bin"
(decode-fn database))))
(take 20 markov-tight-trie)
) )
@ -227,21 +236,22 @@
(comment (comment
(time (time
(let [database (atom (nippy/thaw-from-file "/tmp/database.bin"))] (let [database (atom (nippy/thaw-from-file "/home/eihli/.models/markov-database-4-gram-backwards.bin"))]
(gen-rhyme-model (gen-rhyme-model
prhyme/phrase->all-flex-rhyme-tailing-consonants-phones prhyme/phrase->all-flex-rhyme-tailing-consonants-phones
database database
"/tmp/rhyme-trie-primary-stressed-vowels-and-trailing-consonants.bin") "/home/eihli/.models/rhyme-trie-primary-stressed-vowels-and-trailing-consonants.bin")
(gen-rhyme-model (gen-rhyme-model
prhyme/phrase->unstressed-vowels-and-tailing-consonants prhyme/phrase->unstressed-vowels-and-tailing-consonants
database database
"/tmp/rhyme-trie-unstressed-vowels-and-trailing-consonants.bin"))) "/home/eihli/.models/rhyme-trie-unstressed-vowels-and-trailing-consonants.bin")))
(def rhyme-trie (def rhyme-trie
(into (into
(trie/make-trie) (trie/make-trie)
(nippy/thaw-from-file (nippy/thaw-from-file
"/tmp/rhyme-trie-primary-stressed-vowels-and-trailing-consonants.bin"))) "/home/eihli/.models/rhyme-trie-unstressed-vowels-and-trailing-consonants.bin")))
) )
@ -302,6 +312,35 @@
) )
(defn rhyme-choices-walking-target-rhyme-with-stop
"All target rhymes need to be in phone form.
`target-rhyme`: [N UH1 F]
If we try to turn string form into phone form,
we'd sometimes be forced to deal with multiple pronunciations.
By only handling phone form here, the caller can handle multiple pronunciations.
Makes for a cleaner API.
`words-fn` gets passed the result of `rhyme-choices` which has this structures
([(G AO1 B) bog] [(G AO1 F) fog])
`stop?` gets passed the remaining target-rhyme phones and the current choices.
"
([trie stop? target-rhyme]
(rhyme-choices-walking-target-rhyme-with-stop
trie
stop?
target-rhyme
identity))
([trie stop? target-rhyme words-fn]
(loop [target-rhyme target-rhyme
result []]
(let [choices (words-fn (rhyme-choices trie target-rhyme))]
(if (stop? target-rhyme choices)
(into result choices)
(recur (butlast target-rhyme)
(into result choices)))))))
(defn rhyme-choices-walking-target-rhyme (defn rhyme-choices-walking-target-rhyme
"All target rhymes need to be in phone form. "All target rhymes need to be in phone form.
@ -648,6 +687,47 @@
(remove (fn [[phones wordset]] (remove (fn [[phones wordset]]
(empty? wordset))))))) (empty? wordset)))))))
(defn tightly-generate-n-syllable-sentence-v2
"
If you want to generate a sentence targeting a rhyme, generate the rhyming tail out-of-band
and then pass it as a seed to this function.
"
([database
markov-trie
n-gram-rank
target-sentence-syllable-count
seed]
(tightly-generate-n-syllable-sentence-v2
database
markov-trie
n-gram-rank
target-sentence-syllable-count
identity
seed))
([database
markov-trie
n-gram-rank
target-sentence-syllable-count
markov-process-children
seed]
(let [[eos bos] (map database [prhyme/EOS prhyme/BOS])]
(loop [phrase seed]
(if (<= target-sentence-syllable-count
(prhyme/count-syllables-of-phrase
(string/join " " (map second phrase))))
phrase
(recur
(conj
phrase
(let [word (database
(get-next-markov
markov-trie
(into (vec (repeat (dec n-gram-rank) eos))
(mapv (comp database second) phrase))
markov-process-children))]
[(rand-nth (phonetics/get-phones word)) word]))))))))
;;;; Demo ;;;; Demo
;;;; ;;;;
(comment (comment
@ -853,18 +933,26 @@
(repeatedly (repeatedly
2 2
#(->> (rhyme-from-scheme #(->> (rhyme-from-scheme
'[[A 9] [A 9] [B 5] [B 5] [A 9]] '[[A 8] [A 8] [B 5] [B 5] [A 8]]
database database
markov-tight-trie markov-tight-trie
rhyme-trie) rhyme-trie)
(map reverse) (map reverse)
(map (partial map second)) (map (partial map second))
(map data-transform/untokenize))) (map data-transform/untokenize)))
(->> "overdrive" (->> "overdrive"
(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones) (prhyme/phrase->unstressed-vowels-and-tailing-consonants)
(map first) (map first)
(map reverse) (map reverse)
(map (partial rhyme-choices-walking-target-rhyme rhyme-trie))) (map (partial
rhyme-choices-walking-target-rhyme-with-stop
rhyme-trie
(fn [phones choices]
(every? phonetics/consonant (butlast phones))))))
(trie/lookup rhyme-trie ["V" "AY1"]) (trie/lookup rhyme-trie ["V" "AY1"])
(trie/lookup markov-tight-trie nil) (trie/lookup markov-tight-trie nil)
(tightly-generate-n-syllable-sentence-rhyming-with (tightly-generate-n-syllable-sentence-rhyming-with
@ -883,3 +971,129 @@
(set/difference wordset (into #{} [])))) (set/difference wordset (into #{} []))))
) )
#_(ns-unmap (find-ns 'com.owoga.corpus.markov) 'RhymeTrie)
(defprotocol IRhymeTrie
(rhymes [this phones] [this phones preprocess-rhymes]))
(deftype RhymeTrie [trie prep-phones end-walk]
IRhymeTrie
(rhymes [this phones]
(rhymes this phones identity))
(rhymes [this phones preprocess-rhymes]
(let [prepped-phones (reverse (prep-phones phones))]
(rhyme-choices-walking-target-rhyme-with-stop
trie
end-walk
prepped-phones
preprocess-rhymes))))
(comment
(def rhymetrie
(->RhymeTrie
rhyme-trie
(fn [phones]
(->> phones
prhyme/take-vowels-and-tail-consonants
prhyme/remove-all-stress))
(fn [phones choices]
(every? phonetics/consonant (butlast phones)))))
(rhymes rhymetrie ["AY" "V"])
(time
(count
(trie/children-at-depth markov-tight-trie 2 3)))
)
(defn rhyme-from-scheme-v2
"scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]
Will include as many syllables as possible in finding rhymes
and will choose randomly with equal chance from all possible rhymes.
Result will be a map of schemes to vectors of lines.
{[A 9] [[[you [Y UW]] [are [AA R]] [so [S OH]]]
[[and [AE N D]] [we [W IY]] [go [G OH]]]]
[B 5] [[[hey [H AY]]]
[[bay [B AY]]]]}
Currently hard-coded to work with 4-gram.
"
[scheme database markov-trie rhyme-trie]
(let [[eos bos] (map database [prhyme/EOS prhyme/BOS])]
(loop [scheme scheme
result {}]
(if (empty? scheme)
result
(let [[pattern syllable-count] (first scheme)
existing-lines (result (first scheme))
banned-words
(into #{} (->> existing-lines
(map (comp last last))))
seed (if existing-lines
(->> existing-lines
rand-nth
reverse
(map first)
(apply concat)
(#(rhymes
rhyme-trie
%
(fn [choices]
(->> choices
(map (fn [[phones wordset]]
[phones
(set/difference
wordset
banned-words)]))
(remove (comp empty? second))))))
rand-nth
((fn [[phones wordset]]
(let [word (rand-nth (vec wordset))]
[(rand-nth (phonetics/get-phones word))
word])))
vector)
(->> (get-next-markov
markov-trie
[eos eos eos]
(fn [children]
(let [banned-ids (->> banned-words
(map database)
(into #{eos bos}))]
(remove
#(banned-ids (.key %)) children))))
database
(#(vector (rand-nth (phonetics/get-phones %)) %))
vector))
line (take-until
(best-of-20)
#(tightly-generate-n-syllable-sentence-v2
database
markov-trie
4
syllable-count
(make-markov-filter [eos bos])
seed))]
(recur (rest scheme)
(update result (first scheme) (fnil conj []) line)))))))
(comment
(let [scheme '[[a 8] [a 8] [b 5] [b 5] [a 8]]]
(rhyme-from-scheme-v2
scheme database markov-tight-trie rhymetrie))
)
(comment
(let [existing-lines '([[["K" "AA" "AH"] "unlock"]
[["M" "EH1" "M" "ER0" "IY0" "Z"] "memories"]
[["D" "IH0" "Z" "AO1" "L" "V" "IH0" "NG"] "dissolving"]])]
(->> existing-lines
rand-nth
reverse
(map first)
(mapcat reverse)))
)

@ -279,6 +279,8 @@
(remove-all-stress phones)) (remove-all-stress phones))
(defn phones->all-flex-rhyme-tailing-consonants-phones (defn phones->all-flex-rhyme-tailing-consonants-phones
"Removes all but the tail consonants.
Removes all non-primary stress from vowels."
[phones] [phones]
(->> phones (->> phones
take-vowels-and-tail-consonants take-vowels-and-tail-consonants

Loading…
Cancel
Save