More markov gen testing

main
Eric Ihli 4 years ago
parent 3c57418a61
commit b349e7af2c

@ -15,7 +15,8 @@
[clojure.zip :as zip]
[cljol.dig9 :as d]
[com.owoga.prhyme.data.phonetics :as phonetics]
[com.owoga.prhyme.syllabify :as syllabify]))
[com.owoga.prhyme.syllabify :as syllabify]
[taoensso.nippy :as nippy]))
(tufte/add-basic-println-handler! {})
@ -410,26 +411,48 @@
(conj (peek result) (first phones))))))))
(defn syllabify-phrase-with-stress [phrase]
(map syllabify-with-stress (string/split phrase #"[ -]")))
(reduce
into
[]
(map
(comp owoga.syllabify/syllabify
first
owoga.phonetics/get-phones)
(string/split phrase #"[ -]"))))
(comment
(syllabify-phrase-with-stress "bother me")
(word->phones "bother me")
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones) ["bother" "me"])
[(syllabify-phrase-with-stress "on poverty")
(syllabify-phrase-with-stress "can bother me")]
)
(defn phrase->flex-rhyme-phones [phrase]
(let [syllables (syllabify-phrase-with-stress phrase)]
(->> (seq (reduce into [] syllables))
(map #(filter (partial re-find #"\d") %))
(flatten)
(map #(string/replace % #"\d" "")))))
(defn phrase->flex-rhyme-phones
"Takes a space-seperated string of words
and returns the concatenation of the words
vowel phones.
Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie.
"
[phrase]
(->> phrase
(#(string/split % #" "))
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
(map (partial reduce into []))
(map #(filter (partial re-find #"\d") %))
(flatten)
(map #(string/replace % #"\d" ""))
(reverse)))
(comment
(phrase->flex-rhyme-phones "bother me")
(phrase->flex-rhyme-phones "bother hello")
;; => ("OW" "AH" "ER" "AA")
)
(defonce context (atom {}))
@ -506,25 +529,316 @@
(swap!
context
assoc
:flex-rhyme-trie3'
:flex-rhyme-trie
(transduce
(comp
(map (fn [[k v]]
(println (string/join " " (map (@context :database) k)))
[(string/join " " (map (@context :database) k))
[k v]]))
(map (fn [[phrase [k v]]]
[(reverse (phrase->flex-rhyme-phones phrase))
[(phrase->flex-rhyme-phones phrase)
[k v]])))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
(trie/children-at-depth (@context :trie) 0 3))))
(->> (trie/children-at-depth (@context :trie) 1 2)
(drop 500050)
(take 20)))))
nil)
)
(comment
(->> (get (@context :flex-rhyme-trie) ["EH" "OW" "IY" "EH"])
(take 20)
(map first)
(map (partial map (@context :database))))
(trie/children (trie/lookup (@context :trie) [13393]))
((@context :database) "desk") ;; => 13393
((@context :database) "wobbly") ;; => 152750
(get (@context :trie) [13393 152750]))
(defn rhyme-choices
[{:keys [flex-rhyme-trie database] :as context} phrase]
(let [phones (phrase->flex-rhyme-phones phrase)]
(get flex-rhyme-trie phones)))
(defn exclude-non-rhymes-from-choices
"Removes any choice that includes the last
word of the rhyming phrase as the last word of the choice.
Also removes beginning and end of sentence markers (1 and 38 in the database)."
[{:keys [database]} phrase choices]
(let [word-id (database (last (string/split phrase #" ")))]
(remove
(fn [child]
(or (= ((comp first second) child) word-id)
(#{1 38} ((comp first first) child))))
choices)))
(defn exclude-non-english-phrases-from-choices
[{:keys [database]} choices]
(filter
(fn [choice]
(->> (first choice)
(map database)
(every? dict/cmu-with-stress-map)))
choices))
(defn weighted-selection-from-choices
[choices]
(math/weighted-selection
(comp second second)
choices))
(defn choice->n-gram
[{:keys [database]} choice]
(map database (first choice)))
(defn generate-rhyming-n-gram
[phrase]
(->> (rhyme-choices @context phrase)
(exclude-non-rhymes-from-choices @context phrase)
(weighted-selection-from-choices)
(choice->n-gram @context)))
(defn get-flex-rhyme
"Gets from a rhyme-trie a rhyming n-gram based on the
weighted selection from their frequencies."
[{:keys [flex-rhyme-trie database] :as context} phrase]
(let [phones (phrase->flex-rhyme-phones phrase)
;; Exclude the last word. Don't rhyme kodak with kodak.
word-id (database (first (string/split phrase #" ")))
choices (remove
(fn [child]
(= (first child) word-id))
(get flex-rhyme-trie phones))
choice (math/weighted-selection
(comp second second)
choices)]
(map database (first choice))))
(comment
(get-flex-rhyme @context "bother me")
)
(defn get-next-markov
[{:keys [trie database] :as context} seed]
(let [seed (take-last 3 seed)
node (trie/lookup trie seed)
children (and node
(->> node
trie/children
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))
(remove
(fn [[k v]]
(#{1 38} k)))))]
(cond
(nil? node) (recur context (rest seed))
(seq children)
(if (< (rand) (/ (apply max (map (comp second second) children))
(apply + (map (comp second second) children))))
(recur context (rest seed))
(first (math/weighted-selection (comp second second) children)))
(> (count seed) 0)
(recur context (rest seed))
:else (throw (Exception. "Error")))))
(defn get-next-markov-from-phrase-backwards
[{:keys [database trie] :as context} phrase n]
(let [word-ids (->> phrase
(#(string/split % #" "))
(take n)
(reverse)
(map database))]
(database (get-next-markov context word-ids))))
(comment
(get-next-markov @context [222])
(get-next-markov-from-phrase-backwards @context "will strike you down" 3)
(get (@context :database) 7982)
)
(defn ids->words
[{:keys [database] :as context} ids]
(map database ids))
(defn words->syllables
[words]
(->> words
(string/join " ")
(reverse (phrase->flex-rhyme-phones))))
(defn generate-sentence-with-n-words
[{:keys [database] :as context} seed n]
(loop [seed seed]
(if (>= (dec n) (count seed))
(recur (conj seed (get-next-markov context seed)))
(map database seed))))
(defn take-words-amounting-to-more-at-least-n-syllables
[phrase n]
(letfn [(phones [word]
[word (first (owoga.phonetics/get-phones word))])
(syllables [[word phones]]
[word (owoga.syllabify/syllabify phones)])]
(->> phrase
(#(string/split % #" "))
(map phones)
(map syllables)
(reduce
(fn [result [word syllables]]
(if (<= n (count (mapcat second result)))
(reduced result)
(conj result [word syllables])))
[])
(map first)
(string/join " "))))
(defn valid-english-sentence?
[phrase]
(let [words (string/split #" " phrase)]
(and (nlp/valid-sentence? phrase)
(every? dict/cmu-with-stress-map words))))
(defn sha256 [text]
(let [digest (java.security.MessageDigest/getInstance "SHA-256")]
(->> (.digest digest (.getBytes text "UTF-8"))
(#(BigInteger. 1 %))
(#(.toString % 16)))))
(defn syllable-count-phrase
[phrase]
(->> phrase
(#(string/split % #" "))
(map owoga.phonetics/get-phones)
(map first)
(mapcat owoga.syllabify/syllabify)
count))
(defn rhyming-n-gram-choices
[context target-rhyme]
(loop [target-rhyme target-rhyme]
(let [choices (->> target-rhyme
(rhyme-choices context)
(exclude-non-rhymes-from-choices context target-rhyme)
(exclude-non-english-phrases-from-choices context))]
(if (empty? choices)
(recur (string/join " " (rest (string/split target-rhyme #" "))))
choices))))
(defn generate-n-syllable-sentence-rhyming-with
[context target-phrase n]
(let [target-phrase-words (string/split target-phrase #" ")
reversed-target-phrase (string/join " " (reverse target-phrase-words))
target-rhyme
(->> (take-words-amounting-to-more-at-least-n-syllables
reversed-target-phrase
5)
(#(string/split % #" "))
reverse
(string/join " "))
rhyming-n-gram (->> (rhyming-n-gram-choices context target-rhyme)
(weighted-selection-from-choices)
(choice->n-gram context)
(string/join " "))]
(loop [phrase rhyming-n-gram]
(if (<= n (syllable-count-phrase phrase))
phrase
(recur
(str (get-next-markov-from-phrase-backwards context phrase 5)
" "
phrase))))))
(generate-n-syllable-sentence-rhyming-with @context "instead of war on poverty" 8)
((@context :database) "poverty")
(defn amul8
([sentence]
(->> (amulate (string/split sentence #" "))
(map reverse)
(map (partial string/join " "))))
([sentence n]
(loop [result [sentence]
n n]
(if (zero? n)
result
(recur (conj result (amul8 (peek result)))
(dec n))))))
(defn amulate?
[text]
(let [digest (sha256 text)]
(re-matches #"8{4}" digest)))
(defn continuously-amulate
[seed]
(let [next-sentence (amul8 seed)
next-seed (->> next-sentence
(#(string/split % #" "))
(reverse)
(map
(fn [word]
[word (phrase->flex-rhyme-phones word)]))
((fn [word-phones]
(loop [word-phones word-phones
seed []]
(println (mapcat second seed))
(if (< 2 (count (mapcat second seed)))
(string/join
" "
(reverse (map first seed)))
(recur (rest word-phones)
(conj seed (first word-phones))))))))]
(println next-seed)
(lazy-seq
(cons next-sentence (continuously-amulate next-seed)))))
(comment
(take 5 (continuously-amulate "technology"))
(->> (amul8 "technology" 1)
(map second)
(partition 2 1)
(map
(fn [pair]
(string/join "\n" pair)))
(map #(vector % (sha256 %)))
(map
(fn [[text sha]]
[text sha (re-matches #"8{4}" sha)]))
(map println))
(dict/cmu-with-stress-map )
(repeatedly
3
#(amulate (reverse ["pleasure" "of" "the" "arcane" "technology"])))
(phrase->flex-rhyme-phones "bother hello")
(phrase->flex-rhyme-phones "snow-covered on")
(get-flex-rhyme @context (reverse ["AA" "ER" "AH" "OW"]))
((@context :database) "<s>")
(get-next-markov @context [1 503])
(take 20
(repeatedly #(reverse (get-flex-rhyme @context
(reverse (phrase->flex-rhyme-phones "technology"))
"technology"))))
(amulate)
(get (@context :database) "</s>")
(get (@context :database) "technology")
(phrase->flex-rhyme-phones "able") ;; => ("EY" "AH")
(phrase->flex-rhyme-phones "away") ;; => ("AH" "EY")
(take 20 (@context :flex-rhyme-trie))
(get-flex-rhyme @context '("AA" "IY" "AE"))
(map #(get (@context :database) %) [1 503])
(time (count (tpt/children-at-depth (@context :trie) 0 2)))
(->> (trie/children-at-depth (@context :flex-rhyme-trie') 0 5)
@ -547,7 +861,7 @@
(first (@context :trie))
;; 448351
;; 4388527
(initialize)
(time (initialize))
)
@ -807,6 +1121,9 @@
(def rhyme-database (atom {}))
(def db
(nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(def perfect-rhyme-trie
(transduce
(comp
@ -838,7 +1155,8 @@
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
@loaded-backwards-database))
(take 1000 db)))
(take 20 vowel-rhyme-trie)
)
#_(with-open [wtr (clojure.java.io/writer "database.bin")]

@ -7,7 +7,8 @@
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
[clojure.zip :as zip]))
(def re-word
"Regex for tokenizing a string into words
@ -165,7 +166,7 @@
(map (make-database-processor database)))
(completing
(fn [trie [k v]]
(update trie k (fnil inc 0))))
(update trie k (fnil #(update % 1 inc) [k 0]))))
(trie/make-trie)
files))
@ -173,7 +174,103 @@
[trie encode-fn decode-fn]
(tpt/tightly-packed-trie trie encode-fn decode-fn))
(def texts (eduction
(comp (xf-file-seq 0 10)
(map slurp))
(file-seq (io/file "dark-corpus"))))
(defn split-text-into-sentences
[text]
(->> text
(#(string/replace % #"([\.\?\!])" "$1\n"))
(string/split-lines)))
(defn mapmap
[fn & body]
(apply map (partial map fn) body))
(defn treebank-zipper->trie-map-entries
[treebank-zipper]
(let [leaf-paths (nlp/leaf-pos-paths treebank-zipper)]
leaf-paths))
(comment
(treebank-zipper->trie-map-entries
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(defn breadth-first-search [z]
(letfn [(zip-children [loc]
(when-let [first-child (zip/down loc)]
(take-while (comp not nil?)
(iterate zip/right first-child))))]
(loop [ret []
queue (conj clojure.lang.PersistentQueue/EMPTY z)]
(if (seq queue)
(let [[node children] ((juxt zip/node zip-children) (peek queue))]
(recur (conj ret node) (into (pop queue) children)))
ret))))
(filter
symbol?
(breadth-first-search
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))))
(->> (zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))
(iterate zip/next)
(take 10)
last
(zip/path)
(map first)
(filter symbol?))
)
(defn process-text
[text]
(->> text
(split-text-into-sentences)
(map string/trim)
(map nlp/treebank-zipper)
(map nlp/leaf-pos-paths)))
(comment
(into
#_(trie/make-trie)
[]
(map process-text)
texts)
)
(comment
(let [database (atom {:next-id 1})
trie (file-seq->trie
database
(transduce
(xf-file-seq 0 2)
conj
(file-seq (io/file "dark-corpus")))
1 4)]
trie)
(time
(let [database (atom {:next-id 1})
trie (transduce

@ -356,7 +356,9 @@
(comp
(map (fn [child]
(vector generated-text child)))
xf-filter) conj children)]
xf-filter)
conj
children)]
(cond
(nil? node) (recur trie generated-text (butlast k) xf-filter)
@ -589,7 +591,7 @@
;; Turning a word frequency into a phoneme trie
(transduce
(comp
(drop 10)
(drop 100000)
(take 20)
(map first)
(map (partial remove #{1 7})) ;; </s> and <s>
@ -617,7 +619,8 @@
conj
(trie/children-at-depth tpt 0 2))
rhyme-trie
(take 20 (drop 100 @rhyme-trie))
(take 20 (trie/children-at-depth tpt 0 2))
(let [words [[[["DH" "IH1" "S"] ["DH" "IH0" "S"]] [["IH1" "Z"] ["IH1" "S"]]] '(11 77)]]

@ -1091,3 +1091,97 @@
(top-k-sequences prhyme-pos-tagger (tokenize "")))
)
(defn loc-path
[loc]
(->> loc
zip/path
(map first)
(filter symbol?)))
(defn breadth-first
[zipper]
(letfn [(zip-children [loc]
(when-let [first-child (zip/down loc)]
(take-while
(comp not nil?)
(iterate zip/right first-child))))]
(loop [result []
queue (conj clojure.lang.PersistentQueue/EMPTY zipper)]
(if (seq queue)
(let [[zipper children] ((juxt identity zip-children) (peek queue))]
(recur (conj result zipper) (into (pop queue) children)))
result))))
(defn loc-children
[loc]
(when-let [first-child (zip/down loc)]
(->> (take-while
(complement nil?)
(iterate zip/right first-child))
(map first))))
(defn part-of-speech-children
[loc]
(->> loc
(iterate zip/next)
(take-while (complement zip/end?))
(map (fn [loc]
(when (symbol? (zip/node loc))
[(->> (zip/path loc)
(map first))
(->> (zip/right loc)
(zip/node)
(map first))])))
(remove (comp nil? second))))
(comment
(->> (zip/vector-zip [1 [2 [3]]])
(iterate zip/next)
(take 6)
last
zip/path
(map first))
(->> (breadth-first
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(map loc-children)
(filter seq?)
)
(part-of-speech-children
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(->> (zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))
(zip/next)
(zip/next)
(zip/next)
(zip/next)
(zip/next)
(zip/node)
#_#_(loc-children)
(map first))
)
(comment
(defn part-of-speech-n-grams
[zipper]
(letfn [(fn step [path []])])))

Loading…
Cancel
Save