diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj
index 5cfffcf..76ce6b4 100644
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@@ -15,7 +15,8 @@
[clojure.zip :as zip]
[cljol.dig9 :as d]
[com.owoga.prhyme.data.phonetics :as phonetics]
- [com.owoga.prhyme.syllabify :as syllabify]))
+ [com.owoga.prhyme.syllabify :as syllabify]
+ [taoensso.nippy :as nippy]))
(tufte/add-basic-println-handler! {})
@@ -410,26 +411,48 @@
(conj (peek result) (first phones))))))))
(defn syllabify-phrase-with-stress [phrase]
- (map syllabify-with-stress (string/split phrase #"[ -]")))
+ (reduce
+ into
+ []
+ (map
+ (comp owoga.syllabify/syllabify
+ first
+ owoga.phonetics/get-phones)
+ (string/split phrase #"[ -]"))))
(comment
(syllabify-phrase-with-stress "bother me")
+
(word->phones "bother me")
+
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones) ["bother" "me"])
[(syllabify-phrase-with-stress "on poverty")
(syllabify-phrase-with-stress "can bother me")]
+
)
-(defn phrase->flex-rhyme-phones [phrase]
- (let [syllables (syllabify-phrase-with-stress phrase)]
- (->> (seq (reduce into [] syllables))
- (map #(filter (partial re-find #"\d") %))
- (flatten)
- (map #(string/replace % #"\d" "")))))
+(defn phrase->flex-rhyme-phones
+ "Takes a space-seperated string of words
+ and returns the concatenation of the words
+ vowel phones.
+
+ Returns them in reversed order so they
+ are ready to be used in a lookup of a rhyme trie.
+ "
+ [phrase]
+ (->> phrase
+ (#(string/split % #" "))
+ (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
+ (map (partial reduce into []))
+ (map #(filter (partial re-find #"\d") %))
+ (flatten)
+ (map #(string/replace % #"\d" ""))
+ (reverse)))
(comment
- (phrase->flex-rhyme-phones "bother me")
+ (phrase->flex-rhyme-phones "bother hello")
+ ;; => ("OW" "AH" "ER" "AA")
)
(defonce context (atom {}))
@@ -506,25 +529,316 @@
(swap!
context
assoc
- :flex-rhyme-trie3'
+ :flex-rhyme-trie
(transduce
(comp
(map (fn [[k v]]
+ (println (string/join " " (map (@context :database) k)))
[(string/join " " (map (@context :database) k))
[k v]]))
(map (fn [[phrase [k v]]]
- [(reverse (phrase->flex-rhyme-phones phrase))
+ [(phrase->flex-rhyme-phones phrase)
[k v]])))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
- (trie/children-at-depth (@context :trie) 0 3))))
+ (->> (trie/children-at-depth (@context :trie) 1 2)
+ (drop 500050)
+ (take 20)))))
nil)
)
(comment
+ (->> (get (@context :flex-rhyme-trie) ["EH" "OW" "IY" "EH"])
+ (take 20)
+ (map first)
+ (map (partial map (@context :database))))
+ (trie/children (trie/lookup (@context :trie) [13393]))
+ ((@context :database) "desk") ;; => 13393
+ ((@context :database) "wobbly") ;; => 152750
+ (get (@context :trie) [13393 152750]))
+
+(defn rhyme-choices
+ [{:keys [flex-rhyme-trie database] :as context} phrase]
+ (let [phones (phrase->flex-rhyme-phones phrase)]
+ (get flex-rhyme-trie phones)))
+
+(defn exclude-non-rhymes-from-choices
+ "Removes any choice that includes the last
+ word of the rhyming phrase as the last word of the choice.
+
+ Also removes beginning and end of sentence markers (1 and 38 in the database)."
+ [{:keys [database]} phrase choices]
+ (let [word-id (database (last (string/split phrase #" ")))]
+ (remove
+ (fn [child]
+ (or (= ((comp first second) child) word-id)
+ (#{1 38} ((comp first first) child))))
+ choices)))
+
+(defn exclude-non-english-phrases-from-choices
+ [{:keys [database]} choices]
+ (filter
+ (fn [choice]
+ (->> (first choice)
+ (map database)
+ (every? dict/cmu-with-stress-map)))
+ choices))
+
+(defn weighted-selection-from-choices
+ [choices]
+ (math/weighted-selection
+ (comp second second)
+ choices))
+
+(defn choice->n-gram
+ [{:keys [database]} choice]
+ (map database (first choice)))
+
+(defn generate-rhyming-n-gram
+ [phrase]
+ (->> (rhyme-choices @context phrase)
+ (exclude-non-rhymes-from-choices @context phrase)
+ (weighted-selection-from-choices)
+ (choice->n-gram @context)))
+
+(defn get-flex-rhyme
+ "Gets from a rhyme-trie a rhyming n-gram based on the
+ weighted selection from their frequencies."
+ [{:keys [flex-rhyme-trie database] :as context} phrase]
+ (let [phones (phrase->flex-rhyme-phones phrase)
+ ;; Exclude the last word. Don't rhyme kodak with kodak.
+ word-id (database (first (string/split phrase #" ")))
+ choices (remove
+ (fn [child]
+ (= (first child) word-id))
+ (get flex-rhyme-trie phones))
+ choice (math/weighted-selection
+ (comp second second)
+ choices)]
+ (map database (first choice))))
+
+(comment
+ (get-flex-rhyme @context "bother me")
+
+ )
+(defn get-next-markov
+ [{:keys [trie database] :as context} seed]
+ (let [seed (take-last 3 seed)
+ node (trie/lookup trie seed)
+ children (and node
+ (->> node
+ trie/children
+ (map #(vector (.key %) (get % [])))
+ (remove (comp nil? second))
+ (remove
+ (fn [[k v]]
+ (#{1 38} k)))))]
+ (cond
+ (nil? node) (recur context (rest seed))
+ (seq children)
+ (if (< (rand) (/ (apply max (map (comp second second) children))
+ (apply + (map (comp second second) children))))
+ (recur context (rest seed))
+ (first (math/weighted-selection (comp second second) children)))
+ (> (count seed) 0)
+ (recur context (rest seed))
+ :else (throw (Exception. "Error")))))
+
+(defn get-next-markov-from-phrase-backwards
+ [{:keys [database trie] :as context} phrase n]
+ (let [word-ids (->> phrase
+ (#(string/split % #" "))
+ (take n)
+ (reverse)
+ (map database))]
+ (database (get-next-markov context word-ids))))
+
+(comment
+ (get-next-markov @context [222])
+ (get-next-markov-from-phrase-backwards @context "will strike you down" 3)
+
+ (get (@context :database) 7982)
+ )
+(defn ids->words
+ [{:keys [database] :as context} ids]
+ (map database ids))
+
+(defn words->syllables
+ [words]
+ (->> words
+ (string/join " ")
+ (reverse (phrase->flex-rhyme-phones))))
+
+(defn generate-sentence-with-n-words
+ [{:keys [database] :as context} seed n]
+ (loop [seed seed]
+ (if (>= (dec n) (count seed))
+ (recur (conj seed (get-next-markov context seed)))
+ (map database seed))))
+
+(defn take-words-amounting-to-more-at-least-n-syllables
+ [phrase n]
+ (letfn [(phones [word]
+ [word (first (owoga.phonetics/get-phones word))])
+ (syllables [[word phones]]
+ [word (owoga.syllabify/syllabify phones)])]
+ (->> phrase
+ (#(string/split % #" "))
+ (map phones)
+ (map syllables)
+ (reduce
+ (fn [result [word syllables]]
+ (if (<= n (count (mapcat second result)))
+ (reduced result)
+ (conj result [word syllables])))
+ [])
+ (map first)
+ (string/join " "))))
+
+(defn valid-english-sentence?
+ [phrase]
+ (let [words (string/split #" " phrase)]
+ (and (nlp/valid-sentence? phrase)
+ (every? dict/cmu-with-stress-map words))))
+
+
+(defn sha256 [text]
+ (let [digest (java.security.MessageDigest/getInstance "SHA-256")]
+ (->> (.digest digest (.getBytes text "UTF-8"))
+ (#(BigInteger. 1 %))
+ (#(.toString % 16)))))
+
+(defn syllable-count-phrase
+ [phrase]
+ (->> phrase
+ (#(string/split % #" "))
+ (map owoga.phonetics/get-phones)
+ (map first)
+ (mapcat owoga.syllabify/syllabify)
+ count))
+
+(defn rhyming-n-gram-choices
+ [context target-rhyme]
+ (loop [target-rhyme target-rhyme]
+ (let [choices (->> target-rhyme
+ (rhyme-choices context)
+ (exclude-non-rhymes-from-choices context target-rhyme)
+ (exclude-non-english-phrases-from-choices context))]
+ (if (empty? choices)
+ (recur (string/join " " (rest (string/split target-rhyme #" "))))
+ choices))))
+
+(defn generate-n-syllable-sentence-rhyming-with
+ [context target-phrase n]
+ (let [target-phrase-words (string/split target-phrase #" ")
+ reversed-target-phrase (string/join " " (reverse target-phrase-words))
+ target-rhyme
+ (->> (take-words-amounting-to-more-at-least-n-syllables
+ reversed-target-phrase
+ 5)
+ (#(string/split % #" "))
+ reverse
+ (string/join " "))
+ rhyming-n-gram (->> (rhyming-n-gram-choices context target-rhyme)
+ (weighted-selection-from-choices)
+ (choice->n-gram context)
+ (string/join " "))]
+ (loop [phrase rhyming-n-gram]
+ (if (<= n (syllable-count-phrase phrase))
+ phrase
+ (recur
+ (str (get-next-markov-from-phrase-backwards context phrase 5)
+ " "
+ phrase))))))
+
+(generate-n-syllable-sentence-rhyming-with @context "instead of war on poverty" 8)
+
+((@context :database) "poverty")
+(defn amul8
+ ([sentence]
+ (->> (amulate (string/split sentence #" "))
+ (map reverse)
+ (map (partial string/join " "))))
+ ([sentence n]
+ (loop [result [sentence]
+ n n]
+ (if (zero? n)
+ result
+ (recur (conj result (amul8 (peek result)))
+ (dec n))))))
+
+(defn amulate?
+ [text]
+ (let [digest (sha256 text)]
+ (re-matches #"8{4}" digest)))
+
+(defn continuously-amulate
+ [seed]
+ (let [next-sentence (amul8 seed)
+ next-seed (->> next-sentence
+ (#(string/split % #" "))
+ (reverse)
+ (map
+ (fn [word]
+ [word (phrase->flex-rhyme-phones word)]))
+ ((fn [word-phones]
+ (loop [word-phones word-phones
+ seed []]
+ (println (mapcat second seed))
+ (if (< 2 (count (mapcat second seed)))
+ (string/join
+ " "
+ (reverse (map first seed)))
+ (recur (rest word-phones)
+ (conj seed (first word-phones))))))))]
+ (println next-seed)
+ (lazy-seq
+ (cons next-sentence (continuously-amulate next-seed)))))
+
+(comment
+ (take 5 (continuously-amulate "technology"))
+
+ (->> (amul8 "technology" 1)
+ (map second)
+ (partition 2 1)
+ (map
+ (fn [pair]
+ (string/join "\n" pair)))
+ (map #(vector % (sha256 %)))
+ (map
+ (fn [[text sha]]
+ [text sha (re-matches #"8{4}" sha)]))
+ (map println))
+
+ (dict/cmu-with-stress-map )
+ (repeatedly
+ 3
+ #(amulate (reverse ["pleasure" "of" "the" "arcane" "technology"])))
+
+ (phrase->flex-rhyme-phones "bother hello")
+ (phrase->flex-rhyme-phones "snow-covered on")
+ (get-flex-rhyme @context (reverse ["AA" "ER" "AH" "OW"]))
+ ((@context :database) "")
+ (get-next-markov @context [1 503])
+
+ (take 20
+ (repeatedly #(reverse (get-flex-rhyme @context
+ (reverse (phrase->flex-rhyme-phones "technology"))
+ "technology"))))
+
+ (amulate)
+
+ (get (@context :database) "")
+ (get (@context :database) "technology")
+ (phrase->flex-rhyme-phones "able") ;; => ("EY" "AH")
+ (phrase->flex-rhyme-phones "away") ;; => ("AH" "EY")
+ (take 20 (@context :flex-rhyme-trie))
+ (get-flex-rhyme @context '("AA" "IY" "AE"))
+
+ (map #(get (@context :database) %) [1 503])
(time (count (tpt/children-at-depth (@context :trie) 0 2)))
(->> (trie/children-at-depth (@context :flex-rhyme-trie') 0 5)
@@ -547,7 +861,7 @@
(first (@context :trie))
;; 448351
;; 4388527
- (initialize)
+ (time (initialize))
)
@@ -807,6 +1121,9 @@
(def rhyme-database (atom {}))
+ (def db
+ (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
+
(def perfect-rhyme-trie
(transduce
(comp
@@ -838,7 +1155,8 @@
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
- @loaded-backwards-database))
+ (take 1000 db)))
+ (take 20 vowel-rhyme-trie)
)
#_(with-open [wtr (clojure.java.io/writer "database.bin")]
diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj
index a45e2fb..b90f34e 100644
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@@ -7,7 +7,8 @@
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[taoensso.nippy :as nippy]
- [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
+ [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
+ [clojure.zip :as zip]))
(def re-word
"Regex for tokenizing a string into words
@@ -165,7 +166,7 @@
(map (make-database-processor database)))
(completing
(fn [trie [k v]]
- (update trie k (fnil inc 0))))
+ (update trie k (fnil #(update % 1 inc) [k 0]))))
(trie/make-trie)
files))
@@ -173,7 +174,103 @@
[trie encode-fn decode-fn]
(tpt/tightly-packed-trie trie encode-fn decode-fn))
+(def texts (eduction
+ (comp (xf-file-seq 0 10)
+ (map slurp))
+ (file-seq (io/file "dark-corpus"))))
+
+(defn split-text-into-sentences
+ [text]
+ (->> text
+ (#(string/replace % #"([\.\?\!])" "$1\n"))
+ (string/split-lines)))
+
+(defn mapmap
+ [fn & body]
+ (apply map (partial map fn) body))
+
+(defn treebank-zipper->trie-map-entries
+ [treebank-zipper]
+ (let [leaf-paths (nlp/leaf-pos-paths treebank-zipper)]
+ leaf-paths))
+
(comment
+ (treebank-zipper->trie-map-entries
+ (zip/seq-zip
+ '(TOP
+ ((S
+ ((NP
+ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
+ (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
+ (. ("."))))))))
+
+ (defn breadth-first-search [z]
+ (letfn [(zip-children [loc]
+ (when-let [first-child (zip/down loc)]
+ (take-while (comp not nil?)
+ (iterate zip/right first-child))))]
+ (loop [ret []
+ queue (conj clojure.lang.PersistentQueue/EMPTY z)]
+ (if (seq queue)
+ (let [[node children] ((juxt zip/node zip-children) (peek queue))]
+ (recur (conj ret node) (into (pop queue) children)))
+ ret))))
+
+ (filter
+ symbol?
+ (breadth-first-search
+ (zip/seq-zip
+ '(TOP
+ ((S
+ ((NP
+ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
+ (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
+ (. (".")))))))))
+
+ (->> (zip/seq-zip
+ '(TOP
+ ((S
+ ((NP
+ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
+ (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
+ (. (".")))))))
+ (iterate zip/next)
+ (take 10)
+ last
+ (zip/path)
+ (map first)
+ (filter symbol?))
+
+ )
+
+(defn process-text
+ [text]
+ (->> text
+ (split-text-into-sentences)
+ (map string/trim)
+ (map nlp/treebank-zipper)
+ (map nlp/leaf-pos-paths)))
+
+(comment
+ (into
+ #_(trie/make-trie)
+ []
+ (map process-text)
+ texts)
+
+ )
+
+(comment
+ (let [database (atom {:next-id 1})
+ trie (file-seq->trie
+ database
+ (transduce
+ (xf-file-seq 0 2)
+ conj
+ (file-seq (io/file "dark-corpus")))
+ 1 4)]
+ trie)
+
(time
(let [database (atom {:next-id 1})
trie (transduce
diff --git a/src/com/owoga/prhyme/generation/markov_example.clj b/src/com/owoga/prhyme/generation/markov_example.clj
index 5502ae7..5e5f772 100644
--- a/src/com/owoga/prhyme/generation/markov_example.clj
+++ b/src/com/owoga/prhyme/generation/markov_example.clj
@@ -356,7 +356,9 @@
(comp
(map (fn [child]
(vector generated-text child)))
- xf-filter) conj children)]
+ xf-filter)
+ conj
+ children)]
(cond
(nil? node) (recur trie generated-text (butlast k) xf-filter)
@@ -589,7 +591,7 @@
;; Turning a word frequency into a phoneme trie
(transduce
(comp
- (drop 10)
+ (drop 100000)
(take 20)
(map first)
(map (partial remove #{1 7})) ;; and
@@ -617,7 +619,8 @@
conj
(trie/children-at-depth tpt 0 2))
- rhyme-trie
+ (take 20 (drop 100 @rhyme-trie))
+
(take 20 (trie/children-at-depth tpt 0 2))
(let [words [[[["DH" "IH1" "S"] ["DH" "IH0" "S"]] [["IH1" "Z"] ["IH1" "S"]]] '(11 77)]]
diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj
index a7d0557..cc30c8a 100644
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@@ -1091,3 +1091,97 @@
(top-k-sequences prhyme-pos-tagger (tokenize "")))
)
+
+(defn loc-path
+ [loc]
+ (->> loc
+ zip/path
+ (map first)
+ (filter symbol?)))
+
+(defn breadth-first
+ [zipper]
+ (letfn [(zip-children [loc]
+ (when-let [first-child (zip/down loc)]
+ (take-while
+ (comp not nil?)
+ (iterate zip/right first-child))))]
+ (loop [result []
+ queue (conj clojure.lang.PersistentQueue/EMPTY zipper)]
+ (if (seq queue)
+ (let [[zipper children] ((juxt identity zip-children) (peek queue))]
+ (recur (conj result zipper) (into (pop queue) children)))
+ result))))
+
+(defn loc-children
+ [loc]
+ (when-let [first-child (zip/down loc)]
+ (->> (take-while
+ (complement nil?)
+ (iterate zip/right first-child))
+ (map first))))
+
+(defn part-of-speech-children
+ [loc]
+ (->> loc
+ (iterate zip/next)
+ (take-while (complement zip/end?))
+ (map (fn [loc]
+ (when (symbol? (zip/node loc))
+ [(->> (zip/path loc)
+ (map first))
+ (->> (zip/right loc)
+ (zip/node)
+ (map first))])))
+ (remove (comp nil? second))))
+
+(comment
+ (->> (zip/vector-zip [1 [2 [3]]])
+ (iterate zip/next)
+ (take 6)
+ last
+ zip/path
+ (map first))
+
+ (->> (breadth-first
+ (zip/seq-zip
+ '(TOP
+ ((S
+ ((NP
+ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
+ (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
+ (. ("."))))))))
+ (map loc-children)
+ (filter seq?)
+ )
+
+ (part-of-speech-children
+ (zip/seq-zip
+ '(TOP
+ ((S
+ ((NP
+ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
+ (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
+ (. ("."))))))))
+
+ (->> (zip/seq-zip
+ '(TOP
+ ((S
+ ((NP
+ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
+ (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
+ (. (".")))))))
+ (zip/next)
+ (zip/next)
+ (zip/next)
+ (zip/next)
+ (zip/next)
+ (zip/node)
+ #_#_(loc-children)
+ (map first))
+
+ )
+(comment
+ (defn part-of-speech-n-grams
+ [zipper]
+ (letfn [(fn step [path []])])))