diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index 5cfffcf..76ce6b4 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -15,7 +15,8 @@ [clojure.zip :as zip] [cljol.dig9 :as d] [com.owoga.prhyme.data.phonetics :as phonetics] - [com.owoga.prhyme.syllabify :as syllabify])) + [com.owoga.prhyme.syllabify :as syllabify] + [taoensso.nippy :as nippy])) (tufte/add-basic-println-handler! {}) @@ -410,26 +411,48 @@ (conj (peek result) (first phones)))))))) (defn syllabify-phrase-with-stress [phrase] - (map syllabify-with-stress (string/split phrase #"[ -]"))) + (reduce + into + [] + (map + (comp owoga.syllabify/syllabify + first + owoga.phonetics/get-phones) + (string/split phrase #"[ -]")))) (comment (syllabify-phrase-with-stress "bother me") + (word->phones "bother me") + (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones) ["bother" "me"]) [(syllabify-phrase-with-stress "on poverty") (syllabify-phrase-with-stress "can bother me")] + ) -(defn phrase->flex-rhyme-phones [phrase] - (let [syllables (syllabify-phrase-with-stress phrase)] - (->> (seq (reduce into [] syllables)) - (map #(filter (partial re-find #"\d") %)) - (flatten) - (map #(string/replace % #"\d" ""))))) +(defn phrase->flex-rhyme-phones + "Takes a space-seperated string of words + and returns the concatenation of the words + vowel phones. + + Returns them in reversed order so they + are ready to be used in a lookup of a rhyme trie. + " + [phrase] + (->> phrase + (#(string/split % #" ")) + (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones)) + (map (partial reduce into [])) + (map #(filter (partial re-find #"\d") %)) + (flatten) + (map #(string/replace % #"\d" "")) + (reverse))) (comment - (phrase->flex-rhyme-phones "bother me") + (phrase->flex-rhyme-phones "bother hello") + ;; => ("OW" "AH" "ER" "AA") ) (defonce context (atom {})) @@ -506,25 +529,316 @@ (swap! context assoc - :flex-rhyme-trie3' + :flex-rhyme-trie (transduce (comp (map (fn [[k v]] + (println (string/join " " (map (@context :database) k))) [(string/join " " (map (@context :database) k)) [k v]])) (map (fn [[phrase [k v]]] - [(reverse (phrase->flex-rhyme-phones phrase)) + [(phrase->flex-rhyme-phones phrase) [k v]]))) (completing (fn [trie [k v]] (update trie k (fnil conj [v]) v))) (trie/make-trie) - (trie/children-at-depth (@context :trie) 0 3)))) + (->> (trie/children-at-depth (@context :trie) 1 2) + (drop 500050) + (take 20))))) nil) ) (comment + (->> (get (@context :flex-rhyme-trie) ["EH" "OW" "IY" "EH"]) + (take 20) + (map first) + (map (partial map (@context :database)))) + (trie/children (trie/lookup (@context :trie) [13393])) + ((@context :database) "desk") ;; => 13393 + ((@context :database) "wobbly") ;; => 152750 + (get (@context :trie) [13393 152750])) + +(defn rhyme-choices + [{:keys [flex-rhyme-trie database] :as context} phrase] + (let [phones (phrase->flex-rhyme-phones phrase)] + (get flex-rhyme-trie phones))) + +(defn exclude-non-rhymes-from-choices + "Removes any choice that includes the last + word of the rhyming phrase as the last word of the choice. + + Also removes beginning and end of sentence markers (1 and 38 in the database)." + [{:keys [database]} phrase choices] + (let [word-id (database (last (string/split phrase #" ")))] + (remove + (fn [child] + (or (= ((comp first second) child) word-id) + (#{1 38} ((comp first first) child)))) + choices))) + +(defn exclude-non-english-phrases-from-choices + [{:keys [database]} choices] + (filter + (fn [choice] + (->> (first choice) + (map database) + (every? dict/cmu-with-stress-map))) + choices)) + +(defn weighted-selection-from-choices + [choices] + (math/weighted-selection + (comp second second) + choices)) + +(defn choice->n-gram + [{:keys [database]} choice] + (map database (first choice))) + +(defn generate-rhyming-n-gram + [phrase] + (->> (rhyme-choices @context phrase) + (exclude-non-rhymes-from-choices @context phrase) + (weighted-selection-from-choices) + (choice->n-gram @context))) + +(defn get-flex-rhyme + "Gets from a rhyme-trie a rhyming n-gram based on the + weighted selection from their frequencies." + [{:keys [flex-rhyme-trie database] :as context} phrase] + (let [phones (phrase->flex-rhyme-phones phrase) + ;; Exclude the last word. Don't rhyme kodak with kodak. + word-id (database (first (string/split phrase #" "))) + choices (remove + (fn [child] + (= (first child) word-id)) + (get flex-rhyme-trie phones)) + choice (math/weighted-selection + (comp second second) + choices)] + (map database (first choice)))) + +(comment + (get-flex-rhyme @context "bother me") + + ) +(defn get-next-markov + [{:keys [trie database] :as context} seed] + (let [seed (take-last 3 seed) + node (trie/lookup trie seed) + children (and node + (->> node + trie/children + (map #(vector (.key %) (get % []))) + (remove (comp nil? second)) + (remove + (fn [[k v]] + (#{1 38} k)))))] + (cond + (nil? node) (recur context (rest seed)) + (seq children) + (if (< (rand) (/ (apply max (map (comp second second) children)) + (apply + (map (comp second second) children)))) + (recur context (rest seed)) + (first (math/weighted-selection (comp second second) children))) + (> (count seed) 0) + (recur context (rest seed)) + :else (throw (Exception. "Error"))))) + +(defn get-next-markov-from-phrase-backwards + [{:keys [database trie] :as context} phrase n] + (let [word-ids (->> phrase + (#(string/split % #" ")) + (take n) + (reverse) + (map database))] + (database (get-next-markov context word-ids)))) + +(comment + (get-next-markov @context [222]) + (get-next-markov-from-phrase-backwards @context "will strike you down" 3) + + (get (@context :database) 7982) + ) +(defn ids->words + [{:keys [database] :as context} ids] + (map database ids)) + +(defn words->syllables + [words] + (->> words + (string/join " ") + (reverse (phrase->flex-rhyme-phones)))) + +(defn generate-sentence-with-n-words + [{:keys [database] :as context} seed n] + (loop [seed seed] + (if (>= (dec n) (count seed)) + (recur (conj seed (get-next-markov context seed))) + (map database seed)))) + +(defn take-words-amounting-to-more-at-least-n-syllables + [phrase n] + (letfn [(phones [word] + [word (first (owoga.phonetics/get-phones word))]) + (syllables [[word phones]] + [word (owoga.syllabify/syllabify phones)])] + (->> phrase + (#(string/split % #" ")) + (map phones) + (map syllables) + (reduce + (fn [result [word syllables]] + (if (<= n (count (mapcat second result))) + (reduced result) + (conj result [word syllables]))) + []) + (map first) + (string/join " ")))) + +(defn valid-english-sentence? + [phrase] + (let [words (string/split #" " phrase)] + (and (nlp/valid-sentence? phrase) + (every? dict/cmu-with-stress-map words)))) + + +(defn sha256 [text] + (let [digest (java.security.MessageDigest/getInstance "SHA-256")] + (->> (.digest digest (.getBytes text "UTF-8")) + (#(BigInteger. 1 %)) + (#(.toString % 16))))) + +(defn syllable-count-phrase + [phrase] + (->> phrase + (#(string/split % #" ")) + (map owoga.phonetics/get-phones) + (map first) + (mapcat owoga.syllabify/syllabify) + count)) + +(defn rhyming-n-gram-choices + [context target-rhyme] + (loop [target-rhyme target-rhyme] + (let [choices (->> target-rhyme + (rhyme-choices context) + (exclude-non-rhymes-from-choices context target-rhyme) + (exclude-non-english-phrases-from-choices context))] + (if (empty? choices) + (recur (string/join " " (rest (string/split target-rhyme #" ")))) + choices)))) + +(defn generate-n-syllable-sentence-rhyming-with + [context target-phrase n] + (let [target-phrase-words (string/split target-phrase #" ") + reversed-target-phrase (string/join " " (reverse target-phrase-words)) + target-rhyme + (->> (take-words-amounting-to-more-at-least-n-syllables + reversed-target-phrase + 5) + (#(string/split % #" ")) + reverse + (string/join " ")) + rhyming-n-gram (->> (rhyming-n-gram-choices context target-rhyme) + (weighted-selection-from-choices) + (choice->n-gram context) + (string/join " "))] + (loop [phrase rhyming-n-gram] + (if (<= n (syllable-count-phrase phrase)) + phrase + (recur + (str (get-next-markov-from-phrase-backwards context phrase 5) + " " + phrase)))))) + +(generate-n-syllable-sentence-rhyming-with @context "instead of war on poverty" 8) + +((@context :database) "poverty") +(defn amul8 + ([sentence] + (->> (amulate (string/split sentence #" ")) + (map reverse) + (map (partial string/join " ")))) + ([sentence n] + (loop [result [sentence] + n n] + (if (zero? n) + result + (recur (conj result (amul8 (peek result))) + (dec n)))))) + +(defn amulate? + [text] + (let [digest (sha256 text)] + (re-matches #"8{4}" digest))) + +(defn continuously-amulate + [seed] + (let [next-sentence (amul8 seed) + next-seed (->> next-sentence + (#(string/split % #" ")) + (reverse) + (map + (fn [word] + [word (phrase->flex-rhyme-phones word)])) + ((fn [word-phones] + (loop [word-phones word-phones + seed []] + (println (mapcat second seed)) + (if (< 2 (count (mapcat second seed))) + (string/join + " " + (reverse (map first seed))) + (recur (rest word-phones) + (conj seed (first word-phones))))))))] + (println next-seed) + (lazy-seq + (cons next-sentence (continuously-amulate next-seed))))) + +(comment + (take 5 (continuously-amulate "technology")) + + (->> (amul8 "technology" 1) + (map second) + (partition 2 1) + (map + (fn [pair] + (string/join "\n" pair))) + (map #(vector % (sha256 %))) + (map + (fn [[text sha]] + [text sha (re-matches #"8{4}" sha)])) + (map println)) + + (dict/cmu-with-stress-map ) + (repeatedly + 3 + #(amulate (reverse ["pleasure" "of" "the" "arcane" "technology"]))) + + (phrase->flex-rhyme-phones "bother hello") + (phrase->flex-rhyme-phones "snow-covered on") + (get-flex-rhyme @context (reverse ["AA" "ER" "AH" "OW"])) + ((@context :database) "") + (get-next-markov @context [1 503]) + + (take 20 + (repeatedly #(reverse (get-flex-rhyme @context + (reverse (phrase->flex-rhyme-phones "technology")) + "technology")))) + + (amulate) + + (get (@context :database) "") + (get (@context :database) "technology") + (phrase->flex-rhyme-phones "able") ;; => ("EY" "AH") + (phrase->flex-rhyme-phones "away") ;; => ("AH" "EY") + (take 20 (@context :flex-rhyme-trie)) + (get-flex-rhyme @context '("AA" "IY" "AE")) + + (map #(get (@context :database) %) [1 503]) (time (count (tpt/children-at-depth (@context :trie) 0 2))) (->> (trie/children-at-depth (@context :flex-rhyme-trie') 0 5) @@ -547,7 +861,7 @@ (first (@context :trie)) ;; 448351 ;; 4388527 - (initialize) + (time (initialize)) ) @@ -807,6 +1121,9 @@ (def rhyme-database (atom {})) + (def db + (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) + (def perfect-rhyme-trie (transduce (comp @@ -838,7 +1155,8 @@ (fn [trie [k v]] (update trie k (fnil #(update % 1 inc) [v 0])))) (trie/make-trie) - @loaded-backwards-database)) + (take 1000 db))) + (take 20 vowel-rhyme-trie) ) #_(with-open [wtr (clojure.java.io/writer "database.bin")] diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index a45e2fb..b90f34e 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -7,7 +7,8 @@ [com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie.encoding :as encoding] [taoensso.nippy :as nippy] - [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])) + [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2] + [clojure.zip :as zip])) (def re-word "Regex for tokenizing a string into words @@ -165,7 +166,7 @@ (map (make-database-processor database))) (completing (fn [trie [k v]] - (update trie k (fnil inc 0)))) + (update trie k (fnil #(update % 1 inc) [k 0])))) (trie/make-trie) files)) @@ -173,7 +174,103 @@ [trie encode-fn decode-fn] (tpt/tightly-packed-trie trie encode-fn decode-fn)) +(def texts (eduction + (comp (xf-file-seq 0 10) + (map slurp)) + (file-seq (io/file "dark-corpus")))) + +(defn split-text-into-sentences + [text] + (->> text + (#(string/replace % #"([\.\?\!])" "$1\n")) + (string/split-lines))) + +(defn mapmap + [fn & body] + (apply map (partial map fn) body)) + +(defn treebank-zipper->trie-map-entries + [treebank-zipper] + (let [leaf-paths (nlp/leaf-pos-paths treebank-zipper)] + leaf-paths)) + (comment + (treebank-zipper->trie-map-entries + (zip/seq-zip + '(TOP + ((S + ((NP + ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) + (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) + (. (".")))))))) + + (defn breadth-first-search [z] + (letfn [(zip-children [loc] + (when-let [first-child (zip/down loc)] + (take-while (comp not nil?) + (iterate zip/right first-child))))] + (loop [ret [] + queue (conj clojure.lang.PersistentQueue/EMPTY z)] + (if (seq queue) + (let [[node children] ((juxt zip/node zip-children) (peek queue))] + (recur (conj ret node) (into (pop queue) children))) + ret)))) + + (filter + symbol? + (breadth-first-search + (zip/seq-zip + '(TOP + ((S + ((NP + ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) + (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) + (. ("."))))))))) + + (->> (zip/seq-zip + '(TOP + ((S + ((NP + ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) + (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) + (. ("."))))))) + (iterate zip/next) + (take 10) + last + (zip/path) + (map first) + (filter symbol?)) + + ) + +(defn process-text + [text] + (->> text + (split-text-into-sentences) + (map string/trim) + (map nlp/treebank-zipper) + (map nlp/leaf-pos-paths))) + +(comment + (into + #_(trie/make-trie) + [] + (map process-text) + texts) + + ) + +(comment + (let [database (atom {:next-id 1}) + trie (file-seq->trie + database + (transduce + (xf-file-seq 0 2) + conj + (file-seq (io/file "dark-corpus"))) + 1 4)] + trie) + (time (let [database (atom {:next-id 1}) trie (transduce diff --git a/src/com/owoga/prhyme/generation/markov_example.clj b/src/com/owoga/prhyme/generation/markov_example.clj index 5502ae7..5e5f772 100644 --- a/src/com/owoga/prhyme/generation/markov_example.clj +++ b/src/com/owoga/prhyme/generation/markov_example.clj @@ -356,7 +356,9 @@ (comp (map (fn [child] (vector generated-text child))) - xf-filter) conj children)] + xf-filter) + conj + children)] (cond (nil? node) (recur trie generated-text (butlast k) xf-filter) @@ -589,7 +591,7 @@ ;; Turning a word frequency into a phoneme trie (transduce (comp - (drop 10) + (drop 100000) (take 20) (map first) (map (partial remove #{1 7})) ;; and @@ -617,7 +619,8 @@ conj (trie/children-at-depth tpt 0 2)) - rhyme-trie + (take 20 (drop 100 @rhyme-trie)) + (take 20 (trie/children-at-depth tpt 0 2)) (let [words [[[["DH" "IH1" "S"] ["DH" "IH0" "S"]] [["IH1" "Z"] ["IH1" "S"]]] '(11 77)]] diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index a7d0557..cc30c8a 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -1091,3 +1091,97 @@ (top-k-sequences prhyme-pos-tagger (tokenize ""))) ) + +(defn loc-path + [loc] + (->> loc + zip/path + (map first) + (filter symbol?))) + +(defn breadth-first + [zipper] + (letfn [(zip-children [loc] + (when-let [first-child (zip/down loc)] + (take-while + (comp not nil?) + (iterate zip/right first-child))))] + (loop [result [] + queue (conj clojure.lang.PersistentQueue/EMPTY zipper)] + (if (seq queue) + (let [[zipper children] ((juxt identity zip-children) (peek queue))] + (recur (conj result zipper) (into (pop queue) children))) + result)))) + +(defn loc-children + [loc] + (when-let [first-child (zip/down loc)] + (->> (take-while + (complement nil?) + (iterate zip/right first-child)) + (map first)))) + +(defn part-of-speech-children + [loc] + (->> loc + (iterate zip/next) + (take-while (complement zip/end?)) + (map (fn [loc] + (when (symbol? (zip/node loc)) + [(->> (zip/path loc) + (map first)) + (->> (zip/right loc) + (zip/node) + (map first))]))) + (remove (comp nil? second)))) + +(comment + (->> (zip/vector-zip [1 [2 [3]]]) + (iterate zip/next) + (take 6) + last + zip/path + (map first)) + + (->> (breadth-first + (zip/seq-zip + '(TOP + ((S + ((NP + ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) + (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) + (. (".")))))))) + (map loc-children) + (filter seq?) + ) + + (part-of-speech-children + (zip/seq-zip + '(TOP + ((S + ((NP + ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) + (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) + (. (".")))))))) + + (->> (zip/seq-zip + '(TOP + ((S + ((NP + ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) + (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) + (. ("."))))))) + (zip/next) + (zip/next) + (zip/next) + (zip/next) + (zip/next) + (zip/node) + #_#_(loc-children) + (map first)) + + ) +(comment + (defn part-of-speech-n-grams + [zipper] + (letfn [(fn step [path []])])))