From f408778c9b8c157b74d9aeb1c69841a49c352b1a Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Sun, 2 May 2021 13:04:33 -0500 Subject: [PATCH] Generation misc --- dev/examples/tpt.clj | 35 ++- src/com/owoga/prhyme/nlg/prhyme_nlg.clj | 301 ++++++++++++++++++++++-- src/com/owoga/prhyme/nlp/core.clj | 14 +- 3 files changed, 308 insertions(+), 42 deletions(-) diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index 77811f0..7244cd9 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -533,7 +533,6 @@ (transduce (comp (map (fn [[k v]] - (println (string/join " " (map (@context :database) k))) [(string/join " " (map (@context :database) k)) [k v]])) (map (fn [[phrase [k v]]] @@ -543,11 +542,10 @@ (fn [trie [k v]] (update trie k (fnil conj [v]) v))) (trie/make-trie) - (->> (trie/children-at-depth (@context :trie) 1 2) - (drop 500050) - (take 20))))) + (->> (trie/children-at-depth (@context :trie) 0 1))))) nil) + (take 5 (@context :flex-rhyme-trie)) ) (comment @@ -755,10 +753,17 @@ (exclude-non-english-phrases-from-choices context))] (if (empty? choices) (recur (if (string? target-rhyme) - (rest (phrase->flex-rhyme-phones context target-rhyme)) - (rest target-rhyme))) + (butlast (phrase->flex-rhyme-phones target-rhyme)) + (butlast target-rhyme))) choices)))) +(comment + (->> (rhyming-n-gram-choices @context "fall") + (map (comp (@context :database) first first))) + + ) + + (defn generate-n-syllable-sentence-rhyming-with [context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count] (if (string? target-phrase) @@ -824,14 +829,15 @@ (map last) (apply distinct?)))) - (->> (generate-haiku "technology") + (->> (generate-haiku "football") (filter valid-haiku) (map (partial string/join "\n")) (map #(vector % (sha256 %))) (map (fn [[haiku sha]] (println haiku) (println sha) - (println)))) + (println))) + (take 10)) ) @@ -865,19 +871,6 @@ witness sky is blackened now " -(defn amul8 - ([sentence] - (->> (amulate (string/split sentence #" ")) - (map reverse) - (map (partial string/join " ")))) - ([sentence n] - (loop [result [sentence] - n n] - (if (zero? n) - result - (recur (conj result (amul8 (peek result))) - (dec n)))))) - (defn amulate? [text] (let [digest (sha256 text)] diff --git a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj index 2b93bb9..80acd2e 100644 --- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj +++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj @@ -3,16 +3,20 @@ [clojure.string :as string] [taoensso.timbre :as timbre] [com.owoga.prhyme.util.math :as math] + [com.owoga.phonetics.syllabify :as owoga.syllabify] + [com.owoga.phonetics :as owoga.phonetics] [com.owoga.tightly-packed-trie.encoding :as encoding] [examples.core :as examples] [taoensso.nippy :as nippy] [com.owoga.prhyme.nlp.core :as nlp] + [examples.tpt :as examples.tpt] [clojure.java.io :as io] [com.owoga.prhyme.data-transform :as df] [com.owoga.trie :as trie] [com.owoga.prhyme.util.weighted-rand :as weighted-rand] [clojure.set :as set] - [com.owoga.tightly-packed-trie :as tpt])) + [com.owoga.tightly-packed-trie :as tpt] + [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])) (defn update-values [m f & args] (reduce @@ -438,8 +442,11 @@ (update trie k (fnil inc 0))) trie entries))) - (trie/make-trie) - (take 300 texts)))) + #_(trie/make-trie) + test-trie + (->> texts + (drop 4000) + (take 4000))))) ) @@ -520,8 +527,61 @@ (map zip/node) ()) + (nippy/freeze-to-file "resources/grammar-trie-take-8000.bin" (seq test-trie)) + + (nippy/freeze-to-file "resources/grammar-database-take-8000.bin" @test-database) + ) +(defn phrase->flex-rhyme-phones + "Takes a space-seperated string of words + and returns the concatenation of the words + vowel phones. + + Returns them in reversed order so they + are ready to be used in a lookup of a rhyme trie. + " + [phrase] + (->> phrase + (#(string/split % #" ")) + (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones)) + (map (partial reduce into [])) + (map #(filter (partial re-find #"\d") %)) + (flatten) + (map #(string/replace % #"\d" "")) + (reverse))) + +(defn markov-generate-grammar-with-rhyming-tail + [grammar-trie grammar-database rhyme-trie rhyme-database rhyme-target zipper] + (let [rhyme-phones (phrase->flex-rhyme-phones rhyme-target) + rhyme-options (examples.tpt/rhyming-n-gram-choices + {:database rhyme-database + :flex-rhyme-trie rhyme-trie} + rhyme-target) + rhyme-option-words (map (comp rhyme-database first first) rhyme-options) + grammar (markov-generate-grammar + grammar-trie + grammar-database + zipper) + tail [(grammar-database (zip/node (zipper-last (zip/vector-zip grammar))))] + choices (map (comp grammar-database first) + (children grammar-trie grammar-database tail)) + intersection (set/intersection + (into #{} rhyme-option-words) + (into #{} choices))] + (if (empty? intersection) + (do + (println (take 5 choices) + (take 5 rhyme-option-words)) + (markov-generate-grammar-with-rhyming-tail + grammar-trie + grammar-database + rhyme-trie + rhyme-database + rhyme-target + zipper)) + intersection))) + (defn markov-generate-sentence [trie database zipper] (cond @@ -580,7 +640,10 @@ (zip/vector-zip))))) (comment - (generate test-trie @test-database (zip/vector-zip [1])) + (markov-generate-sentence + test-trie + @test-database + (zip/vector-zip [1])) (repeatedly 20 @@ -626,6 +689,7 @@ (map zip/node))) (defn choose-with-n-gram-markov + "Hard-coded to work with 4-gram. That's the at the end." [zipper grammar-trie grammar-database @@ -633,7 +697,7 @@ n-gram-database] (let [prev-pos (previous-leaf-part-of-speech zipper) prev-pos' (map grammar-database prev-pos) - n-gram (filter string? (rest-leafs zipper)) + n-gram (filter string? (concat (rest-leafs zipper) ["" "" ""])) n-gram' (mapv tpt-db n-gram) part-of-speech-children (->> (children grammar-trie grammar-database (take-last 1 prev-pos')) (map #(vector (grammar-database (first %)) @@ -641,15 +705,14 @@ grammar-children (->> (children grammar-trie grammar-database prev-pos') (map #(vector (grammar-database (first %)) (second %)))) - n-gram-children (->> n-gram' - (take 2) - (reverse) - (trie/lookup n-gram-trie) - (trie/children) - (map #(vector (n-gram-database (.key %)) (get % [])))) + n-gram-children (loop [n-gram' (reverse (remove nil? (take 4 n-gram')))] + (if-let [node (trie/lookup n-gram-trie n-gram')] + (->> (trie/children node) + (map #(vector (n-gram-database (.key %)) (get % [])))) + (recur (rest n-gram')))) combined-choices (reduce (fn [acc [k v]] - (update acc k (fnil * v))) + (update acc k (fnil + v))) (into {} grammar-children) n-gram-children) intersection (set/intersection @@ -670,9 +733,9 @@ n-gram-children combined-choices choice] - )) + (first choice))) -(defn previous-leaf-part-of-speech +(defn previous-leaf-loc [zipper] (->> zipper (iterate zip/prev) @@ -680,11 +743,32 @@ (filter #(and (symbol? (zip/node %)) (zip/up %) (= 1 (count (zip/node (zip/up %)))))) - (first) + (first))) + +(defn previous-leaf-part-of-speech + [zipper] + (->> zipper + previous-leaf-loc (zip/path) (map first) (filter symbol?))) +(defn nearest-ancestor-phrase + [loc] + (->> loc + (iterate zip/prev) + (take-while (complement nil?)) + (filter (comp tb2/phrases zip/node)) + (first))) + +(comment + (nearest-ancestor-phrase + (->> (zip/vector-zip + '[NP [NN]]) + zip/down + zip/right + zip/down))) + (comment ;; Working backwards from a completed grammar tree that has ;; been partially filled in with words, choose the next likely word @@ -742,13 +826,17 @@ n-gram' prev-pos prev-pos' + part-of-speech-children grammar-children n-gram-children combined-choices choice] (choose-with-n-gram-markov loc test-trie @test-database tpt tpt-db)] - [ + [prev-pos + (take 5 grammar-children) + (take 5 n-gram-children) + (take 5 combined-choices) choice])) (trie/lookup test-trie [1 59 3 5 5 17]) @@ -761,6 +849,30 @@ (def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) + (-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) + (zip/vector-zip) + (zipper-last) + (visitor + zip/prev + (fn [loc] + (let [k (filter symbol? (map first (zip/path loc)))] + (if (and (symbol? (zip/node loc)) + (zip/up loc) + (= 1 (count (zip/node (zip/up loc)))) + (not-empty k)) + (let [k' (map @test-database k) + choice (choose-with-n-gram-markov + loc + test-trie + @test-database + tpt + tpt-db)] + (zip/replace + loc + [(zip/node loc) + [choice]])) + loc))))) + (-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) zip/vector-zip (zipper-last) @@ -773,8 +885,8 @@ (= 1 (count (zip/node (zip/up loc)))) (not-empty k)) (let [k' (map @test-database k) - choice (@test-database (first (choose test-trie @test-database k')))] - (println k') + choice (@test-database (first (choose + test-trie @test-database k')))] (zip/replace loc [(zip/node loc) @@ -783,3 +895,156 @@ (@test-database 497) ) + + +(defn markov-choose-words-for-grammar + [grammar-trie + grammar-database + n-gram-trie + n-gram-database + grammar] + (-> grammar + (visitor + zip/prev + (fn [loc] + (println (zip/node loc)) + (if (and (tb2/words (zip/node loc)) + (nil? (zip/right loc))) + (do + (println "inserting right" (zip/node loc)) + (zip/insert-right + loc + [(choose-with-n-gram-markov + loc + grammar-trie + grammar-database + n-gram-trie + n-gram-database)])) + loc))))) + +(comment + (let [grammar '[[TOP [[S [[NP [[NNS]]] [VP [[VBP]]] [NP [[NNS ["taylor"]]]]]]]]] + sentence (markov-choose-words-for-grammar + test-trie + @test-database + tpt + tpt-db + (->> (zip/vector-zip grammar) + zipper-last + previous-leaf-loc)) + grammar2 `[[TOP [[S [[NP [[NNS]] [VP [[VBP]]] [NP [[NNS]]]]]]]]] + rhyme (random-sample + (markov-generate-grammar-with-rhyming-tail + grammar2 + grammar-database + rhyme-trie + rhyme-database + "taylor" + ))] + sentence) + + (markov-generate-grammar + test-trie + @test-database + (zip/vector-zip [1])) + + (map @test-database '[TOP S NP NNS VP VBP]) ;; => (1 3 5 74 7 53) + + + (markov-generate-grammar-with-rhyming-tail + test-trie + @test-database + rhyme-trie + rhyme-database + "taylor" + (zip/vector-zip [1])) + + ) + +(defn markov-complete-grammar-with-rhyming-tail + [grammar-trie grammar-database rhyme-trie rhyme-database grammar rhyme-target] + (let [rhyme-phones (phrase->flex-rhyme-phones rhyme-target) + rhyme-options (examples.tpt/rhyming-n-gram-choices + {:database rhyme-database + :flex-rhyme-trie rhyme-trie} + rhyme-target) + rhyme-option-words (map (comp rhyme-database first first) rhyme-options) + tail [(grammar-database (zip/node (zipper-last (zip/vector-zip grammar))))] + choices (map (comp grammar-database first) + (children grammar-trie grammar-database tail)) + intersection (set/intersection + (into #{} rhyme-option-words) + (into #{} choices))] + (if (empty? intersection) + (markov-complete-grammar-with-rhyming-tail + grammar-trie + grammar-database + rhyme-trie + rhyme-database + rhyme-target) + intersection))) + +(defn markov-generate-sentence + [trie database zipper] + (cond + (zip/end? zipper) + (zip/root zipper) + + (seqable? (zip/node zipper)) + (recur trie database (zip/next zipper)) + + (symbol? (zip/node zipper)) + (recur trie database (zip/next zipper)) + + (symbol? (database (zip/node zipper))) + (let [sym (database (zip/node zipper)) + sym-path (->> (map first (zip/path zipper)) + butlast + (filter symbol?) + (#(concat % (list sym)))) + path (map database sym-path) + choice (first (choose trie database path))] + (recur + trie + database + (-> zipper + (zip/replace + [sym choice]) + (zip/root) + (zip/vector-zip)))) + + (string? (database (zip/node zipper))) + (let [terminal (database (zip/node zipper)) + path (->> (map first (zip/path zipper)) + butlast + (filter symbol?))] + (recur + trie + database + (-> zipper + (zip/replace + terminal) + (zip/next) + (zip/root) + (zip/vector-zip)))) + + :else + (recur + trie + database + (-> zipper + (zip/replace + (mapv + database + (database (zip/node zipper)))) + (zip/next) + (zip/root) + (zip/vector-zip))))) + + +(comment + (generate-grammar-from [[NN ["taylor"]]]) + (map (comp (partial map @test-database) first) (take 5 test-trie)) + + (@test-database 1) + ) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index e2e60cb..f3b7589 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -68,7 +68,7 @@ (ParserFactory/create (ParserModel. (io/input-stream (io/resource "models/en-parser-chunking.bin"))) - 3 + 5 0.95)) (defn parse-probs [parses] @@ -83,13 +83,20 @@ (string/split results #"\n"))) (comment + (- (Math/log 0.001) (Math/log 0.01)) + (Math/E) (tokenize "Eric's testing.") (let [results (StringBuffer.) - parses (ParserTool/parseLine "Eric 's testing ." custom-parser 3)] + parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)] ((juxt parse-probs parse-strs) parses)) + (let [results (StringBuffer.) + parses (ParserTool/parseLine "Eric 's testing ." custom-parser 1)] + (meta parses)) + ) +(Math/log (Math/pow Math/E -4.1)) (defn parse-top-n [tokenized n] (let [results (StringBuffer.) parses (ParserTool/parseLine tokenized custom-parser n)] @@ -1276,6 +1283,7 @@ (map string/trim) (filter english?) (remove empty?) + (filter valid-sentence?) (mapv treebank-zipper) (remove nil?) (map parts-of-speech-trie-entries) @@ -1287,7 +1295,7 @@ (clojure.lang.MapEntry. (into (vec k) [v]) v))))) (comment - (let [text "Hi my name. Is Eric? \n What is yours? Fooaba brosaet"] + (let [text "Hi my name. Is Eric? \n What is yours? Fooaba brosaet. The run dog go for the."] (text->grammar-trie-map-entry text) #_(->> text -split-text-into-sentences