diff --git a/README.rst b/README.rst index bc4c034..1561486 100644 --- a/README.rst +++ b/README.rst @@ -61,6 +61,24 @@ words, then you're stuck trying to rhyme with the single syllable Implementation -------------- +2021-06-09 +++++++++++ + +Most generation tasks are going to require some big data structures, like a Trie of n-grams. + +A ``context`` is an atom that gets updated with those data structures. + +Loading some of these data structures can take a long time, so only load what you need. + +An example of the different data structures you might load: + +Alliterations - From the database of n-grams, convert each n-gram to syllables then create a trie of the alliterations. +Perfect rhymes - Again, from the database of n-grams, convert n-gram to syllables and create trie of reverse of syllables. +Imperfect rhymes - Perform some manipulation of the syllables so that you can be more flexible with your rhymes. + +One key that is probably always required is the ``database``. This maps words to their IDs and IDs to their words. The integer +IDs are necessary for tightly packed tries. + 2020-10-20 ++++++++++ diff --git a/deps.edn b/deps.edn index e3828bc..65152f4 100644 --- a/deps.edn +++ b/deps.edn @@ -21,7 +21,7 @@ com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/timbre {:mvn/version "4.10.0"} com.owoga/tightly-packed-trie - {:local/root "/home/eihli/code/tightly-packed-trie"} + {:local/root "/home/eihli/src/clj-tightly-packed-trie"} com.owoga/phonetics {:mvn/version "0.1.1"}} :aliases {:dev {:extra-paths ["test" "examples" "dev"] :extra-deps {}}}} diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index 7244cd9..30a5dff 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -18,6 +18,7 @@ [com.owoga.prhyme.syllabify :as syllabify] [taoensso.nippy :as nippy])) +(set! *warn-on-reflection* true) (tufte/add-basic-println-handler! {}) (defn xf-file-seq [start end] @@ -505,7 +506,7 @@ (trie/make-trie) (@context :database))) - #_(swap! + (swap! context assoc :flex-rhyme-trie @@ -546,6 +547,7 @@ nil) (take 5 (@context :flex-rhyme-trie)) + ) (comment @@ -553,17 +555,31 @@ (take 20) (map first) (map (partial map (@context :database)))) + (trie/children (trie/lookup (@context :trie) [13393])) ((@context :database) "desk") ;; => 13393 ((@context :database) "wobbly") ;; => 152750 (get (@context :trie) [13393 152750])) (defn rhyme-choices - [{:keys [flex-rhyme-trie database] :as context} phrase] + [{:keys [rhyme-trie database] :as context} phrase] (if (string? phrase) - (let [phones (phrase->flex-rhyme-phones phrase)] - (get flex-rhyme-trie phones)) - (get flex-rhyme-trie phrase))) + (let [phones (phrase->phones phrase)] + (get rhyme-trie phones)) + (get rhyme-trie phrase))) + +(comment + (get (:rhyme-trie @context) (phrase->phones "fall")) + + (->> (rhyme-choices + @context + "fall") + #_(map (comp (:database @context) first first))) + + (take 20 (:flex-rhyme-trie @context)) + (take 20 (:rhyme-trie @context)) + + ) (defn exclude-non-rhymes-from-choices "Removes any choice that includes the last @@ -633,8 +649,11 @@ (comment (get-flex-rhyme @context "bother me") + (phrase->flex-rhyme-phones "bother me") + (get-flex-rhyme @context ["IY" "ER" "AA"]) + ) (defn get-next-markov @@ -644,7 +663,9 @@ children (and node (->> node trie/children - (map #(vector (.key %) (get % []))) + (map (fn [^com.owoga.trie.ITrie child] + [(.key child) + (get child [])])) (remove (comp nil? second)) (remove (fn [[k v]] @@ -670,7 +691,14 @@ (database (get-next-markov context word-ids)))) (comment + (initialize) + + (get (@context :database) "") + (keys @context) + ((@context :database) (get-next-markov @context [1])) + (get-next-markov @context [222]) + (get-next-markov-from-phrase-backwards @context "will strike you down" 3) (get (@context :database) 7982) @@ -693,6 +721,12 @@ (map database seed)))) (defn take-words-amounting-to-at-least-n-syllables + "This function is nice to grab the tail end of a sentence for making a good rhyme. + If the sentence ends with a single-syllable word, like 'me', but a more + interesting n-gram like 'bother me', then you might want to explore the rhymes + available for the last N syllables. Sure, a word like 'poverty' would show up if you + got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great + rhymes before you see it." [phrase n] (letfn [(phones [word] [word (first (owoga.phonetics/get-phones word))]) @@ -711,7 +745,16 @@ (map first) (string/join " ")))) +(comment + (take-words-amounting-to-at-least-n-syllables + "police can bother me" 3);; => "police can" + (take-words-amounting-to-at-least-n-syllables + "police can bother me" 4);; => "police can bother" + ) + (defn take-n-syllables + "Returns the vowel sounds that make up the last n syllables. + Doesn't return stress." [phrase n] (if (string? phrase) (->> phrase @@ -720,7 +763,9 @@ (reverse)) (take-last n phrase))) -(take-n-syllables "bother me" 2) +(comment + (take-n-syllables "bother me" 2);; => ("ER" "IY") + ) (defn valid-english-sentence? [phrase] @@ -747,7 +792,8 @@ (defn rhyming-n-gram-choices [context target-rhyme] (loop [target-rhyme target-rhyme] - (let [choices (->> target-rhyme + (let [context @context + choices (->> target-rhyme (rhyme-choices context) (exclude-non-rhymes-from-choices context target-rhyme) (exclude-non-english-phrases-from-choices context))] @@ -758,11 +804,12 @@ choices)))) (comment - (->> (rhyming-n-gram-choices @context "fall") + (->> (rhyming-n-gram-choices context "fall") (map (comp (@context :database) first first))) - ) + (rhyme-choices @context "tall") + ) (defn generate-n-syllable-sentence-rhyming-with [context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count] @@ -801,6 +848,16 @@ " " phrase))))))) +(comment + (generate-n-syllable-sentence-rhyming-with + context + "war on poverty" + 3 + 3 + 8) + + ) + (defn generate-haiku [seed] (let [haiku (cons @@ -829,7 +886,9 @@ (map last) (apply distinct?)))) - (->> (generate-haiku "football") + (println (first (generate-haiku "fall"))) + + (->> (generate-haiku "") (filter valid-haiku) (map (partial string/join "\n")) (map #(vector % (sha256 %))) @@ -837,7 +896,7 @@ (println haiku) (println sha) (println))) - (take 10)) + (take 1)) ) @@ -850,7 +909,9 @@ @context (take 3 (phrase->flex-rhyme-phones "player unknown battlegrounds")) 3 4 %) - [9 6 6 9 6 6 9 6 6])))) + [9 6 6 9 6 6 9 6 6]))) + + ) " another day a battleground @@ -878,7 +939,7 @@ witness sky is blackened now (defn continuously-amulate [seed] - (let [next-sentence (amul8 seed) + (let [next-sentence (generate-haiku seed) next-seed (->> next-sentence (#(string/split % #" ")) (reverse) @@ -898,6 +959,8 @@ witness sky is blackened now (cons next-sentence (continuously-amulate next-seed))))) (comment + (generate-haiku "technology") + (take 5 (continuously-amulate "technology")) (->> (amul8 "technology" 1) @@ -1034,7 +1097,16 @@ witness sky is blackened now (defn choose-next-word "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses - the next word based on markove data in trie." + the next word based on markov data in trie. + + Could be improved by taking into account grammar and/or bidirectional context. + + The n-gram parameter is a list of trie entries + For trie entries that are word/frequency pairs, it might look something like this. + `[[sunshine 38] [ 509]]` + + But note that nothing in this function uses the frequency count from the passed in n-gram. + It's just easier for the calling functions to pass them in like that." [{:keys [database trie] :as context} n-gram] (let [n-gram-ids (->> n-gram (map first) (map database)) node (trie/lookup trie n-gram-ids)] @@ -1053,6 +1125,7 @@ witness sky is blackened now n-minus-1-gram-odds (/ (second (first children-freqs)) (+ (second (get node [])) (second (first children-freqs)))) + ;; Good-turing smoothing, take unseen ngram? take-n-minus-1-gram? (and (< 1 (count n-gram-ids)) (< (rand) n-minus-1-gram-odds))] (if take-n-minus-1-gram? @@ -1137,22 +1210,19 @@ witness sky is blackened now (get trie ids)) (choose-next-word @context (take 3 [["" 509]])) - (generate-sentence-backwards @context [""]) + (generate-sentence-backwards @context ["kill" ""]) (valid-sentences? (generate-phrase @context '(["bitter" 41]))) - - (choose-next-word @context (take 3 [["theology" 41]])) (choose-next-word @context [["and" 5] ["theology" 41]]) (find-rhymes (@context :perfect-rhyme-trie) "theology") - (trie/chil(trie/lookup (@context :trie) '(57 2477))) + (trie/lookup (@context :trie) '(57 2477)) (take 5 (@context :trie)) - (->> (find-rhymes (@context :perfect-rhyme-trie) "technology") (map (fn [[word frq]] (let [n+1grams (word->n+1grams @@ -1162,101 +1232,55 @@ witness sky is blackened now (map vector n+1grams (repeat [word frq]))))) (reduce into [])) + (def loaded-backwards-trie + (tpt/load-tightly-packed-trie-from-file + "resources/dark-corpus-backwards-tpt.bin" + (decode-fn @trie-database))) + + (def loaded-backwards-database + (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")] + (into {} (map read-string (line-seq rdr))))) + + (def rhyme-database (atom {})) + + (def db + (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) + + (def perfect-rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector % (reverse (word->phones %)))) + (map reverse)) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + @loaded-backwards-database)) + + (def vowel-rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector % (reverse (word->phones %)))) + (map reverse) + (map (fn [[phones v]] + [(map #(if (owoga.phonetics/vowel + (string/replace % #"\d" "")) + % + "?") + phones) + v]))) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + (take 1000 db))) + (take 20 vowel-rhyme-trie) - - - - - - - - - - - - - - - - - (do - #_(time - (def backwards-trie - (transduce (comp (xf-file-seq 0 250000) - (map slurp) - (map (partial n-to-m-backwards-grams 1 4)) - (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams))) - stateful-transducer) - conj - (file-seq (io/file "dark-corpus"))))) - - #_(time - (def tightly-packed-backwards-trie - (tpt/tightly-packed-trie - backwards-trie - encode-fn - (decode-fn @trie-database)))) - - #_(tpt/save-tightly-packed-trie-to-file - "resources/dark-corpus-backwards-tpt.bin" - tightly-packed-backwards-trie) - #_(with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")] - (let [lines (->> (seq @trie-database) - (map pr-str) - (map #(str % "\n")))] - (doseq [line lines] - (.write wtr line)))) - - (def loaded-backwards-trie - (tpt/load-tightly-packed-trie-from-file - "resources/dark-corpus-backwards-tpt.bin" - (decode-fn @trie-database))) - - (def loaded-backwards-database - (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")] - (into {} (map read-string (line-seq rdr))))) - - (def rhyme-database (atom {})) - - (def db - (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) - - (def perfect-rhyme-trie - (transduce - (comp - (map first) - (filter string?) - (map #(vector % (reverse (word->phones %)))) - (map reverse)) - (completing - (fn [trie [k v]] - (update trie k (fnil #(update % 1 inc) [v 0])))) - (trie/make-trie) - @loaded-backwards-database)) - - (def vowel-rhyme-trie - (transduce - (comp - (map first) - (filter string?) - (map #(vector % (reverse (word->phones %)))) - (map reverse) - (map (fn [[phones v]] - [(map #(if (owoga.phonetics/vowel - (string/replace % #"\d" "")) - % - "?") - phones) - v]))) - (completing - (fn [trie [k v]] - (update trie k (fnil #(update % 1 inc) [v 0])))) - (trie/make-trie) - (take 1000 db))) - (take 20 vowel-rhyme-trie) - ) - #_(with-open [wtr (clojure.java.io/writer "database.bin")] (let [lines (->> (seq @trie-database) (map pr-str) @@ -1310,9 +1334,9 @@ witness sky is blackened now (id-get-in-tpt tightly-packed-trie trie-database - '(2 2 3))) + '(2 2 3)) ;; => {("" "" "the") {:value ("" "" "the"), :count 462}} - + ) (comment (->> (perfect-rhymes perfect-rhyme-trie @@ -1417,3 +1441,9 @@ witness sky is blackened now #(= (last %) \2) phones))] (trie/lookup trie rhyme-suffix))) + +(comment + (keys @context) + ;; => (:flex-rhyme-trie :database :trie :perfect-rhyme-trie :rhyme-trie) + + ) diff --git a/src/com/owoga/corpus/util.clj b/src/com/owoga/corpus/util.clj new file mode 100644 index 0000000..66beef2 --- /dev/null +++ b/src/com/owoga/corpus/util.clj @@ -0,0 +1,81 @@ +(ns com.owoga.corpus.util + (:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)] + [clojure.string :as string])) + +(set! *warn-on-reflection* true) +(tufte/add-basic-println-handler! {}) + +(defn clean-text + "Removes all non-alphabetical characters and lowercases everything. + Very spartan way of cleaning." + [text] + (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" ""))) + +(defn xf-file-seq [start end] + (comp (remove #(.isDirectory %)) + (drop start) + (take end))) + +(def re-word + "Regex for tokenizing a string into words + (including contractions and hyphenations), + commas, periods, and newlines." + #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") + +(defn pad-tokens + "Pads the beginning with n - 1 tokens and + the end with 1 token." + [beginning-pad + number-of-beginning-pad + ending-pad + number-of-ending-pad + tokens] + (vec + (concat + (vec (repeat number-of-beginning-pad beginning-pad)) + tokens + (vec (repeat number-of-ending-pad ending-pad))))) + +(defn padder + [beg beg-n end end-n] + (partial pad-tokens beg beg-n end end-n)) + +(comment + (let [p (padder "" 1 "" 3)] + (p [1 2 3]));; => ["" 1 2 3 "" "" ""] + ) + +(defn tokenize-line + [line] + (->> line + (string/trim) + (re-seq re-word) + (mapv second) + (mapv string/lower-case))) + +(comment + (tokenize-line "The lazy fox jumps over the moon.") + ;; => ["the" "lazy" "fox" "jumps" "over" "the" "moon" "."] + ) + +(defn text->ngrams + "Takes text from a file, including newlines." + [text n] + (->> text + clean-text + (#(string/split % #"\n+")) + (remove empty?) + (mapv tokenize-line) + (mapv #(partition n 1 %)) + (mapv #(mapv vec %)) + (reduce #(into %1 %2) []))) + +(comment + (text->ngrams "The lazy fox jumps.\nOver the 5th full moon." 3) + ;; => [["the" "lazy" "fox"] + ;; ["lazy" "fox" "jumps"] + ;; ["over" "the" "th"] + ;; ["the" "th" "full"] + ;; ["th" "full" "moon"]] + + ) diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index cd08aa9..628f139 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -237,6 +237,8 @@ ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]] ) + + (comment (->> (first texts) (split-text-into-sentences) @@ -406,11 +408,11 @@ (recur (zip/next zipper))))) (defn grammar-children - [k] + [database trie k] (sort-by (comp - last) - (map #(vector (.key %) (@test-database (.key %)) (get % [])) - (remove (comp nil? #(get % [])) (trie/children (trie/lookup test-trie k)))))) + (map #(vector (.key %) (database (.key %)) (get % [])) + (remove (comp nil? #(get % [])) (trie/children (trie/lookup trie k)))))) (defn grammar-branch? [trie database k] diff --git a/src/com/owoga/prhyme/limerick.clj b/src/com/owoga/prhyme/limerick.clj index 78fbc6e..ac212b1 100644 --- a/src/com/owoga/prhyme/limerick.clj +++ b/src/com/owoga/prhyme/limerick.clj @@ -1,9 +1,18 @@ (ns com.owoga.prhyme.limerick (:require [com.owoga.prhyme.gen :as gen] [com.owoga.prhyme.generation.weighted-selection :as weighted-selection] + [com.owoga.prhyme.util.math :as math] + [com.owoga.prhyme.nlp.core :as nlp] [clojure.string :as string] + [com.owoga.phonetics :as phonetics] + [com.owoga.phonetics.syllabify :as syllabify] [com.owoga.prhyme.core :as prhyme] - [com.owoga.prhyme.util :as util])) + [com.owoga.prhyme.util :as util] + [com.owoga.prhyme.data.dictionary :as dict] + [com.owoga.trie :as trie] + [com.owoga.tightly-packed-trie :as tpt] + [com.owoga.tightly-packed-trie.encoding :as encoding] + [clojure.java.io :as io])) (defn rhyme-from-scheme "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]" @@ -42,6 +51,7 @@ (require '[com.owoga.prhyme.data.dictionary :as dict] '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]] '[clojure.java.io :as io]) + (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8))) ) @@ -74,3 +84,204 @@ "war we await the afterlife"]) +;;;; Generating limericks with a markov model + +(defn phrase->flex-rhyme-phones + "Takes a space-seperated string of words + and returns the concatenation of the words + vowel phones. + + Returns them in reversed order so they + are ready to be used in a lookup of a rhyme trie. + " + [phrase] + (->> phrase + (#(string/split % #" ")) + (map (comp syllabify/syllabify first phonetics/get-phones)) + (map (partial reduce into [])) + (map #(filter (partial re-find #"\d") %)) + (flatten) + (map #(string/replace % #"\d" "")) + (reverse))) + +(defn word->phones [word] + (or (dict/word->cmu-phones word) + (util/get-phones-with-stress word))) + +(defonce context (atom {})) + +(defn decode-fn [db] + (fn [byte-buffer] + (let [value (encoding/decode byte-buffer)] + (if (zero? value) + nil + [value (encoding/decode byte-buffer)])))) + +(defn initialize [] + (swap! + context + assoc + :database + (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")] + (into {} (map read-string (line-seq rdr))))) + + (swap! + context + assoc + :trie + (tpt/load-tightly-packed-trie-from-file + "resources/dark-corpus-backwards-tpt.bin" + (decode-fn (@context :database)))) + + (swap! + context + assoc + :perfect-rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector % (reverse (word->phones %)))) + (map reverse)) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + (@context :database))) + + (swap! + context + assoc + :rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector % (reverse (word->phones %)))) + (map reverse)) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + (@context :database))) + + (swap! + context + assoc + :flex-rhyme-trie + (transduce + (comp + (map (fn [[k v]] + [(string/join " " (map (@context :database) k)) + [k v]])) + (map (fn [[phrase [k v]]] + [(phrase->flex-rhyme-phones phrase) + [k v]]))) + (completing + (fn [trie [k v]] + (update trie k (fnil conj [v]) v))) + (trie/make-trie) + (->> (trie/children-at-depth (@context :trie) 0 1)))) + nil) + + +(comment + (time (initialize)) + + (println 2) + + (take 5 (:flex-rhyme-trie @context)) + + ) + +(defn choose-next-word + "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses + the next word based on markov data in trie. + + Could be improved by taking into account grammar and/or bidirectional context. + + The n-gram parameter is a list of trie entries + For trie entries that are word/frequency pairs, it might look something like this. + `[[sunshine 38] [ 509]]` + + But note that nothing in this function uses the frequency count from the passed in n-gram. + It's just easier for the calling functions to pass them in like that." + [{:keys [database trie] :as context} n-gram] + (let [n-gram-ids (->> n-gram (map first) (map database)) + node (trie/lookup trie n-gram-ids)] + (cond + (= 0 (count n-gram-ids)) + (let [children (->> (trie/children trie) + (map #(get % []))) + choice (math/weighted-selection second children)] + [(database (first choice)) (second choice)]) + node + (let [children (->> (trie/children node) + (map #(get % [])) + (remove (fn [[id f]] (= id (first n-gram-ids)))))] + (if (seq children) + (let [children-freqs (into (sorted-map) (frequencies (map second children))) + n-minus-1-gram-odds (/ (second (first children-freqs)) + (+ (second (get node [])) + (second (first children-freqs)))) + ;; Good-turing smoothing, take unseen ngram? + take-n-minus-1-gram? (and (< 1 (count n-gram-ids)) + (< (rand) n-minus-1-gram-odds))] + (if take-n-minus-1-gram? + (choose-next-word context (butlast n-gram)) + (let [choice (math/weighted-selection second children)] + [(database (first choice)) (second choice)]))) + (choose-next-word context (butlast n-gram)))) + :else + (choose-next-word context (butlast n-gram))))) + +(defn valid-sentence? [phrase] + (->> phrase + (map first) + (string/join " ") + (#(string/replace % #"(|)" "")) + (nlp/valid-sentence?))) + +(defn generate-sentence-backwards + "Given a phrase of [w1 w2 w3] generates a sentence + using a backwards markov." + ([{:keys [database trie] :as context} phrase] + (let [phrase (map (fn [w] + (let [id (database w)] + [w (second (get trie [id]))])) + phrase)] + (loop [phrase' (loop [phrase phrase] + (if (= "" (first (first phrase))) + phrase + (recur (cons (choose-next-word context (take 3 phrase)) + phrase))))] + (if (valid-sentence? phrase') + phrase' + (recur (loop [phrase phrase] + (if (= "" (first (first phrase))) + phrase + (recur (cons (choose-next-word context (take 3 phrase)) + phrase))))))))) + ) + +(comment + (take 5 (:database @context)) + + (map (:database @context) ["me" "bother"]) + (map (:database @context) ["bother me"]) + (first + (filter + valid-sentence? + (repeatedly + (fn [] + (generate-sentence-backwards + @context + ["bother" "me" ""]))))) + + (keys @context) + (time (initialize)) + ) + +(defn rhyme-from-scheme-2 + "Generate rhyme without the use of `weighted-selection/adjust-for-markov`." + []) diff --git a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj index 80acd2e..a8c04a7 100644 --- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj +++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj @@ -6,7 +6,6 @@ [com.owoga.phonetics.syllabify :as owoga.syllabify] [com.owoga.phonetics :as owoga.phonetics] [com.owoga.tightly-packed-trie.encoding :as encoding] - [examples.core :as examples] [taoensso.nippy :as nippy] [com.owoga.prhyme.nlp.core :as nlp] [examples.tpt :as examples.tpt] @@ -18,6 +17,78 @@ [com.owoga.tightly-packed-trie :as tpt] [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])) +;;;; Utilities +;; +;; + +(defn phrase->flex-rhyme-phones + "Takes a space-seperated string of words + and returns the concatenation of the words + vowel phones. + + Returns them in reversed order so they + are ready to be used in a lookup of a rhyme trie. + " + [phrase] + (->> phrase + (#(string/split % #" ")) + (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones)) + (map (partial reduce into [])) + (map #(filter (partial re-find #"\d") %)) + (flatten) + (map #(string/replace % #"\d" "")) + (reverse))) + +(defn take-words-amounting-to-at-least-n-syllables + "This function is nice to grab the tail end of a sentence for making a good rhyme. + If the sentence ends with a single-syllable word, like 'me', but a more + interesting n-gram like 'bother me', then you might want to explore the rhymes + available for the last N syllables. Sure, a word like 'poverty' would show up if you + got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great + rhymes before you see it." + [phrase n] + (letfn [(phones [word] + [word (first (owoga.phonetics/get-phones word))]) + (syllables [[word phones]] + [word (owoga.syllabify/syllabify phones)])] + (->> phrase + (#(string/split % #" ")) + (map phones) + (map syllables) + (reduce + (fn [result [word syllables]] + (if (<= n (count (mapcat second result))) + (reduced result) + (conj result [word syllables]))) + []) + (map first) + (string/join " ")))) + +(comment + (take-words-amounting-to-at-least-n-syllables + "police can bother me" 3);; => "police can" + (take-words-amounting-to-at-least-n-syllables + "police can bother me" 4);; => "police can bother" + ) + +(defn take-n-syllables + "Returns the vowel sounds that make up the last n syllables. + Doesn't return stress." + [phrase n] + (if (string? phrase) + (->> phrase + (phrase->flex-rhyme-phones) + (take n) + (reverse)) + (take-last n phrase))) + +(comment + (take-n-syllables "bother me" 2);; => ("ER" "IY") + ) + + +;;;; Much of the code below is related to grammar generation. + (defn update-values [m f & args] (reduce (fn [acc [k v]] @@ -298,27 +369,6 @@ (recur (next parse-zipper))))) (comment - (let [structure '(TOP (S (NP (DT) (JJ) (NN)) - (VP (RB) (VBZ)) - (NP (DT) (JJ) (NN)))) - structure (-> structure - zip/seq-zip - nlp/iter-zip - last) - pos-freqs (examples/pos-paths->pos-freqs - examples/t1)] - (repeatedly - 10 - (fn [] - (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs - zip/prev - zip/next - nil? - zip/end? - examples/pos-freqs-data-2 - structure - examples/darkov-2))))) - (timbre/set-level! :info) (timbre/set-level! :error) @@ -334,67 +384,6 @@ pos-path->word-freqs pos->word-freqs target-parse-tree))) - (time (def example-pos-freqs examples/example-pos-freqs)) - (nippy/thaw) - (nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs) - - (time (def example-structures examples/example-structures)) - (weighted-rand/weighted-selection-from-map - example-structures) - - - - (take 5 examples/t2) - (let [structure (weighted-rand/weighted-selection-from-map - examples/popular-structure-freq-data) - structure (-> structure - zip/seq-zip - nlp/iter-zip - last) - pos-freqs examples/pos-freqs-data-2] - (repeatedly - 10 - (fn [] - (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs - zip/prev - zip/next - nil? - zip/end? - pos-freqs - structure - examples/darkov-2) - nlp/leaf-nodes - (string/join " "))))) - - (repeatedly - 10 - (fn [] - (let [structure (weighted-rand/weighted-selection-from-map - (->> examples/t2 - (sort-by second) - (reverse) - (take 20))) - structure (-> structure - zip/seq-zip - nlp/iter-zip - last) - pos-freqs (examples/pos-paths->pos-freqs - examples/t1)] - (repeatedly - 10 - (fn [] - (->> (generate-with-markov-with-custom-progression - zip/prev - zip/next - nil? - zip/end? - examples/t1 - pos-freqs - structure - examples/darkov-2) - nlp/leaf-nodes - (string/join " "))))))) - ) @@ -442,12 +431,15 @@ (update trie k (fnil inc 0))) trie entries))) - #_(trie/make-trie) - test-trie + (trie/make-trie) (->> texts (drop 4000) (take 4000))))) + (def test-trie (into (trie/make-trie) (nippy/thaw-from-file "resources/grammar-trie-take-8000.bin"))) + + (def test-database (atom (nippy/thaw-from-file "resources/grammar-database-take-8000.bin"))) + ) (defn children @@ -531,25 +523,63 @@ (nippy/freeze-to-file "resources/grammar-database-take-8000.bin" @test-database) + (->> (take 20 test-trie) + (map (comp (partial map @test-database) first))) + + (->> (take 20 (reverse (sort-by second test-trie))) + (map (fn [[a b]] + [(map @test-database a) b]))) + + + ;; A sampling of the words that have been seen in the [TOP S NP NN] position. + (->> (trie/lookup test-trie (map @test-database '[TOP S NP NN])) + (map (comp @test-database first first)) + (drop 100) + (take 5)) + ;; => ("sink" "lose" "deep" "well" "help") + ) -(defn phrase->flex-rhyme-phones - "Takes a space-seperated string of words - and returns the concatenation of the words - vowel phones. +(defn zipper-last + [zipper] + (->> zipper + (iterate zip/next) + (take-while (complement zip/end?)) + last)) - Returns them in reversed order so they - are ready to be used in a lookup of a rhyme trie. - " - [phrase] - (->> phrase - (#(string/split % #" ")) - (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones)) - (map (partial reduce into [])) - (map #(filter (partial re-find #"\d") %)) - (flatten) - (map #(string/replace % #"\d" "")) - (reverse))) +(defn previous-leaf-loc + [zipper] + (->> zipper + (iterate zip/prev) + (take-while (complement nil?)) + (filter #(and (symbol? (zip/node %)) + (zip/up %) + (= 1 (count (zip/node (zip/up %)))))) + (first))) + +(defn previous-leaf-part-of-speech + [zipper] + (->> zipper + previous-leaf-loc + (zip/path) + (map first) + (filter symbol?))) + +(defn nearest-ancestor-phrase + [loc] + (->> loc + (iterate zip/prev) + (take-while (complement nil?)) + (filter (comp tb2/phrases zip/node)) + (first))) + +(comment + (nearest-ancestor-phrase + (->> (zip/vector-zip + '[NP [NN]]) + zip/down + zip/right + zip/down))) (defn markov-generate-grammar-with-rhyming-tail [grammar-trie grammar-database rhyme-trie rhyme-database rhyme-target zipper] @@ -645,6 +675,11 @@ @test-database (zip/vector-zip [1])) + (def test-sentence (markov-generate-sentence + test-trie + @test-database + (zip/vector-zip [1]))) + (repeatedly 20 #(->> (generate test-trie @test-database (zip/vector-zip [1])) @@ -666,13 +701,6 @@ (zip/root (apply-fn loc))) (recur (next-fn (apply-fn loc)))))) -(defn zipper-last - [zipper] - (->> zipper - (iterate zip/next) - (take-while (complement zip/end?)) - last)) - (defn decode-fn "Decodes a variable-length encoded number from a byte-buffer. Zero gets decoded to nil." @@ -688,6 +716,13 @@ (filter (complement zip/branch?)) (map zip/node))) + +(def tpt (tpt/load-tightly-packed-trie-from-file + (io/resource "dark-corpus-4-gram-backwards-tpt.bin") + decode-fn)) + +(def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) + (defn choose-with-n-gram-markov "Hard-coded to work with 4-gram. That's the at the end." [zipper @@ -735,39 +770,7 @@ choice] (first choice))) -(defn previous-leaf-loc - [zipper] - (->> zipper - (iterate zip/prev) - (take-while (complement nil?)) - (filter #(and (symbol? (zip/node %)) - (zip/up %) - (= 1 (count (zip/node (zip/up %)))))) - (first))) -(defn previous-leaf-part-of-speech - [zipper] - (->> zipper - previous-leaf-loc - (zip/path) - (map first) - (filter symbol?))) - -(defn nearest-ancestor-phrase - [loc] - (->> loc - (iterate zip/prev) - (take-while (complement nil?)) - (filter (comp tb2/phrases zip/node)) - (first))) - -(comment - (nearest-ancestor-phrase - (->> (zip/vector-zip - '[NP [NN]]) - zip/down - zip/right - zip/down))) (comment ;; Working backwards from a completed grammar tree that has @@ -796,8 +799,45 @@ (trie/lookup tpt) (trie/children) (map #(vector (tpt-db (.key %)) (get % []))))] - (choose-with-n-gram-markov - loc test-trie @test-database tpt tpt-db)) + [(zip/node loc) + prev-pos + prev-pos' + n-gram + n-gram' + (choose-with-n-gram-markov + loc test-trie @test-database tpt tpt-db)]) + + (let [zipper (zip/vector-zip + '[[TOP + [[VP + [[[VBN]] + [PP [[[TO]] [NP [[[NN]]]]]] + [PP [[[IN ["into"]]] [NP [[[PRP$ ["my"]]] [[NNS ["answers"]]]]]]]]]]]]) + loc (->> zipper + (iterate zip/next) + (filter #(= "into" (zip/node %))) + (first)) + prev-pos (previous-leaf-part-of-speech loc) + prev-pos' (map @test-database prev-pos) + n-gram (filter string? (rest-leafs loc)) + n-gram' (mapv tpt-db n-gram) + grammar-children (->> (children test-trie @test-database prev-pos') + (map first) + (map @test-database)) + n-gram-children (->> n-gram' + (take 2) + (reverse) + (trie/lookup tpt) + (trie/children) + (map #(vector (tpt-db (.key %)) (get % []))))] + [(zip/node loc) + prev-pos + prev-pos' + n-gram + n-gram' + (choose-with-n-gram-markov + loc test-trie @test-database tpt tpt-db)]) + (let [zipper (zip/vector-zip '[[TOP @@ -840,6 +880,7 @@ choice])) (trie/lookup test-trie [1 59 3 5 5 17]) + (@test-database 1911) (def tpt (tpt/load-tightly-packed-trie-from-file @@ -847,6 +888,7 @@ decode-fn)) (def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) + (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) (-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) @@ -1044,6 +1086,7 @@ (comment (generate-grammar-from [[NN ["taylor"]]]) + (map (comp (partial map @test-database) first) (take 5 test-trie)) (@test-database 1) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index f3b7589..99deb31 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -15,7 +15,11 @@ ParserFactory) (opennlp.tools.cmdline.parser ParserTool))) -(comment tb2/phrases) +(comment + tb2/phrases + (.exists (io/file (io/resource "models/en-token.bin"))) + + ) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin"))) @@ -68,8 +72,8 @@ (ParserFactory/create (ParserModel. (io/input-stream (io/resource "models/en-parser-chunking.bin"))) - 5 - 0.95)) + 50 + 0.90)) (defn parse-probs [parses] (map #(.getProb %) parses)) @@ -86,10 +90,32 @@ (- (Math/log 0.001) (Math/log 0.01)) (Math/E) (tokenize "Eric's testing.") + + (Math/log 0.9999) + (Math/pow Math/E -0.5) (let [results (StringBuffer.) parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)] ((juxt parse-probs parse-strs) parses)) + (let [results (StringBuffer.) + parses (ParserTool/parseLine "Eric is testing." custom-parser 1)] + [((juxt parse-probs parse-strs) parses) + (count parses)]) + + (let [results (StringBuffer.) + parses (ParserTool/parseLine "Eric is testing." custom-parser 2)] + [((juxt parse-probs parse-strs) parses) + (count parses)]) + + (let [results (StringBuffer.) + parses (ParserTool/parseLine "This is a good day." custom-parser 1)] + [((juxt parse-probs parse-strs) parses) + (count parses)]) + + (let [results (StringBuffer.) + parses (ParserTool/parseLine "The do dog run drive a." custom-parser 1)] + ((juxt parse-probs parse-strs) parses)) + (let [results (StringBuffer.) parses (ParserTool/parseLine "Eric 's testing ." custom-parser 1)] (meta parses)) @@ -146,6 +172,12 @@ tb2/clauses boolean)) +(comment + (->> "the lazy fox" + vector + parse) + + ) (defn unmake-tree "Tokenizing and then parsing a sentence returns a string representation of the parse tree. This is a helper function diff --git a/src/com/owoga/prhyme/util.clj b/src/com/owoga/prhyme/util.clj index 61b73f4..7978bf0 100644 --- a/src/com/owoga/prhyme/util.clj +++ b/src/com/owoga/prhyme/util.clj @@ -27,7 +27,10 @@ (map str (.getPhones cmu-lexicon "two" nil))) (defn get-phones - "String must be lowercase." + "Gets phones for known or unknonwn words from the CMULexicon, + removes the stress, and converts them to a format that matches the CMU Sphinx + dictionary (capitalizes and replaces 'ax' with 'ah') String must be + lowercase." [word] (->> (map str (.getPhones cmu-lexicon word nil)) (map remove-stress) @@ -35,9 +38,9 @@ (map string/upper-case))) (defn get-phones-with-stress - "String must be lowercase. - .getPhones only. - Might be different from stress in cmu-dict" + "Same as `get-phones` but leaves stress. + Note that this might not be the same stress that you'd see + in the CMU pronouncing dictionary." [word] (->> (map str (.getPhones cmu-lexicon word nil)) (map convert-to-sphinx)