Remove stale code, add comments, cleanup

5 years ago · 62a1ff3a3b
parent 9b8fec9fc7
commit 62a1ff3a3b
9 changed files with 689 additions and 269 deletions
--- a/README.rst
+++ b/README.rst
@ -61,6 +61,24 @@ words, then you're stuck trying to rhyme with the single syllable
 Implementation
 --------------

+2021-06-09
++++++++++
+
+Most generation tasks are going to require some big data structures, like a Trie of n-grams.
+
+A ``context`` is an atom that gets updated with those data structures.
+
+Loading some of these data structures can take a long time, so only load what you need.
+
+An example of the different data structures you might load:
+
+Alliterations - From the database of n-grams, convert each n-gram to syllables then create a trie of the alliterations.
+Perfect rhymes - Again, from the database of n-grams, convert n-gram to syllables and create trie of reverse of syllables.
+Imperfect rhymes - Perform some manipulation of the syllables so that you can be more flexible with your rhymes.
+
+One key that is probably always required is the ``database``. This maps words to their IDs and IDs to their words. The integer
+IDs are necessary for tightly packed tries.
+
 2020-10-20
 ++++++++++

--- a/deps.edn
+++ b/deps.edn
@ -21,7 +21,7 @@
        com.taoensso/nippy {:mvn/version "3.0.0"}
        com.taoensso/timbre {:mvn/version "4.10.0"}
        com.owoga/tightly-packed-trie
-        {:local/root "/home/eihli/code/tightly-packed-trie"}
+        {:local/root "/home/eihli/src/clj-tightly-packed-trie"}
        com.owoga/phonetics {:mvn/version "0.1.1"}}
 :aliases {:dev {:extra-paths ["test" "examples" "dev"]
                 :extra-deps {}}}}
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@ -18,6 +18,7 @@
            [com.owoga.prhyme.syllabify :as syllabify]
            [taoensso.nippy :as nippy]))

+(set! *warn-on-reflection* true)
 (tufte/add-basic-println-handler! {})

 (defn xf-file-seq [start end]
@ -505,7 +506,7 @@
    (trie/make-trie)
    (@context :database)))

-  #_(swap!
+  (swap!
   context
   assoc
   :flex-rhyme-trie
@ -546,6 +547,7 @@
    nil)

  (take 5 (@context :flex-rhyme-trie))
+
  )

 (comment
@ -553,17 +555,31 @@
       (take 20)
       (map first)
       (map (partial map (@context :database))))
+
  (trie/children (trie/lookup (@context :trie) [13393]))
  ((@context :database) "desk")   ;; => 13393
  ((@context :database) "wobbly") ;; => 152750
  (get (@context :trie) [13393 152750]))

 (defn rhyme-choices
-  [{:keys [flex-rhyme-trie database] :as context} phrase]
+  [{:keys [rhyme-trie database] :as context} phrase]
  (if (string? phrase)
-    (let [phones (phrase->flex-rhyme-phones phrase)]
-      (get flex-rhyme-trie phones))
-    (get flex-rhyme-trie phrase)))
+    (let [phones (phrase->phones phrase)]
+      (get rhyme-trie phones))
+    (get rhyme-trie phrase)))
+
+(comment
+  (get (:rhyme-trie @context) (phrase->phones "fall"))
+
+  (->> (rhyme-choices
+        @context
+        "fall")
+       #_(map (comp (:database @context) first first)))
+
+  (take 20 (:flex-rhyme-trie @context))
+  (take 20 (:rhyme-trie @context))
+
+  )

 (defn exclude-non-rhymes-from-choices
  "Removes any choice that includes the last
@ -633,8 +649,11 @@

 (comment
  (get-flex-rhyme @context "bother me")
+
  (phrase->flex-rhyme-phones "bother me")
+
  (get-flex-rhyme @context ["IY" "ER" "AA"])
+
  )

 (defn get-next-markov
@ -644,7 +663,9 @@
        children (and node
                      (->> node
                           trie/children
-                           (map #(vector (.key %) (get % [])))
+                           (map (fn [^com.owoga.trie.ITrie child]
+                                  [(.key child)
+                                   (get child [])]))
                           (remove (comp nil? second))
                           (remove
                            (fn [[k v]]
@ -670,7 +691,14 @@
    (database (get-next-markov context word-ids))))

 (comment
+  (initialize)
+
+  (get (@context :database) "</s>")
+  (keys @context)
+  ((@context :database) (get-next-markov @context [1]))
+
  (get-next-markov @context [222])
+
  (get-next-markov-from-phrase-backwards @context "will strike you down" 3)

  (get (@context :database) 7982)
@ -693,6 +721,12 @@
      (map database seed))))

 (defn take-words-amounting-to-at-least-n-syllables
+  "This function is nice to grab the tail end of a sentence for making a good rhyme.
+  If the sentence ends with a single-syllable word, like 'me', but a more
+  interesting n-gram like 'bother me', then you might want to explore the rhymes
+  available for the last N syllables. Sure, a word like 'poverty' would show up if you
+  got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
+  rhymes before you see it."
  [phrase n]
  (letfn [(phones [word]
            [word (first (owoga.phonetics/get-phones word))])
@ -711,7 +745,16 @@
         (map first)
         (string/join " "))))

+(comment
+  (take-words-amounting-to-at-least-n-syllables
+   "police can bother me" 3);; => "police can"
+  (take-words-amounting-to-at-least-n-syllables
+   "police can bother me" 4);; => "police can bother"
+  )
+
 (defn take-n-syllables
+  "Returns the vowel sounds that make up the last n syllables.
+  Doesn't return stress."
  [phrase n]
  (if (string? phrase)
    (->> phrase
@ -720,7 +763,9 @@
         (reverse))
    (take-last n phrase)))

-(take-n-syllables "bother me" 2)
+(comment
+  (take-n-syllables "bother me" 2);; => ("ER" "IY")
+  )

 (defn valid-english-sentence?
  [phrase]
@ -747,7 +792,8 @@
 (defn rhyming-n-gram-choices
  [context target-rhyme]
  (loop [target-rhyme target-rhyme]
-    (let [choices (->> target-rhyme
+    (let [context @context
+          choices (->> target-rhyme
                       (rhyme-choices context)
                       (exclude-non-rhymes-from-choices context target-rhyme)
                       (exclude-non-english-phrases-from-choices context))]
@ -758,11 +804,12 @@
        choices))))

 (comment
-  (->> (rhyming-n-gram-choices @context "fall")
+  (->> (rhyming-n-gram-choices context "fall")
       (map (comp (@context :database) first first)))

-  )
+  (rhyme-choices @context "tall")

+  )

 (defn generate-n-syllable-sentence-rhyming-with
  [context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count]
@ -801,6 +848,16 @@
                " "
                phrase)))))))

+(comment
+  (generate-n-syllable-sentence-rhyming-with
+   context
+   "war on poverty"
+   3
+   3
+   8)
+
+  )
+
 (defn generate-haiku
  [seed]
  (let [haiku (cons
@ -829,7 +886,9 @@
          (map last)
          (apply distinct?))))

-  (->> (generate-haiku "football")
+  (println (first (generate-haiku "fall")))
+
+  (->> (generate-haiku "</s>")
       (filter valid-haiku)
       (map (partial string/join "\n"))
       (map #(vector % (sha256 %)))
@ -837,7 +896,7 @@
              (println haiku)
              (println sha)
              (println)))
-       (take 10))
+       (take 1))

  )

@ -850,7 +909,9 @@
       @context
       (take 3 (phrase->flex-rhyme-phones "player unknown battlegrounds"))
       3 4 %)
-     [9 6 6 9 6 6 9 6 6]))))
+     [9 6 6 9 6 6 9 6 6])))
+
+  )

 "
 another day a battleground
@ -878,7 +939,7 @@ witness sky is blackened now

 (defn continuously-amulate
  [seed]
-  (let [next-sentence (amul8 seed)
+  (let [next-sentence (generate-haiku seed)
        next-seed (->> next-sentence
                       (#(string/split % #" "))
                       (reverse)
@ -898,6 +959,8 @@ witness sky is blackened now
     (cons next-sentence (continuously-amulate next-seed)))))

 (comment
+  (generate-haiku "technology")
+
  (take 5 (continuously-amulate "technology"))

  (->> (amul8 "technology" 1)
@ -1034,7 +1097,16 @@ witness sky is blackened now

 (defn choose-next-word
  "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
-  the next word based on markove data in trie."
+  the next word based on markov data in trie.
+
+  Could be improved by taking into account grammar and/or bidirectional context.
+
+  The n-gram parameter is a list of trie entries
+  For trie entries that are word/frequency pairs, it might look something like this.
+  `[[sunshine 38] [</s> 509]]`
+
+  But note that nothing in this function uses the frequency count from the passed in n-gram.
+  It's just easier for the calling functions to pass them in like that."
  [{:keys [database trie] :as context} n-gram]
  (let [n-gram-ids (->> n-gram (map first) (map database))
        node (trie/lookup trie n-gram-ids)]
@ -1053,6 +1125,7 @@ witness sky is blackened now
                n-minus-1-gram-odds (/ (second (first children-freqs))
                                       (+ (second (get node []))
                                          (second (first children-freqs))))
+                ;; Good-turing smoothing, take unseen ngram?
                take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
                                          (< (rand) n-minus-1-gram-odds))]
            (if take-n-minus-1-gram?
@ -1137,22 +1210,19 @@ witness sky is blackened now
    (get trie ids))
  (choose-next-word @context (take 3 [["</s>" 509]]))

-  (generate-sentence-backwards @context ["</s>"])
+  (generate-sentence-backwards @context ["kill" "</s>"])

  (valid-sentences? (generate-phrase @context '(["bitter" 41])))

-
-
  (choose-next-word @context (take 3 [["theology" 41]]))

  (choose-next-word @context [["and" 5] ["theology" 41]])

  (find-rhymes (@context :perfect-rhyme-trie) "theology")

-  (trie/chil(trie/lookup (@context :trie) '(57 2477)))
+  (trie/lookup (@context :trie) '(57 2477))
  (take 5 (@context :trie))

-
  (->> (find-rhymes (@context :perfect-rhyme-trie) "technology")
       (map (fn [[word frq]]
              (let [n+1grams (word->n+1grams
@ -1162,101 +1232,55 @@ witness sky is blackened now
                (map vector n+1grams (repeat [word frq])))))
       (reduce into []))

+  (def loaded-backwards-trie
+    (tpt/load-tightly-packed-trie-from-file
+     "resources/dark-corpus-backwards-tpt.bin"
+     (decode-fn @trie-database)))
+
+  (def loaded-backwards-database
+    (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
+      (into {} (map read-string (line-seq rdr)))))
+
+  (def rhyme-database (atom {}))
+
+  (def db
+    (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
+
+  (def perfect-rhyme-trie
+    (transduce
+     (comp
+      (map first)
+      (filter string?)
+      (map #(vector % (reverse (word->phones %))))
+      (map reverse))
+     (completing
+      (fn [trie [k v]]
+        (update trie k (fnil #(update % 1 inc) [v 0]))))
+     (trie/make-trie)
+     @loaded-backwards-database))
+
+  (def vowel-rhyme-trie
+    (transduce
+     (comp
+      (map first)
+      (filter string?)
+      (map #(vector % (reverse (word->phones %))))
+      (map reverse)
+      (map (fn [[phones v]]
+             [(map #(if (owoga.phonetics/vowel
+                         (string/replace % #"\d" ""))
+                      %
+                      "?")
+                   phones)
+              v])))
+     (completing
+      (fn [trie [k v]]
+        (update trie k (fnil #(update % 1 inc) [v 0]))))
+     (trie/make-trie)
+     (take 1000 db)))
+  (take 20 vowel-rhyme-trie)


-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-  (do
-    #_(time
-       (def backwards-trie
-         (transduce (comp (xf-file-seq 0 250000)
-                          (map slurp)
-                          (map (partial n-to-m-backwards-grams 1 4))
-                          (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
-                          stateful-transducer)
-                    conj
-                    (file-seq (io/file "dark-corpus")))))
-
-    #_(time
-       (def tightly-packed-backwards-trie
-         (tpt/tightly-packed-trie
-          backwards-trie
-          encode-fn
-          (decode-fn @trie-database))))
-
-    #_(tpt/save-tightly-packed-trie-to-file
-       "resources/dark-corpus-backwards-tpt.bin"
-       tightly-packed-backwards-trie)
-    #_(with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
-        (let [lines (->> (seq @trie-database)
-                         (map pr-str)
-                         (map #(str % "\n")))]
-          (doseq [line lines]
-            (.write wtr line))))
-
-    (def loaded-backwards-trie
-      (tpt/load-tightly-packed-trie-from-file
-       "resources/dark-corpus-backwards-tpt.bin"
-       (decode-fn @trie-database)))
-
-    (def loaded-backwards-database
-      (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
-        (into {} (map read-string (line-seq rdr)))))
-
-    (def rhyme-database (atom {}))
-
-    (def db
-      (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
-
-    (def perfect-rhyme-trie
-      (transduce
-       (comp
-        (map first)
-        (filter string?)
-        (map #(vector % (reverse (word->phones %))))
-        (map reverse))
-       (completing
-        (fn [trie [k v]]
-          (update trie k (fnil #(update % 1 inc) [v 0]))))
-       (trie/make-trie)
-       @loaded-backwards-database))
-
-    (def vowel-rhyme-trie
-      (transduce
-       (comp
-        (map first)
-        (filter string?)
-        (map #(vector % (reverse (word->phones %))))
-        (map reverse)
-        (map (fn [[phones v]]
-               [(map #(if (owoga.phonetics/vowel
-                           (string/replace % #"\d" ""))
-                        %
-                        "?")
-                     phones)
-                v])))
-       (completing
-        (fn [trie [k v]]
-          (update trie k (fnil #(update % 1 inc) [v 0]))))
-       (trie/make-trie)
-       (take 1000 db)))
-    (take 20 vowel-rhyme-trie)
-    )
-
  #_(with-open [wtr (clojure.java.io/writer "database.bin")]
      (let [lines (->> (seq @trie-database)
                       (map pr-str)
@ -1310,9 +1334,9 @@ witness sky is blackened now
  (id-get-in-tpt
   tightly-packed-trie
   trie-database
-   '(2 2 3)))
+   '(2 2 3))
  ;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
-
+  )

 (comment
  (->> (perfect-rhymes perfect-rhyme-trie
@ -1417,3 +1441,9 @@ witness sky is blackened now
                       #(= (last %) \2)
                       phones))]
    (trie/lookup trie rhyme-suffix)))
+
+(comment
+  (keys @context)
+  ;; => (:flex-rhyme-trie :database :trie :perfect-rhyme-trie :rhyme-trie)
+
+  )
--- a/src/com/owoga/corpus/util.clj
+++ b/src/com/owoga/corpus/util.clj
@ -0,0 +1,81 @@
+(ns com.owoga.corpus.util
+  (:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)]
+            [clojure.string :as string]))
+
+(set! *warn-on-reflection* true)
+(tufte/add-basic-println-handler! {})
+
+(defn clean-text
+  "Removes all non-alphabetical characters and lowercases everything.
+  Very spartan way of cleaning."
+  [text]
+  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" "")))
+
+(defn xf-file-seq [start end]
+  (comp (remove #(.isDirectory %))
+        (drop start)
+        (take end)))
+
+(def re-word
+  "Regex for tokenizing a string into words
+  (including contractions and hyphenations),
+  commas, periods, and newlines."
+  #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
+
+(defn pad-tokens
+  "Pads the beginning with n - 1 <s> tokens and
+  the end with 1 </s> token."
+  [beginning-pad
+   number-of-beginning-pad
+   ending-pad
+   number-of-ending-pad
+   tokens]
+  (vec
+   (concat
+    (vec (repeat number-of-beginning-pad beginning-pad))
+    tokens
+    (vec (repeat number-of-ending-pad ending-pad)))))
+
+(defn padder
+  [beg beg-n end end-n]
+  (partial pad-tokens beg beg-n end end-n))
+
+(comment
+  (let [p (padder "<s>" 1 "</s>" 3)]
+    (p [1 2 3]));; => ["<s>" 1 2 3 "</s>" "</s>" "</s>"]
+  )
+
+(defn tokenize-line
+  [line]
+  (->> line
+       (string/trim)
+       (re-seq re-word)
+       (mapv second)
+       (mapv string/lower-case)))
+
+(comment
+  (tokenize-line "The lazy fox jumps over the moon.")
+  ;; => ["the" "lazy" "fox" "jumps" "over" "the" "moon" "."]
+  )
+
+(defn text->ngrams
+  "Takes text from a file, including newlines."
+  [text n]
+  (->> text
+       clean-text
+       (#(string/split % #"\n+"))
+       (remove empty?)
+       (mapv tokenize-line)
+       (mapv #(partition n 1 %))
+       (mapv #(mapv vec %))
+       (reduce #(into %1 %2) [])))
+
+(comment
+  (text->ngrams "The lazy fox jumps.\nOver the 5th full moon." 3)
+  ;; => [["the" "lazy" "fox"]
+  ;;     ["lazy" "fox" "jumps"]
+  ;;     ["over" "the" "th"]
+  ;;     ["the" "th" "full"]
+  ;;     ["th" "full" "moon"]]
+
+  )
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -237,6 +237,8 @@
  ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
  )

+
+
 (comment
  (->> (first texts)
       (split-text-into-sentences)
@ -406,11 +408,11 @@
      (recur (zip/next zipper)))))

 (defn grammar-children
-  [k]
+  [database trie k]
  (sort-by
   (comp - last)
-   (map #(vector (.key %) (@test-database (.key %)) (get % []))
-        (remove (comp nil? #(get % [])) (trie/children (trie/lookup test-trie k))))))
+   (map #(vector (.key %) (database (.key %)) (get % []))
+        (remove (comp nil? #(get % [])) (trie/children (trie/lookup trie k))))))

 (defn grammar-branch?
  [trie database k]
--- a/src/com/owoga/prhyme/limerick.clj
+++ b/src/com/owoga/prhyme/limerick.clj
@ -1,9 +1,18 @@
 (ns com.owoga.prhyme.limerick
  (:require [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
+            [com.owoga.prhyme.util.math :as math]
+            [com.owoga.prhyme.nlp.core :as nlp]
            [clojure.string :as string]
+            [com.owoga.phonetics :as phonetics]
+            [com.owoga.phonetics.syllabify :as syllabify]
            [com.owoga.prhyme.core :as prhyme]
-            [com.owoga.prhyme.util :as util]))
+            [com.owoga.prhyme.util :as util]
+            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.trie :as trie]
+            [com.owoga.tightly-packed-trie :as tpt]
+            [com.owoga.tightly-packed-trie.encoding :as encoding]
+            [clojure.java.io :as io]))

 (defn rhyme-from-scheme
  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
@ -42,6 +51,7 @@
  (require '[com.owoga.prhyme.data.dictionary :as dict]
           '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]]
           '[clojure.java.io :as io])
+
  (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))

  )
@ -74,3 +84,204 @@
   "war we await the afterlife"])


+;;;; Generating limericks with a markov model
+
+(defn phrase->flex-rhyme-phones
+  "Takes a space-seperated string of words
+  and returns the concatenation of the words
+  vowel phones.
+
+  Returns them in reversed order so they
+  are ready to be used in a lookup of a rhyme trie.
+  "
+  [phrase]
+  (->> phrase
+       (#(string/split % #" "))
+       (map (comp syllabify/syllabify first phonetics/get-phones))
+       (map (partial reduce into []))
+       (map #(filter (partial re-find #"\d") %))
+       (flatten)
+       (map #(string/replace % #"\d" ""))
+       (reverse)))
+
+(defn word->phones [word]
+  (or (dict/word->cmu-phones word)
+      (util/get-phones-with-stress word)))
+
+(defonce context (atom {}))
+
+(defn decode-fn [db]
+  (fn [byte-buffer]
+    (let [value (encoding/decode byte-buffer)]
+      (if (zero? value)
+        nil
+        [value (encoding/decode byte-buffer)]))))
+
+(defn initialize []
+  (swap!
+   context
+   assoc
+   :database
+   (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
+     (into {} (map read-string (line-seq rdr)))))
+
+  (swap!
+   context
+   assoc
+   :trie
+   (tpt/load-tightly-packed-trie-from-file
+    "resources/dark-corpus-backwards-tpt.bin"
+    (decode-fn (@context :database))))
+
+  (swap!
+   context
+   assoc
+   :perfect-rhyme-trie
+   (transduce
+    (comp
+     (map first)
+     (filter string?)
+     (map #(vector % (reverse (word->phones %))))
+     (map reverse))
+    (completing
+     (fn [trie [k v]]
+       (update trie k (fnil #(update % 1 inc) [v 0]))))
+    (trie/make-trie)
+    (@context :database)))
+
+  (swap!
+   context
+   assoc
+   :rhyme-trie
+   (transduce
+    (comp
+     (map first)
+     (filter string?)
+     (map #(vector % (reverse (word->phones %))))
+     (map reverse))
+    (completing
+     (fn [trie [k v]]
+       (update trie k (fnil #(update % 1 inc) [v 0]))))
+    (trie/make-trie)
+    (@context :database)))
+
+  (swap!
+   context
+   assoc
+   :flex-rhyme-trie
+   (transduce
+    (comp
+     (map (fn [[k v]]
+            [(string/join " " (map (@context :database) k))
+             [k v]]))
+     (map (fn [[phrase [k v]]]
+            [(phrase->flex-rhyme-phones phrase)
+             [k v]])))
+    (completing
+     (fn [trie [k v]]
+       (update trie k (fnil conj [v]) v)))
+    (trie/make-trie)
+    (->> (trie/children-at-depth (@context :trie) 0 1))))
+  nil)
+
+
+(comment
+  (time (initialize))
+
+  (println 2)
+
+  (take 5 (:flex-rhyme-trie @context))
+
+  )
+
+(defn choose-next-word
+  "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
+  the next word based on markov data in trie.
+
+  Could be improved by taking into account grammar and/or bidirectional context.
+
+  The n-gram parameter is a list of trie entries
+  For trie entries that are word/frequency pairs, it might look something like this.
+  `[[sunshine 38] [</s> 509]]`
+
+  But note that nothing in this function uses the frequency count from the passed in n-gram.
+  It's just easier for the calling functions to pass them in like that."
+  [{:keys [database trie] :as context} n-gram]
+  (let [n-gram-ids (->> n-gram (map first) (map database))
+        node (trie/lookup trie n-gram-ids)]
+    (cond
+      (= 0 (count n-gram-ids))
+      (let [children (->> (trie/children trie)
+                          (map #(get % [])))
+            choice (math/weighted-selection second children)]
+        [(database (first choice)) (second choice)])
+      node
+      (let [children (->> (trie/children node)
+                          (map #(get % []))
+                          (remove (fn [[id f]] (= id (first n-gram-ids)))))]
+        (if (seq children)
+          (let [children-freqs (into (sorted-map) (frequencies (map second children)))
+                n-minus-1-gram-odds (/ (second (first children-freqs))
+                                       (+ (second (get node []))
+                                          (second (first children-freqs))))
+                ;; Good-turing smoothing, take unseen ngram?
+                take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
+                                          (< (rand) n-minus-1-gram-odds))]
+            (if take-n-minus-1-gram?
+              (choose-next-word context (butlast n-gram))
+              (let [choice (math/weighted-selection second children)]
+                [(database (first choice)) (second choice)])))
+          (choose-next-word context (butlast n-gram))))
+      :else
+      (choose-next-word context (butlast n-gram)))))
+
+(defn valid-sentence? [phrase]
+  (->> phrase
+       (map first)
+       (string/join " ")
+       (#(string/replace % #"(<s>|</s>)" ""))
+       (nlp/valid-sentence?)))
+
+(defn generate-sentence-backwards
+  "Given a phrase of [w1 w2 w3] generates a sentence
+  using a backwards markov."
+  ([{:keys [database trie] :as context} phrase]
+   (let [phrase (map (fn [w]
+                       (let [id (database w)]
+                         [w (second (get trie [id]))]))
+                     phrase)]
+     (loop [phrase' (loop [phrase phrase]
+                      (if (= "<s>" (first (first phrase)))
+                        phrase
+                        (recur (cons (choose-next-word context (take 3 phrase))
+                                     phrase))))]
+       (if (valid-sentence? phrase')
+         phrase'
+         (recur (loop [phrase phrase]
+                  (if (= "<s>" (first (first phrase)))
+                    phrase
+                    (recur (cons (choose-next-word context (take 3 phrase))
+                                 phrase)))))))))
+  )
+
+(comment
+  (take 5 (:database @context))
+
+  (map (:database @context) ["me" "bother"])
+  (map (:database @context) ["bother me"])
+  (first
+   (filter
+    valid-sentence?
+    (repeatedly
+     (fn []
+       (generate-sentence-backwards
+        @context
+        ["bother" "me" "</s>"])))))
+
+  (keys @context)
+  (time (initialize))
+  )
+
+(defn rhyme-from-scheme-2
+  "Generate rhyme without the use of `weighted-selection/adjust-for-markov`."
+  [])
--- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
+++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
@ -6,7 +6,6 @@
            [com.owoga.phonetics.syllabify :as owoga.syllabify]
            [com.owoga.phonetics :as owoga.phonetics]
            [com.owoga.tightly-packed-trie.encoding :as encoding]
-            [examples.core :as examples]
            [taoensso.nippy :as nippy]
            [com.owoga.prhyme.nlp.core :as nlp]
            [examples.tpt :as examples.tpt]
@ -18,6 +17,78 @@
            [com.owoga.tightly-packed-trie :as tpt]
            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))

+;;;; Utilities
+;;
+;;
+
+(defn phrase->flex-rhyme-phones
+  "Takes a space-seperated string of words
+  and returns the concatenation of the words
+  vowel phones.
+
+  Returns them in reversed order so they
+  are ready to be used in a lookup of a rhyme trie.
+  "
+  [phrase]
+  (->> phrase
+       (#(string/split % #" "))
+       (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
+       (map (partial reduce into []))
+       (map #(filter (partial re-find #"\d") %))
+       (flatten)
+       (map #(string/replace % #"\d" ""))
+       (reverse)))
+
+(defn take-words-amounting-to-at-least-n-syllables
+  "This function is nice to grab the tail end of a sentence for making a good rhyme.
+  If the sentence ends with a single-syllable word, like 'me', but a more
+  interesting n-gram like 'bother me', then you might want to explore the rhymes
+  available for the last N syllables. Sure, a word like 'poverty' would show up if you
+  got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
+  rhymes before you see it."
+  [phrase n]
+  (letfn [(phones [word]
+            [word (first (owoga.phonetics/get-phones word))])
+          (syllables [[word phones]]
+            [word (owoga.syllabify/syllabify phones)])]
+    (->> phrase
+         (#(string/split % #" "))
+         (map phones)
+         (map syllables)
+         (reduce
+          (fn [result [word syllables]]
+            (if (<= n (count (mapcat second result)))
+              (reduced result)
+              (conj result [word syllables])))
+          [])
+         (map first)
+         (string/join " "))))
+
+(comment
+  (take-words-amounting-to-at-least-n-syllables
+   "police can bother me" 3);; => "police can"
+  (take-words-amounting-to-at-least-n-syllables
+   "police can bother me" 4);; => "police can bother"
+  )
+
+(defn take-n-syllables
+  "Returns the vowel sounds that make up the last n syllables.
+  Doesn't return stress."
+  [phrase n]
+  (if (string? phrase)
+    (->> phrase
+         (phrase->flex-rhyme-phones)
+         (take n)
+         (reverse))
+    (take-last n phrase)))
+
+(comment
+  (take-n-syllables "bother me" 2);; => ("ER" "IY")
+  )
+
+
+;;;; Much of the code below is related to grammar generation.
+
 (defn update-values [m f & args]
  (reduce
   (fn [acc [k v]]
@ -298,27 +369,6 @@
      (recur (next parse-zipper)))))

 (comment
-  (let [structure '(TOP (S (NP (DT) (JJ) (NN))
-                           (VP (RB) (VBZ))
-                           (NP (DT) (JJ) (NN))))
-        structure (-> structure
-                      zip/seq-zip
-                      nlp/iter-zip
-                      last)
-        pos-freqs (examples/pos-paths->pos-freqs
-                   examples/t1)]
-    (repeatedly
-     10
-     (fn []
-       (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
-             zip/prev
-             zip/next
-             nil?
-             zip/end?
-             examples/pos-freqs-data-2
-             structure
-             examples/darkov-2)))))
-
  (timbre/set-level! :info)
  (timbre/set-level! :error)

@ -334,67 +384,6 @@
         pos-path->word-freqs
         pos->word-freqs
         target-parse-tree)))
-  (time (def example-pos-freqs examples/example-pos-freqs))
-  (nippy/thaw)
-  (nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs)
-
-  (time (def example-structures examples/example-structures))
-  (weighted-rand/weighted-selection-from-map
-   example-structures)
-
-
-
-  (take 5 examples/t2)
-  (let [structure (weighted-rand/weighted-selection-from-map
-                   examples/popular-structure-freq-data)
-        structure (-> structure
-                      zip/seq-zip
-                      nlp/iter-zip
-                      last)
-        pos-freqs examples/pos-freqs-data-2]
-    (repeatedly
-     10
-     (fn []
-       (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
-             zip/prev
-             zip/next
-             nil?
-             zip/end?
-             pos-freqs
-             structure
-             examples/darkov-2)
-            nlp/leaf-nodes
-            (string/join " ")))))
-
-  (repeatedly
-   10
-   (fn []
-     (let [structure (weighted-rand/weighted-selection-from-map
-                      (->> examples/t2
-                           (sort-by second)
-                           (reverse)
-                           (take 20)))
-           structure (-> structure
-                         zip/seq-zip
-                         nlp/iter-zip
-                         last)
-           pos-freqs (examples/pos-paths->pos-freqs
-                      examples/t1)]
-       (repeatedly
-        10
-        (fn []
-          (->> (generate-with-markov-with-custom-progression
-                zip/prev
-                zip/next
-                nil?
-                zip/end?
-                examples/t1
-                pos-freqs
-                structure
-                examples/darkov-2)
-               nlp/leaf-nodes
-               (string/join " ")))))))
-
  )


@ -442,12 +431,15 @@
            (update trie k (fnil inc 0)))
          trie
          entries)))
-      #_(trie/make-trie)
-      test-trie
+      (trie/make-trie)
      (->> texts
           (drop 4000)
           (take 4000)))))

+  (def test-trie (into (trie/make-trie) (nippy/thaw-from-file "resources/grammar-trie-take-8000.bin")))
+
+  (def test-database (atom (nippy/thaw-from-file "resources/grammar-database-take-8000.bin")))
+
  )

 (defn children
@ -531,25 +523,63 @@

  (nippy/freeze-to-file "resources/grammar-database-take-8000.bin" @test-database)

+  (->> (take 20 test-trie)
+       (map (comp (partial map @test-database) first)))
+
+  (->> (take 20 (reverse (sort-by second test-trie)))
+       (map (fn [[a b]]
+              [(map @test-database a) b])))
+
+
+  ;; A sampling of the words that have been seen in the [TOP S NP NN] position.
+  (->> (trie/lookup test-trie (map @test-database '[TOP S NP NN]))
+       (map (comp @test-database first first))
+       (drop 100)
+       (take 5))
+  ;; => ("sink" "lose" "deep" "well" "help")
+
  )

-(defn phrase->flex-rhyme-phones
-  "Takes a space-seperated string of words
-  and returns the concatenation of the words
-  vowel phones.
+(defn zipper-last
+  [zipper]
+  (->> zipper
+       (iterate zip/next)
+       (take-while (complement zip/end?))
+       last))

-  Returns them in reversed order so they
-  are ready to be used in a lookup of a rhyme trie.
-  "
-  [phrase]
-  (->> phrase
-       (#(string/split % #" "))
-       (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
-       (map (partial reduce into []))
-       (map #(filter (partial re-find #"\d") %))
-       (flatten)
-       (map #(string/replace % #"\d" ""))
-       (reverse)))
+(defn previous-leaf-loc
+  [zipper]
+  (->> zipper
+       (iterate zip/prev)
+       (take-while (complement nil?))
+       (filter #(and (symbol? (zip/node %))
+                     (zip/up %)
+                     (= 1 (count (zip/node (zip/up %))))))
+       (first)))
+
+(defn previous-leaf-part-of-speech
+  [zipper]
+  (->> zipper
+       previous-leaf-loc
+       (zip/path)
+       (map first)
+       (filter symbol?)))
+
+(defn nearest-ancestor-phrase
+  [loc]
+  (->> loc
+       (iterate zip/prev)
+       (take-while (complement nil?))
+       (filter (comp tb2/phrases zip/node))
+       (first)))
+
+(comment
+  (nearest-ancestor-phrase
+   (->> (zip/vector-zip
+         '[NP [NN]])
+        zip/down
+        zip/right
+        zip/down)))

 (defn markov-generate-grammar-with-rhyming-tail
  [grammar-trie grammar-database rhyme-trie rhyme-database rhyme-target zipper]
@ -645,6 +675,11 @@
   @test-database
   (zip/vector-zip [1]))

+  (def test-sentence (markov-generate-sentence
+                      test-trie
+                      @test-database
+                      (zip/vector-zip [1])))
+
  (repeatedly
   20
   #(->> (generate test-trie @test-database (zip/vector-zip [1]))
@ -666,13 +701,6 @@
       (zip/root (apply-fn loc)))
      (recur (next-fn (apply-fn loc))))))

-(defn zipper-last
-  [zipper]
-  (->> zipper
-       (iterate zip/next)
-       (take-while (complement zip/end?))
-       last))
-
 (defn decode-fn
    "Decodes a variable-length encoded number from a byte-buffer.
  Zero gets decoded to nil."
@ -688,6 +716,13 @@
       (filter (complement zip/branch?))
       (map zip/node)))

+
+(def tpt (tpt/load-tightly-packed-trie-from-file
+        (io/resource "dark-corpus-4-gram-backwards-tpt.bin")
+        decode-fn))
+
+(def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
+
 (defn choose-with-n-gram-markov
  "Hard-coded to work with 4-gram. That's the </s> at the end."
  [zipper
@ -735,39 +770,7 @@
     choice]
    (first choice)))

-(defn previous-leaf-loc
-  [zipper]
-  (->> zipper
-       (iterate zip/prev)
-       (take-while (complement nil?))
-       (filter #(and (symbol? (zip/node %))
-                     (zip/up %)
-                     (= 1 (count (zip/node (zip/up %))))))
-       (first)))

-(defn previous-leaf-part-of-speech
-  [zipper]
-  (->> zipper
-       previous-leaf-loc
-       (zip/path)
-       (map first)
-       (filter symbol?)))
-
-(defn nearest-ancestor-phrase
-  [loc]
-  (->> loc
-       (iterate zip/prev)
-       (take-while (complement nil?))
-       (filter (comp tb2/phrases zip/node))
-       (first)))
-
-(comment
-  (nearest-ancestor-phrase
-   (->> (zip/vector-zip
-         '[NP [NN]])
-        zip/down
-        zip/right
-        zip/down)))

 (comment
  ;; Working backwards from a completed grammar tree that has
@ -796,8 +799,45 @@
                             (trie/lookup tpt)
                             (trie/children)
                             (map #(vector (tpt-db (.key %)) (get % []))))]
-    (choose-with-n-gram-markov
-     loc test-trie @test-database tpt tpt-db))
+    [(zip/node loc)
+     prev-pos
+     prev-pos'
+     n-gram
+     n-gram'
+     (choose-with-n-gram-markov
+      loc test-trie @test-database tpt tpt-db)])
+
+  (let [zipper (zip/vector-zip
+                '[[TOP
+                   [[VP
+                     [[[VBN]]
+                      [PP [[[TO]] [NP [[[NN]]]]]]
+                      [PP [[[IN ["into"]]] [NP [[[PRP$ ["my"]]] [[NNS ["answers"]]]]]]]]]]]])
+        loc (->> zipper
+                 (iterate zip/next)
+                 (filter #(= "into" (zip/node %)))
+                 (first))
+        prev-pos (previous-leaf-part-of-speech loc)
+        prev-pos' (map @test-database prev-pos)
+        n-gram (filter string? (rest-leafs loc))
+        n-gram' (mapv tpt-db n-gram)
+        grammar-children (->> (children test-trie @test-database prev-pos')
+                              (map first)
+                              (map @test-database))
+        n-gram-children (->> n-gram'
+                             (take 2)
+                             (reverse)
+                             (trie/lookup tpt)
+                             (trie/children)
+                             (map #(vector (tpt-db (.key %)) (get % []))))]
+    [(zip/node loc)
+     prev-pos
+     prev-pos'
+     n-gram
+     n-gram'
+     (choose-with-n-gram-markov
+      loc test-trie @test-database tpt tpt-db)])
+

  (let [zipper (zip/vector-zip
                '[[TOP
@ -840,6 +880,7 @@
       choice]))

  (trie/lookup test-trie [1 59 3 5 5 17])
+
  (@test-database 1911)

  (def tpt (tpt/load-tightly-packed-trie-from-file
@ -847,6 +888,7 @@
            decode-fn))

  (def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
+
  (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))

  (-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
@ -1044,6 +1086,7 @@

 (comment
  (generate-grammar-from [[NN ["taylor"]]])
+
  (map (comp (partial map @test-database) first) (take 5 test-trie))

  (@test-database 1)
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -15,7 +15,11 @@
                                 ParserFactory)
           (opennlp.tools.cmdline.parser ParserTool)))

-(comment tb2/phrases)
+(comment
+  tb2/phrases
+  (.exists (io/file (io/resource "models/en-token.bin")))
+
+  )
 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
 (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
 (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
@ -68,8 +72,8 @@
  (ParserFactory/create
   (ParserModel.
    (io/input-stream (io/resource "models/en-parser-chunking.bin")))
-   5
-   0.95))
+   50
+   0.90))

 (defn parse-probs [parses]
  (map #(.getProb %) parses))
@ -86,10 +90,32 @@
  (- (Math/log 0.001) (Math/log 0.01))
  (Math/E)
  (tokenize "Eric's testing.")
+
+  (Math/log 0.9999)
+  (Math/pow Math/E -0.5)
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)]
    ((juxt parse-probs parse-strs) parses))

+  (let [results (StringBuffer.)
+        parses (ParserTool/parseLine "Eric is testing." custom-parser 1)]
+    [((juxt parse-probs parse-strs) parses)
+     (count parses)])
+
+  (let [results (StringBuffer.)
+        parses (ParserTool/parseLine "Eric is testing." custom-parser 2)]
+    [((juxt parse-probs parse-strs) parses)
+     (count parses)])
+
+  (let [results (StringBuffer.)
+        parses (ParserTool/parseLine "This is a good day." custom-parser 1)]
+    [((juxt parse-probs parse-strs) parses)
+     (count parses)])
+
+  (let [results (StringBuffer.)
+        parses (ParserTool/parseLine "The do dog run drive a." custom-parser 1)]
+    ((juxt parse-probs parse-strs) parses))
+
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "Eric 's testing ." custom-parser 1)]
    (meta parses))
@ -146,6 +172,12 @@
       tb2/clauses
       boolean))

+(comment
+  (->> "the lazy fox"
+       vector
+       parse)
+
+  )
 (defn unmake-tree
  "Tokenizing and then parsing a sentence returns a string
  representation of the parse tree. This is a helper function
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -27,7 +27,10 @@
  (map str (.getPhones cmu-lexicon "two" nil)))

 (defn get-phones
-  "String must be lowercase."
+  "Gets phones for known or unknonwn words from the CMULexicon,
+  removes the stress, and converts them to a format that matches the CMU Sphinx
+  dictionary (capitalizes and replaces 'ax' with 'ah') String must be
+  lowercase."
  [word]
  (->> (map str (.getPhones cmu-lexicon word nil))
       (map remove-stress)
@ -35,9 +38,9 @@
       (map string/upper-case)))

 (defn get-phones-with-stress
-  "String must be lowercase.
-  .getPhones only.
-  Might be different from stress in cmu-dict"
+  "Same as `get-phones` but leaves stress.
+  Note that this might not be the same stress that you'd see
+  in the CMU pronouncing dictionary."
  [word]
  (->> (map str (.getPhones cmu-lexicon word nil))
       (map convert-to-sphinx)