Remove stale code, add comments, cleanup

5 years ago · 62a1ff3a3b
parent 9b8fec9fc7
commit 62a1ff3a3b
9 changed files with 689 additions and 269 deletions
--- a/README.rst
+++ b/README.rst
@ -61,6 +61,24 @@ words, then you're stuck trying to rhyme with the single syllable
 Implementation
 --------------
 2021-06-09
 ++++++++++
 Most generation tasks are going to require some big data structures, like a Trie of n-grams.
 A ``context`` is an atom that gets updated with those data structures.
 Loading some of these data structures can take a long time, so only load what you need.
 An example of the different data structures you might load:
 Alliterations - From the database of n-grams, convert each n-gram to syllables then create a trie of the alliterations.
 Perfect rhymes - Again, from the database of n-grams, convert n-gram to syllables and create trie of reverse of syllables.
 Imperfect rhymes - Perform some manipulation of the syllables so that you can be more flexible with your rhymes.
 One key that is probably always required is the ``database``. This maps words to their IDs and IDs to their words. The integer
 IDs are necessary for tightly packed tries.
 2020-10-20
 ++++++++++
--- a/deps.edn
+++ b/deps.edn
@ -21,7 +21,7 @@
        com.taoensso/nippy {:mvn/version "3.0.0"}
        com.taoensso/timbre {:mvn/version "4.10.0"}
        com.owoga/tightly-packed-trie
-        {:local/root "/home/eihli/code/tightly-packed-trie"}
+        {:local/root "/home/eihli/src/clj-tightly-packed-trie"}
        com.owoga/phonetics {:mvn/version "0.1.1"}}
 :aliases {:dev {:extra-paths ["test" "examples" "dev"]
                 :extra-deps {}}}}
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@ -18,6 +18,7 @@
            [com.owoga.prhyme.syllabify :as syllabify]
            [taoensso.nippy :as nippy]))
 (set! *warn-on-reflection* true)
 (tufte/add-basic-println-handler! {})
 (defn xf-file-seq [start end]
@ -505,7 +506,7 @@
    (trie/make-trie)
    (@context :database)))
-  #_(swap!
+  (swap!
   context
   assoc
   :flex-rhyme-trie
@ -546,6 +547,7 @@
    nil)
  (take 5 (@context :flex-rhyme-trie))
  )
 (comment
@ -553,17 +555,31 @@
       (take 20)
       (map first)
       (map (partial map (@context :database))))
  (trie/children (trie/lookup (@context :trie) [13393]))
  ((@context :database) "desk")   ;; => 13393
  ((@context :database) "wobbly") ;; => 152750
  (get (@context :trie) [13393 152750]))
 (defn rhyme-choices
-  [{:keys [flex-rhyme-trie database] :as context} phrase]
+  [{:keys [rhyme-trie database] :as context} phrase]
  (if (string? phrase)
-    (let [phones (phrase->flex-rhyme-phones phrase)]
+    (let [phones (phrase->phones phrase)]
-      (get flex-rhyme-trie phones))
+      (get rhyme-trie phones))
-    (get flex-rhyme-trie phrase)))
+    (get rhyme-trie phrase)))
 (comment
  (get (:rhyme-trie @context) (phrase->phones "fall"))
  (->> (rhyme-choices
        @context
        "fall")
       #_(map (comp (:database @context) first first)))
  (take 20 (:flex-rhyme-trie @context))
  (take 20 (:rhyme-trie @context))
  )
 (defn exclude-non-rhymes-from-choices
  "Removes any choice that includes the last
@ -633,8 +649,11 @@
 (comment
  (get-flex-rhyme @context "bother me")
  (phrase->flex-rhyme-phones "bother me")
  (get-flex-rhyme @context ["IY" "ER" "AA"])
  )
 (defn get-next-markov
@ -644,7 +663,9 @@
        children (and node
                      (->> node
                           trie/children
-                           (map #(vector (.key %) (get % [])))
+                           (map (fn [^com.owoga.trie.ITrie child]
                                  [(.key child)
                                   (get child [])]))
                           (remove (comp nil? second))
                           (remove
                            (fn [[k v]]
@ -670,7 +691,14 @@
    (database (get-next-markov context word-ids))))
 (comment
  (initialize)
  (get (@context :database) "</s>")
  (keys @context)
  ((@context :database) (get-next-markov @context [1]))
  (get-next-markov @context [222])
  (get-next-markov-from-phrase-backwards @context "will strike you down" 3)
  (get (@context :database) 7982)
@ -693,6 +721,12 @@
      (map database seed))))
 (defn take-words-amounting-to-at-least-n-syllables
  "This function is nice to grab the tail end of a sentence for making a good rhyme.
  If the sentence ends with a single-syllable word, like 'me', but a more
  interesting n-gram like 'bother me', then you might want to explore the rhymes
  available for the last N syllables. Sure, a word like 'poverty' would show up if you
  got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
  rhymes before you see it."
  [phrase n]
  (letfn [(phones [word]
            [word (first (owoga.phonetics/get-phones word))])
@ -711,7 +745,16 @@
         (map first)
         (string/join " "))))
 (comment
  (take-words-amounting-to-at-least-n-syllables
   "police can bother me" 3);; => "police can"
  (take-words-amounting-to-at-least-n-syllables
   "police can bother me" 4);; => "police can bother"
  )
 (defn take-n-syllables
  "Returns the vowel sounds that make up the last n syllables.
  Doesn't return stress."
  [phrase n]
  (if (string? phrase)
    (->> phrase
@ -720,7 +763,9 @@
         (reverse))
    (take-last n phrase)))
-(take-n-syllables "bother me" 2)
+(comment
  (take-n-syllables "bother me" 2);; => ("ER" "IY")
  )
 (defn valid-english-sentence?
  [phrase]
@ -747,7 +792,8 @@
 (defn rhyming-n-gram-choices
  [context target-rhyme]
  (loop [target-rhyme target-rhyme]
-    (let [choices (->> target-rhyme
+    (let [context @context
          choices (->> target-rhyme
                       (rhyme-choices context)
                       (exclude-non-rhymes-from-choices context target-rhyme)
                       (exclude-non-english-phrases-from-choices context))]
@ -758,11 +804,12 @@
        choices))))
 (comment
-  (->> (rhyming-n-gram-choices @context "fall")
+  (->> (rhyming-n-gram-choices context "fall")
       (map (comp (@context :database) first first)))
-  )
+  (rhyme-choices @context "tall")
  )
 (defn generate-n-syllable-sentence-rhyming-with
  [context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count]
@ -801,6 +848,16 @@
                " "
                phrase)))))))
 (comment
  (generate-n-syllable-sentence-rhyming-with
   context
   "war on poverty"
   3
   3
   8)
  )
 (defn generate-haiku
  [seed]
  (let [haiku (cons
@ -829,7 +886,9 @@
          (map last)
          (apply distinct?))))
-  (->> (generate-haiku "football")
+  (println (first (generate-haiku "fall")))
  (->> (generate-haiku "</s>")
       (filter valid-haiku)
       (map (partial string/join "\n"))
       (map #(vector % (sha256 %)))
@ -837,7 +896,7 @@
              (println haiku)
              (println sha)
              (println)))
-       (take 10))
+       (take 1))
  )
@ -850,7 +909,9 @@
       @context
       (take 3 (phrase->flex-rhyme-phones "player unknown battlegrounds"))
       3 4 %)
-     [9 6 6 9 6 6 9 6 6]))))
+     [9 6 6 9 6 6 9 6 6])))
  )
 "
 another day a battleground
@ -878,7 +939,7 @@ witness sky is blackened now
 (defn continuously-amulate
  [seed]
-  (let [next-sentence (amul8 seed)
+  (let [next-sentence (generate-haiku seed)
        next-seed (->> next-sentence
                       (#(string/split % #" "))
                       (reverse)
@ -898,6 +959,8 @@ witness sky is blackened now
     (cons next-sentence (continuously-amulate next-seed)))))
 (comment
  (generate-haiku "technology")
  (take 5 (continuously-amulate "technology"))
  (->> (amul8 "technology" 1)
@ -1034,7 +1097,16 @@ witness sky is blackened now
 (defn choose-next-word
  "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
-  the next word based on markove data in trie."
+  the next word based on markov data in trie.
  Could be improved by taking into account grammar and/or bidirectional context.
  The n-gram parameter is a list of trie entries
  For trie entries that are word/frequency pairs, it might look something like this.
  `[[sunshine 38] [</s> 509]]`
  But note that nothing in this function uses the frequency count from the passed in n-gram.
  It's just easier for the calling functions to pass them in like that."
  [{:keys [database trie] :as context} n-gram]
  (let [n-gram-ids (->> n-gram (map first) (map database))
        node (trie/lookup trie n-gram-ids)]
@ -1053,6 +1125,7 @@ witness sky is blackened now
                n-minus-1-gram-odds (/ (second (first children-freqs))
                                       (+ (second (get node []))
                                          (second (first children-freqs))))
                ;; Good-turing smoothing, take unseen ngram?
                take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
                                          (< (rand) n-minus-1-gram-odds))]
            (if take-n-minus-1-gram?
@ -1137,22 +1210,19 @@ witness sky is blackened now
    (get trie ids))
  (choose-next-word @context (take 3 [["</s>" 509]]))
-  (generate-sentence-backwards @context ["</s>"])
+  (generate-sentence-backwards @context ["kill" "</s>"])
  (valid-sentences? (generate-phrase @context '(["bitter" 41])))
  (choose-next-word @context (take 3 [["theology" 41]]))
  (choose-next-word @context [["and" 5] ["theology" 41]])
  (find-rhymes (@context :perfect-rhyme-trie) "theology")
-  (trie/chil(trie/lookup (@context :trie) '(57 2477)))
+  (trie/lookup (@context :trie) '(57 2477))
  (take 5 (@context :trie))
  (->> (find-rhymes (@context :perfect-rhyme-trie) "technology")
       (map (fn [[word frq]]
              (let [n+1grams (word->n+1grams
@ -1162,101 +1232,55 @@ witness sky is blackened now
                (map vector n+1grams (repeat [word frq])))))
       (reduce into []))
  (def loaded-backwards-trie
    (tpt/load-tightly-packed-trie-from-file
     "resources/dark-corpus-backwards-tpt.bin"
     (decode-fn @trie-database)))
  (def loaded-backwards-database
    (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
      (into {} (map read-string (line-seq rdr)))))
  (def rhyme-database (atom {}))
  (def db
    (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
  (def perfect-rhyme-trie
    (transduce
     (comp
      (map first)
      (filter string?)
      (map #(vector % (reverse (word->phones %))))
      (map reverse))
     (completing
      (fn [trie [k v]]
        (update trie k (fnil #(update % 1 inc) [v 0]))))
     (trie/make-trie)
     @loaded-backwards-database))
  (def vowel-rhyme-trie
    (transduce
     (comp
      (map first)
      (filter string?)
      (map #(vector % (reverse (word->phones %))))
      (map reverse)
      (map (fn [[phones v]]
             [(map #(if (owoga.phonetics/vowel
                         (string/replace % #"\d" ""))
                      %
                      "?")
                   phones)
              v])))
     (completing
      (fn [trie [k v]]
        (update trie k (fnil #(update % 1 inc) [v 0]))))
     (trie/make-trie)
     (take 1000 db)))
  (take 20 vowel-rhyme-trie)
  (do
    #_(time
       (def backwards-trie
         (transduce (comp (xf-file-seq 0 250000)
                          (map slurp)
                          (map (partial n-to-m-backwards-grams 1 4))
                          (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
                          stateful-transducer)
                    conj
                    (file-seq (io/file "dark-corpus")))))
    #_(time
       (def tightly-packed-backwards-trie
         (tpt/tightly-packed-trie
          backwards-trie
          encode-fn
          (decode-fn @trie-database))))
    #_(tpt/save-tightly-packed-trie-to-file
       "resources/dark-corpus-backwards-tpt.bin"
       tightly-packed-backwards-trie)
    #_(with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
        (let [lines (->> (seq @trie-database)
                         (map pr-str)
                         (map #(str % "\n")))]
          (doseq [line lines]
            (.write wtr line))))
    (def loaded-backwards-trie
      (tpt/load-tightly-packed-trie-from-file
       "resources/dark-corpus-backwards-tpt.bin"
       (decode-fn @trie-database)))
    (def loaded-backwards-database
      (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
        (into {} (map read-string (line-seq rdr)))))
    (def rhyme-database (atom {}))
    (def db
      (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
    (def perfect-rhyme-trie
      (transduce
       (comp
        (map first)
        (filter string?)
        (map #(vector % (reverse (word->phones %))))
        (map reverse))
       (completing
        (fn [trie [k v]]
          (update trie k (fnil #(update % 1 inc) [v 0]))))
       (trie/make-trie)
       @loaded-backwards-database))
    (def vowel-rhyme-trie
      (transduce
       (comp
        (map first)
        (filter string?)
        (map #(vector % (reverse (word->phones %))))
        (map reverse)
        (map (fn [[phones v]]
               [(map #(if (owoga.phonetics/vowel
                           (string/replace % #"\d" ""))
                        %
                        "?")
                     phones)
                v])))
       (completing
        (fn [trie [k v]]
          (update trie k (fnil #(update % 1 inc) [v 0]))))
       (trie/make-trie)
       (take 1000 db)))
    (take 20 vowel-rhyme-trie)
    )
  #_(with-open [wtr (clojure.java.io/writer "database.bin")]
      (let [lines (->> (seq @trie-database)
                       (map pr-str)
@ -1310,9 +1334,9 @@ witness sky is blackened now
  (id-get-in-tpt
   tightly-packed-trie
   trie-database
-   '(2 2 3)))
+   '(2 2 3))
  ;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
-
+  )
 (comment
  (->> (perfect-rhymes perfect-rhyme-trie
@ -1417,3 +1441,9 @@ witness sky is blackened now
                       #(= (last %) \2)
                       phones))]
    (trie/lookup trie rhyme-suffix)))
 (comment
  (keys @context)
  ;; => (:flex-rhyme-trie :database :trie :perfect-rhyme-trie :rhyme-trie)
  )
--- a/src/com/owoga/corpus/util.clj
+++ b/src/com/owoga/corpus/util.clj
@ -0,0 +1,81 @@
 (ns com.owoga.corpus.util
  (:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)]
            [clojure.string :as string]))
 (set! *warn-on-reflection* true)
 (tufte/add-basic-println-handler! {})
 (defn clean-text
  "Removes all non-alphabetical characters and lowercases everything.
  Very spartan way of cleaning."
  [text]
  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" "")))
 (defn xf-file-seq [start end]
  (comp (remove #(.isDirectory %))
        (drop start)
        (take end)))
 (def re-word
  "Regex for tokenizing a string into words
  (including contractions and hyphenations),
  commas, periods, and newlines."
  #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
 (defn pad-tokens
  "Pads the beginning with n - 1 <s> tokens and
  the end with 1 </s> token."
  [beginning-pad
   number-of-beginning-pad
   ending-pad
   number-of-ending-pad
   tokens]
  (vec
   (concat
    (vec (repeat number-of-beginning-pad beginning-pad))
    tokens
    (vec (repeat number-of-ending-pad ending-pad)))))
 (defn padder
  [beg beg-n end end-n]
  (partial pad-tokens beg beg-n end end-n))
 (comment
  (let [p (padder "<s>" 1 "</s>" 3)]
    (p [1 2 3]));; => ["<s>" 1 2 3 "</s>" "</s>" "</s>"]
  )
 (defn tokenize-line
  [line]
  (->> line
       (string/trim)
       (re-seq re-word)
       (mapv second)
       (mapv string/lower-case)))
 (comment
  (tokenize-line "The lazy fox jumps over the moon.")
  ;; => ["the" "lazy" "fox" "jumps" "over" "the" "moon" "."]
  )
 (defn text->ngrams
  "Takes text from a file, including newlines."
  [text n]
  (->> text
       clean-text
       (#(string/split % #"\n+"))
       (remove empty?)
       (mapv tokenize-line)
       (mapv #(partition n 1 %))
       (mapv #(mapv vec %))
       (reduce #(into %1 %2) [])))
 (comment
  (text->ngrams "The lazy fox jumps.\nOver the 5th full moon." 3)
  ;; => [["the" "lazy" "fox"]
  ;;     ["lazy" "fox" "jumps"]
  ;;     ["over" "the" "th"]
  ;;     ["the" "th" "full"]
  ;;     ["th" "full" "moon"]]
  )
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -237,6 +237,8 @@
  ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
  )
 (comment
  (->> (first texts)
       (split-text-into-sentences)
@ -406,11 +408,11 @@
      (recur (zip/next zipper)))))
 (defn grammar-children
-  [k]
+  [database trie k]
  (sort-by
   (comp - last)
-   (map #(vector (.key %) (@test-database (.key %)) (get % []))
+   (map #(vector (.key %) (database (.key %)) (get % []))
-        (remove (comp nil? #(get % [])) (trie/children (trie/lookup test-trie k))))))
+        (remove (comp nil? #(get % [])) (trie/children (trie/lookup trie k))))))
 (defn grammar-branch?
  [trie database k]
--- a/src/com/owoga/prhyme/limerick.clj
+++ b/src/com/owoga/prhyme/limerick.clj
@ -1,9 +1,18 @@
 (ns com.owoga.prhyme.limerick
  (:require [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
            [com.owoga.prhyme.util.math :as math]
            [com.owoga.prhyme.nlp.core :as nlp]
            [clojure.string :as string]
            [com.owoga.phonetics :as phonetics]
            [com.owoga.phonetics.syllabify :as syllabify]
            [com.owoga.prhyme.core :as prhyme]
-            [com.owoga.prhyme.util :as util]))
+            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.trie :as trie]
            [com.owoga.tightly-packed-trie :as tpt]
            [com.owoga.tightly-packed-trie.encoding :as encoding]
            [clojure.java.io :as io]))
 (defn rhyme-from-scheme
  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
@ -42,6 +51,7 @@
  (require '[com.owoga.prhyme.data.dictionary :as dict]
           '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]]
           '[clojure.java.io :as io])
  (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
  )
@ -74,3 +84,204 @@
   "war we await the afterlife"])
 ;;;; Generating limericks with a markov model
 (defn phrase->flex-rhyme-phones
  "Takes a space-seperated string of words
  and returns the concatenation of the words
  vowel phones.
  Returns them in reversed order so they
  are ready to be used in a lookup of a rhyme trie.
  "
  [phrase]
  (->> phrase
       (#(string/split % #" "))
       (map (comp syllabify/syllabify first phonetics/get-phones))
       (map (partial reduce into []))
       (map #(filter (partial re-find #"\d") %))
       (flatten)
       (map #(string/replace % #"\d" ""))
       (reverse)))
 (defn word->phones [word]
  (or (dict/word->cmu-phones word)
      (util/get-phones-with-stress word)))
 (defonce context (atom {}))
 (defn decode-fn [db]
  (fn [byte-buffer]
    (let [value (encoding/decode byte-buffer)]
      (if (zero? value)
        nil
        [value (encoding/decode byte-buffer)]))))
 (defn initialize []
  (swap!
   context
   assoc
   :database
   (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
     (into {} (map read-string (line-seq rdr)))))
  (swap!
   context
   assoc
   :trie
   (tpt/load-tightly-packed-trie-from-file
    "resources/dark-corpus-backwards-tpt.bin"
    (decode-fn (@context :database))))
  (swap!
   context
   assoc
   :perfect-rhyme-trie
   (transduce
    (comp
     (map first)
     (filter string?)
     (map #(vector % (reverse (word->phones %))))
     (map reverse))
    (completing
     (fn [trie [k v]]
       (update trie k (fnil #(update % 1 inc) [v 0]))))
    (trie/make-trie)
    (@context :database)))
  (swap!
   context
   assoc
   :rhyme-trie
   (transduce
    (comp
     (map first)
     (filter string?)
     (map #(vector % (reverse (word->phones %))))
     (map reverse))
    (completing
     (fn [trie [k v]]
       (update trie k (fnil #(update % 1 inc) [v 0]))))
    (trie/make-trie)
    (@context :database)))
  (swap!
   context
   assoc
   :flex-rhyme-trie
   (transduce
    (comp
     (map (fn [[k v]]
            [(string/join " " (map (@context :database) k))
             [k v]]))
     (map (fn [[phrase [k v]]]
            [(phrase->flex-rhyme-phones phrase)
             [k v]])))
    (completing
     (fn [trie [k v]]
       (update trie k (fnil conj [v]) v)))
    (trie/make-trie)
    (->> (trie/children-at-depth (@context :trie) 0 1))))
  nil)
 (comment
  (time (initialize))
  (println 2)
  (take 5 (:flex-rhyme-trie @context))
  )
 (defn choose-next-word
  "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
  the next word based on markov data in trie.
  Could be improved by taking into account grammar and/or bidirectional context.
  The n-gram parameter is a list of trie entries
  For trie entries that are word/frequency pairs, it might look something like this.
  `[[sunshine 38] [</s> 509]]`
  But note that nothing in this function uses the frequency count from the passed in n-gram.
  It's just easier for the calling functions to pass them in like that."
  [{:keys [database trie] :as context} n-gram]
  (let [n-gram-ids (->> n-gram (map first) (map database))
        node (trie/lookup trie n-gram-ids)]
    (cond
      (= 0 (count n-gram-ids))
      (let [children (->> (trie/children trie)
                          (map #(get % [])))
            choice (math/weighted-selection second children)]
        [(database (first choice)) (second choice)])
      node
      (let [children (->> (trie/children node)
                          (map #(get % []))
                          (remove (fn [[id f]] (= id (first n-gram-ids)))))]
        (if (seq children)
          (let [children-freqs (into (sorted-map) (frequencies (map second children)))
                n-minus-1-gram-odds (/ (second (first children-freqs))
                                       (+ (second (get node []))
                                          (second (first children-freqs))))
                ;; Good-turing smoothing, take unseen ngram?
                take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
                                          (< (rand) n-minus-1-gram-odds))]
            (if take-n-minus-1-gram?
              (choose-next-word context (butlast n-gram))
              (let [choice (math/weighted-selection second children)]
                [(database (first choice)) (second choice)])))
          (choose-next-word context (butlast n-gram))))
      :else
      (choose-next-word context (butlast n-gram)))))
 (defn valid-sentence? [phrase]
  (->> phrase
       (map first)
       (string/join " ")
       (#(string/replace % #"(<s>|</s>)" ""))
       (nlp/valid-sentence?)))
 (defn generate-sentence-backwards
  "Given a phrase of [w1 w2 w3] generates a sentence
  using a backwards markov."
  ([{:keys [database trie] :as context} phrase]
   (let [phrase (map (fn [w]
                       (let [id (database w)]
                         [w (second (get trie [id]))]))
                     phrase)]
     (loop [phrase' (loop [phrase phrase]
                      (if (= "<s>" (first (first phrase)))
                        phrase
                        (recur (cons (choose-next-word context (take 3 phrase))
                                     phrase))))]
       (if (valid-sentence? phrase')
         phrase'
         (recur (loop [phrase phrase]
                  (if (= "<s>" (first (first phrase)))
                    phrase
                    (recur (cons (choose-next-word context (take 3 phrase))
                                 phrase)))))))))
  )
 (comment
  (take 5 (:database @context))
  (map (:database @context) ["me" "bother"])
  (map (:database @context) ["bother me"])
  (first
   (filter
    valid-sentence?
    (repeatedly
     (fn []
       (generate-sentence-backwards
        @context
        ["bother" "me" "</s>"])))))
  (keys @context)
  (time (initialize))
  )
 (defn rhyme-from-scheme-2
  "Generate rhyme without the use of `weighted-selection/adjust-for-markov`."
  [])
--- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
+++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
@ -6,7 +6,6 @@
            [com.owoga.phonetics.syllabify :as owoga.syllabify]
            [com.owoga.phonetics :as owoga.phonetics]
            [com.owoga.tightly-packed-trie.encoding :as encoding]
            [examples.core :as examples]
            [taoensso.nippy :as nippy]
            [com.owoga.prhyme.nlp.core :as nlp]
            [examples.tpt :as examples.tpt]
@ -18,6 +17,78 @@
            [com.owoga.tightly-packed-trie :as tpt]
            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
 ;;;; Utilities
 ;;
 ;;
 (defn phrase->flex-rhyme-phones
  "Takes a space-seperated string of words
  and returns the concatenation of the words
  vowel phones.
  Returns them in reversed order so they
  are ready to be used in a lookup of a rhyme trie.
  "
  [phrase]
  (->> phrase
       (#(string/split % #" "))
       (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
       (map (partial reduce into []))
       (map #(filter (partial re-find #"\d") %))
       (flatten)
       (map #(string/replace % #"\d" ""))
       (reverse)))
 (defn take-words-amounting-to-at-least-n-syllables
  "This function is nice to grab the tail end of a sentence for making a good rhyme.
  If the sentence ends with a single-syllable word, like 'me', but a more
  interesting n-gram like 'bother me', then you might want to explore the rhymes
  available for the last N syllables. Sure, a word like 'poverty' would show up if you
  got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
  rhymes before you see it."
  [phrase n]
  (letfn [(phones [word]
            [word (first (owoga.phonetics/get-phones word))])
          (syllables [[word phones]]
            [word (owoga.syllabify/syllabify phones)])]
    (->> phrase
         (#(string/split % #" "))
         (map phones)
         (map syllables)
         (reduce
          (fn [result [word syllables]]
            (if (<= n (count (mapcat second result)))
              (reduced result)
              (conj result [word syllables])))
          [])
         (map first)
         (string/join " "))))
 (comment
  (take-words-amounting-to-at-least-n-syllables
   "police can bother me" 3);; => "police can"
  (take-words-amounting-to-at-least-n-syllables
   "police can bother me" 4);; => "police can bother"
  )
 (defn take-n-syllables
  "Returns the vowel sounds that make up the last n syllables.
  Doesn't return stress."
  [phrase n]
  (if (string? phrase)
    (->> phrase
         (phrase->flex-rhyme-phones)
         (take n)
         (reverse))
    (take-last n phrase)))
 (comment
  (take-n-syllables "bother me" 2);; => ("ER" "IY")
  )
 ;;;; Much of the code below is related to grammar generation.
 (defn update-values [m f & args]
  (reduce
   (fn [acc [k v]]
@ -298,27 +369,6 @@
      (recur (next parse-zipper)))))
 (comment
  (let [structure '(TOP (S (NP (DT) (JJ) (NN))
                           (VP (RB) (VBZ))
                           (NP (DT) (JJ) (NN))))
        structure (-> structure
                      zip/seq-zip
                      nlp/iter-zip
                      last)
        pos-freqs (examples/pos-paths->pos-freqs
                   examples/t1)]
    (repeatedly
     10
     (fn []
       (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
             zip/prev
             zip/next
             nil?
             zip/end?
             examples/pos-freqs-data-2
             structure
             examples/darkov-2)))))
  (timbre/set-level! :info)
  (timbre/set-level! :error)
@ -334,67 +384,6 @@
         pos-path->word-freqs
         pos->word-freqs
         target-parse-tree)))
  (time (def example-pos-freqs examples/example-pos-freqs))
  (nippy/thaw)
  (nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs)
  (time (def example-structures examples/example-structures))
  (weighted-rand/weighted-selection-from-map
   example-structures)
  (take 5 examples/t2)
  (let [structure (weighted-rand/weighted-selection-from-map
                   examples/popular-structure-freq-data)
        structure (-> structure
                      zip/seq-zip
                      nlp/iter-zip
                      last)
        pos-freqs examples/pos-freqs-data-2]
    (repeatedly
     10
     (fn []
       (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
             zip/prev
             zip/next
             nil?
             zip/end?
             pos-freqs
             structure
             examples/darkov-2)
            nlp/leaf-nodes
            (string/join " ")))))
  (repeatedly
   10
   (fn []
     (let [structure (weighted-rand/weighted-selection-from-map
                      (->> examples/t2
                           (sort-by second)
                           (reverse)
                           (take 20)))
           structure (-> structure
                         zip/seq-zip
                         nlp/iter-zip
                         last)
           pos-freqs (examples/pos-paths->pos-freqs
                      examples/t1)]
       (repeatedly
        10
        (fn []
          (->> (generate-with-markov-with-custom-progression
                zip/prev
                zip/next
                nil?
                zip/end?
                examples/t1
                pos-freqs
                structure
                examples/darkov-2)
               nlp/leaf-nodes
               (string/join " ")))))))
  )
@ -442,12 +431,15 @@
            (update trie k (fnil inc 0)))
          trie
          entries)))
-      #_(trie/make-trie)
+      (trie/make-trie)
      test-trie
      (->> texts
           (drop 4000)
           (take 4000)))))
  (def test-trie (into (trie/make-trie) (nippy/thaw-from-file "resources/grammar-trie-take-8000.bin")))
  (def test-database (atom (nippy/thaw-from-file "resources/grammar-database-take-8000.bin")))
  )
 (defn children
@ -531,25 +523,63 @@
  (nippy/freeze-to-file "resources/grammar-database-take-8000.bin" @test-database)
  (->> (take 20 test-trie)
       (map (comp (partial map @test-database) first)))
  (->> (take 20 (reverse (sort-by second test-trie)))
       (map (fn [[a b]]
              [(map @test-database a) b])))
  ;; A sampling of the words that have been seen in the [TOP S NP NN] position.
  (->> (trie/lookup test-trie (map @test-database '[TOP S NP NN]))
       (map (comp @test-database first first))
       (drop 100)
       (take 5))
  ;; => ("sink" "lose" "deep" "well" "help")
  )
-(defn phrase->flex-rhyme-phones
+(defn zipper-last
-  "Takes a space-seperated string of words
+  [zipper]
-  and returns the concatenation of the words
+  (->> zipper
-  vowel phones.
+       (iterate zip/next)
       (take-while (complement zip/end?))
       last))
-  Returns them in reversed order so they
+(defn previous-leaf-loc
-  are ready to be used in a lookup of a rhyme trie.
+  [zipper]
-  "
+  (->> zipper
-  [phrase]
+       (iterate zip/prev)
-  (->> phrase
+       (take-while (complement nil?))
-       (#(string/split % #" "))
+       (filter #(and (symbol? (zip/node %))
-       (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
+                     (zip/up %)
-       (map (partial reduce into []))
+                     (= 1 (count (zip/node (zip/up %))))))
-       (map #(filter (partial re-find #"\d") %))
+       (first)))
-       (flatten)
+
-       (map #(string/replace % #"\d" ""))
+(defn previous-leaf-part-of-speech
-       (reverse)))
+  [zipper]
  (->> zipper
       previous-leaf-loc
       (zip/path)
       (map first)
       (filter symbol?)))
 (defn nearest-ancestor-phrase
  [loc]
  (->> loc
       (iterate zip/prev)
       (take-while (complement nil?))
       (filter (comp tb2/phrases zip/node))
       (first)))
 (comment
  (nearest-ancestor-phrase
   (->> (zip/vector-zip
         '[NP [NN]])
        zip/down
        zip/right
        zip/down)))
 (defn markov-generate-grammar-with-rhyming-tail
  [grammar-trie grammar-database rhyme-trie rhyme-database rhyme-target zipper]
@ -645,6 +675,11 @@
   @test-database
   (zip/vector-zip [1]))
  (def test-sentence (markov-generate-sentence
                      test-trie
                      @test-database
                      (zip/vector-zip [1])))
  (repeatedly
   20
   #(->> (generate test-trie @test-database (zip/vector-zip [1]))
@ -666,13 +701,6 @@
       (zip/root (apply-fn loc)))
      (recur (next-fn (apply-fn loc))))))
 (defn zipper-last
  [zipper]
  (->> zipper
       (iterate zip/next)
       (take-while (complement zip/end?))
       last))
 (defn decode-fn
    "Decodes a variable-length encoded number from a byte-buffer.
  Zero gets decoded to nil."
@ -688,6 +716,13 @@
       (filter (complement zip/branch?))
       (map zip/node)))
 (def tpt (tpt/load-tightly-packed-trie-from-file
        (io/resource "dark-corpus-4-gram-backwards-tpt.bin")
        decode-fn))
 (def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
 (defn choose-with-n-gram-markov
  "Hard-coded to work with 4-gram. That's the </s> at the end."
  [zipper
@ -735,39 +770,7 @@
     choice]
    (first choice)))
 (defn previous-leaf-loc
  [zipper]
  (->> zipper
       (iterate zip/prev)
       (take-while (complement nil?))
       (filter #(and (symbol? (zip/node %))
                     (zip/up %)
                     (= 1 (count (zip/node (zip/up %))))))
       (first)))
 (defn previous-leaf-part-of-speech
  [zipper]
  (->> zipper
       previous-leaf-loc
       (zip/path)
       (map first)
       (filter symbol?)))
 (defn nearest-ancestor-phrase
  [loc]
  (->> loc
       (iterate zip/prev)
       (take-while (complement nil?))
       (filter (comp tb2/phrases zip/node))
       (first)))
 (comment
  (nearest-ancestor-phrase
   (->> (zip/vector-zip
         '[NP [NN]])
        zip/down
        zip/right
        zip/down)))
 (comment
  ;; Working backwards from a completed grammar tree that has
@ -796,8 +799,45 @@
                             (trie/lookup tpt)
                             (trie/children)
                             (map #(vector (tpt-db (.key %)) (get % []))))]
-    (choose-with-n-gram-markov
+    [(zip/node loc)
-     loc test-trie @test-database tpt tpt-db))
+     prev-pos
     prev-pos'
     n-gram
     n-gram'
     (choose-with-n-gram-markov
      loc test-trie @test-database tpt tpt-db)])
  (let [zipper (zip/vector-zip
                '[[TOP
                   [[VP
                     [[[VBN]]
                      [PP [[[TO]] [NP [[[NN]]]]]]
                      [PP [[[IN ["into"]]] [NP [[[PRP$ ["my"]]] [[NNS ["answers"]]]]]]]]]]]])
        loc (->> zipper
                 (iterate zip/next)
                 (filter #(= "into" (zip/node %)))
                 (first))
        prev-pos (previous-leaf-part-of-speech loc)
        prev-pos' (map @test-database prev-pos)
        n-gram (filter string? (rest-leafs loc))
        n-gram' (mapv tpt-db n-gram)
        grammar-children (->> (children test-trie @test-database prev-pos')
                              (map first)
                              (map @test-database))
        n-gram-children (->> n-gram'
                             (take 2)
                             (reverse)
                             (trie/lookup tpt)
                             (trie/children)
                             (map #(vector (tpt-db (.key %)) (get % []))))]
    [(zip/node loc)
     prev-pos
     prev-pos'
     n-gram
     n-gram'
     (choose-with-n-gram-markov
      loc test-trie @test-database tpt tpt-db)])
  (let [zipper (zip/vector-zip
                '[[TOP
@ -840,6 +880,7 @@
       choice]))
  (trie/lookup test-trie [1 59 3 5 5 17])
  (@test-database 1911)
  (def tpt (tpt/load-tightly-packed-trie-from-file
@ -847,6 +888,7 @@
            decode-fn))
  (def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
  (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
  (-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
@ -1044,6 +1086,7 @@
 (comment
  (generate-grammar-from [[NN ["taylor"]]])
  (map (comp (partial map @test-database) first) (take 5 test-trie))
  (@test-database 1)
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -15,7 +15,11 @@
                                 ParserFactory)
           (opennlp.tools.cmdline.parser ParserTool)))
-(comment tb2/phrases)
+(comment
  tb2/phrases
  (.exists (io/file (io/resource "models/en-token.bin")))
  )
 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
 (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
 (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
@ -68,8 +72,8 @@
  (ParserFactory/create
   (ParserModel.
    (io/input-stream (io/resource "models/en-parser-chunking.bin")))
-   5
+   50
-   0.95))
+   0.90))
 (defn parse-probs [parses]
  (map #(.getProb %) parses))
@ -86,10 +90,32 @@
  (- (Math/log 0.001) (Math/log 0.01))
  (Math/E)
  (tokenize "Eric's testing.")
  (Math/log 0.9999)
  (Math/pow Math/E -0.5)
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)]
    ((juxt parse-probs parse-strs) parses))
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "Eric is testing." custom-parser 1)]
    [((juxt parse-probs parse-strs) parses)
     (count parses)])
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "Eric is testing." custom-parser 2)]
    [((juxt parse-probs parse-strs) parses)
     (count parses)])
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "This is a good day." custom-parser 1)]
    [((juxt parse-probs parse-strs) parses)
     (count parses)])
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "The do dog run drive a." custom-parser 1)]
    ((juxt parse-probs parse-strs) parses))
  (let [results (StringBuffer.)
        parses (ParserTool/parseLine "Eric 's testing ." custom-parser 1)]
    (meta parses))
@ -146,6 +172,12 @@
       tb2/clauses
       boolean))
 (comment
  (->> "the lazy fox"
       vector
       parse)
  )
 (defn unmake-tree
  "Tokenizing and then parsing a sentence returns a string
  representation of the parse tree. This is a helper function
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -27,7 +27,10 @@
  (map str (.getPhones cmu-lexicon "two" nil)))
 (defn get-phones
-  "String must be lowercase."
+  "Gets phones for known or unknonwn words from the CMULexicon,
  removes the stress, and converts them to a format that matches the CMU Sphinx
  dictionary (capitalizes and replaces 'ax' with 'ah') String must be
  lowercase."
  [word]
  (->> (map str (.getPhones cmu-lexicon word nil))
       (map remove-stress)
@ -35,9 +38,9 @@
       (map string/upper-case)))
 (defn get-phones-with-stress
-  "String must be lowercase.
+  "Same as `get-phones` but leaves stress.
-  .getPhones only.
+  Note that this might not be the same stress that you'd see
-  Might be different from stress in cmu-dict"
+  in the CMU pronouncing dictionary."
  [word]
  (->> (map str (.getPhones cmu-lexicon word nil))
       (map convert-to-sphinx)