Add example code, more nlp options

5 years ago · 86546c7d01
parent f0ea2bc513
commit 86546c7d01
22 changed files with 287550 additions and 189 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -7,3 +7,4 @@ resources/dark-corpus-2.bin filter=lfs diff=lfs merge=lfs -text
 resources/models/en-parser-chunking.bin filter=lfs diff=lfs merge=lfs -text
 resources/dark-corpus-2.edn filter=lfs diff=lfs merge=lfs -text
 resources/dark-corpus-1.edn filter=lfs diff=lfs merge=lfs -text
+resources/models filter=lfs diff=lfs merge=lfs -text
--- a/deps.edn
+++ b/deps.edn
@ -13,5 +13,5 @@
        org.clojure/data.fressian {:mvn/version "1.0.0"}
        com.taoensso/nippy {:mvn/version "3.0.0"}
        com.taoensso/timbre {:mvn/version "4.10.0"}}
- :aliases {:dev {:extra-paths ["test"]
+ :aliases {:dev {:extra-paths ["test" "examples"]
                 :extra-deps {}}}}
--- a/dev/examples/core.clj
+++ b/dev/examples/core.clj
@ -0,0 +1,192 @@
+(ns examples.core
+  (:require [clojure.string :as string]
+            [clojure.set]
+            [com.owoga.prhyme.frp :as frp]
+            [com.owoga.prhyme.core :as prhyme]
+            [com.owoga.prhyme.data.bigrams :as bigrams]
+            [com.owoga.prhyme.gen :as gen]
+            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.prhyme.data.thesaurus :as thesaurus]
+            [com.owoga.prhyme.data.darklyrics :as darklyrics]
+            [com.owoga.prhyme.generation.weighted-selection :as weighted]
+            [clojure.set :as set]
+            [clojure.zip :as zip]))
+
+(defn weight-fn [word target result]
+  (let [rimes (frp/consecutive-matching word target :rimes)
+        nuclei (frp/consecutive-matching word target :nuclei)
+        onsets (frp/consecutive-matching word target :onsets)
+        total (apply + (map count [rimes nuclei onsets]))]
+    total))
+
+(defn pred-fn [word target result]
+  (< 0 (weight-fn word target result)))
+
+(defn weight-popular [word target result]
+  (if (dict/popular (:normalized-word word))
+    10
+    1))
+
+(defn pred-popular [word target result]
+  (< 1 (weight-popular word target result)))
+
+(def words-by-rime (prhyme/words-by-rime*
+                      (filter
+                       (fn [[word & _]]
+                         (get
+                          dict/popular
+                          (string/lower-case word)))
+                       dict/cmu-dict)))
+
+(defn rime-1 [target]
+  (let [rime (last (:rimes target))]
+    (fn [x]
+      (= rime (last (:rimes x))))))
+
+(defn rime-2 [target]
+  (let [rime (last (butlast (:rimes target)))]
+    (fn [x]
+      (= rime (last (butlast (:rimes x)))))))
+
+(defn rime-member? [coll]
+  (let [coll (into #{} coll)]
+    (fn [x]
+      (coll (:normalized-word x)))))
+
+(defn rime-compare [& comparators]
+  (let [juxtcomp (apply juxt comparators)]
+    (fn [a b]
+      (let [a (juxtcomp a)
+            b (juxtcomp b)]
+        (compare a b)))))
+(def c
+  (fn [a b]
+    ((rime-compare
+      (rime-1 {:rimes '(1 2)})
+      (rime-2 {:rimes '(1 2)})
+      (rime-member? ["foo" "bar"]))
+     b a)))
+
+(comment
+  (let [coll [{:rimes '(3 2) :normalized-word "foo"}
+              {:rimes '(1 2) :normalized-word "foo"}
+              {:rimes '(4 5) :normalized-word "foo"}
+              {:rimes '(1 2) :normalized-word "buzz"}]]
+    (sort c coll))
+
+  (let [coll '("woman"
+               "union"
+               "passion"
+               "infatuation"
+               "emotion"
+               "disposition"
+               "communion"
+               "attraction"
+               "affection"
+               "adoration"
+               "admiration")
+        coll (map #(prhyme/phrase->word dict/popular %) coll)
+        target (prhyme/phrase->word dict/popular "devotion")
+        synonyms (thesaurus/synonyms "love" "heart")
+        comparisons (fn [target]
+                      (fn [a b]
+                        ((rime-compare
+                          (rime-1 target)
+                          (rime-2 target)
+                          (rime-member? synonyms))
+                         b a)))]
+    (sort (comparisons target) coll))
+
+  )
+
+(defn rhymestorm [& words]
+  (let [synonyms (->> (apply thesaurus/synonyms words)
+                      (filter #(get dict/popular %))
+                      (into #{}))
+        comparisons (fn [target]
+                      (fn [a b]
+                        ((rime-compare
+                          (rime-1 target)
+                          (rime-2 target)
+                          (rime-member? synonyms))
+                         b a)))]
+    (->> synonyms
+         (map
+          (fn [synonym]
+            (let [word (prhyme/phrase->word dict/prhyme-dict synonym)
+                  rhymes (get words-by-rime (last (:rimes word)))]
+              (when rhymes
+                (let [rhyming-words (map string/lower-case (prhyme/flatten-node rhymes))
+                      rhyming-synonyms (remove #{(:normalized-word word)} (filter synonyms rhyming-words))]
+                  [(:normalized-word word) rhyming-synonyms])))))
+         (remove (fn [[_ rhymes]]
+                   (empty? rhymes)))
+         (map (fn [[target rhymes]]
+                [target (->> rhymes
+                             (map prhyme/phrase->word dict/popular)
+                             (sort (comparisons (prhyme/phrase->word dict/popular target)))
+                             (map :normalized-word))]))
+         (into {}))))
+
+(comment
+  (rhymestorm "love")
+  (take 3 (drop 500 dict/prhyme-dict))
+  (take 3 dict/cmu-dict)
+  (take 3 dict/popular)
+
+  (let [node (get-in words-by-rime ['("AH" "V")])]
+    (->> (prhyme/flatten-node node)))
+
+  (let  [love-synonyms (thesaurus/thesaurus "love")
+         heart-synonyms (thesaurus/thesaurus "heart")]
+    (->> (clojure.set/intersection
+          (into #{} love-synonyms)
+          (into #{} heart-synonyms))
+         (map string/lower-case)
+         (filter #(dict/popular %))))
+
+  (let [synonyms (thesaurus/synonyms "love" "heart")]
+    synonyms)
+
+  (def love-rhymes
+    (let [synonyms (->> (thesaurus/synonyms "love" "heart")
+                        (filter #(get dict/popular %))
+                        (into #{}))]
+      (->>
+       (map
+        (fn [synonym]
+          (let [word (prhyme/phrase->word dict/prhyme-dict synonym)
+                rhymes (get words-by-rime (last (:rimes word)))]
+            (when rhymes
+              (let [rhyming-words (map string/lower-case (prhyme/flatten-node rhymes))
+                    rhyming-synonyms (filter synonyms rhyming-words)]
+                [(:normalized-word word) rhyming-synonyms]))))
+        synonyms)
+       (into {}))))
+
+  (count love-rhymes)
+  (get-in words-by-rime ['("AH" "V")])
+
+  (weight-fn
+   (first (filter #(= (:normalized-word %) "gotshal's") dict/prhyme-dict))
+   (prhyme/phrase->word dict/prhyme-dict "bye bye")
+   nil)
+
+  (take 10 darklyrics/darklyrics-markov-2)
+  (get darklyrics/darklyrics-markov-2 '("memory" "my"))
+  (repeatedly
+   5
+   (fn []
+     (let [rhymes (gen/selection-seq
+                   dict/prhyme-dict
+                   (comp (weighted/adjust-for-tail-rhyme 0.90)
+                         #_(weighted/adjust-for-rhymes 0.50)
+                         #_(weighted/adjust-for-fn :adj-rimes 0.80 pred-fn weight-fn)
+                         (weighted/adjust-for-fn :adj-popular 0.95 pred-popular weight-popular)
+                         (weighted/adjust-for-markov darklyrics/darklyrics-markov-2 0.99))
+                   (prhyme/phrase->word dict/prhyme-dict "happy birthday taylor my love"))]
+       (->> rhymes
+            (take 5)
+            (map :normalized-word)))))
+
+  )
--- a/resources/bigrams-with-counts.txt
+++ b/resources/bigrams-with-counts.txt
--- a/resources/models/en-pos-maxent.bin
+++ b/resources/models/en-pos-maxent.bin
--- a/resources/models/en-pos-perceptron.bin
+++ b/resources/models/en-pos-perceptron.bin
--- a/src/com/owoga/corpus/lovecraft.clj
+++ b/src/com/owoga/corpus/lovecraft.clj
@ -0,0 +1,339 @@
+(ns com.owoga.corpus.lovecraft
+  (:require [net.cgrand.enlive-html :as html]
+            [clojure.string :as string]
+            [com.owoga.prhyme.util.weighted-rand :as wr]
+            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.prhyme.util :as util]
+            [com.owoga.prhyme.core :as prhyme]
+            [com.owoga.prhyme.util.nlp :as nlp]
+            [com.owoga.prhyme.gen :as gen]
+            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
+            [taoensso.tufte :as tufte :refer [defnp p profiled profile]]
+            [com.owoga.prhyme.frp :as frp]
+            [clojure.java.io :as io]))
+
+(tufte/add-basic-println-handler! {})
+
+(def ^:dynamic *base-url* "https://www.hplovecraft.com/writings/texts/")
+
+(def words-map
+  (into {} (map #(vector (string/lower-case (:word %)) %) frp/words)))
+
+(defn fetch-url [url]
+  (html/html-resource (java.net.URL. url)))
+
+(defn links []
+  (map
+   #(str *base-url* (first (html/attr-values % :href)))
+   (html/select
+    (fetch-url *base-url*)
+    [:li :> [:a (html/attr? :href)]])))
+
+(defn contentful-sections [nodes]
+  (->> nodes
+       (map html/text)
+       (filter #(> (count %) 100))))
+
+(defn text-from-link [link]
+  (->> (html/select
+        (fetch-url link)
+        [:body])
+       (first)
+       (html/text)
+       ((fn [s] (string/replace s #"[\s\u00A0]+" " ")))))
+
+(defn cleanup [content]
+  (-> content
+      (string/replace #"Return to.*$" "")
+      (string/replace #"Home.*?This Site" "")
+      (string/replace #"[^a-zA-Z -]+" "")))
+
+(defn tokens [content]
+  (string/split content #"\s+"))
+
+(defn append-to-file [filepath text]
+  (with-open [w (io/writer filepath :append true)]
+    (.write w text)))
+
+(defn scrape []
+  (run!
+   (fn [link]
+     (->> (text-from-link link)
+          (cleanup)
+          (#(str % "\n"))
+          (append-to-file "lovecraft.txt")))
+   (take 10 (links))))
+
+(defn tokens-from-file [file]
+  (with-open [r (io/reader file)]
+    (tokens (slurp r))))
+
+(defn window [n]
+  (fn [coll]
+    (cond
+      (empty? coll) []
+      (< (count coll) n) []
+      :else (cons (take n coll)
+                  (lazy-seq ((window n) (drop n coll)))))))
+
+(defn markov [tokens]
+  (->> tokens
+       (map
+        (fn [token]
+          (let [k (butlast token)
+                v (last token)]
+            [k v])))
+       (reduce
+        (fn [a [k v]]
+          (update-in a [k v] (fnil inc 0)))
+        {})))
+
+(defn running-total
+  ([coll]
+   (running-total coll 0))
+  ([coll last-val]
+   (cond
+     (empty? coll) nil
+     :else (cons (+ last-val (first coll))
+                 (lazy-seq
+                  (running-total
+                   (rest coll)
+                   (+ last-val (first coll))))))))
+
+(defn weighted-rand [weights]
+  (let [running-weights (running-total weights)
+        rand-val (rand (last running-weights))]
+    (loop [i 0]
+      (if (> (nth running-weights i) rand-val)
+        i
+        (recur (inc i))))))
+
+(def word-set (into #{} (->> prhyme/words
+                             (map first)
+                             (map string/lower-case)
+                             (map #(string/replace % #"\(\d+\)" "")))))
+
+(defn normalize-tokens [tokens]
+  (->> tokens
+       (map string/lower-case)
+       (filter word-set)))
+
+(defn main []
+  (->> (tokens-from-file "lovecraft.txt")
+       (reverse)
+       (normalize-tokens)
+       ((window 2))
+       (markov)
+       (into {})))
+
+(defn synonym?
+  "Given a possibility, like [\"foo\" 3]
+  which says that foo follows a particular key with
+  a weight of 3, a word is a synonym of that possibility
+  if the word is a synonym ."
+  [p synonyms]
+  (synonyms p))
+
+(defn adjust-for-synonyms
+  "If a word is in a set of synonyms, adjust its weight upwards."
+  [synonyms]
+  (fn [possibilities]
+    (reduce
+     (fn [p s]
+       (if (s p)
+         (update p s #(* 5 %))
+         p))
+     possibilities
+     synonyms)))
+
+(defn adjust-for-rimes
+  [target-rime dictionary]
+  (fn [possibilities]
+    (into
+     {}
+     (map
+      (fn [[p v]]
+        (let [possibility (get dictionary p)
+              factor (count
+                      (frp/consecutive-matching
+                       target-rime
+                       possibility
+                       :rimes))]
+          [p (* v (max 1 (* factor 4)))]))
+      possibilities))))
+
+
+(defonce lovecraft-markov (read-string (slurp "lovecraft.edn")))
+
+(defn markov-key [key-fn]
+  (fn [text]
+    (key-fn text)))
+
+(defn gen-from [m p initial]
+  (loop [r (list initial)]
+    (cond
+      (p r) (recur (cons (m (list (first r))) r))
+      :else r)))
+
+
+(defn rhyming-words
+  "List of rhyming words sorted by quality of rhyme."
+  [target]
+  (let [target-phrase (->> target
+                           (prhyme/phrase->word frp/words)
+                           (#(assoc % :rimes? true)))]
+    (->> target-phrase
+         (#(assoc % :rimes? true))
+         (frp/prhyme frp/words)
+         (sort-by
+          #(- (count
+               (frp/consecutive-matching
+                %
+                target-phrase
+                :rimes)))))))
+
+(defn markov-rhymes [markov-data rhyming-words]
+  (->> (map
+        (fn [word]
+          (->> word
+               :word
+               string/lower-case
+               (#(string/replace % #"\(\d+\)" ""))
+               (#(vector % (get markov-data (list %))))))
+        rhyming-words)
+       (into #{})
+       (remove
+        (fn [[w p]]
+          (nil? p)))))
+
+(defn adjust-for-over-syllables
+  "Adjust weights to prefer not going over the number
+  of syllables of the target word."
+  [target]
+  (fn [words]
+    (p :adjust-for-syllables
+       (map
+        (fn [word]
+          (if (or (nil? (:syllable-count word))
+                  (nil? (:syllables target)))
+            (println word target))
+          (cond
+            (= (:syllable-count word) (count (:syllables target)))
+            (as-> word word
+              (assoc word :weight (* 3 (:weight word)))
+              (assoc word :adjusted-for-syllables-factor 3))
+
+            (< (:syllable-count word) (count (:syllables target)))
+            (as-> word word
+              (assoc word :weight (* 2 (:weight word)))
+              (assoc word :adjusted-for-syllables-factor 2))
+
+            :else
+            (as-> word word
+              (assoc word :weight (* 1 (:weight word)))
+              (assoc word :adjusted-for-syllables-factor 1))))
+        words))))
+
+(defn adjust-for-membership-1
+  [set_ percent]
+  (let [ratio (- 1 percent)]
+    (fn [words]
+      (let [[members non-members]
+            ((juxt filter remove)
+             #(set_ (:normalized-word %))
+             words)
+            weight-non-members (apply + (map :weight non-members))
+            target-weight-members (* ratio weight-non-members)
+            count-members (count members)
+            adjustment-members (/ target-weight-members count-members)]
+        (concat
+         (map
+          (fn [member]
+            (as-> member member
+              (assoc member :weight (* adjustment-members (:weight member)))
+              (assoc member :adjustment-for-membership adjustment-members)))
+          members)
+         non-members)))))
+
+(defn adjust-for-membership [set_]
+  (fn [words]
+    (map
+     (fn [word]
+       (if (set_ (:normalized-word word))
+         (as-> word word
+           (assoc word :weight (* 2 (:weight word)))
+           (assoc word :adjust-for-membership-factor 2))
+         (assoc word :adjust-for-membership-factor 1)))
+     words)))
+
+(defn filter-for-membership [set_]
+  (fn [words]
+    (map
+     (fn [word]
+       (if-not (set_ (:normalized-word word))
+         (as-> word word
+           (assoc word :weight (* 0.01 (:weight word)))
+           (assoc word :filter-for-membership-factor 0.01))
+         word))
+     words)))
+
+(defn adjust-for-markov [markov-options]
+  (let [markov-set (into #{} (map first (keys markov-options)))]
+    (fn [words]
+      (let [result (map
+                    (fn [word]
+                      (if (markov-set (:normalized-word word))
+                        (as-> word word
+                          (assoc word :weight (* 100 (:weight word)))
+                          (assoc word :adjust-for-markov-factor 100))
+                        (assoc word :adjust-for-markov-factor 1)))
+                    words)]
+        result))))
+
+(comment
+  (let [markov-adjuster (adjust-for-markov (lovecraft-markov '("help")))]
+    (take 5 (markov-adjuster frp/words))))
+
+(defn adjust-for-membership-1
+  [set_ percent]
+  (let [ratio (- 1 percent)]
+    (fn [words]
+      (let [[members non-members]
+            ((juxt filter remove)
+             #(set_ (:normalized-word %))
+             words)
+            weight-non-members (apply + (map :weight non-members))
+            target-weight-members (* ratio weight-non-members)
+            count-members (count members)
+            adjustment-members (/ target-weight-members count-members)]
+        (concat
+         (map
+          (fn [member]
+            (as-> member member
+              (assoc member :weight (* adjustment-members (:weight member)))
+              (assoc member :adjustment-for-membership adjustment-members)))
+          members)
+         non-members)))))
+
+(defn adjust-for-markov-1
+  [markov-options percent]
+  (let [ratio (- 1 percent)]
+    (fn [words]
+      (if (nil? markov-options)
+        words
+        (let [[markovs non-markovs]
+              ((juxt filter remove)
+               #(markov-options (:normalized-word %))
+               words)
+              weight-non-markovs (apply + (map :weight non-markovs))
+              target-weight-markovs (* ratio weight-non-markovs)
+              count-markovs (count markovs)
+              adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
+          (concat
+           (map
+            (fn [markov]
+              (as-> markov markov
+                (assoc markov :weight (* adjustment-markovs (:weight markov)))
+                (assoc markov :adjustment-for-markov adjustment-markovs)))
+            markovs)
+           non-markovs))))))
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -1,5 +1,7 @@
 (ns com.owoga.corpus.markov
  (:require [com.owoga.prhyme.util :as util]
+            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.prhyme.util.nlp :as nlp]
            [clojure.string :as string]
            [clojure.java.io :as io]))

@ -42,7 +44,7 @@
       (remove #(.isDirectory %))
       (map #(slurp %))
       (map clean-text)
-       (filter util/english?)
+       (filter dict/english?)
       (map #(string/split % #"\n+"))
       (flatten)
       (map #(string/split % #"\s+"))
@ -50,4 +52,23 @@
       (map #(util/extend-coll % nil 2))
       (map #(make-markov % 2))
       (apply merge-markov)
-       (util/write-markov "resources/dark-corpus-2.edn")))
+       #_(util/write-markov "resources/dark-corpus-2.edn")))
+
+(defn gen-pos-markov [directory]
+  (->> (file-seq (io/file directory))
+       (remove #(.isDirectory %))
+       (map #(slurp %))
+       (map clean-text)
+       (filter dict/english?)
+       (map #(string/split % #"\n+"))
+       (map (fn [lyrics] (filter #(nlp/valid-sentence? %) lyrics)))
+       (map #(remove nil? %))
+       (take 5)
+       (map (fn [lyrics]
+              (map #(nlp/tags nlp/prhyme-pos-tagger (nlp/tokenize %)) lyrics)))))
+
+(comment
+  (let [directory "dark-corpus/zero-hour/"]
+    (gen-pos-markov directory))
+
+  )
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -1,9 +1,8 @@
 (ns com.owoga.prhyme.core
-  (:require [clojure.string :as string]
-            [clojure.set :as set]
+  (:require [clojure.zip :as zip]
+            [clojure.string :as string]
            [com.owoga.prhyme.util :as u]
            [com.owoga.prhyme.syllabify :as s]
-            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.data.phonetics :as phonetics]))

 ;;; Typical rhyme model (explanation of following 3 functions)
@ -60,9 +59,32 @@
                       (assoc :onsets (concat (:onsets merged)
                                              (:onsets (first phrase-words))))
                       (assoc :nuclei (concat (:nuclei merged)
-                                              (:nuclei (first phrase-words)))))
+                                              (:nuclei (first phrase-words))))
+                       (assoc :normalized-word (string/join
+                                                " "
+                                                [(:normalized-word merged)
+                                                 (:normalized-word (first phrase-words))])))
                   (rest phrase-words)))))

+(defrecord Word [word syllables syllable-count rimes onsets nuclei weight normalized-word])
+
+(defn cmu->prhyme-word [word phonemes]
+  (let [syllables (s/syllabify phonemes)
+        rimes (rimes syllables)
+        onsets (onset+nucleus syllables)
+        nuclei (nucleus syllables)]
+    (->Word
+     word
+     syllables
+     (count syllables)
+     rimes
+     onsets
+     nuclei
+     1
+     (-> word
+         string/lower-case
+         (string/replace #"\(\d+\)" "")))))
+
 (defn cmu->prhyme [[word & phonemes]]
  (let [syllables (s/syllabify phonemes)
        rimes (rimes syllables)
@ -79,6 +101,17 @@
                          string/lower-case
                          (string/replace #"\(\d+\)" ""))}))

+(defn make-phrase->Word
+  [phonemes-lookup]
+  (fn [phrase]
+    (->> (string/split phrase #"[ -]")
+         (map
+          (fn [phrase-word]
+            (let [phonemes (or (phonemes-lookup phrase-word)
+                               (u/get-phones phrase-word))]
+              (cmu->prhyme-word phrase-word phonemes))))
+         (merge-phrase-words phrase))))
+
 (defn phrase->word
  "Given a word like 'well-off' or a phrase like 'war on poverty', return a Word
  that has the correct syllables, rimes, onsets, and nucleus. This way we can
@ -86,15 +119,14 @@
  make up the phrase are in the dictionary. Returns nil if the word is not in
  the dictionary."
  [words phrase]
+  (let [word-set (into #{} (map :normalized-word words))]
    (->> (string/split phrase #"[ -]")
         (map (fn [phrase-word]
-              (let [word (first (filter (fn [word]
-                                          (= phrase-word (:norm-word word)))
-                                        words))]
+                (let [word (first (filter word-set phrase-word))]
                  (if (nil? word)
                    (cmu->prhyme (cons phrase-word (u/get-phones phrase-word)))
                    word))))
-       (merge-phrase-words phrase)))
+         (merge-phrase-words phrase))))

 (defn words-by-rime* [words]
  (let [words-with-rime (->> words
@ -120,7 +152,18 @@
                                 (cons val (:words existing)))
                       (rest words)))))))

-(def words-by-rime (words-by-rime* dict/cmu-dict))
+(defn prhyme-words-by-rime* [words]
+  (loop [by-rime {}
+         words words]
+    (let [key (reverse (:rimes (first words)))
+          val (first words)
+          existing (get-in by-rime key {:words '()})]
+      (cond
+        (empty? words) by-rime
+        :else (recur (assoc-in by-rime
+                               (concat key [:words])
+                               (cons val (:words existing)))
+                     (rest words))))))

 (defn words-by-onset-nucleus* [words]
  (let [words-with-onset-nucleus (->> words
@ -144,8 +187,6 @@
                                 (cons val (:words existing)))
                       (rest words)))))))

-(def words-by-onset-nucleus (words-by-onset-nucleus* words))
-
 (defn words-by-nucleus* [words]
  (let [words-with-nucleus (->> words
                                (map rest)
@ -170,8 +211,6 @@
                                 (cons val (:words existing)))
                       (rest words)))))))

-(def words-by-nucleus (words-by-nucleus* words))
-
 (defn words-by-syllables* [words]
  (loop [by-syllables {}
         words words]
@ -190,8 +229,6 @@
 (defn build-tree [words]
  (reduce add-word-to-tree {} words))

-(def phone-tree (build-tree words))
-
 (defn rhyme-node [rhyme-tree phonemes]
  (let [phonemes (reverse phonemes)
        node (get-in rhyme-tree phonemes)]
@ -249,63 +286,53 @@
  [data rime]
  (map (partial rhyming-word data) rime))

-(defn all-rhymes [syllables]
-  )
-(defn prhyme [phones]
-  (let [syllables (s/syllabify phones)
-        rhymes (remove #(some nil? %)
-                       (map (partial rhyming-words words-by-rime)
-                            (u/partitions (rimes syllables))))
-        onsets (remove #(some nil? %)
-                       (map (partial rhyming-words words-by-onset-nucleus)
-                            (u/partitions (onset+nucleus syllables))))
-        nuclei (remove #(some nil? %)
-                       (map (partial rhyming-words words-by-nucleus)
-                            (u/partitions (nucleus (reverse syllables)))))
-        popular-rhymes
-        (let [popular (into #{} (map string/upper-case popular))]
-          (remove #(some empty? %)
-                  (map (fn [rhyme]
-                         (map (fn [words-list]
-                                (set/intersection popular (into #{} words-list)))
-                              rhyme))
-                       rhymes)))]
-    {:rhymes popular-rhymes
-     :onsets onsets
-     :nuclei nuclei}))
+(defn deep-merge-with [& maps]
+  ((apply merge-with merge maps)))
+
+(defn flatten-node [node]
+  (let [zipper (zip/zipper
+                (fn branch? [node]
+                  (or (map? node) (map? (nth node 1))))
+                (fn children [node]
+                  (seq (if (map? node) node (nth node 1))))
+                (fn make-node [node children]
+                  (if (map? node)
+                    (into {} children)
+                    (assoc node 1 (into {} children))))
+                node)]
+    (->> zipper
+         (iterate zip/next)
+         (take-while #(not (zip/end? %)))
+         (drop 1)
+         (map zip/node)
+         (map #(apply hash-map %))
+         (map :words)
+         (remove nil?)
+         flatten)))
+
+(defn node-merge [result-value latter-value]
+  (cond
+    (map? result-value)
+    (merge-with node-merge result-value latter-value )
+
+    :else (concat result-value latter-value)))

 (comment
-  (take 10 popular)
-  (prhyme ["R" "OY" "AH" "L"])
-  (let [phones ["D" "R" "IY" "M" "S" "AE" "N" "D" "HH" "OW" "P" "S"]]
-    (prhyme phones))
+  (node-merge {:b 2 :c [1]} {:c [2]})
+
+  (let [m1 {:a {:a1 {:a2 2 :a3 4 :a4 {:a6 7 :a5 5}}}}
+        m2 {:a {:a1 {:a3 3 :a2 99 :a4 {:a5 195}}} :b 4}]
+    #_(merge-with
+     (fn [& maps]
+       (apply merge-with merge maps))
+     m1 m2)
+    (deep-merge m1 m2))
+
  (let [phones ["D" "R" "IY" "M" "S" "AE" "N" "D" "HH" "OW" "P" "S"]]
    (s/syllabify phones))
-  (let [phones ["AE" "N" "D" "HH" "OW" "P" "S"]]
-    (prhyme phones)
-    (get-in words-by-nucleus (nucleus (s/syllabify phones)))
-    (prhyme phones)
-    (u/partitions (nucleus (s/syllabify phones)))
-    (prhyme phones))
-  (let [phones ["T" "AY" "M" "T" "UW" "TH" "IH" "NG" "K"]]
-    (rimes (s/syllabify phones))
-    (prhyme phones))
-  (let [phones ["R" "UH" "N" "AW" "T" "AH" "F" "S" "L" "IY" "P"]]
-    (prhyme phones)
-    (s/syllabify phones))
-  (let [phones ["S" "L" "IY" "P"]]
-    (prhyme phones))
-  (let [phones ["AH" "F"]]
-    (prhyme phones))
-  (let [phones ["D" "OW" "N" "T" "F" "UH" "K" "W" "IH" "TH" "M" "IY"]]
-    (prhyme phones))
-  (prhyme ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"])
  (let [r (rimes (s/syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"]))]
    (remove #(some nil? %) (map rhyming-words (u/partitions r))))

-  (let [r (rimes (s/syllabify ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"]))]
-    (remove #(some nil? %) (map (partial rhyming-words words-by-rime) (u/partitions r))))
-
  (get
   (->> words
        (filter-to-syllable-count 1)
--- a/src/com/owoga/prhyme/data/bigrams.clj
+++ b/src/com/owoga/prhyme/data/bigrams.clj
@ -0,0 +1,63 @@
+(ns com.owoga.prhyme.data.bigrams
+  (:require [clojure.java.io :as io]
+            [clojure.string :as string]
+            [com.owoga.prhyme.syllabify :refer [syllabify]]
+            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.prhyme.util :as util]
+            [com.owoga.prhyme.core :as prhyme]))
+
+(def popular-bigrams
+  (->> (io/resource "bigrams-with-counts.txt")
+       io/reader
+       line-seq
+       (map #(string/split % #"\s+"))
+       (map #(map string/lower-case %))
+       (filter (fn [[w1 w2 num]]
+                 (and (dict/popular w1)
+                      (dict/popular w2))))
+       (map (partial take 2))))
+
+(def phrase->Word
+  (prhyme/make-phrase->Word
+   (into
+    {}
+    (map
+     (fn [[word & phonemes]]
+       [(string/lower-case word)
+        phonemes])
+     dict/cmu-dict))))
+
+(def popular-bigrams-with-syllables
+  (->> popular-bigrams
+       (map (partial string/join " "))
+       (map phrase->Word)))
+
+(def popular-bigram-words
+  (->> popular-bigrams
+       (map (partial string/join " "))
+       (map phrase->Word)))
+
+(def popular-mono-and-bi-grams
+  (concat
+   dict/prhyme-dict
+   popular-bigram-words))
+
+(def popular-mono-and-bi-grams-by-rime
+  (prhyme/prhyme-words-by-rime*
+   popular-mono-and-bi-grams))
+
+(comment
+  (->> (reverse (:rimes (phrase->Word "beautiful")))
+       (get-in popular-mono-and-bi-grams-by-rime)
+       (prhyme/flatten-node)
+       (remove #(re-matches #".*beautiful" (:word %)))
+       (map :word)
+       (map string/lower-case))
+  (get-in
+   popular-mono-and-bi-grams-by-rime
+   ['("AA") '("ER") '("IY")])
+  (take 5 popular-bigrams)
+  (take 5 popular-bigram-words)
+  (take-last 5 popular-bigram-words)
+  (prhyme/phrase->word dict/prhyme-dict "hi there"))
+(def words-and-bigrams())
--- a/src/com/owoga/prhyme/data/darklyrics.clj
+++ b/src/com/owoga/prhyme/data/darklyrics.clj
@ -0,0 +1,39 @@
+(ns com.owoga.prhyme.data.darklyrics
+  (:require [clojure.java.io :as io]
+            [taoensso.nippy :as nippy])
+  (:import [java.io DataInputStream ByteArrayOutputStream]))
+
+(defn thaw-from-file
+  "Convenience util: like `thaw`, but reads from `(clojure.java.io/file <file>)`.
+
+  To thaw from a resource on classpath (e.g in Leiningen `resources` dir):
+    (thaw-from-file (clojure.java.io/resource \"my-resource-name.npy\"))
+
+  See also `freeze-to-file`."
+  ([file          ] (thaw-from-file file nil))
+  ([file thaw-opts]
+   (let [xin (io/input-stream file)
+         xout (ByteArrayOutputStream.)]
+     (io/copy xin xout)
+     (nippy/thaw (.toByteArray xout) thaw-opts))))
+
+;; (thaw-from-file (io/resource "test.bin"))
+;; (bytes (byte-array (take 20 (range))))
+
+;; (byte-array (map (comp byte int) "ascii"))
+;; ;; => [97, 115, 99, 105, 105]
+;; (bytes (byte-array (map (comp byte int) "ascii")))
+;; ;; => [97, 115, 99, 105, 105]
+
+;; (let [xin (io/input-stream (io/resource "test.bin"))
+;;       xout (ByteArrayOutputStream.)]
+;;   (io/copy xin xout)
+;;   (nippy/thaw (.toByteArray xout)))
+
+;; (.fullyRead (io/input-stream (io/resource "test.bin")))
+
+;; (.getSize (io/input-stream (io/resource "dark-corpus-2.bin")))
+;; (def data (into {} (map vec (partition 2 (range 20)))))
+;; (nippy/freeze-to-file "resources/test.bin" data)
+(def darklyrics-markov-2
+  (thaw-from-file (io/resource "dark-corpus-2.bin")))
--- a/src/com/owoga/prhyme/data/dictionary.clj
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@ -2,6 +2,7 @@
  (:require [clojure.string :as string]
            [clojure.java.io :as io]
            [clojure.set]
+            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]))

 (def cmu-dict
@ -9,6 +10,25 @@
       (line-seq)
       (map #(string/split % #"[\t ]"))))

+(def spelling->phonemes
+  (loop [words cmu-dict
+         accum {}]
+    (let [word (first words)
+          key ((fnil util/clean-text "") (first word))]
+      (cond
+        (nil? word) accum
+        :else (recur (rest words)
+                     (update accum key (fnil conj []) (rest word)))))))
+
+(def phrase->Word
+  (into
+   {}
+   (map
+    (fn [[word & phonemes]]
+      [(string/lower-case word)
+       phonemes])
+    cmu-dict)))
+
 (def prhyme-dict
  (into [] (map prhyme/cmu->prhyme cmu-dict)))

--- a/src/com/owoga/prhyme/data/thesaurus.clj
+++ b/src/com/owoga/prhyme/data/thesaurus.clj
@ -1,9 +1,18 @@
 (ns com.owoga.prhyme.data.thesaurus
  (:require [clojure.string :as string]
-            [clojure.java.io :as io]))
+            [clojure.java.io :as io]
+            [clojure.set]))

 (def thesaurus
  (->> (line-seq (io/reader (io/resource "mthesaur.txt")))
       (map #(string/split % #","))
       (map #(vector (first %) (rest %)))
       (into {})))
+
+(defn synonyms
+  ([& words]
+   (->> words
+        (map string/lower-case)
+        (map #(get thesaurus %))
+        (map (fn [ws] (into #{} ws)))
+        (apply clojure.set/intersection))))
--- a/src/com/owoga/prhyme/frp.clj
+++ b/src/com/owoga/prhyme/frp.clj
@ -244,11 +244,11 @@
 (defn prhyme-many [dict phrase]
  (let [syllable-partitions
        (->> phrase
-             (:syllables (phrase->word dict phrase))
+             (:syllables (p/phrase->word dict phrase))
             (u/partitions)
             (map (fn [part]
                    (map (fn [syllables]
-                           (make-word
+                           (p/cmu->prhyme
                            (into
                             [(string/join " " (flatten (apply concat syllables)))]
                             (flatten syllables))))
--- a/src/com/owoga/prhyme/gen.clj
+++ b/src/com/owoga/prhyme/gen.clj
@ -43,14 +43,14 @@
      (if (>= (count result) target-markov-n)
        (let [markov-options (markov (->> result
                                          (take target-markov-n)
-                                          (map :norm-word)))
+                                          (map :normalized-word)))
              markov-option-avg (/ (apply + (vals markov-options))
                                   (max 1 (count markov-options)))]
          (if (nil? markov-options)
            [words target result]
            (let [[markovs non-markovs]
                  ((juxt filter remove)
-                   #(markov-options (:norm-word %))
+                   #(markov-options (:normalized-word %))
                   words)
                  weight-non-markovs (apply + (map :weight non-markovs))
                  target-weight-markovs (* 100 percent weight-non-markovs)
@ -59,7 +59,7 @@
              [(concat
                (map
                 (fn [m]
-                   (let [option (markov-options (:norm-word m))]
+                   (let [option (markov-options (:normalized-word m))]
                     (as-> m m
                       (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
                       (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
@ -73,7 +73,7 @@
  [markov percent]
  (let [markov-n (count (first (first markov)))]
    (fn [[words target result]]
-      (let [key (let [k (map :norm-word (take markov-n result))]
+      (let [key (let [k (map :normalized-word (take markov-n result))]
                  (reverse
                   (if (> markov-n (count k))
                     (concat k (repeat (- markov-n (count k)) nil))
@ -85,7 +85,7 @@
          [words target result]
          (let [[markovs non-markovs]
                ((juxt filter remove)
-                 #(markov-options (:norm-word %))
+                 #(markov-options (:normalized-word %))
                 words)
                weight-non-markovs (apply + (map :weight non-markovs))
                target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
@ -95,7 +95,7 @@
            [(concat
              (map
               (fn [m]
-                 (let [option (markov-options (:norm-word m))]
+                 (let [option (markov-options (:normalized-word m))]
                   (as-> m m
                     (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
                     (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
@ -147,7 +147,7 @@
        (fn []
          (attempt-gen-target-by-syllable-count adj syllable-count words)))
       (filter #(= syllable-count (apply + (map :syllable-count %))))
-       (map #(map :norm-word %))
+       (map #(map :normalized-word %))
       (map #(string/join " " %))
       (filter nlp/valid-sentence?)
       first))
@ -174,6 +174,14 @@
     (cons selection
           (lazy-seq (selection-seq words adjust new-target new-result))))))

+(defn selection-stream
+  "Continuously make the first selection."
+  ([words adjust target]
+   (selection-stream words adjust target '()))
+  ([words adjust target result]
+   (let [[weighted-words _ _] (adjust [words target result])]
+     (repeatedly #(math/weighted-selection :weight weighted-words)))))
+
 (defn generate-prhyme [words adjust target stop?]
  (loop [result '()]))

@ -217,7 +225,7 @@
        (fn []
          (attempt-gen-rhyme-with-syllable-count adj syllable-count words target)))
       (filter #(= syllable-count (apply + (map :syllable-count %))))
-       (map #(map :norm-word %))
+       (map #(map :normalized-word %))
       (map #(string/join " " %))
       (filter nlp/valid-sentence?)
       first))
@ -232,7 +240,7 @@

 (defn sentence-stop [target]
  (fn [inner-target result]
-    (let [result-sentence (string/join " " (map :norm-word result))]
+    (let [result-sentence (string/join " " (map :normalized-word result))]
      (when-not (empty? result)
        (or (nlp/valid-sentence? result-sentence)
            (< (:syllable-count target)
@ -241,18 +249,18 @@

 (defn gen-prhymes [words adjust poem-lines]
  (let [words (map #(assoc % :weight 1) words)
-        words-map (into {} (map #(vector (:norm-word %) %) words))]
+        words-map (into {} (map #(vector (:normalized-word %) %) words))]
    (map (fn [line]
-           (let [target (phrase->word words line)
+           (let [target (prhyme/phrase->word words line)
                 stop (sentence-stop target)
                 r (prhymer words adjust target stop)]
-             (string/join " " (map #(:norm-word %) (first r)))))
+             (string/join " " (map #(:normalized-word %) (first r)))))
         poem-lines)))

 (defn phrase-syllable-count [phrase]
  (->> phrase
       (#(string/split % #" "))
-       (map (partial phrase->word frp/words))
+       (map (partial prhyme/phrase->word frp/words))
       (map :syllable-count)
       (apply +)))

@ -268,8 +276,8 @@
 (defn generate-rhyme-for-phrase
  [words adjust phrase]
  (let [words (map #(assoc % :weight 1) words)
-        words-map (into {} (map #(vector (:norm-word %) %) words))
-        target (phrase->word words phrase)]
+        words-map (into {} (map #(vector (:normalized-word %) %) words))
+        target (prhyme/phrase->word words phrase)]
    (prhymer words adjust target (syllable-stop target))))

 #_(defn generate-prhymes [poem]
@ -277,28 +285,28 @@
    (fn []
      (->> poem
           (map (fn [phrase]
-                  (let [target (phrase->word frp/popular phrase)]
+                  (let [target (prhyme/phrase->word frp/popular phrase)]
                    (first
                     (filter
                      #(and
                        (or (< 0.9 (rand))
-                            (nlp/valid-sentence? (string/join " " (map :norm-word %))))
+                            (nlp/valid-sentence? (string/join " " (map :normalized-word %))))
                        (= (:syllable-count target)
                           (apply + (map :syllable-count %))))
                      (r phrase))))))
-           (map (fn [line] (map #(:norm-word %) line)))
+           (map (fn [line] (map #(:normalized-word %) line)))
           (map #(string/join " " %))))))

 (defn generate-prhymes-darkov [words adj phrase]
-  (let [target (phrase->word words phrase)
+  (let [target (prhyme/phrase->word words phrase)
        r (generate-rhyme-for-phrase words adj target)]
    (first
     (filter
      #(and
        (or (< 0.9 (rand))
-            (nlp/valid-sentence? (string/join " " (map :norm-word %))))
+            (nlp/valid-sentence? (string/join " " (map :normalized-word %))))
        (= (:syllable-count target)
           (apply + (map :syllable-count %))))
      r))
-    (map (fn [line] (map #(:norm-word %) line)))
+    (map (fn [line] (map #(:normalized-word %) line)))
    (map #(string/join " " %))))
--- a/src/com/owoga/prhyme/generation/weighted_selection.clj
+++ b/src/com/owoga/prhyme/generation/weighted_selection.clj
@ -43,7 +43,7 @@
  [markov percent]
  (let [markov-n (count (first (first markov)))]
    (fn [[words target result]]
-      (let [key (let [k (map :norm-word (take markov-n result))]
+      (let [key (let [k (map :normalized-word (take markov-n result))]
                  (reverse
                   (if (> markov-n (count k))
                     (concat k (repeat (- markov-n (count k)) nil))
@ -55,7 +55,7 @@
          [words target result]
          (let [[markovs non-markovs]
                ((juxt filter remove)
-                 #(markov-options (:norm-word %))
+                 #(markov-options (:normalized-word %))
                 words)
                weight-non-markovs (apply + (map :weight non-markovs))
                target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
@ -65,7 +65,7 @@
            [(concat
              (map
               (fn [m]
-                 (let [option (markov-options (:norm-word m))]
+                 (let [option (markov-options (:normalized-word m))]
                   (as-> m m
                     (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
                     (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
@ -108,8 +108,34 @@
       result])))

 (defn adjust-for-tail-rhyme
+  "Only bump up rhyme probability if result is empty."
  [percent]
  (fn [[words target result]]
    (if (empty? result)
      ((adjust-for-rhymes percent) [words target result])
      [words target result])))
+
+(defn adjust-for-fn
+  "Weights words by whether or not they rhyme.
+  Once result contains something, becomes inactive. If you want to try to rhyme
+  every selection, you'll need a different function. This one will only rhyme
+  the tail of a target."
+  [key percent pred-fn weight-fn]
+  (fn [[words target result]]
+    (let [[matching non-matching] ((juxt filter remove) #(pred-fn % target result) words)
+          weight-non-matching (apply + (map :weight non-matching))
+          target-weight-matching (* 100 percent weight-non-matching)
+          count-matching (count matching)
+          adjustment-matching (if (= 0 count-matching)
+                               1
+                               (/ target-weight-matching count-matching))]
+      [(concat
+        (map
+         (fn [word]
+           (as-> word word
+             (assoc word :weight (* adjustment-matching (weight-fn word target result)))
+             (assoc word key adjustment-matching)))
+         matching)
+        non-matching)
+       target
+       result])))
--- a/src/com/owoga/prhyme/limerick.clj
+++ b/src/com/owoga/prhyme/limerick.clj
@ -0,0 +1,76 @@
+(ns com.owoga.prhyme.limerick
+  (:require [com.owoga.prhyme.gen :as gen]
+            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
+            [clojure.string :as string]
+            [com.owoga.prhyme.core :as prhyme]
+            [com.owoga.prhyme.util :as util]))
+
+(defn rhyme-from-scheme
+  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
+  [words markov scheme]
+  (loop [scheme scheme
+         rhymes {}
+         result []]
+    (cond
+      (empty? scheme) result
+      :else
+      (let [[pattern syllable-count] (first scheme)
+            banned-words (into #{} (->> result
+                                        (map #(string/split % #" "))
+                                        (map #(last %))))
+            adj (util/comp-rnil
+                 (weighted-selection/adjust-for-markov
+                  markov
+                  0.99)
+                 (when (rhymes pattern)
+                   (weighted-selection/adjust-for-tail-rhyme 0.99)))
+            rhyme (if (nil? (get rhymes pattern))
+                    (gen/gen-sentence-with-syllable-count
+                     adj
+                     syllable-count
+                     words)
+                    (gen/gen-rhyme-with-syllable-count
+                     adj
+                     syllable-count
+                     (remove #(banned-words (:normalized-word %)) words)
+                     (prhyme/phrase->word words (get rhymes pattern))))]
+        (recur (rest scheme)
+               (assoc rhymes pattern rhyme)
+               (conj result rhyme))))))
+
+(comment
+  (require '[com.owoga.prhyme.data.dictionary :as dict]
+           '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]]
+           '[clojure.java.io :as io])
+  (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
+
+  )
+
+(comment
+  ["bishop larch smitten us dwell"
+   "solely first week in hell"
+   "and take that for three"
+   "come wrapped in glory"
+   "you ever leave it so well"]
+  ["romancing realized too late"
+   "my crown revive my withered state"
+   "reign is obsolete"
+   "i sit in the street"
+   "but nobody cares of my fate"]
+  ["flesh is hacked to get me sedate"
+   "demonstration obsessed with hate"
+   "justice will be written in stone"
+   "and you will be shown"
+   "bedrooms of icons suffocate"]
+  ["you will bow to their hungry gods"
+   "come will come whatever the odds"
+   "now we see the light"
+   "you can't put it right"
+   "recklessly chopping firing squads"]
+  ["untimely they fool their poor life"
+   "it wither away with this knife"
+   "hate is my virtue"
+   "my feelings are well overdue"
+   "war we await the afterlife"])
+
+
--- a/src/com/owoga/prhyme/lymeric.clj
+++ b/src/com/owoga/prhyme/lymeric.clj
@ -1,86 +0,0 @@
-(ns com.owoga.prhyme.lymeric
-  (:require [com.owoga.prhyme.gen :as gen]
-            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
-            [clojure.string :as string]
-            [com.owoga.prhyme.core :as prhyme]
-            [com.owoga.prhyme.frp :as frp]
-            [com.owoga.prhyme.util :as util]
-            [taoensso.nippy :as nippy]
-            [clojure.java.io :as io]))
-
-
-(defn rhyme-from-scheme
-  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
-  [rhymer markov scheme]
-  (let [base-words (map #(assoc % :weight 1) frp/popular)]
-    (loop [scheme scheme
-           rhymes {}
-           result []]
-      (cond
-        (empty? scheme) result
-        :else
-        (let [[pattern syllable-count] (first scheme)
-              banned-words (into #{} (->> result
-                                          (map #(string/split % #" "))
-                                          (map #(last %))))
-              adj (util/comp-rnil
-                   (weighted-selection/adjust-for-markov
-                    markov
-                    0.99)
-                   (when (rhymes pattern)
-                     (weighted-selection/adjust-for-tail-rhyme 0.99)))
-              rhyme (if (nil? (get rhymes pattern))
-                      (gen/gen-sentence-with-syllable-count
-                       adj
-                       syllable-count
-                       base-words)
-                      (gen/gen-rhyme-with-syllable-count
-                       adj
-                       syllable-count
-                       (remove #(banned-words (:norm-word %))
-                               base-words)
-                       (prhyme/phrase->word frp/words (get rhymes pattern))))]
-          (recur (rest scheme)
-                 (assoc rhymes pattern rhyme)
-                 (conj result rhyme)))))))
-
-(comment
-  (time (def darkov-2 (nippy/thaw-from-file (io/resource "darkov-2.bin"))))
-  (rhyme-from-scheme nil darkov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
-
-  )
-
-(comment
-  (->> (gen/selection-seq
-        (map #(assoc % :weight 1) frp/words)
-        (weighted-selection/adjust-for-rhymes 0.99)
-        (prhyme/phrase->word frp/words "hi there my boy"))
-       (take 3))
-
-  ["bishop larch smitten us dwell"
-   "solely first week in hell"
-   "and take that for three"
-   "come wrapped in glory"
-   "you ever leave it so well"]
-  ["romancing realized too late"
-   "my crown revive my withered state"
-   "reign is obsolete"
-   "i sit in the street"
-   "but nobody cares of my fate"]
-  ["flesh is hacked to get me sedate"
-   "demonstration obsessed with hate"
-   "justice will be written in stone"
-   "and you will be shown"
-   "bedrooms of icons suffocate"]
-  ["you will bow to their hungry gods"
-   "come will come whatever the odds"
-   "now we see the light"
-   "you can't put it right"
-   "recklessly chopping firing squads"]
-  ["untimely they fool their poor life"
-   "it wither away with this knife"
-   "hate is my virtue"
-   "my feelings are well overdue"
-   "war we await the afterlife"])
-
-
--- a/src/com/owoga/prhyme/nlp/tag_sets/treebank.clj
+++ b/src/com/owoga/prhyme/nlp/tag_sets/treebank.clj
--- a/src/com/owoga/prhyme/nlp/tag_sets/treebank_ii.clj
+++ b/src/com/owoga/prhyme/nlp/tag_sets/treebank_ii.clj
@ -0,0 +1,81 @@
+(ns com.owoga.prhyme.nlp.tag-sets.treebank-ii)
+
+;;;; Penn Treebank II Constituent Tags
+;;;; http://www.surdeanu.info/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html#MD
+(def clauses
+  {'S "simple declarative clause, i.e. one that is not introduced by a (possible empty) subordinating conjunction or a wh-word and that does not exhibit subject-verb inversion."
+   'SBAR "Clause introduced by a (possibly empty) subordinating conjunction."
+   'SBARQ "Direct question introduced by a wh-word or a wh-phrase. Indirect questions and relative clauses should be bracketed as SBAR, not SBARQ."
+   'SINV "Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal."
+   'SQ "Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ."})
+
+(def phrases
+  {'ADJP "Adjective Phrase."
+   'ADVP "Adverb Phrase."
+   'CONJP "Conjunction Phrase."
+   'FRAG "Fragment."
+   'INTJ "Interjection. Corresponds approximately to the part-of-speech tag UH."
+   'LST "List marker. Includes surrounding punctuation."
+   'NAC "Not a Constituent; used to show the scope of certain prenominal modifiers within an NP."
+   'NP "Noun Phrase."
+   'NX "Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently."
+   'PP "Prepositional Phrase."
+   'PRN "Parenthetical."
+   'PRT "Particle. Category for words that should be tagged RP."
+   'QP "Quantifier Phrase (i.e. complex measure/amount phrase); used within NP."
+   'RRC "Reduced Relative Clause."
+   'UCP "Unlike Coordinated Phrase."
+   'VP "Vereb Phrase."
+   'WHADJP "Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot."
+   'WHAVP "Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why."
+   'WHNP "Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards."
+   'WHPP "Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP."
+   'X "Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the...the-constructions."})
+
+(def words
+  {'CC "Coordinating conjunction"
+   'CD "Cardinal number"
+   'DT "Determiner"
+   'EX "Existential there"
+   'FW "Foreign word"
+   'IN "Preposition or subordinating conjunction"
+   'JJ "Adjective"
+   'JJR "Adjective, comparative"
+   'JJS "Adjective, superlative"
+   'LS "List item marker"
+   'MD "Modal"
+   'NN "Noun, singular or mass"
+   'NNS "Noun, plural"
+   'NNP "Proper noun, singular"
+   'NNPS "Proper noun, plural"
+   'PDT "Predeterminer"
+   'POS "Possessive ending"
+   'PRP "Personal pronoun"
+   'PRP$ "Possessive pronoun (prolog version PRP-S)"
+   'RB "Adverb"
+   'RBR "Adverb, comparative"
+   'RBS "Adverb, superlative"
+   'RP "Particle"
+   'SYM "Symbol"
+   'TO "to"
+   'UH "Interjection"
+   'VB "Verb, base form"
+   'VBD "Verb, past tense"
+   'VBG "Verb, gerund or present participle"
+   'VBN "Verb, past participle"
+   'VBP "Verb, non-3rd person singular present"
+   'VBZ "Verb, 3rd person singular present"
+   'WDT "Wh-determiner"
+   'WP "Wh-pronoun"
+   'WP$ "Possessive wh-pronoun (prolog version WP-S)"
+   'WRB "Wh-adverb"})
+
+(def form-function-discrepencies
+  {'-ADV "(adverbial) - marks a constituent
+  other than ADVP or PP when it is used adverbially (e.g. NPs or
+  free (\"headless\" relatives). However, constituents that themselves are
+  modifying an ADVP generally do not get -ADV. If a more specific tag is
+  available (for example, -TMP) then it is used alone and -ADV is implied. See
+  the Adverbials section."
+   '-NOM "(nominal) - marks free (\"headless\") relatives
+  and gerunds when they act nominally."})
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -21,6 +21,9 @@
    "ah"
    phoneme))

+(comment
+  (map str (.getPhones cmu-lexicon "two" nil)))
+
 (defn get-phones [word]
  (->> (map str (.getPhones cmu-lexicon word nil))
       (map remove-stress)
--- a/src/com/owoga/prhyme/util/nlp.clj
+++ b/src/com/owoga/prhyme/util/nlp.clj
@ -2,17 +2,55 @@
  (:require [opennlp.nlp :as nlp]
            [opennlp.treebank :as tb]
            [clojure.string :as string]
-            [clojure.java.io :as io]))
+            [clojure.java.io :as io]
+            [clojure.zip :as zip]
+            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
+  (:import (opennlp.tools.postag POSModel POSTaggerME)))

 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
 (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
 (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
+(def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
+
+;;;; The tagger that onennlp.nlp gives us doesn't provide access
+;;;; to the probabilities of all tags. It gives us the probability of the
+;;;; top tag through some metadata. But to get probs for all tags, we
+;;;; need to implement our own tagger.
+(defprotocol Tagger
+  (tags [this sent])
+  (probs [this])
+  (top-k-sequences [this sent]))
+
+(defn make-pos-tagger
+  [modelfile]
+  (let [model (with-open [model-stream (io/input-stream modelfile)]
+                (POSModel. model-stream))
+        tagger (POSTaggerME. model)]
+    (reify Tagger
+      (tags [_ tokens]
+        (let [token-array (into-array String tokens)]
+          (map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array))))
+      (probs [_] (seq (.probs tagger)))
+      (top-k-sequences [_ tokens]
+        (let [token-array (into-array String tokens)]
+          (.topKSequences tagger #^"[Ljava.lang.String;" token-array))))))
+
+(def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
+
+(comment
+  (let [phrase "The feeling hurts."]
+    (map (juxt #(.getOutcomes %)
+               #(map float (.getProbs %)))
+         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
+  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
+  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
+  )

 (defn valid-sentence?
  "Tokenizes and parses the phrase using OpenNLP models from
  http://opennlp.sourceforge.net/models-1.5/

-  If the parse tree has an 'S as the top-level tag, then
+  If the parse tree has an clause as the top-level tag, then
  we consider it a valid English sentence."
  [phrase]
  (->> phrase
@ -22,5 +60,151 @@
       parse
       first
       tb/make-tree
-       (#(= 'S (:tag (first (:chunk %)))))))
+       :chunk
+       first
+       :tag
+       tb2/clauses
+       boolean))
+
+(defn unmake-tree
+  "Tokenizing and then parsing a sentence returns a string
+  representation of the parse tree. This is a helper function
+  to make working with the parse tree more convenient. We
+  can use `opennlp.treebank/make-tree` to make a clojure map
+  representation of the tree, then we can `unmake` the tree
+  to turn it into a list representation of the tree that
+  we can easily use in a clojure zipper. (read-string almost works,
+  but falls apart when reading things like commas)."
+  [node]
+  (cond
+    (string? node) node
+    (map? node) (list (:tag node) (unmake-tree (:chunk node)))
+    :else (map unmake-tree node)))
+
+(comment
+  (let [phrase "Hello, Eric"]
+    (->> phrase
+         tokenize
+         (string/join " ")
+         vector
+         parse
+         (map tb/make-tree)
+         unmake-tree))
+  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
+  )
+
+(defn treebank-zipper
+  "Turns a bit of text into a parse tree into a zipper."
+  [text]
+  (let [tree (->> text
+                  tokenize
+                  (string/join " ")
+                  vector
+                  parse
+                  (map tb/make-tree)
+                  unmake-tree)]
+    (zip/zipper seq? seq (fn [_ c] c) tree)))
+
+(defn node-constituents
+  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
+  returns a list of the top-level node tag and its first-level child tags.
+  "
+  [node]
+  (list
+   (first node)
+   (if (every? string? (map first (rest node)))
+     nil
+     (map first (first (rest node))))))
+
+(defn phrase-constituents
+  "Given a bit of text that can be parsed into a treebank tree,
+  Get a sequence of the tags and their chunks.
+  For example:
+    My name is Eric.
+  Returns the sequence:
+    At the TOP tag, we have a 'S part-of-speech (a clause).
+    At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period)
+    At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun)
+    ...
+  "
+  [text]
+  (->> (treebank-zipper text)
+       (iterate zip/next)
+       (take-while (complement zip/end?))
+       (filter (complement zip/branch?))
+       (map zip/path)
+       (map last)
+       (map node-constituents)
+       (remove #(string? (first %)))))
+
+(comment
+  (phrase-constituents "My name is Eric.")
+  ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
+  (phrase-constituents "How are you?")
+  ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP))) 
+  )
+
+
+(defn pos-constituent-frequencies
+  "Frequencies of the parts of speech that make up phrases.
+  Example:
+    Clauses are made up of:
+      NounPhrase + VerbPhrase 2 times
+      Clause + CoordinatingConjuction + Clause 1 times
+    NounPhrases are made up of:
+      ProperNouns 2 times
+      PersonalPronoun + SingularNoun 3 times
+
+  Does not include frequencies for leaf words. By that I mean: A SingularNoun might
+  appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair
+  and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned
+  by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %)))
+  line. This data structure is used as a kind of markov selection process and we don't really
+  care how often the leafs are used. We just care about the ratio at which we should pick each
+  leaf from a given parent.
+  "
+  [texts]
+  (reduce
+   (fn [acc text]
+     (let [constituents (->> text
+                             phrase-constituents
+                             (remove #(nil? (second %))))]
+       (reduce
+        (fn [acc constituent]
+          (let [k1 (first constituent)
+                k2 (second constituent)]
+            (update-in acc [k1 k2] (fnil inc 0))))
+        acc
+        constituents)))
+   {}
+   texts))
+
+(comment
+  (pos-constituent-frequencies
+   ["My name is Eric."
+    "My hat is blue and I like cake."
+    "Your name is Taylor."
+    "How are you?"])
+  ;; => {TOP {(S) 3, (SBARQ) 1},
+  ;;     S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
+  ;;     NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
+  ;;     VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1},
+  ;;     ADJP {(JJ) 1},
+  ;;     SBARQ {(WHADVP SQ .) 1},
+  ;;     WHADVP {(WRB) 1},
+  ;;     SQ {(VBP NP) 1}}
+
+  (let [phrase "How are you today?"]
+    (->> phrase
+         tokenize
+         (string/join " ")
+         vector
+         parse
+         (map tb/make-tree)))
+
+  (let [phrase "I gave the cake to John at the store."]
+    (parse (tokenize phrase)))

+  (let [phrase "I've got a good feeling"]
+    (pos-tagger (tokenize phrase)))
+  )