Add example code, more nlp options

5 years ago · 86546c7d01
parent f0ea2bc513
commit 86546c7d01
22 changed files with 287550 additions and 189 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -7,3 +7,4 @@ resources/dark-corpus-2.bin filter=lfs diff=lfs merge=lfs -text
 resources/models/en-parser-chunking.bin filter=lfs diff=lfs merge=lfs -text
 resources/dark-corpus-2.edn filter=lfs diff=lfs merge=lfs -text
 resources/dark-corpus-1.edn filter=lfs diff=lfs merge=lfs -text
 resources/models filter=lfs diff=lfs merge=lfs -text
--- a/deps.edn
+++ b/deps.edn
@ -13,5 +13,5 @@
        org.clojure/data.fressian {:mvn/version "1.0.0"}
        com.taoensso/nippy {:mvn/version "3.0.0"}
        com.taoensso/timbre {:mvn/version "4.10.0"}}
- :aliases {:dev {:extra-paths ["test"]
+ :aliases {:dev {:extra-paths ["test" "examples"]
                 :extra-deps {}}}}
--- a/dev/examples/core.clj
+++ b/dev/examples/core.clj
@ -0,0 +1,192 @@
 (ns examples.core
  (:require [clojure.string :as string]
            [clojure.set]
            [com.owoga.prhyme.frp :as frp]
            [com.owoga.prhyme.core :as prhyme]
            [com.owoga.prhyme.data.bigrams :as bigrams]
            [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.data.thesaurus :as thesaurus]
            [com.owoga.prhyme.data.darklyrics :as darklyrics]
            [com.owoga.prhyme.generation.weighted-selection :as weighted]
            [clojure.set :as set]
            [clojure.zip :as zip]))
 (defn weight-fn [word target result]
  (let [rimes (frp/consecutive-matching word target :rimes)
        nuclei (frp/consecutive-matching word target :nuclei)
        onsets (frp/consecutive-matching word target :onsets)
        total (apply + (map count [rimes nuclei onsets]))]
    total))
 (defn pred-fn [word target result]
  (< 0 (weight-fn word target result)))
 (defn weight-popular [word target result]
  (if (dict/popular (:normalized-word word))
    10
    1))
 (defn pred-popular [word target result]
  (< 1 (weight-popular word target result)))
 (def words-by-rime (prhyme/words-by-rime*
                      (filter
                       (fn [[word & _]]
                         (get
                          dict/popular
                          (string/lower-case word)))
                       dict/cmu-dict)))
 (defn rime-1 [target]
  (let [rime (last (:rimes target))]
    (fn [x]
      (= rime (last (:rimes x))))))
 (defn rime-2 [target]
  (let [rime (last (butlast (:rimes target)))]
    (fn [x]
      (= rime (last (butlast (:rimes x)))))))
 (defn rime-member? [coll]
  (let [coll (into #{} coll)]
    (fn [x]
      (coll (:normalized-word x)))))
 (defn rime-compare [& comparators]
  (let [juxtcomp (apply juxt comparators)]
    (fn [a b]
      (let [a (juxtcomp a)
            b (juxtcomp b)]
        (compare a b)))))
 (def c
  (fn [a b]
    ((rime-compare
      (rime-1 {:rimes '(1 2)})
      (rime-2 {:rimes '(1 2)})
      (rime-member? ["foo" "bar"]))
     b a)))
 (comment
  (let [coll [{:rimes '(3 2) :normalized-word "foo"}
              {:rimes '(1 2) :normalized-word "foo"}
              {:rimes '(4 5) :normalized-word "foo"}
              {:rimes '(1 2) :normalized-word "buzz"}]]
    (sort c coll))
  (let [coll '("woman"
               "union"
               "passion"
               "infatuation"
               "emotion"
               "disposition"
               "communion"
               "attraction"
               "affection"
               "adoration"
               "admiration")
        coll (map #(prhyme/phrase->word dict/popular %) coll)
        target (prhyme/phrase->word dict/popular "devotion")
        synonyms (thesaurus/synonyms "love" "heart")
        comparisons (fn [target]
                      (fn [a b]
                        ((rime-compare
                          (rime-1 target)
                          (rime-2 target)
                          (rime-member? synonyms))
                         b a)))]
    (sort (comparisons target) coll))
  )
 (defn rhymestorm [& words]
  (let [synonyms (->> (apply thesaurus/synonyms words)
                      (filter #(get dict/popular %))
                      (into #{}))
        comparisons (fn [target]
                      (fn [a b]
                        ((rime-compare
                          (rime-1 target)
                          (rime-2 target)
                          (rime-member? synonyms))
                         b a)))]
    (->> synonyms
         (map
          (fn [synonym]
            (let [word (prhyme/phrase->word dict/prhyme-dict synonym)
                  rhymes (get words-by-rime (last (:rimes word)))]
              (when rhymes
                (let [rhyming-words (map string/lower-case (prhyme/flatten-node rhymes))
                      rhyming-synonyms (remove #{(:normalized-word word)} (filter synonyms rhyming-words))]
                  [(:normalized-word word) rhyming-synonyms])))))
         (remove (fn [[_ rhymes]]
                   (empty? rhymes)))
         (map (fn [[target rhymes]]
                [target (->> rhymes
                             (map prhyme/phrase->word dict/popular)
                             (sort (comparisons (prhyme/phrase->word dict/popular target)))
                             (map :normalized-word))]))
         (into {}))))
 (comment
  (rhymestorm "love")
  (take 3 (drop 500 dict/prhyme-dict))
  (take 3 dict/cmu-dict)
  (take 3 dict/popular)
  (let [node (get-in words-by-rime ['("AH" "V")])]
    (->> (prhyme/flatten-node node)))
  (let  [love-synonyms (thesaurus/thesaurus "love")
         heart-synonyms (thesaurus/thesaurus "heart")]
    (->> (clojure.set/intersection
          (into #{} love-synonyms)
          (into #{} heart-synonyms))
         (map string/lower-case)
         (filter #(dict/popular %))))
  (let [synonyms (thesaurus/synonyms "love" "heart")]
    synonyms)
  (def love-rhymes
    (let [synonyms (->> (thesaurus/synonyms "love" "heart")
                        (filter #(get dict/popular %))
                        (into #{}))]
      (->>
       (map
        (fn [synonym]
          (let [word (prhyme/phrase->word dict/prhyme-dict synonym)
                rhymes (get words-by-rime (last (:rimes word)))]
            (when rhymes
              (let [rhyming-words (map string/lower-case (prhyme/flatten-node rhymes))
                    rhyming-synonyms (filter synonyms rhyming-words)]
                [(:normalized-word word) rhyming-synonyms]))))
        synonyms)
       (into {}))))
  (count love-rhymes)
  (get-in words-by-rime ['("AH" "V")])
  (weight-fn
   (first (filter #(= (:normalized-word %) "gotshal's") dict/prhyme-dict))
   (prhyme/phrase->word dict/prhyme-dict "bye bye")
   nil)
  (take 10 darklyrics/darklyrics-markov-2)
  (get darklyrics/darklyrics-markov-2 '("memory" "my"))
  (repeatedly
   5
   (fn []
     (let [rhymes (gen/selection-seq
                   dict/prhyme-dict
                   (comp (weighted/adjust-for-tail-rhyme 0.90)
                         #_(weighted/adjust-for-rhymes 0.50)
                         #_(weighted/adjust-for-fn :adj-rimes 0.80 pred-fn weight-fn)
                         (weighted/adjust-for-fn :adj-popular 0.95 pred-popular weight-popular)
                         (weighted/adjust-for-markov darklyrics/darklyrics-markov-2 0.99))
                   (prhyme/phrase->word dict/prhyme-dict "happy birthday taylor my love"))]
       (->> rhymes
            (take 5)
            (map :normalized-word)))))
  )
--- a/resources/bigrams-with-counts.txt
+++ b/resources/bigrams-with-counts.txt
--- a/resources/models/en-pos-maxent.bin
+++ b/resources/models/en-pos-maxent.bin
--- a/resources/models/en-pos-perceptron.bin
+++ b/resources/models/en-pos-perceptron.bin
--- a/src/com/owoga/corpus/lovecraft.clj
+++ b/src/com/owoga/corpus/lovecraft.clj
@ -0,0 +1,339 @@
 (ns com.owoga.corpus.lovecraft
  (:require [net.cgrand.enlive-html :as html]
            [clojure.string :as string]
            [com.owoga.prhyme.util.weighted-rand :as wr]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]
            [com.owoga.prhyme.util.nlp :as nlp]
            [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
            [taoensso.tufte :as tufte :refer [defnp p profiled profile]]
            [com.owoga.prhyme.frp :as frp]
            [clojure.java.io :as io]))
 (tufte/add-basic-println-handler! {})
 (def ^:dynamic *base-url* "https://www.hplovecraft.com/writings/texts/")
 (def words-map
  (into {} (map #(vector (string/lower-case (:word %)) %) frp/words)))
 (defn fetch-url [url]
  (html/html-resource (java.net.URL. url)))
 (defn links []
  (map
   #(str *base-url* (first (html/attr-values % :href)))
   (html/select
    (fetch-url *base-url*)
    [:li :> [:a (html/attr? :href)]])))
 (defn contentful-sections [nodes]
  (->> nodes
       (map html/text)
       (filter #(> (count %) 100))))
 (defn text-from-link [link]
  (->> (html/select
        (fetch-url link)
        [:body])
       (first)
       (html/text)
       ((fn [s] (string/replace s #"[\s\u00A0]+" " ")))))
 (defn cleanup [content]
  (-> content
      (string/replace #"Return to.*$" "")
      (string/replace #"Home.*?This Site" "")
      (string/replace #"[^a-zA-Z -]+" "")))
 (defn tokens [content]
  (string/split content #"\s+"))
 (defn append-to-file [filepath text]
  (with-open [w (io/writer filepath :append true)]
    (.write w text)))
 (defn scrape []
  (run!
   (fn [link]
     (->> (text-from-link link)
          (cleanup)
          (#(str % "\n"))
          (append-to-file "lovecraft.txt")))
   (take 10 (links))))
 (defn tokens-from-file [file]
  (with-open [r (io/reader file)]
    (tokens (slurp r))))
 (defn window [n]
  (fn [coll]
    (cond
      (empty? coll) []
      (< (count coll) n) []
      :else (cons (take n coll)
                  (lazy-seq ((window n) (drop n coll)))))))
 (defn markov [tokens]
  (->> tokens
       (map
        (fn [token]
          (let [k (butlast token)
                v (last token)]
            [k v])))
       (reduce
        (fn [a [k v]]
          (update-in a [k v] (fnil inc 0)))
        {})))
 (defn running-total
  ([coll]
   (running-total coll 0))
  ([coll last-val]
   (cond
     (empty? coll) nil
     :else (cons (+ last-val (first coll))
                 (lazy-seq
                  (running-total
                   (rest coll)
                   (+ last-val (first coll))))))))
 (defn weighted-rand [weights]
  (let [running-weights (running-total weights)
        rand-val (rand (last running-weights))]
    (loop [i 0]
      (if (> (nth running-weights i) rand-val)
        i
        (recur (inc i))))))
 (def word-set (into #{} (->> prhyme/words
                             (map first)
                             (map string/lower-case)
                             (map #(string/replace % #"\(\d+\)" "")))))
 (defn normalize-tokens [tokens]
  (->> tokens
       (map string/lower-case)
       (filter word-set)))
 (defn main []
  (->> (tokens-from-file "lovecraft.txt")
       (reverse)
       (normalize-tokens)
       ((window 2))
       (markov)
       (into {})))
 (defn synonym?
  "Given a possibility, like [\"foo\" 3]
  which says that foo follows a particular key with
  a weight of 3, a word is a synonym of that possibility
  if the word is a synonym ."
  [p synonyms]
  (synonyms p))
 (defn adjust-for-synonyms
  "If a word is in a set of synonyms, adjust its weight upwards."
  [synonyms]
  (fn [possibilities]
    (reduce
     (fn [p s]
       (if (s p)
         (update p s #(* 5 %))
         p))
     possibilities
     synonyms)))
 (defn adjust-for-rimes
  [target-rime dictionary]
  (fn [possibilities]
    (into
     {}
     (map
      (fn [[p v]]
        (let [possibility (get dictionary p)
              factor (count
                      (frp/consecutive-matching
                       target-rime
                       possibility
                       :rimes))]
          [p (* v (max 1 (* factor 4)))]))
      possibilities))))
 (defonce lovecraft-markov (read-string (slurp "lovecraft.edn")))
 (defn markov-key [key-fn]
  (fn [text]
    (key-fn text)))
 (defn gen-from [m p initial]
  (loop [r (list initial)]
    (cond
      (p r) (recur (cons (m (list (first r))) r))
      :else r)))
 (defn rhyming-words
  "List of rhyming words sorted by quality of rhyme."
  [target]
  (let [target-phrase (->> target
                           (prhyme/phrase->word frp/words)
                           (#(assoc % :rimes? true)))]
    (->> target-phrase
         (#(assoc % :rimes? true))
         (frp/prhyme frp/words)
         (sort-by
          #(- (count
               (frp/consecutive-matching
                %
                target-phrase
                :rimes)))))))
 (defn markov-rhymes [markov-data rhyming-words]
  (->> (map
        (fn [word]
          (->> word
               :word
               string/lower-case
               (#(string/replace % #"\(\d+\)" ""))
               (#(vector % (get markov-data (list %))))))
        rhyming-words)
       (into #{})
       (remove
        (fn [[w p]]
          (nil? p)))))
 (defn adjust-for-over-syllables
  "Adjust weights to prefer not going over the number
  of syllables of the target word."
  [target]
  (fn [words]
    (p :adjust-for-syllables
       (map
        (fn [word]
          (if (or (nil? (:syllable-count word))
                  (nil? (:syllables target)))
            (println word target))
          (cond
            (= (:syllable-count word) (count (:syllables target)))
            (as-> word word
              (assoc word :weight (* 3 (:weight word)))
              (assoc word :adjusted-for-syllables-factor 3))
            (< (:syllable-count word) (count (:syllables target)))
            (as-> word word
              (assoc word :weight (* 2 (:weight word)))
              (assoc word :adjusted-for-syllables-factor 2))
            :else
            (as-> word word
              (assoc word :weight (* 1 (:weight word)))
              (assoc word :adjusted-for-syllables-factor 1))))
        words))))
 (defn adjust-for-membership-1
  [set_ percent]
  (let [ratio (- 1 percent)]
    (fn [words]
      (let [[members non-members]
            ((juxt filter remove)
             #(set_ (:normalized-word %))
             words)
            weight-non-members (apply + (map :weight non-members))
            target-weight-members (* ratio weight-non-members)
            count-members (count members)
            adjustment-members (/ target-weight-members count-members)]
        (concat
         (map
          (fn [member]
            (as-> member member
              (assoc member :weight (* adjustment-members (:weight member)))
              (assoc member :adjustment-for-membership adjustment-members)))
          members)
         non-members)))))
 (defn adjust-for-membership [set_]
  (fn [words]
    (map
     (fn [word]
       (if (set_ (:normalized-word word))
         (as-> word word
           (assoc word :weight (* 2 (:weight word)))
           (assoc word :adjust-for-membership-factor 2))
         (assoc word :adjust-for-membership-factor 1)))
     words)))
 (defn filter-for-membership [set_]
  (fn [words]
    (map
     (fn [word]
       (if-not (set_ (:normalized-word word))
         (as-> word word
           (assoc word :weight (* 0.01 (:weight word)))
           (assoc word :filter-for-membership-factor 0.01))
         word))
     words)))
 (defn adjust-for-markov [markov-options]
  (let [markov-set (into #{} (map first (keys markov-options)))]
    (fn [words]
      (let [result (map
                    (fn [word]
                      (if (markov-set (:normalized-word word))
                        (as-> word word
                          (assoc word :weight (* 100 (:weight word)))
                          (assoc word :adjust-for-markov-factor 100))
                        (assoc word :adjust-for-markov-factor 1)))
                    words)]
        result))))
 (comment
  (let [markov-adjuster (adjust-for-markov (lovecraft-markov '("help")))]
    (take 5 (markov-adjuster frp/words))))
 (defn adjust-for-membership-1
  [set_ percent]
  (let [ratio (- 1 percent)]
    (fn [words]
      (let [[members non-members]
            ((juxt filter remove)
             #(set_ (:normalized-word %))
             words)
            weight-non-members (apply + (map :weight non-members))
            target-weight-members (* ratio weight-non-members)
            count-members (count members)
            adjustment-members (/ target-weight-members count-members)]
        (concat
         (map
          (fn [member]
            (as-> member member
              (assoc member :weight (* adjustment-members (:weight member)))
              (assoc member :adjustment-for-membership adjustment-members)))
          members)
         non-members)))))
 (defn adjust-for-markov-1
  [markov-options percent]
  (let [ratio (- 1 percent)]
    (fn [words]
      (if (nil? markov-options)
        words
        (let [[markovs non-markovs]
              ((juxt filter remove)
               #(markov-options (:normalized-word %))
               words)
              weight-non-markovs (apply + (map :weight non-markovs))
              target-weight-markovs (* ratio weight-non-markovs)
              count-markovs (count markovs)
              adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
          (concat
           (map
            (fn [markov]
              (as-> markov markov
                (assoc markov :weight (* adjustment-markovs (:weight markov)))
                (assoc markov :adjustment-for-markov adjustment-markovs)))
            markovs)
           non-markovs))))))
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -1,5 +1,7 @@
 (ns com.owoga.corpus.markov
  (:require [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.util.nlp :as nlp]
            [clojure.string :as string]
            [clojure.java.io :as io]))
@ -42,7 +44,7 @@
       (remove #(.isDirectory %))
       (map #(slurp %))
       (map clean-text)
-       (filter util/english?)
+       (filter dict/english?)
       (map #(string/split % #"\n+"))
       (flatten)
       (map #(string/split % #"\s+"))
@ -50,4 +52,23 @@
       (map #(util/extend-coll % nil 2))
       (map #(make-markov % 2))
       (apply merge-markov)
-       (util/write-markov "resources/dark-corpus-2.edn")))
+       #_(util/write-markov "resources/dark-corpus-2.edn")))
 (defn gen-pos-markov [directory]
  (->> (file-seq (io/file directory))
       (remove #(.isDirectory %))
       (map #(slurp %))
       (map clean-text)
       (filter dict/english?)
       (map #(string/split % #"\n+"))
       (map (fn [lyrics] (filter #(nlp/valid-sentence? %) lyrics)))
       (map #(remove nil? %))
       (take 5)
       (map (fn [lyrics]
              (map #(nlp/tags nlp/prhyme-pos-tagger (nlp/tokenize %)) lyrics)))))
 (comment
  (let [directory "dark-corpus/zero-hour/"]
    (gen-pos-markov directory))
  )
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -1,9 +1,8 @@
 (ns com.owoga.prhyme.core
-  (:require [clojure.string :as string]
+  (:require [clojure.zip :as zip]
-            [clojure.set :as set]
+            [clojure.string :as string]
            [com.owoga.prhyme.util :as u]
            [com.owoga.prhyme.syllabify :as s]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.data.phonetics :as phonetics]))
 ;;; Typical rhyme model (explanation of following 3 functions)
@ -60,9 +59,32 @@
                       (assoc :onsets (concat (:onsets merged)
                                              (:onsets (first phrase-words))))
                       (assoc :nuclei (concat (:nuclei merged)
-                                              (:nuclei (first phrase-words)))))
+                                              (:nuclei (first phrase-words))))
                       (assoc :normalized-word (string/join
                                                " "
                                                [(:normalized-word merged)
                                                 (:normalized-word (first phrase-words))])))
                   (rest phrase-words)))))
 (defrecord Word [word syllables syllable-count rimes onsets nuclei weight normalized-word])
 (defn cmu->prhyme-word [word phonemes]
  (let [syllables (s/syllabify phonemes)
        rimes (rimes syllables)
        onsets (onset+nucleus syllables)
        nuclei (nucleus syllables)]
    (->Word
     word
     syllables
     (count syllables)
     rimes
     onsets
     nuclei
     1
     (-> word
         string/lower-case
         (string/replace #"\(\d+\)" "")))))
 (defn cmu->prhyme [[word & phonemes]]
  (let [syllables (s/syllabify phonemes)
        rimes (rimes syllables)
@ -79,6 +101,17 @@
                          string/lower-case
                          (string/replace #"\(\d+\)" ""))}))
 (defn make-phrase->Word
  [phonemes-lookup]
  (fn [phrase]
    (->> (string/split phrase #"[ -]")
         (map
          (fn [phrase-word]
            (let [phonemes (or (phonemes-lookup phrase-word)
                               (u/get-phones phrase-word))]
              (cmu->prhyme-word phrase-word phonemes))))
         (merge-phrase-words phrase))))
 (defn phrase->word
  "Given a word like 'well-off' or a phrase like 'war on poverty', return a Word
  that has the correct syllables, rimes, onsets, and nucleus. This way we can
@ -86,15 +119,14 @@
  make up the phrase are in the dictionary. Returns nil if the word is not in
  the dictionary."
  [words phrase]
  (let [word-set (into #{} (map :normalized-word words))]
    (->> (string/split phrase #"[ -]")
         (map (fn [phrase-word]
-              (let [word (first (filter (fn [word]
+                (let [word (first (filter word-set phrase-word))]
                                          (= phrase-word (:norm-word word)))
                                        words))]
                  (if (nil? word)
                    (cmu->prhyme (cons phrase-word (u/get-phones phrase-word)))
                    word))))
-       (merge-phrase-words phrase)))
+         (merge-phrase-words phrase))))
 (defn words-by-rime* [words]
  (let [words-with-rime (->> words
@ -120,7 +152,18 @@
                                 (cons val (:words existing)))
                       (rest words)))))))
-(def words-by-rime (words-by-rime* dict/cmu-dict))
+(defn prhyme-words-by-rime* [words]
  (loop [by-rime {}
         words words]
    (let [key (reverse (:rimes (first words)))
          val (first words)
          existing (get-in by-rime key {:words '()})]
      (cond
        (empty? words) by-rime
        :else (recur (assoc-in by-rime
                               (concat key [:words])
                               (cons val (:words existing)))
                     (rest words))))))
 (defn words-by-onset-nucleus* [words]
  (let [words-with-onset-nucleus (->> words
@ -144,8 +187,6 @@
                                 (cons val (:words existing)))
                       (rest words)))))))
 (def words-by-onset-nucleus (words-by-onset-nucleus* words))
 (defn words-by-nucleus* [words]
  (let [words-with-nucleus (->> words
                                (map rest)
@ -170,8 +211,6 @@
                                 (cons val (:words existing)))
                       (rest words)))))))
 (def words-by-nucleus (words-by-nucleus* words))
 (defn words-by-syllables* [words]
  (loop [by-syllables {}
         words words]
@ -190,8 +229,6 @@
 (defn build-tree [words]
  (reduce add-word-to-tree {} words))
 (def phone-tree (build-tree words))
 (defn rhyme-node [rhyme-tree phonemes]
  (let [phonemes (reverse phonemes)
        node (get-in rhyme-tree phonemes)]
@ -249,63 +286,53 @@
  [data rime]
  (map (partial rhyming-word data) rime))
-(defn all-rhymes [syllables]
+(defn deep-merge-with [& maps]
-  )
+  ((apply merge-with merge maps)))
-(defn prhyme [phones]
+
-  (let [syllables (s/syllabify phones)
+(defn flatten-node [node]
-        rhymes (remove #(some nil? %)
+  (let [zipper (zip/zipper
-                       (map (partial rhyming-words words-by-rime)
+                (fn branch? [node]
-                            (u/partitions (rimes syllables))))
+                  (or (map? node) (map? (nth node 1))))
-        onsets (remove #(some nil? %)
+                (fn children [node]
-                       (map (partial rhyming-words words-by-onset-nucleus)
+                  (seq (if (map? node) node (nth node 1))))
-                            (u/partitions (onset+nucleus syllables))))
+                (fn make-node [node children]
-        nuclei (remove #(some nil? %)
+                  (if (map? node)
-                       (map (partial rhyming-words words-by-nucleus)
+                    (into {} children)
-                            (u/partitions (nucleus (reverse syllables)))))
+                    (assoc node 1 (into {} children))))
-        popular-rhymes
+                node)]
-        (let [popular (into #{} (map string/upper-case popular))]
+    (->> zipper
-          (remove #(some empty? %)
+         (iterate zip/next)
-                  (map (fn [rhyme]
+         (take-while #(not (zip/end? %)))
-                         (map (fn [words-list]
+         (drop 1)
-                                (set/intersection popular (into #{} words-list)))
+         (map zip/node)
-                              rhyme))
+         (map #(apply hash-map %))
-                       rhymes)))]
+         (map :words)
-    {:rhymes popular-rhymes
+         (remove nil?)
-     :onsets onsets
+         flatten)))
-     :nuclei nuclei}))
+
 (defn node-merge [result-value latter-value]
  (cond
    (map? result-value)
    (merge-with node-merge result-value latter-value )
    :else (concat result-value latter-value)))
 (comment
-  (take 10 popular)
+  (node-merge {:b 2 :c [1]} {:c [2]})
-  (prhyme ["R" "OY" "AH" "L"])
+
-  (let [phones ["D" "R" "IY" "M" "S" "AE" "N" "D" "HH" "OW" "P" "S"]]
+  (let [m1 {:a {:a1 {:a2 2 :a3 4 :a4 {:a6 7 :a5 5}}}}
-    (prhyme phones))
+        m2 {:a {:a1 {:a3 3 :a2 99 :a4 {:a5 195}}} :b 4}]
    #_(merge-with
     (fn [& maps]
       (apply merge-with merge maps))
     m1 m2)
    (deep-merge m1 m2))
  (let [phones ["D" "R" "IY" "M" "S" "AE" "N" "D" "HH" "OW" "P" "S"]]
    (s/syllabify phones))
  (let [phones ["AE" "N" "D" "HH" "OW" "P" "S"]]
    (prhyme phones)
    (get-in words-by-nucleus (nucleus (s/syllabify phones)))
    (prhyme phones)
    (u/partitions (nucleus (s/syllabify phones)))
    (prhyme phones))
  (let [phones ["T" "AY" "M" "T" "UW" "TH" "IH" "NG" "K"]]
    (rimes (s/syllabify phones))
    (prhyme phones))
  (let [phones ["R" "UH" "N" "AW" "T" "AH" "F" "S" "L" "IY" "P"]]
    (prhyme phones)
    (s/syllabify phones))
  (let [phones ["S" "L" "IY" "P"]]
    (prhyme phones))
  (let [phones ["AH" "F"]]
    (prhyme phones))
  (let [phones ["D" "OW" "N" "T" "F" "UH" "K" "W" "IH" "TH" "M" "IY"]]
    (prhyme phones))
  (prhyme ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"])
  (let [r (rimes (s/syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"]))]
    (remove #(some nil? %) (map rhyming-words (u/partitions r))))
  (let [r (rimes (s/syllabify ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"]))]
    (remove #(some nil? %) (map (partial rhyming-words words-by-rime) (u/partitions r))))
  (get
   (->> words
        (filter-to-syllable-count 1)
--- a/src/com/owoga/prhyme/data/bigrams.clj
+++ b/src/com/owoga/prhyme/data/bigrams.clj
@ -0,0 +1,63 @@
 (ns com.owoga.prhyme.data.bigrams
  (:require [clojure.java.io :as io]
            [clojure.string :as string]
            [com.owoga.prhyme.syllabify :refer [syllabify]]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]))
 (def popular-bigrams
  (->> (io/resource "bigrams-with-counts.txt")
       io/reader
       line-seq
       (map #(string/split % #"\s+"))
       (map #(map string/lower-case %))
       (filter (fn [[w1 w2 num]]
                 (and (dict/popular w1)
                      (dict/popular w2))))
       (map (partial take 2))))
 (def phrase->Word
  (prhyme/make-phrase->Word
   (into
    {}
    (map
     (fn [[word & phonemes]]
       [(string/lower-case word)
        phonemes])
     dict/cmu-dict))))
 (def popular-bigrams-with-syllables
  (->> popular-bigrams
       (map (partial string/join " "))
       (map phrase->Word)))
 (def popular-bigram-words
  (->> popular-bigrams
       (map (partial string/join " "))
       (map phrase->Word)))
 (def popular-mono-and-bi-grams
  (concat
   dict/prhyme-dict
   popular-bigram-words))
 (def popular-mono-and-bi-grams-by-rime
  (prhyme/prhyme-words-by-rime*
   popular-mono-and-bi-grams))
 (comment
  (->> (reverse (:rimes (phrase->Word "beautiful")))
       (get-in popular-mono-and-bi-grams-by-rime)
       (prhyme/flatten-node)
       (remove #(re-matches #".*beautiful" (:word %)))
       (map :word)
       (map string/lower-case))
  (get-in
   popular-mono-and-bi-grams-by-rime
   ['("AA") '("ER") '("IY")])
  (take 5 popular-bigrams)
  (take 5 popular-bigram-words)
  (take-last 5 popular-bigram-words)
  (prhyme/phrase->word dict/prhyme-dict "hi there"))
 (def words-and-bigrams())
--- a/src/com/owoga/prhyme/data/darklyrics.clj
+++ b/src/com/owoga/prhyme/data/darklyrics.clj
@ -0,0 +1,39 @@
 (ns com.owoga.prhyme.data.darklyrics
  (:require [clojure.java.io :as io]
            [taoensso.nippy :as nippy])
  (:import [java.io DataInputStream ByteArrayOutputStream]))
 (defn thaw-from-file
  "Convenience util: like `thaw`, but reads from `(clojure.java.io/file <file>)`.
  To thaw from a resource on classpath (e.g in Leiningen `resources` dir):
    (thaw-from-file (clojure.java.io/resource \"my-resource-name.npy\"))
  See also `freeze-to-file`."
  ([file          ] (thaw-from-file file nil))
  ([file thaw-opts]
   (let [xin (io/input-stream file)
         xout (ByteArrayOutputStream.)]
     (io/copy xin xout)
     (nippy/thaw (.toByteArray xout) thaw-opts))))
 ;; (thaw-from-file (io/resource "test.bin"))
 ;; (bytes (byte-array (take 20 (range))))
 ;; (byte-array (map (comp byte int) "ascii"))
 ;; ;; => [97, 115, 99, 105, 105]
 ;; (bytes (byte-array (map (comp byte int) "ascii")))
 ;; ;; => [97, 115, 99, 105, 105]
 ;; (let [xin (io/input-stream (io/resource "test.bin"))
 ;;       xout (ByteArrayOutputStream.)]
 ;;   (io/copy xin xout)
 ;;   (nippy/thaw (.toByteArray xout)))
 ;; (.fullyRead (io/input-stream (io/resource "test.bin")))
 ;; (.getSize (io/input-stream (io/resource "dark-corpus-2.bin")))
 ;; (def data (into {} (map vec (partition 2 (range 20)))))
 ;; (nippy/freeze-to-file "resources/test.bin" data)
 (def darklyrics-markov-2
  (thaw-from-file (io/resource "dark-corpus-2.bin")))
--- a/src/com/owoga/prhyme/data/dictionary.clj
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@ -2,6 +2,7 @@
  (:require [clojure.string :as string]
            [clojure.java.io :as io]
            [clojure.set]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]))
 (def cmu-dict
@ -9,6 +10,25 @@
       (line-seq)
       (map #(string/split % #"[\t ]"))))
 (def spelling->phonemes
  (loop [words cmu-dict
         accum {}]
    (let [word (first words)
          key ((fnil util/clean-text "") (first word))]
      (cond
        (nil? word) accum
        :else (recur (rest words)
                     (update accum key (fnil conj []) (rest word)))))))
 (def phrase->Word
  (into
   {}
   (map
    (fn [[word & phonemes]]
      [(string/lower-case word)
       phonemes])
    cmu-dict)))
 (def prhyme-dict
  (into [] (map prhyme/cmu->prhyme cmu-dict)))
--- a/src/com/owoga/prhyme/data/thesaurus.clj
+++ b/src/com/owoga/prhyme/data/thesaurus.clj
@ -1,9 +1,18 @@
 (ns com.owoga.prhyme.data.thesaurus
  (:require [clojure.string :as string]
-            [clojure.java.io :as io]))
+            [clojure.java.io :as io]
            [clojure.set]))
 (def thesaurus
  (->> (line-seq (io/reader (io/resource "mthesaur.txt")))
       (map #(string/split % #","))
       (map #(vector (first %) (rest %)))
       (into {})))
 (defn synonyms
  ([& words]
   (->> words
        (map string/lower-case)
        (map #(get thesaurus %))
        (map (fn [ws] (into #{} ws)))
        (apply clojure.set/intersection))))
--- a/src/com/owoga/prhyme/frp.clj
+++ b/src/com/owoga/prhyme/frp.clj
@ -244,11 +244,11 @@
 (defn prhyme-many [dict phrase]
  (let [syllable-partitions
        (->> phrase
-             (:syllables (phrase->word dict phrase))
+             (:syllables (p/phrase->word dict phrase))
             (u/partitions)
             (map (fn [part]
                    (map (fn [syllables]
-                           (make-word
+                           (p/cmu->prhyme
                            (into
                             [(string/join " " (flatten (apply concat syllables)))]
                             (flatten syllables))))
--- a/src/com/owoga/prhyme/gen.clj
+++ b/src/com/owoga/prhyme/gen.clj
@ -43,14 +43,14 @@
      (if (>= (count result) target-markov-n)
        (let [markov-options (markov (->> result
                                          (take target-markov-n)
-                                          (map :norm-word)))
+                                          (map :normalized-word)))
              markov-option-avg (/ (apply + (vals markov-options))
                                   (max 1 (count markov-options)))]
          (if (nil? markov-options)
            [words target result]
            (let [[markovs non-markovs]
                  ((juxt filter remove)
-                   #(markov-options (:norm-word %))
+                   #(markov-options (:normalized-word %))
                   words)
                  weight-non-markovs (apply + (map :weight non-markovs))
                  target-weight-markovs (* 100 percent weight-non-markovs)
@ -59,7 +59,7 @@
              [(concat
                (map
                 (fn [m]
-                   (let [option (markov-options (:norm-word m))]
+                   (let [option (markov-options (:normalized-word m))]
                     (as-> m m
                       (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
                       (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
@ -73,7 +73,7 @@
  [markov percent]
  (let [markov-n (count (first (first markov)))]
    (fn [[words target result]]
-      (let [key (let [k (map :norm-word (take markov-n result))]
+      (let [key (let [k (map :normalized-word (take markov-n result))]
                  (reverse
                   (if (> markov-n (count k))
                     (concat k (repeat (- markov-n (count k)) nil))
@ -85,7 +85,7 @@
          [words target result]
          (let [[markovs non-markovs]
                ((juxt filter remove)
-                 #(markov-options (:norm-word %))
+                 #(markov-options (:normalized-word %))
                 words)
                weight-non-markovs (apply + (map :weight non-markovs))
                target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
@ -95,7 +95,7 @@
            [(concat
              (map
               (fn [m]
-                 (let [option (markov-options (:norm-word m))]
+                 (let [option (markov-options (:normalized-word m))]
                   (as-> m m
                     (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
                     (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
@ -147,7 +147,7 @@
        (fn []
          (attempt-gen-target-by-syllable-count adj syllable-count words)))
       (filter #(= syllable-count (apply + (map :syllable-count %))))
-       (map #(map :norm-word %))
+       (map #(map :normalized-word %))
       (map #(string/join " " %))
       (filter nlp/valid-sentence?)
       first))
@ -174,6 +174,14 @@
     (cons selection
           (lazy-seq (selection-seq words adjust new-target new-result))))))
 (defn selection-stream
  "Continuously make the first selection."
  ([words adjust target]
   (selection-stream words adjust target '()))
  ([words adjust target result]
   (let [[weighted-words _ _] (adjust [words target result])]
     (repeatedly #(math/weighted-selection :weight weighted-words)))))
 (defn generate-prhyme [words adjust target stop?]
  (loop [result '()]))
@ -217,7 +225,7 @@
        (fn []
          (attempt-gen-rhyme-with-syllable-count adj syllable-count words target)))
       (filter #(= syllable-count (apply + (map :syllable-count %))))
-       (map #(map :norm-word %))
+       (map #(map :normalized-word %))
       (map #(string/join " " %))
       (filter nlp/valid-sentence?)
       first))
@ -232,7 +240,7 @@
 (defn sentence-stop [target]
  (fn [inner-target result]
-    (let [result-sentence (string/join " " (map :norm-word result))]
+    (let [result-sentence (string/join " " (map :normalized-word result))]
      (when-not (empty? result)
        (or (nlp/valid-sentence? result-sentence)
            (< (:syllable-count target)
@ -241,18 +249,18 @@
 (defn gen-prhymes [words adjust poem-lines]
  (let [words (map #(assoc % :weight 1) words)
-        words-map (into {} (map #(vector (:norm-word %) %) words))]
+        words-map (into {} (map #(vector (:normalized-word %) %) words))]
    (map (fn [line]
-           (let [target (phrase->word words line)
+           (let [target (prhyme/phrase->word words line)
                 stop (sentence-stop target)
                 r (prhymer words adjust target stop)]
-             (string/join " " (map #(:norm-word %) (first r)))))
+             (string/join " " (map #(:normalized-word %) (first r)))))
         poem-lines)))
 (defn phrase-syllable-count [phrase]
  (->> phrase
       (#(string/split % #" "))
-       (map (partial phrase->word frp/words))
+       (map (partial prhyme/phrase->word frp/words))
       (map :syllable-count)
       (apply +)))
@ -268,8 +276,8 @@
 (defn generate-rhyme-for-phrase
  [words adjust phrase]
  (let [words (map #(assoc % :weight 1) words)
-        words-map (into {} (map #(vector (:norm-word %) %) words))
+        words-map (into {} (map #(vector (:normalized-word %) %) words))
-        target (phrase->word words phrase)]
+        target (prhyme/phrase->word words phrase)]
    (prhymer words adjust target (syllable-stop target))))
 #_(defn generate-prhymes [poem]
@ -277,28 +285,28 @@
    (fn []
      (->> poem
           (map (fn [phrase]
-                  (let [target (phrase->word frp/popular phrase)]
+                  (let [target (prhyme/phrase->word frp/popular phrase)]
                    (first
                     (filter
                      #(and
                        (or (< 0.9 (rand))
-                            (nlp/valid-sentence? (string/join " " (map :norm-word %))))
+                            (nlp/valid-sentence? (string/join " " (map :normalized-word %))))
                        (= (:syllable-count target)
                           (apply + (map :syllable-count %))))
                      (r phrase))))))
-           (map (fn [line] (map #(:norm-word %) line)))
+           (map (fn [line] (map #(:normalized-word %) line)))
           (map #(string/join " " %))))))
 (defn generate-prhymes-darkov [words adj phrase]
-  (let [target (phrase->word words phrase)
+  (let [target (prhyme/phrase->word words phrase)
        r (generate-rhyme-for-phrase words adj target)]
    (first
     (filter
      #(and
        (or (< 0.9 (rand))
-            (nlp/valid-sentence? (string/join " " (map :norm-word %))))
+            (nlp/valid-sentence? (string/join " " (map :normalized-word %))))
        (= (:syllable-count target)
           (apply + (map :syllable-count %))))
      r))
-    (map (fn [line] (map #(:norm-word %) line)))
+    (map (fn [line] (map #(:normalized-word %) line)))
    (map #(string/join " " %))))
--- a/src/com/owoga/prhyme/generation/weighted_selection.clj
+++ b/src/com/owoga/prhyme/generation/weighted_selection.clj
@ -43,7 +43,7 @@
  [markov percent]
  (let [markov-n (count (first (first markov)))]
    (fn [[words target result]]
-      (let [key (let [k (map :norm-word (take markov-n result))]
+      (let [key (let [k (map :normalized-word (take markov-n result))]
                  (reverse
                   (if (> markov-n (count k))
                     (concat k (repeat (- markov-n (count k)) nil))
@ -55,7 +55,7 @@
          [words target result]
          (let [[markovs non-markovs]
                ((juxt filter remove)
-                 #(markov-options (:norm-word %))
+                 #(markov-options (:normalized-word %))
                 words)
                weight-non-markovs (apply + (map :weight non-markovs))
                target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
@ -65,7 +65,7 @@
            [(concat
              (map
               (fn [m]
-                 (let [option (markov-options (:norm-word m))]
+                 (let [option (markov-options (:normalized-word m))]
                   (as-> m m
                     (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
                     (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
@ -108,8 +108,34 @@
       result])))
 (defn adjust-for-tail-rhyme
  "Only bump up rhyme probability if result is empty."
  [percent]
  (fn [[words target result]]
    (if (empty? result)
      ((adjust-for-rhymes percent) [words target result])
      [words target result])))
 (defn adjust-for-fn
  "Weights words by whether or not they rhyme.
  Once result contains something, becomes inactive. If you want to try to rhyme
  every selection, you'll need a different function. This one will only rhyme
  the tail of a target."
  [key percent pred-fn weight-fn]
  (fn [[words target result]]
    (let [[matching non-matching] ((juxt filter remove) #(pred-fn % target result) words)
          weight-non-matching (apply + (map :weight non-matching))
          target-weight-matching (* 100 percent weight-non-matching)
          count-matching (count matching)
          adjustment-matching (if (= 0 count-matching)
                               1
                               (/ target-weight-matching count-matching))]
      [(concat
        (map
         (fn [word]
           (as-> word word
             (assoc word :weight (* adjustment-matching (weight-fn word target result)))
             (assoc word key adjustment-matching)))
         matching)
        non-matching)
       target
       result])))
--- a/src/com/owoga/prhyme/limerick.clj
+++ b/src/com/owoga/prhyme/limerick.clj
@ -0,0 +1,76 @@
 (ns com.owoga.prhyme.limerick
  (:require [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
            [clojure.string :as string]
            [com.owoga.prhyme.core :as prhyme]
            [com.owoga.prhyme.util :as util]))
 (defn rhyme-from-scheme
  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
  [words markov scheme]
  (loop [scheme scheme
         rhymes {}
         result []]
    (cond
      (empty? scheme) result
      :else
      (let [[pattern syllable-count] (first scheme)
            banned-words (into #{} (->> result
                                        (map #(string/split % #" "))
                                        (map #(last %))))
            adj (util/comp-rnil
                 (weighted-selection/adjust-for-markov
                  markov
                  0.99)
                 (when (rhymes pattern)
                   (weighted-selection/adjust-for-tail-rhyme 0.99)))
            rhyme (if (nil? (get rhymes pattern))
                    (gen/gen-sentence-with-syllable-count
                     adj
                     syllable-count
                     words)
                    (gen/gen-rhyme-with-syllable-count
                     adj
                     syllable-count
                     (remove #(banned-words (:normalized-word %)) words)
                     (prhyme/phrase->word words (get rhymes pattern))))]
        (recur (rest scheme)
               (assoc rhymes pattern rhyme)
               (conj result rhyme))))))
 (comment
  (require '[com.owoga.prhyme.data.dictionary :as dict]
           '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]]
           '[clojure.java.io :as io])
  (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
  )
 (comment
  ["bishop larch smitten us dwell"
   "solely first week in hell"
   "and take that for three"
   "come wrapped in glory"
   "you ever leave it so well"]
  ["romancing realized too late"
   "my crown revive my withered state"
   "reign is obsolete"
   "i sit in the street"
   "but nobody cares of my fate"]
  ["flesh is hacked to get me sedate"
   "demonstration obsessed with hate"
   "justice will be written in stone"
   "and you will be shown"
   "bedrooms of icons suffocate"]
  ["you will bow to their hungry gods"
   "come will come whatever the odds"
   "now we see the light"
   "you can't put it right"
   "recklessly chopping firing squads"]
  ["untimely they fool their poor life"
   "it wither away with this knife"
   "hate is my virtue"
   "my feelings are well overdue"
   "war we await the afterlife"])
--- a/src/com/owoga/prhyme/lymeric.clj
+++ b/src/com/owoga/prhyme/lymeric.clj
@ -1,86 +0,0 @@
 (ns com.owoga.prhyme.lymeric
  (:require [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
            [clojure.string :as string]
            [com.owoga.prhyme.core :as prhyme]
            [com.owoga.prhyme.frp :as frp]
            [com.owoga.prhyme.util :as util]
            [taoensso.nippy :as nippy]
            [clojure.java.io :as io]))
 (defn rhyme-from-scheme
  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
  [rhymer markov scheme]
  (let [base-words (map #(assoc % :weight 1) frp/popular)]
    (loop [scheme scheme
           rhymes {}
           result []]
      (cond
        (empty? scheme) result
        :else
        (let [[pattern syllable-count] (first scheme)
              banned-words (into #{} (->> result
                                          (map #(string/split % #" "))
                                          (map #(last %))))
              adj (util/comp-rnil
                   (weighted-selection/adjust-for-markov
                    markov
                    0.99)
                   (when (rhymes pattern)
                     (weighted-selection/adjust-for-tail-rhyme 0.99)))
              rhyme (if (nil? (get rhymes pattern))
                      (gen/gen-sentence-with-syllable-count
                       adj
                       syllable-count
                       base-words)
                      (gen/gen-rhyme-with-syllable-count
                       adj
                       syllable-count
                       (remove #(banned-words (:norm-word %))
                               base-words)
                       (prhyme/phrase->word frp/words (get rhymes pattern))))]
          (recur (rest scheme)
                 (assoc rhymes pattern rhyme)
                 (conj result rhyme)))))))
 (comment
  (time (def darkov-2 (nippy/thaw-from-file (io/resource "darkov-2.bin"))))
  (rhyme-from-scheme nil darkov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
  )
 (comment
  (->> (gen/selection-seq
        (map #(assoc % :weight 1) frp/words)
        (weighted-selection/adjust-for-rhymes 0.99)
        (prhyme/phrase->word frp/words "hi there my boy"))
       (take 3))
  ["bishop larch smitten us dwell"
   "solely first week in hell"
   "and take that for three"
   "come wrapped in glory"
   "you ever leave it so well"]
  ["romancing realized too late"
   "my crown revive my withered state"
   "reign is obsolete"
   "i sit in the street"
   "but nobody cares of my fate"]
  ["flesh is hacked to get me sedate"
   "demonstration obsessed with hate"
   "justice will be written in stone"
   "and you will be shown"
   "bedrooms of icons suffocate"]
  ["you will bow to their hungry gods"
   "come will come whatever the odds"
   "now we see the light"
   "you can't put it right"
   "recklessly chopping firing squads"]
  ["untimely they fool their poor life"
   "it wither away with this knife"
   "hate is my virtue"
   "my feelings are well overdue"
   "war we await the afterlife"])
--- a/src/com/owoga/prhyme/nlp/tag_sets/treebank.clj
+++ b/src/com/owoga/prhyme/nlp/tag_sets/treebank.clj
--- a/src/com/owoga/prhyme/nlp/tag_sets/treebank_ii.clj
+++ b/src/com/owoga/prhyme/nlp/tag_sets/treebank_ii.clj
@ -0,0 +1,81 @@
 (ns com.owoga.prhyme.nlp.tag-sets.treebank-ii)
 ;;;; Penn Treebank II Constituent Tags
 ;;;; http://www.surdeanu.info/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html#MD
 (def clauses
  {'S "simple declarative clause, i.e. one that is not introduced by a (possible empty) subordinating conjunction or a wh-word and that does not exhibit subject-verb inversion."
   'SBAR "Clause introduced by a (possibly empty) subordinating conjunction."
   'SBARQ "Direct question introduced by a wh-word or a wh-phrase. Indirect questions and relative clauses should be bracketed as SBAR, not SBARQ."
   'SINV "Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal."
   'SQ "Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ."})
 (def phrases
  {'ADJP "Adjective Phrase."
   'ADVP "Adverb Phrase."
   'CONJP "Conjunction Phrase."
   'FRAG "Fragment."
   'INTJ "Interjection. Corresponds approximately to the part-of-speech tag UH."
   'LST "List marker. Includes surrounding punctuation."
   'NAC "Not a Constituent; used to show the scope of certain prenominal modifiers within an NP."
   'NP "Noun Phrase."
   'NX "Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently."
   'PP "Prepositional Phrase."
   'PRN "Parenthetical."
   'PRT "Particle. Category for words that should be tagged RP."
   'QP "Quantifier Phrase (i.e. complex measure/amount phrase); used within NP."
   'RRC "Reduced Relative Clause."
   'UCP "Unlike Coordinated Phrase."
   'VP "Vereb Phrase."
   'WHADJP "Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot."
   'WHAVP "Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why."
   'WHNP "Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards."
   'WHPP "Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP."
   'X "Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the...the-constructions."})
 (def words
  {'CC "Coordinating conjunction"
   'CD "Cardinal number"
   'DT "Determiner"
   'EX "Existential there"
   'FW "Foreign word"
   'IN "Preposition or subordinating conjunction"
   'JJ "Adjective"
   'JJR "Adjective, comparative"
   'JJS "Adjective, superlative"
   'LS "List item marker"
   'MD "Modal"
   'NN "Noun, singular or mass"
   'NNS "Noun, plural"
   'NNP "Proper noun, singular"
   'NNPS "Proper noun, plural"
   'PDT "Predeterminer"
   'POS "Possessive ending"
   'PRP "Personal pronoun"
   'PRP$ "Possessive pronoun (prolog version PRP-S)"
   'RB "Adverb"
   'RBR "Adverb, comparative"
   'RBS "Adverb, superlative"
   'RP "Particle"
   'SYM "Symbol"
   'TO "to"
   'UH "Interjection"
   'VB "Verb, base form"
   'VBD "Verb, past tense"
   'VBG "Verb, gerund or present participle"
   'VBN "Verb, past participle"
   'VBP "Verb, non-3rd person singular present"
   'VBZ "Verb, 3rd person singular present"
   'WDT "Wh-determiner"
   'WP "Wh-pronoun"
   'WP$ "Possessive wh-pronoun (prolog version WP-S)"
   'WRB "Wh-adverb"})
 (def form-function-discrepencies
  {'-ADV "(adverbial) - marks a constituent
  other than ADVP or PP when it is used adverbially (e.g. NPs or
  free (\"headless\" relatives). However, constituents that themselves are
  modifying an ADVP generally do not get -ADV. If a more specific tag is
  available (for example, -TMP) then it is used alone and -ADV is implied. See
  the Adverbials section."
   '-NOM "(nominal) - marks free (\"headless\") relatives
  and gerunds when they act nominally."})
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -21,6 +21,9 @@
    "ah"
    phoneme))
 (comment
  (map str (.getPhones cmu-lexicon "two" nil)))
 (defn get-phones [word]
  (->> (map str (.getPhones cmu-lexicon word nil))
       (map remove-stress)
--- a/src/com/owoga/prhyme/util/nlp.clj
+++ b/src/com/owoga/prhyme/util/nlp.clj
@ -2,17 +2,55 @@
  (:require [opennlp.nlp :as nlp]
            [opennlp.treebank :as tb]
            [clojure.string :as string]
-            [clojure.java.io :as io]))
+            [clojure.java.io :as io]
            [clojure.zip :as zip]
            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
  (:import (opennlp.tools.postag POSModel POSTaggerME)))
 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
 (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
 (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
 (def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
 ;;;; The tagger that onennlp.nlp gives us doesn't provide access
 ;;;; to the probabilities of all tags. It gives us the probability of the
 ;;;; top tag through some metadata. But to get probs for all tags, we
 ;;;; need to implement our own tagger.
 (defprotocol Tagger
  (tags [this sent])
  (probs [this])
  (top-k-sequences [this sent]))
 (defn make-pos-tagger
  [modelfile]
  (let [model (with-open [model-stream (io/input-stream modelfile)]
                (POSModel. model-stream))
        tagger (POSTaggerME. model)]
    (reify Tagger
      (tags [_ tokens]
        (let [token-array (into-array String tokens)]
          (map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array))))
      (probs [_] (seq (.probs tagger)))
      (top-k-sequences [_ tokens]
        (let [token-array (into-array String tokens)]
          (.topKSequences tagger #^"[Ljava.lang.String;" token-array))))))
 (def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
 (comment
  (let [phrase "The feeling hurts."]
    (map (juxt #(.getOutcomes %)
               #(map float (.getProbs %)))
         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
  )
 (defn valid-sentence?
  "Tokenizes and parses the phrase using OpenNLP models from
  http://opennlp.sourceforge.net/models-1.5/
-  If the parse tree has an 'S as the top-level tag, then
+  If the parse tree has an clause as the top-level tag, then
  we consider it a valid English sentence."
  [phrase]
  (->> phrase
@ -22,5 +60,151 @@
       parse
       first
       tb/make-tree
-       (#(= 'S (:tag (first (:chunk %)))))))
+       :chunk
       first
       :tag
       tb2/clauses
       boolean))
 (defn unmake-tree
  "Tokenizing and then parsing a sentence returns a string
  representation of the parse tree. This is a helper function
  to make working with the parse tree more convenient. We
  can use `opennlp.treebank/make-tree` to make a clojure map
  representation of the tree, then we can `unmake` the tree
  to turn it into a list representation of the tree that
  we can easily use in a clojure zipper. (read-string almost works,
  but falls apart when reading things like commas)."
  [node]
  (cond
    (string? node) node
    (map? node) (list (:tag node) (unmake-tree (:chunk node)))
    :else (map unmake-tree node)))
 (comment
  (let [phrase "Hello, Eric"]
    (->> phrase
         tokenize
         (string/join " ")
         vector
         parse
         (map tb/make-tree)
         unmake-tree))
  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
  )
 (defn treebank-zipper
  "Turns a bit of text into a parse tree into a zipper."
  [text]
  (let [tree (->> text
                  tokenize
                  (string/join " ")
                  vector
                  parse
                  (map tb/make-tree)
                  unmake-tree)]
    (zip/zipper seq? seq (fn [_ c] c) tree)))
 (defn node-constituents
  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
  returns a list of the top-level node tag and its first-level child tags.
  "
  [node]
  (list
   (first node)
   (if (every? string? (map first (rest node)))
     nil
     (map first (first (rest node))))))
 (defn phrase-constituents
  "Given a bit of text that can be parsed into a treebank tree,
  Get a sequence of the tags and their chunks.
  For example:
    My name is Eric.
  Returns the sequence:
    At the TOP tag, we have a 'S part-of-speech (a clause).
    At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period)
    At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun)
    ...
  "
  [text]
  (->> (treebank-zipper text)
       (iterate zip/next)
       (take-while (complement zip/end?))
       (filter (complement zip/branch?))
       (map zip/path)
       (map last)
       (map node-constituents)
       (remove #(string? (first %)))))
 (comment
  (phrase-constituents "My name is Eric.")
  ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
  (phrase-constituents "How are you?")
  ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP))) 
  )
 (defn pos-constituent-frequencies
  "Frequencies of the parts of speech that make up phrases.
  Example:
    Clauses are made up of:
      NounPhrase + VerbPhrase 2 times
      Clause + CoordinatingConjuction + Clause 1 times
    NounPhrases are made up of:
      ProperNouns 2 times
      PersonalPronoun + SingularNoun 3 times
  Does not include frequencies for leaf words. By that I mean: A SingularNoun might
  appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair
  and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned
  by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %)))
  line. This data structure is used as a kind of markov selection process and we don't really
  care how often the leafs are used. We just care about the ratio at which we should pick each
  leaf from a given parent.
  "
  [texts]
  (reduce
   (fn [acc text]
     (let [constituents (->> text
                             phrase-constituents
                             (remove #(nil? (second %))))]
       (reduce
        (fn [acc constituent]
          (let [k1 (first constituent)
                k2 (second constituent)]
            (update-in acc [k1 k2] (fnil inc 0))))
        acc
        constituents)))
   {}
   texts))
 (comment
  (pos-constituent-frequencies
   ["My name is Eric."
    "My hat is blue and I like cake."
    "Your name is Taylor."
    "How are you?"])
  ;; => {TOP {(S) 3, (SBARQ) 1},
  ;;     S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
  ;;     NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
  ;;     VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1},
  ;;     ADJP {(JJ) 1},
  ;;     SBARQ {(WHADVP SQ .) 1},
  ;;     WHADVP {(WRB) 1},
  ;;     SQ {(VBP NP) 1}}
  (let [phrase "How are you today?"]
    (->> phrase
         tokenize
         (string/join " ")
         vector
         parse
         (map tb/make-tree)))
  (let [phrase "I gave the cake to John at the store."]
    (parse (tokenize phrase)))
  (let [phrase "I've got a good feeling"]
    (pos-tagger (tokenize phrase)))
  )