Add simple-nlg

5 years ago · c0ad5acefc
parent 86546c7d01
commit c0ad5acefc
5 changed files with 933 additions and 8 deletions
--- a/deps.edn
+++ b/deps.edn
@ -7,6 +7,7 @@
        inflections {:mvn/version "0.13.2"}
        com.taoensso/tufte {:mvn/version "2.2.0"}
        clojure-opennlp {:mvn/version "0.5.0"}
+        uk.ac.abdn/SimpleNLG {:mvn/version "4.5.0"}
        net.sf.sociaal/freetts {:mvn/version "1.2.2"}
        enlive {:mvn/version "1.1.6"}
        integrant {:mvn/version "0.8.0"}
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -1,7 +1,7 @@
 (ns com.owoga.corpus.markov
  (:require [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.data.dictionary :as dict]
-            [com.owoga.prhyme.util.nlp :as nlp]
+            [com.owoga.prhyme.nlp.core :as nlp]
            [clojure.string :as string]
            [clojure.java.io :as io]))

@ -63,12 +63,15 @@
       (map #(string/split % #"\n+"))
       (map (fn [lyrics] (filter #(nlp/valid-sentence? %) lyrics)))
       (map #(remove nil? %))
-       (take 5)
-       (map (fn [lyrics]
+       (take 400)
+       (flatten)
+       (nlp/pos-constituent-frequencies)
+       #_(map (fn [lyrics]
              (map #(nlp/tags nlp/prhyme-pos-tagger (nlp/tokenize %)) lyrics)))))

 (comment
-  (let [directory "dark-corpus/zero-hour/"]
-    (gen-pos-markov directory))
+  (time
+   (let [directory "dark-corpus/"]
+     (gen-pos-markov directory)))

  )
--- a/src/com/owoga/prhyme/data/dictionary.clj
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@ -56,9 +56,9 @@
   (set (line-seq (io/reader (io/resource "nouns.txt"))))))

 (defn english? [text]
-  (let [words (string/split text #"\s+")
+  (let [word-set (into #{} (map :normalized-word prhyme-dict))
+        words (string/split text #"\s+")
        english-words
        (->> words
-             (filter #((into #{} (map :normalized-word prhyme-dict))
-                       (string/lower-case %))))]
+             (filter #(word-set (string/lower-case %))))]
    (< 0.7 (/ (count english-words) (max 1 (count words))))))
--- a/src/com/owoga/prhyme/nlg/core.clj
+++ b/src/com/owoga/prhyme/nlg/core.clj
@ -0,0 +1,162 @@
+(ns com.owoga.prhyme.nlg.core
+  (:import (simplenlg.framework NLGFactory)
+           (simplenlg.lexicon Lexicon)
+           (simplenlg.realiser.english Realiser)
+           (simplenlg.features Feature Form)))
+
+(def lexicon (Lexicon/getDefaultLexicon))
+(def nlg-factory (NLGFactory. lexicon))
+(def realiser (Realiser. lexicon))
+
+(defn realise [element]
+  (.realise realiser element))
+
+(let [text "my dog is happy"
+      sentence (.createSentence nlg-factory text)]
+  (.realiseSentence realiser sentence))
+
+(let [clause (.createClause nlg-factory)
+      _ (.setSubject clause "us")
+      _ (.setVerb clause "chase")
+      _ (.setObject clause "the monkey")]
+  (.realiseSentence realiser clause))
+
+(let [NP (.createNounPhrase nlg-factory "zipper")
+      _ (.setDeterminer NP "another")
+      VP1 (.createVerbPhrase nlg-factory "eat")
+      clause (.createSentence nlg-factory [NP VP1])]
+  (.realiseSentence realiser clause))
+
+(defn create-noun-phrase
+  ([]
+   (.createNounPhrase nlg-factory))
+  ([noun]
+   (.createNounPhrase nlg-factory noun))
+  ([specifier noun]
+   (.createNounPhrase nlg-factory specifier noun)))
+
+(defn set-determiner [noun-phrase determiner]
+  (.setDeterminer noun-phrase determiner)
+  noun-phrase)
+
+(defn set-noun [noun-phrase noun]
+  (.setNoun noun-phrase noun)
+  noun-phrase)
+
+(defn create-adjective-phrase
+  ([]
+   (.createAdjectivePhrase nlg-factory))
+  ([adjective]
+   (.createAdjectivePhrase nlg-factory adjective)))
+
+(defn create-adverb-phrase
+  ([]
+   (.createAdverbPhrase nlg-factory))
+  ([adverb]
+   (.createAdverbPhrase nlg-factory adverb)))
+
+(defn create-verb-phrase
+  ([]
+   (.createVerbPhrase nlg-factory))
+  ([verb]
+   (.createVerbPhrase nlg-factory verb)))
+
+(defn set-feature [element feature-name feature-value]
+  (.setFeature element feature-name feature-value)
+  element)
+
+(defn set-pre-modifier [element new-pre-modifier]
+  (.setPreModifier element new-pre-modifier)
+  element)
+
+(ns-unmap *ns* 'create-element)
+(defmulti create-element
+  (fn [children] (doall (map first children))))
+
+(defmethod create-element '(PRP$)
+  [[[_ child]]] (.createNounPhrase nlg-factory child))
+
+(defmethod create-element '(NN)
+  [[[_ child]]] (.createNounPhrase nlg-factory child))
+
+(defmethod create-element '(NNP)
+  [[[_ child]]]
+  (let [noun-phrase (.createNounPhrase nlg-factory child)]
+    (.setPlural noun-phrase true)
+    noun-phrase))
+
+;; A determiner by itself is usually a subject.
+;; i.e. "This is a test." <- "This" is tagged by the treebank parser
+;; as a determiner
+(defmethod create-element '(DT)
+  [[[_ child]]]
+  (.createNounPhrase nlg-factory child))
+
+(defmethod create-element '(DT NN NN)
+  [[DT NN1 NN2]]
+  (let [dt (create-element (list DT))
+        nn1 (create-element (list NN1))
+        nn2 (create-element (list NN2))
+        clause (.createNounPhrase nlg-factory)]
+    (.setSpecifier clause dt)
+    (.setPreModifier clause nn1)
+    (.setNoun clause nn2)
+    clause))
+
+(realise (create-element '((DT "a") (NN "sample") (NN "test"))))
+
+(defmethod create-element '(PRP$ NN)
+  [[prp$ nn]]
+  (-> (.createNounPhrase
+       nlg-factory
+       (create-element (list nn)))
+      (set-pre-modifier
+       (-> (create-element (list prp$))
+           (set-feature Feature/POSSESSIVE true)))))
+
+(defmethod create-element
+  '(NP)
+  [[[_ child]]]
+  (create-element child))
+
+(realise (create-element '((NP ((PRP$ "Eric") (NN "test"))))))
+
+(realise (create-element '((DT "This"))))
+(create-element '((NN "test")))
+(realise (create-element '((PRP$ "Eric") (NN "test"))))
+(realise (create-element '((NNP "tests"))))
+
+(defmethod create-element '(VB)
+  [[[_ child]]]
+  (.createVerbPhrase nlg-factory child))
+
+(realise (create-element '((VB "run"))))
+
+(defmethod create-element '(VBZ)
+  [[[_ child]]]
+  (.createVerbPhrase nlg-factory child))
+
+(defmethod create-element '(VBZ NP)
+  [[VBZ NP]]
+  (let [np (create-element (list NP))
+        vbz (create-element (list VBZ))
+        clause (.createClause nlg-factory)]
+    (.setObject clause np)
+    (.setVerb clause vbz)
+    clause))
+
+(comment
+  (realise (create-element '((VBZ "is") (NP ((NN "test"))))))
+  )
+
+(comment
+  (-> (create-noun-phrase)
+      (set-determiner "a")
+      (set-noun "test"))
+
+  (-> (create-verb-phrase "let")
+      (set-feature Feature/FORM Form/INFINITIVE)
+      (#(realise %)))
+
+
+  )
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -0,0 +1,759 @@
+(ns com.owoga.prhyme.nlp.core
+  (:require [opennlp.nlp :as nlp]
+            [opennlp.treebank :as tb]
+            [clojure.string :as string]
+            [clojure.java.io :as io]
+            [clojure.zip :as zip]
+            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
+  (:import (opennlp.tools.postag POSModel POSTaggerME)))
+
+(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
+(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
+(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
+(def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
+
+;;;; The tagger that onennlp.nlp gives us doesn't provide access
+;;;; to the probabilities of all tags. It gives us the probability of the
+;;;; top tag through some metadata. But to get probs for all tags, we
+;;;; need to implement our own tagger.
+(defprotocol Tagger
+  (tags [this sent])
+  (probs [this])
+  (top-k-sequences [this sent]))
+
+(defn make-pos-tagger
+  [modelfile]
+  (let [model (with-open [model-stream (io/input-stream modelfile)]
+                (POSModel. model-stream))
+        tagger (POSTaggerME. model)]
+    (reify Tagger
+      (tags [_ tokens]
+        (let [token-array (into-array String tokens)]
+          (map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array))))
+      (probs [_] (seq (.probs tagger)))
+      (top-k-sequences [_ tokens]
+        (let [token-array (into-array String tokens)]
+          (.topKSequences tagger #^"[Ljava.lang.String;" token-array))))))
+
+(def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
+
+(comment
+  (let [phrase "The feeling hurts."]
+    (map (juxt #(.getOutcomes %)
+               #(map float (.getProbs %)))
+         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
+  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
+  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
+  (let [phrase "Blood falls."]
+    (->> phrase
+         tokenize
+         (top-k-sequences prhyme-pos-tagger)
+         (map (juxt #(.getOutcomes %)
+                    #(map float (.getProbs %))))))
+
+  )
+
+(defn deep-merge-with [f & maps]
+  (letfn [(m [& xs]
+            (if (some map? xs)
+              (apply merge-with m xs)
+              (apply f xs)))]
+    (reduce m maps)))
+
+(comment
+  (let [m1 {:a 1 :b {:b 2}}
+        m2 {:c 3 :b {:b 5}}]
+    (deep-merge-with + m1 m2))
+  ;; => {:a 1, :b {:b 7}, :c 3}
+  )
+
+(defn valid-sentence?
+  "Tokenizes and parses the phrase using OpenNLP models from
+  http://opennlp.sourceforge.net/models-1.5/
+
+  If the parse tree has an clause as the top-level tag, then
+  we consider it a valid English sentence."
+  [phrase]
+  (->> phrase
+       tokenize
+       (string/join " ")
+       vector
+       parse
+       first
+       tb/make-tree
+       :chunk
+       first
+       :tag
+       tb2/clauses
+       boolean))
+
+(defn unmake-tree
+  "Tokenizing and then parsing a sentence returns a string
+  representation of the parse tree. This is a helper function
+  to make working with the parse tree more convenient. We
+  can use `opennlp.treebank/make-tree` to make a clojure map
+  representation of the tree, then we can `unmake` the tree
+  to turn it into a list representation of the tree that
+  we can easily use in a clojure zipper. (read-string almost works,
+  but falls apart when reading things like commas)."
+  [node]
+  (cond
+    (string? node) node
+    (map? node) (list (:tag node) (unmake-tree (:chunk node)))
+    :else (map unmake-tree node)))
+
+(comment
+  (let [phrase "Hello, Eric"]
+    (->> phrase
+         tokenize
+         (string/join " ")
+         vector
+         parse
+         (map tb/make-tree)
+         unmake-tree))
+  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
+  )
+
+(defn treebank-zipper
+  "Turns a bit of text into a parse tree into a zipper."
+  [text]
+  (let [tree (->> text
+                  tokenize
+                  (string/join " ")
+                  vector
+                  parse
+                  (map tb/make-tree)
+                  unmake-tree)]
+    (zip/seq-zip tree)))
+
+(comment
+  (let [zipper (treebank-zipper "This is a zipper test.")]
+    zipper)
+  ;; => [((TOP
+  ;;       ((S
+  ;;         ((NP ((DT ("This"))))
+  ;;          (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("zipper")) (NN ("test"))))))
+  ;;          (. (".")))))))
+  ;;     nil]
+  )
+
+(defn treebank-zipper-1
+  "Turns a bit of text into a parse tree into a zipper."
+  [texts]
+  (let [tree (->> texts
+                  (map tokenize)
+                  (map (partial string/join " "))
+                  parse
+                  (map tb/make-tree)
+                  unmake-tree)]
+    (zip/seq-zip tree)))
+
+(defn seq-zip [zipper]
+  (->> zipper
+       (iterate zip/next)
+       (take-while (complement zip/end?))))
+
+(defn seq-nodes [zipper]
+  (->> zipper
+       seq-zip
+       (map zip/node)))
+
+(defn phrase-level? [node]
+  (and (seq? node)
+       (symbol? (first node))))
+
+(defn word-level? [node]
+  (and (seq? node)
+       (symbol? (first node))
+       (= 1 (count (second node)))
+       (string? (first (second node)))))
+
+(defn leaf-pos-paths [zipper]
+  (->> zipper
+       seq-zip
+       (filter (complement zip/branch?))
+       (map zip/path)
+       (map #(map first %))
+       (map
+        #(filter
+          (fn [v] (or (string? v)
+                      (symbol? v)))
+          %))
+       (filter #(string? (last %)))))
+
+(comment
+  (let [zipper (treebank-zipper-1 ["This is a zipper test."
+                                   "And this is another one."])]
+    (leaf-pos-paths zipper))
+;; => ((TOP S NP DT "This")
+;;     (TOP S VP VBZ "is")
+;;     (TOP S VP NP DT "a")
+;;     (TOP S VP NP NN "zipper")
+;;     (TOP S VP NP NN "test")
+;;     (TOP S . ".")
+;;     (TOP S CC "And")
+;;     (TOP S NP DT "this")
+;;     (TOP S VP VBZ "is")
+;;     (TOP S VP NP DT "another")
+;;     (TOP S VP NP CD "one")
+;;     (TOP S . ".")) 
+  )
+
+
+;;;; This is not sufficient
+;; You'll end up with a mapping that says a verb phrase can be a
+;; third-person-singular verb, like "is" followed by a
+;; noun-phrase. But "is" can't be followed by just any noun-phrase.
+;; It must be followed by a noun-phrase that starts with a determiner,
+;; like "a" or "the".
+(defn pos->children-freqs
+  "Takes a zipper representation of a parsed text.
+
+  Returns a map of parts-of-speech to a map of their constituent parts-of-speech
+  and the number of times each constituent is seen in a corpus.
+
+  For example, in the sentence 'This is a test and that is not a test.',
+  there are 4 noun phrases; 'this', 'a test', 'that', 'a test'.
+
+  Twice, the noun phrase is made up of a determiner (namely, 'this' and 'that').
+  Twice, the noun phras is made up of a determiner and a noun (namely, 'a test' and 'a test').
+
+  If the constituent part of speech is an actual word, then the key is the word.
+
+  For the sentence 'This is a test and that is not a test.', this function would include a kv pairs of:
+  {NP {(DT) 2 (DT NN) 2}
+   CC {'and' 1}
+   VBZ {'is' 2}
+   ,,,}
+
+  This data structure is useful for text generation. If you're generating a sentence and you
+  know that you need a coordinating conjunction, you can choose one in from the list weighted
+  by their frequencies.
+  "
+  [zipper]
+  (->> zipper
+       seq-nodes
+       (filter phrase-level?)
+       (map
+        (fn [[pos xs]]
+          (vector
+           pos
+           (if (string? (first xs))
+             (first xs)
+             (map first xs)))))
+       (map (fn [[k v]]
+              {k {v 1}}))
+       (apply deep-merge-with +)))
+
+(comment
+  (let [zipper (treebank-zipper-1 ["This is a test and that is not a test."
+                                   "My name is Eric."
+                                   "Go to the store."
+                                   "Your name is not Eric."
+                                   "This is a sample test."])
+        freqs (->> zipper
+                   pos->children-freqs)]
+    freqs)
+;; => {NP {(DT) 3, (DT NN) 3, (PRP$ NN) 2, (NNP) 2, (DT NN NN) 1},
+;;     VB {"Go" 1},
+;;     VBZ {"is" 5},
+;;     S {(S CC S .) 1, (NP VP) 2, (NP VP .) 3},
+;;     RB {"not" 2},
+;;     NNP {"Eric" 2},
+;;     TO {"to" 1},
+;;     DT {"This" 2, "a" 3, "that" 1, "the" 1},
+;;     TOP {(S) 4, (VP) 1},
+;;     CC {"and" 1},
+;;     NN {"test" 3, "name" 2, "store" 1, "sample" 1},
+;;     PP {(TO NP) 1},
+;;     VP {(VBZ NP) 3, (VBZ RB NP) 2, (VB PP .) 1},
+;;     PRP$ {"My" 1, "Your" 1},
+;;     . {"." 5}} 
+  )
+
+(defn generate-from-freqs
+  "freqs is a map of parts-of-speech to a map of their constituent parts-of-speech
+  and the number of times each constituent is seen in a corpus.
+
+  start is the part-of-speech to start generating for.
+  "
+  [freqs start]
+  (let [zipper (zip/seq-zip (list start))]
+    (loop [zipper zipper]
+      (cond
+        (zip/end? zipper) (zip/root zipper)
+
+        (zip/branch? zipper) (recur (zip/next zipper))
+
+        (freqs (zip/node zipper))
+        (recur
+         (zip/next
+          (zip/next
+           (zip/replace
+            zipper
+            (list
+             (zip/node zipper)
+             (first (rand-nth (seq (freqs (zip/node zipper))))))))))
+
+        :else (recur (zip/next zipper))))))
+
+(comment
+  (let [zipper (treebank-zipper-1 ["This is a zipper test in the car."
+                                   "And this is another one."
+                                   "Here are some sample sentences."
+                                   "Let's see what we can generate."
+                                   "This is a big adjective."
+                                   "That's an ugly sentence."
+                                   "The corpus will be simple to start."])
+        freqs (->> zipper
+                   pos->children-freqs)]
+    freqs
+    (->> (generate-from-freqs freqs 'TOP))
+    (->> (repeatedly
+          (fn []
+            (->> (generate-from-freqs freqs 'TOP)
+                 (zip/seq-zip)
+                 (iterate zip/next)
+                 (take-while (complement zip/end?))
+                 (filter (complement zip/branch?))
+                 (map zip/node)
+                 (filter string?)
+                 (string/join " "))))
+         (filter valid-sentence?)
+         (take 10)))
+  ;; => ("Here be what start big And This test see what be That one start . . That one ."
+  ;;     "Here are This ugly zipper ."
+  ;;     "And an is The sample ."
+  ;;     "generate simple see This corpus test to Let And a Let what an corpus adjective to be And another ugly car can see ugly start what this one to are . . . . . . ."
+  ;;     "This sample sentences Let big 's a sample adjective in we ."
+  ;;     "will generate what Let"
+  ;;     "Here be ugly That sentence generate a big test ."
+  ;;     "Here is this corpus sentences in a test this sample sentences ."
+  ;;     "And some ugly zipper 's 's ."
+  ;;     "Here generate big the adjective 's another corpus an .")
+  ;; => ((TOP
+  ;;      ((SINV
+  ;;        ((ADVP ((RB "Here")))
+  ;;         (VP
+  ;;          ((TO "to")
+  ;;           (VP
+  ;;            ((TO "to")
+  ;;             (VP
+  ;;              ((VB "Let")
+  ;;               (SBAR
+  ;;                ((WHNP ((WP "what")))
+  ;;                 (S
+  ;;                  ((NP ((DT "this") (CD "one")))
+  ;;                   (VP
+  ;;                    ((VBZ "is")
+  ;;                     (NP ((DT "The") (NN "adjective") (NN "car")))
+  ;;                     (PP
+  ;;                      ((IN "in")
+  ;;                       (NP
+  ;;                        ((DT "The") (NN "test") (NNS "sentences")))))))))))))))))
+  ;;         (NP ((DT "another") (CD "one")))
+  ;;         (. "."))))))
+
+
+
+  )
+
+
+(defn pos-freq [pos-path]
+  (loop [remaining-path (rest pos-path)
+         current-path [(first pos-path)]
+         result {}]
+    (cond
+      (string? (first remaining-path))
+      (-> result
+          (update-in
+           (conj current-path :words)
+           #(merge-with + % {(first remaining-path) 1}))
+          (update-in
+           current-path
+           #(merge-with + % {:freq 1})))
+
+      :else
+      (recur (rest remaining-path)
+             (conj current-path (first remaining-path))
+             (update-in
+              result
+              current-path
+              #(merge-with + % {:freq 1}))))))
+
+(comment
+  (let [pos-path '(TOP S NP DT "This")]
+    (pos-freq pos-path))
+  ;; => {TOP {:freq 1, S {:freq 1, NP {:freq 1, DT {:words {"This" 1}, :freq 1}}}}}
+  )
+
+
+
+(defn pos-freqs [pos-paths]
+  (apply deep-merge-with + pos-paths))
+
+(comment
+  (let [zipper (treebank-zipper-1 ["This is a zipper test."
+                                   "And this is another one."
+                                   "This is not a test."])]
+    (->> zipper
+         leaf-pos-paths
+         (map pos-freq)
+         (apply deep-merge-with +)))
+;; => {TOP
+;;     {:freq 18,
+;;      S
+;;      {:freq 18,
+;;       NP {:freq 3, DT {:words {"This" 2, "this" 1}, :freq 3}},
+;;       VP
+;;       {:freq 11,
+;;        VBZ {:words {"is" 3}, :freq 3},
+;;        NP
+;;        {:freq 7,
+;;         DT {:words {"a" 2, "another" 1}, :freq 3},
+;;         NN {:words {"zipper" 1, "test" 2}, :freq 3},
+;;         CD {:words {"one" 1}, :freq 1}},
+;;        RB {:words {"not" 1}, :freq 1}},
+;;       . {:words {"." 3}, :freq 3},
+;;       CC {:words {"And" 1}, :freq 1}}}} 
+  )
+(defn node-constituents
+  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
+  returns a list of the top-level node tag and its first-level child tags.
+  "
+  [node]
+  (list
+   (first node)
+   (if (every? string? (map first (rest node)))
+     nil
+     (map first (first (rest node))))))
+
+(defn node-constituents-with-tokens
+  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
+  returns a list of the top-level node tag and its first-level child tags.
+
+  Like the above, but instead of converting leaf tokens to nil, leave them
+  as-is. This can be used to merge a tree keeping the leaf tokens in a set
+  and that tree can be used for a kind of markov generation of new text.
+  "
+  [node]
+  (list
+   (first node)
+   (if (every? string? (map first (rest node)))
+     (first (first (rest node)))
+     (map first (first (rest node))))))
+
+(comment
+  (let [text "My name is Eric."
+        parse-tree (->> (treebank-zipper text)
+                        (iterate zip/next)
+                        (take-while (complement zip/end?))
+                        (filter (complement zip/branch?))
+                        (map zip/path)
+                        (map last)
+                        (map node-constituents-with-tokens))]
+    parse-tree)
+  )
+
+(defn phrase-constituents
+  "Given a bit of text that can be parsed into a treebank tree,
+  Get a sequence of the tags and their chunks.
+  For example:
+    My name is Eric.
+  Returns the sequence:
+    At the TOP tag, we have a 'S part-of-speech (a clause).
+    At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period)
+    At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun)
+    ...
+  "
+  [text]
+  (->> (treebank-zipper text)
+       (iterate zip/next)
+       (take-while (complement zip/end?))
+       (filter (complement zip/branch?))
+       (map zip/path)
+       (map last)
+       (map node-constituents)
+       (remove #(string? (first %)))))
+
+(comment
+  (phrase-constituents "My name is Eric.")
+  ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
+  (phrase-constituents "How are you?")
+  ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP)))
+  )
+
+
+(defn pos-constituent-frequencies
+  "Frequencies of the parts of speech that make up phrases.
+  Example:
+    Clauses are made up of:
+      NounPhrase + VerbPhrase 2 times
+      Clause + CoordinatingConjuction + Clause 1 times
+    NounPhrases are made up of:
+      ProperNouns 2 times
+      PersonalPronoun + SingularNoun 3 times
+
+  Does not include frequencies for leaf words. By that I mean: A SingularNoun might
+  appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair
+  and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned
+  by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %)))
+  line. This data structure is used as a kind of markov selection process and we don't really
+  care how often the leafs are used. We just care about the ratio at which we should pick each
+  leaf from a given parent.
+  "
+  [texts]
+  (reduce
+   (fn [acc text]
+     (let [constituents (->> text
+                             phrase-constituents
+                             (remove #(nil? (second %))))]
+       (reduce
+        (fn [acc constituent]
+          (let [k1 (first constituent)
+                k2 (second constituent)]
+            (update-in acc [k1 k2] (fnil inc 0))))
+        acc
+        constituents)))
+   {}
+   texts))
+
+(comment
+  (pos-constituent-frequencies
+   ["My name is Eric."
+    "My hat is blue and I like cake."
+    "Your name is Taylor."
+    "How are you?"])
+  ;; => {TOP {(S) 3, (SBARQ) 1},
+  ;;     S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
+  ;;     NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
+  ;;     VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1},
+  ;;     ADJP {(JJ) 1},
+  ;;     SBARQ {(WHADVP SQ .) 1},
+  ;;     WHADVP {(WRB) 1},
+  ;;     SQ {(VBP NP) 1}}
+
+  (let [phrase "How are you today?"]
+    (->> phrase
+         tokenize
+         (string/join " ")
+         vector
+         parse
+         (map tb/make-tree)))
+
+  (let [phrase "I gave the cake to John at the store."]
+    (parse (tokenize phrase)))
+
+  (let [phrase "I've got a good feeling"]
+    (pos-tagger (tokenize phrase)))
+  )
+
+(comment
+  (let [text "My name is Eric."
+        zipper (treebank-zipper text)]
+    (map identity (seq-zip zipper)))
+
+  )
+
+(defn zipper->freqs [zipper]
+  (let [coll ()]))
+
+(defn reduce-to-freqs [tree]
+  (reduce
+   (fn [acc [pos [children]]]
+     (let [ks (map first children)
+           vs (map rest children)]
+       (-> acc
+           (assoc pos (merge-with + {children 1}))
+           )))
+   {}
+   tree))
+
+(defn text->parsed-pos-freqs
+  [text]
+  (->> text
+       (map tokenize)
+       (map (partial string/join " "))
+       parse
+       (map tb/make-tree)
+       (map unmake-tree)))
+
+(comment
+  (let [text ["My name is Eric and my style is wild."
+              "Your name is unknown and where are you going?"]]
+    (text->parsed-pos-freqs text))
+
+  )
+
+(defn text->pos
+  "'My name is Eric.' -> (['my' 'PRP$'] ['name' 'NN'] ,,,)"
+  [text]
+  (->> text
+       tokenize
+       pos-tagger
+       (map #(vector (string/lower-case (first %)) (second %)))))
+
+(comment
+  (text->pos "My name is Eric.")
+  ;; => (["my" "PRP$"] ["name" "NN"] ["is" "VBZ"] ["eric" "NNP"] ["." "."])
+  )
+
+(defn text->pos-freqs
+  "Turn a line of text into a map of parts of speech to word frequencies.
+  Useful if you have a corpus and need to sample a particular part-of-speech."
+  [text]
+  (let [pos (text->pos text)]
+    (into
+     {}
+     (->> pos
+          (map reverse)
+          (reduce
+           (fn [acc [pos word]]
+             (update
+              acc
+              (symbol pos)
+              #(update % word (fnil inc 0))))
+           {})))))
+
+(comment
+  (text->pos-freqs "My name is Eric and my style is wild.")
+  ;; => {PRP$ {"my" 2},
+  ;;     NN {"name" 1, "style" 1},
+  ;;     VBZ {"is" 2},
+  ;;     NNP {"eric" 1},
+  ;;     CC {"and" 1},
+  ;;     RB {"wild" 1},
+  ;;     . {"." 1}}
+  )
+
+(defn texts->pos-freqs
+  "Convenience method for turning multiple lines of texts into a
+  tree with their pos-word-freqs merged."
+  [texts]
+  (->> texts
+       (map text->pos-freqs)
+       (apply
+        merge-with
+        (fn [old new]
+          (merge-with + old new)))))
+
+(comment
+  (let [texts ["My name is Eric and my style is wildish."
+               "Your name is unknown and your style is childish."]]
+    (texts->pos-freqs texts))
+  ;; => {PRP$ {"my" 2, "your" 2},
+  ;;     NN {"name" 2, "style" 2},
+  ;;     VBZ {"is" 4},
+  ;;     NNP {"eric" 1},
+  ;;     CC {"and" 2},
+  ;;     JJ {"wildish" 1, "unknown" 1, "childish" 1},
+  ;;     . {"." 2}}
+  )
+
+(defn pos-freqs->reverse
+  "For getting a map of words to their pos frequencies"
+  [tree]
+  (reduce
+   (fn [acc [pos word-freqs]]
+     (reduce
+      (fn [acc [word freq]]
+        (update acc word (fnil conj {}) {pos freq}))
+      acc
+      word-freqs))
+   {}
+   tree))
+
+(comment
+  (let [pos-freqs {'NN {"name" 2 "style" 2}
+                   'VBZ {"is" 4 "name" 1}}]
+    (pos-freqs->reverse pos-freqs))
+  ;; => {"name" {NN 2, VBZ 1}, "style" {NN 2}, "is" {VBZ 4}}
+  )
+
+(defn pos-tree->str [pos-tree]
+  (->> pos-tree
+       (map (fn [[k v]]
+              (cons (str k) (seq v))))
+       (map #(string/join " " %))
+       (string/join "\n")))
+
+(defn str->pos-tree [s]
+  (let [lines (string/split s #"\n")]
+    (into
+     {}
+     (map
+      (fn [line]
+        (->> line
+             (#(string/split % #" " 2))
+             ((fn [[k v]]
+                (vector (symbol k) (into #{} (string/split v #" ")))))))
+      lines))))
+
+(comment
+  (text->pos-freqs "My name is Eric.")
+
+  (->> ["My name is Eric."
+        "Your name is Taylor."
+        "Is your feeling hurt?"
+        "How are you feeling?"
+        "It is a lovely feeling."
+        "Your name is not Eric."
+        "Who is your mother and what does she do?"]
+       (pos-constituent-frequencies)
+       #_#_(apply
+        merge-with
+        (fn [a b]
+          (merge-with + a b)))
+       (pos-freqs->reverse))
+
+  )
+
+(defn reverse-children-to-parents
+  "We want to work from the reverse, or backwards, representation of pos-constituents-frequencies.
+  Pos-constituent-frequencies is good for generating text top-down. But we want
+  to generate text bottom-up for rhyming."
+  [pos-freqs]
+  (reduce
+   (fn [acc [k v]]
+     (reduce
+      (fn [acc [children freq]]
+        (update acc children assoc k freq))
+      acc
+      v))
+   {}
+   pos-freqs))
+
+(defn ways-to-make-a [pos pos-freqs]
+  (let [children (pos-freqs pos)]
+    (->> children
+         ((juxt keys vals))
+         (apply map vector)
+         (into {}))))
+
+(comment
+  (let [pos-freqs (->> ["My name is Eric."
+                        "Your name is Taylor."
+                        "Is your feeling hurt?"
+                        "How are you feeling?"
+                        "It is a lovely feeling."
+                        "Your name is not Eric."
+                        "Who is your mother and what does she do?"]
+                       (pos-constituent-frequencies))]
+    (->> pos-freqs
+         #_(ways-to-make-a 'S)
+         (#(get % 'NN))))
+
+  (let [pos-freqs (->> ["My name is Eric."
+                        "Your name is Taylor."
+                        "Is your feeling hurt?"
+                        "How are you feeling?"
+                        "It is a lovely feeling."
+                        "Your name is not Eric."
+                        "Who is your mother and what does she do?"]
+                       (pos-constituent-frequencies))]
+    (->> (reverse-children-to-parents pos-freqs)
+         (remove (fn [[k v]] (and (= 1 (count k))
+                                  (pos-freqs (first k)))))
+         (into {})))
+
+  )