Add comments to nlp/g code

5 years ago · 0c7da21610
parent c0ad5acefc
commit 0c7da21610
6 changed files with 478 additions and 381 deletions
--- a/dev/examples/core.clj
+++ b/dev/examples/core.clj
@ -1,16 +1,21 @@
 (ns examples.core
  (:require [clojure.string :as string]
            [clojure.set]
+            [clojure.java.io :as io]
            [com.owoga.prhyme.frp :as frp]
+            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]
            [com.owoga.prhyme.data.bigrams :as bigrams]
            [com.owoga.prhyme.gen :as gen]
+            [com.owoga.prhyme.nlp.core :as nlp]
+            [com.owoga.prhyme.nlg.core :as nlg]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.data.thesaurus :as thesaurus]
            [com.owoga.prhyme.data.darklyrics :as darklyrics]
            [com.owoga.prhyme.generation.weighted-selection :as weighted]
            [clojure.set :as set]
-            [clojure.zip :as zip]))
+            [clojure.zip :as zip]
+            [clojure.walk :as walk]))

 (defn weight-fn [word target result]
  (let [rimes (frp/consecutive-matching word target :rimes)
@ -190,3 +195,91 @@
            (map :normalized-word)))))

  )
+
+(defn remove-sentences-with-words-not-in-dictionary [dictionary]
+  (let [dictionary (into #{} dictionary)]
+    (fn [sentences]
+      (->> sentences
+           (map #(string/split % #" "))
+           (remove #(some (complement dictionary) %))
+           (remove #(some string/blank? %))
+           (map #(string/join " " %))))))
+
+(defn dark-pos-freqs []
+  (let [directory "dark-corpus"]
+    (->> (file-seq (io/file directory))
+         (remove #(.isDirectory %))
+         (drop 10)
+         (take 10)
+         (map slurp)
+         (map util/clean-text)
+         (filter dict/english?)
+         (map #(string/split % #"\n+"))
+         (map (remove-sentences-with-words-not-in-dictionary dict/popular))
+         (map nlp/treebank-zipper)
+         (map nlp/leaf-pos-path-word-freqs)
+         (apply nlp/deep-merge-with +))))
+
+(defn dark-structures []
+  (let [directory "dark-corpus"]
+    (->> (file-seq (io/file directory))
+         (remove #(.isDirectory %))
+         (take 1000)
+         (map slurp)
+         (map util/clean-text)
+         (filter dict/english?)
+         (map #(string/split % #"\n+"))
+         (map #(remove string/blank? %))
+         (map nlp/parse-to-simple-tree)
+         (map nlp/parse-tree-sans-leaf-words)
+         (map
+          (fn [lines]
+            (map
+             (fn [line]
+               (hash-map line 1))
+             lines)))
+         (map (partial merge-with +))
+         flatten
+         (apply merge-with +))))
+
+(comment
+  (time (def example-pos-freqs (dark-pos-freqs)))
+
+  example-pos-freqs
+
+  (take 20 example-pos-freqs)
+  (time (def example-structures (dark-structures)))
+
+  (def common-example-structures
+    (filter
+     #(< 10 (second %))
+     example-structures))
+  (count common-example-structures)
+  (let [structure (rand-nth (seq common-example-structures))
+        zipper (zip/seq-zip (first structure))]
+    (loop [zipper zipper]
+      (let [path (map first (zip/path zipper))]
+        (cond
+          (zip/end? zipper) (zip/root zipper)
+          (and (not-empty path)
+               (example-pos-freqs path))
+          (recur
+           (-> zipper
+               zip/up
+               (zip/append-child
+                (first
+                 (rand-nth
+                  (seq
+                   (example-pos-freqs path)))))
+               zip/down
+               zip/next
+               zip/next))
+          :else (recur (zip/next zipper))))))
+
+  (get-in {:a 1} '())
+  (let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))]
+    (-> zipper
+        zip/down
+        zip/right
+        zip/node))
+  )
--- a/src/com/owoga/prhyme/gen.clj
+++ b/src/com/owoga/prhyme/gen.clj
@ -3,7 +3,7 @@
            [com.owoga.prhyme.util.math :as math]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
-            [com.owoga.prhyme.util.nlp :as nlp]
+            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.prhyme.frp :as frp]
            [com.owoga.prhyme.core :as prhyme]))

--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -4,7 +4,9 @@
            [clojure.string :as string]
            [clojure.java.io :as io]
            [clojure.zip :as zip]
-            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
+            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
+            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
+            [clojure.walk :as walk])
  (:import (opennlp.tools.postag POSModel POSTaggerME)))

 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
@ -44,13 +46,6 @@
         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
-  (let [phrase "Blood falls."]
-    (->> phrase
-         tokenize
-         (top-k-sequences prhyme-pos-tagger)
-         (map (juxt #(.getOutcomes %)
-                    #(map float (.getProbs %))))))
-
  )

 (defn deep-merge-with [f & maps]
@ -67,6 +62,7 @@
  ;; => {:a 1, :b {:b 7}, :c 3}
  )

+
 (defn valid-sentence?
  "Tokenizes and parses the phrase using OpenNLP models from
  http://opennlp.sourceforge.net/models-1.5/
@ -95,7 +91,16 @@
  representation of the tree, then we can `unmake` the tree
  to turn it into a list representation of the tree that
  we can easily use in a clojure zipper. (read-string almost works,
-  but falls apart when reading things like commas)."
+  but falls apart when reading things like commas).
+
+  This doesn't map to the string returned by the parser.
+  Children here are all nested inside a single list.
+  For example: (NP ((DT 'a') (NN ('sample')) (NN 'test')))
+  rather than: (NP (DT 'a') (NN 'sample') (NN 'test'))
+
+  If you want a data structure that is in parity with the
+  string returned by the parser, look at `parse-to-simple-tree`
+  "
  [node]
  (cond
    (string? node) node
@ -103,42 +108,78 @@
    :else (map unmake-tree node)))

 (comment
-  (let [phrase "Hello, Eric"]
-    (->> phrase
-         tokenize
-         (string/join " ")
-         vector
+  (let [text-lines ["This is a sample test."
+                    "This is another line."]]
+    (->> text-lines
+         (map tokenize)
+         (map (partial string/join " "))
         parse
         (map tb/make-tree)
-         unmake-tree))
-  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
+         (map unmake-tree)))
+  ;; => ((TOP
+  ;;      ((S
+  ;;        ((NP ((DT ("This"))))
+  ;;         (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("sample")) (NN ("test"))))))
+  ;;         (. ("."))))))
+  ;;     (TOP
+  ;;      ((S
+  ;;        ((NP ((DT ("This"))))
+  ;;         (VP ((VBZ ("is")) (NP ((DT ("another")) (NN ("line"))))))
+  ;;         (. (".")))))))
  )

-(defn treebank-zipper
-  "Turns a bit of text into a parse tree into a zipper."
-  [text]
-  (let [tree (->> text
-                  tokenize
-                  (string/join " ")
-                  vector
-                  parse
-                  (map tb/make-tree)
-                  unmake-tree)]
-    (zip/seq-zip tree)))
+
+(defn parse-to-simple-tree
+  "Returns a Clojure data structure, a list, in the shape of a tree, that
+  matches the structure returned by the parser.
+
+  The implementation takes a roundabout path. It uses `tb/make-tree` to
+  get a map/list data structure, it then unmakes the tree info a data
+  structure that can be `walk`ed, it then walks the tree making modifications
+  to bring it in line with the parse string structure.
+  "
+  [text-lines]
+  (->> text-lines
+       (map tokenize)
+       (map (partial string/join " "))
+       parse
+       (map tb/make-tree)
+       (map unmake-tree)
+       (map
+        #(walk/postwalk
+          (fn [node]
+            (cond
+              (and (seq? node)
+                   (symbol? (first node))
+                   (and (seq? (second node))
+                        (seq? (first (second node)))))
+              (cons (first node) (second node))
+
+              (and (seq? node)
+                   (string? (first node)))
+              (first node)
+
+              :else node))
+          %))))

 (comment
-  (let [zipper (treebank-zipper "This is a zipper test.")]
-    zipper)
-  ;; => [((TOP
-  ;;       ((S
-  ;;         ((NP ((DT ("This"))))
-  ;;          (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("zipper")) (NN ("test"))))))
-  ;;          (. (".")))))))
-  ;;     nil]
+  ;; This is what the parsed string looks like.
+  (let [text-lines ["Hello, world!"]]
+    (->> text-lines (map tokenize) (map (partial string/join " ")) parse))
+  ;; => ["(TOP (FRAG (INTJ (UH Hello)) (, ,) (NP (NN world)) (. !)))"]
+
+  ;; And this is what the simple-tree Clojure data structure looks like.
+  (let [text-lines ["Hello, world!"]]
+    (parse-to-simple-tree text-lines))
+  ;; => ((TOP (FRAG (INTJ (UH "Hello")) (, ",") (NP (NN "world")) (. "!")))) 
  )

-(defn treebank-zipper-1
-  "Turns a bit of text into a parse tree into a zipper."
+(defn treebank-zipper
+  "Turns a bit of text into a parse tree into a zipper.
+
+  Porcelain. If you have the simple tree data structure
+  returned by `parse-to-simple-tree`, then you can just
+  pass that directly to `zip/seq-zip`."
  [texts]
  (let [tree (->> texts
                  (map tokenize)
@ -148,14 +189,30 @@
                  unmake-tree)]
    (zip/seq-zip tree)))

-(defn seq-zip [zipper]
+(comment
+  (let [texts ["Eric's test is difficult."]]
+    (loop [zipper (treebank-zipper texts)]
+      (cond
+        (zip/end? zipper) (zip/root zipper)
+        (= 'JJ (zip/node zipper)) (recur (-> zipper
+                                             zip/next
+                                             (zip/replace '("thorough"))))
+        :else (recur (zip/next zipper)))))
+  ;; => ((TOP
+  ;;      ((S
+  ;;        ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
+  ;;         (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
+  ;;         (. (".")))))))
+  )
+
+(defn iter-zip [zipper]
  (->> zipper
       (iterate zip/next)
       (take-while (complement zip/end?))))

-(defn seq-nodes [zipper]
+(defn iter-nodes [zipper]
  (->> zipper
-       seq-zip
+       iter-zip
       (map zip/node)))

 (defn phrase-level? [node]
@ -168,9 +225,19 @@
       (= 1 (count (second node)))
       (string? (first (second node)))))

-(defn leaf-pos-paths [zipper]
+(defn leaf-pos-paths
+  "Seq of the path down the parse tree to each leaf part-of-speech.
+
+  Useful to aggregate over a corpus information like how often
+  different words are used as different parts of speech.
+
+  Benefit of having the entire path down the tree is that you
+  can know things like 'A noun phrase as a child of a verb phrase
+  never has a determiner.'
+  "
+  [zipper]
  (->> zipper
-       seq-zip
+       iter-zip
       (filter (complement zip/branch?))
       (map zip/path)
       (map #(map first %))
@ -182,21 +249,77 @@
       (filter #(string? (last %)))))

 (comment
-  (let [zipper (treebank-zipper-1 ["This is a zipper test."
-                                   "And this is another one."])]
+  (let [zipper (treebank-zipper ["Commas, they work."
+                                 "Eric's test is difficult."
+                                 "Eric's testing the code."])]
    (leaf-pos-paths zipper))
-;; => ((TOP S NP DT "This")
-;;     (TOP S VP VBZ "is")
-;;     (TOP S VP NP DT "a")
-;;     (TOP S VP NP NN "zipper")
-;;     (TOP S VP NP NN "test")
-;;     (TOP S . ".")
-;;     (TOP S CC "And")
-;;     (TOP S NP DT "this")
-;;     (TOP S VP VBZ "is")
-;;     (TOP S VP NP DT "another")
-;;     (TOP S VP NP CD "one")
-;;     (TOP S . ".")) 
+  ;; => ((TOP S NP NNP "Commas")
+  ;;     (TOP S , ",")
+  ;;     (TOP S NP PRP "they")
+  ;;     (TOP S VP VBP "work")
+  ;;     (TOP S . ".")
+  ;;     (TOP S NP NP NNP "Eric")
+  ;;     (TOP S NP NP POS "'s")
+  ;;     (TOP S NP NN "test")
+  ;;     (TOP S VP VBZ "is")
+  ;;     (TOP S VP ADJP JJ "difficult")
+  ;;     (TOP S . ".")
+  ;;     (TOP S NP NNP "Eric")
+  ;;     (TOP S VP VBZ "'s")
+  ;;     (TOP S VP VP VBG "testing")
+  ;;     (TOP S VP VP NP DT "the")
+  ;;     (TOP S VP VP NP NN "code")
+  ;;     (TOP S . "."))
+  )
+
+(defn leaf-pos-path-word-freqs [zipper]
+  (->> zipper
+      leaf-pos-paths
+      (map #(hash-map (butlast %) {(last %) 1}))
+      (apply deep-merge-with +)))
+
+(comment
+  (let [zipper (treebank-zipper ["Eric's test is difficult."
+                                 "Eric's test is thorough."
+                                 "Eric's testing."])]
+    (leaf-pos-path-word-freqs zipper))
+  ;; => {(TOP NP .) {"." 1},
+  ;;     (TOP NP NP POS) {"'s" 1},
+  ;;     (TOP S NP NP NNP) {"Eric" 2},
+  ;;     (TOP NP NN) {"testing" 1},
+  ;;     (TOP S VP VBZ) {"is" 2},
+  ;;     (TOP S .) {"." 2},
+  ;;     (TOP S NP NP POS) {"'s" 2},
+  ;;     (TOP NP NP NNP) {"Eric" 1},
+  ;;     (TOP S NP NN) {"test" 2},
+  ;;     (TOP S VP ADJP JJ) {"difficult" 1, "thorough" 1}}
+  )
+
+(defn parse-tree-sans-leaf-words
+  "Takes a 'simple' parse tree (`parse-to-simple-tree`)
+  Removes the leaf words from the tree.
+
+  Useful if you want to work with the structure of something
+  without caring about the actual words."
+  [tree]
+  (walk/postwalk
+   (fn [node]
+     (if (and (seq? node)
+              (string? (second node)))
+       (take 1 node)
+       node))
+   tree))
+
+(comment
+  (let [tree (parse-to-simple-tree ["Eric's test is difficult."
+                                    "Eric's test is thorough."
+                                    "Eric's testing."
+                                    "Eric is testing."])]
+    (parse-tree-sans-leaf-words tree))
+  ;; => ((TOP (S (NP (NP (NNP) (POS)) (NN)) (VP (VBZ) (ADJP (JJ))) (.)))
+  ;;     (TOP (S (NP (NP (NNP) (POS)) (NN)) (VP (VBZ) (ADJP (JJ))) (.)))
+  ;;     (TOP (NP (NP (NNP) (POS)) (NN) (.)))
+  ;;     (TOP (S (NP (NNP)) (VP (VBZ) (VP (VBG))) (.))))
  )


@ -232,7 +355,7 @@
  "
  [zipper]
  (->> zipper
-       seq-nodes
+       iter-nodes
       (filter phrase-level?)
       (map
        (fn [[pos xs]]
@ -246,29 +369,47 @@
       (apply deep-merge-with +)))

 (comment
-  (let [zipper (treebank-zipper-1 ["This is a test and that is not a test."
-                                   "My name is Eric."
-                                   "Go to the store."
-                                   "Your name is not Eric."
-                                   "This is a sample test."])
+  (let [zipper (treebank-zipper ["Eric's test is difficult."
+                                 "Eric's test is thorough."
+                                 "Eric's testing."])
        freqs (->> zipper
                   pos->children-freqs)]
    freqs)
-;; => {NP {(DT) 3, (DT NN) 3, (PRP$ NN) 2, (NNP) 2, (DT NN NN) 1},
-;;     VB {"Go" 1},
-;;     VBZ {"is" 5},
-;;     S {(S CC S .) 1, (NP VP) 2, (NP VP .) 3},
-;;     RB {"not" 2},
-;;     NNP {"Eric" 2},
-;;     TO {"to" 1},
-;;     DT {"This" 2, "a" 3, "that" 1, "the" 1},
-;;     TOP {(S) 4, (VP) 1},
-;;     CC {"and" 1},
-;;     NN {"test" 3, "name" 2, "store" 1, "sample" 1},
-;;     PP {(TO NP) 1},
-;;     VP {(VBZ NP) 3, (VBZ RB NP) 2, (VB PP .) 1},
-;;     PRP$ {"My" 1, "Your" 1},
-;;     . {"." 5}} 
+  ;; => {NP {(NP NN) 2, (NNP POS) 3, (NP NN .) 1},
+  ;;     ADJP {(JJ) 2},
+  ;;     VBZ {"is" 2},
+  ;;     S {(NP VP .) 2},
+  ;;     NNP {"Eric" 3},
+  ;;     JJ {"difficult" 1, "thorough" 1},
+  ;;     TOP {(S) 2, (NP) 1},
+  ;;     POS {"'s" 3},
+  ;;     NN {"test" 2, "testing" 1},
+  ;;     VP {(VBZ ADJP) 2},
+  ;;     . {"." 3}}
+  )
+
+(defn remove-sentences-with-words-not-in-dictionary [dictionary]
+  (let [dictionary (into #{} dictionary)]
+    (fn [sentences]
+      (println sentences)
+      (println dictionary)
+      (->> sentences
+           (map #(string/split % #" "))
+           (remove #(some (complement dictionary) %))
+           (remove #(some string/blank? %))
+           (map #(string/join " " %))))))
+
+(comment
+  (let [dictionary ["this" "is" "a" "test"]
+        remove-fn (remove-sentences-with-words-not-in-dictionary
+                   dictionary)
+        sentences ["this is a foobar test"
+                   "Eric is a test"
+                   "this is a test"
+                   "a test this is"]]
+    (->> sentences
+         remove-fn))
+  ;; => ("this is a test" "a test this is")
  )

 (defn generate-from-freqs
@ -276,86 +417,78 @@
  and the number of times each constituent is seen in a corpus.

  start is the part-of-speech to start generating for.
+
+  Creates a zipper that will initially have just the starting node.
+  Iterates through the zipper with zip/next. When we encounter a node
+  that exists in the frequencies map, then replace the node we are on
+  with a value from the map.
+
+  A problem with this is that we might not always get a valid structure.
+
+  For example, a noun phrase might not start with a determiner 20% of the time,
+  but 100% of the time when the noun phrase follows a verb phrase.
+
+  So if we are just selecting based on frequency, we'll end up worse off than
+  if we take into account the entire context we're in.
  "
  [freqs start]
  (let [zipper (zip/seq-zip (list start))]
    (loop [zipper zipper]
-      (cond
-        (zip/end? zipper) (zip/root zipper)
+      (let [cur-freqs (freqs (zip/node zipper))]
+        (cond
+          (zip/end? zipper) (zip/root zipper)

-        (zip/branch? zipper) (recur (zip/next zipper))
+          (zip/branch? zipper) (recur (zip/next zipper))

-        (freqs (zip/node zipper))
-        (recur
-         (zip/next
-          (zip/next
-           (zip/replace
-            zipper
-            (list
-             (zip/node zipper)
-             (first (rand-nth (seq (freqs (zip/node zipper))))))))))
+          cur-freqs
+          (let [selection (first (weighted-rand/weighted-selection second cur-freqs))]
+            (recur
+             (zip/next
+              (zip/next
+               (zip/replace
+                zipper
+                (list (zip/node zipper) selection))))))

-        :else (recur (zip/next zipper))))))
+          :else (recur (zip/next zipper)))))))

 (comment
-  (let [zipper (treebank-zipper-1 ["This is a zipper test in the car."
-                                   "And this is another one."
-                                   "Here are some sample sentences."
-                                   "Let's see what we can generate."
-                                   "This is a big adjective."
-                                   "That's an ugly sentence."
-                                   "The corpus will be simple to start."])
-        freqs (->> zipper
-                   pos->children-freqs)]
-    freqs
-    (->> (generate-from-freqs freqs 'TOP))
-    (->> (repeatedly
-          (fn []
-            (->> (generate-from-freqs freqs 'TOP)
+  (letfn [;; This is just a helper function to get the string leafs of a tree and
+          ;; join them into a single string.
+          (string-leaf-nodes [tree]
+            (->> tree
                 (zip/seq-zip)
                 (iterate zip/next)
                 (take-while (complement zip/end?))
                 (filter (complement zip/branch?))
                 (map zip/node)
                 (filter string?)
-                 (string/join " "))))
-         (filter valid-sentence?)
-         (take 10)))
-  ;; => ("Here be what start big And This test see what be That one start . . That one ."
-  ;;     "Here are This ugly zipper ."
-  ;;     "And an is The sample ."
-  ;;     "generate simple see This corpus test to Let And a Let what an corpus adjective to be And another ugly car can see ugly start what this one to are . . . . . . ."
-  ;;     "This sample sentences Let big 's a sample adjective in we ."
-  ;;     "will generate what Let"
-  ;;     "Here be ugly That sentence generate a big test ."
-  ;;     "Here is this corpus sentences in a test this sample sentences ."
-  ;;     "And some ugly zipper 's 's ."
-  ;;     "Here generate big the adjective 's another corpus an .")
-  ;; => ((TOP
-  ;;      ((SINV
-  ;;        ((ADVP ((RB "Here")))
-  ;;         (VP
-  ;;          ((TO "to")
-  ;;           (VP
-  ;;            ((TO "to")
-  ;;             (VP
-  ;;              ((VB "Let")
-  ;;               (SBAR
-  ;;                ((WHNP ((WP "what")))
-  ;;                 (S
-  ;;                  ((NP ((DT "this") (CD "one")))
-  ;;                   (VP
-  ;;                    ((VBZ "is")
-  ;;                     (NP ((DT "The") (NN "adjective") (NN "car")))
-  ;;                     (PP
-  ;;                      ((IN "in")
-  ;;                       (NP
-  ;;                        ((DT "The") (NN "test") (NNS "sentences")))))))))))))))))
-  ;;         (NP ((DT "another") (CD "one")))
-  ;;         (. "."))))))
-
-
-
+                 (string/join " ")))]
+    (let [ ;; Create an initial corpus.
+          zipper (treebank-zipper ["This is a zipper test in the car."
+                                   "And this is another one."
+                                   "Here are some sample sentences."
+                                   "Let's see what we can generate."
+                                   "This is a big adjective."
+                                   "That's a short adjective."
+                                   "The corpus will be a simple start."])
+          ;; Extract parts-of-speech frequencies from the corpus.
+          freqs (pos->children-freqs zipper)]
+      ;; Let's generate a bunch of noun phrases.
+      (repeatedly
+       10
+       (fn []
+         (->> (generate-from-freqs freqs 'NP)
+              string-leaf-nodes)))))
+  ;; => ("another simple corpus"
+  ;;     "we"
+  ;;     "a big start"
+  ;;     "This start"
+  ;;     "'s"
+  ;;     "a"
+  ;;     "a test sentences"
+  ;;     "a sample"
+  ;;     "another car"
+  ;;     "this car sentences")
  )


@ -384,39 +517,119 @@
 (comment
  (let [pos-path '(TOP S NP DT "This")]
    (pos-freq pos-path))
+
  ;; => {TOP {:freq 1, S {:freq 1, NP {:freq 1, DT {:words {"This" 1}, :freq 1}}}}}
  )

-
-
 (defn pos-freqs [pos-paths]
  (apply deep-merge-with + pos-paths))

+(defn structure-freqs
+  "Frequencies of entire parse trees."
+  [text-lines]
+  (let [parse-tree (-> text-lines
+                       parse-to-simple-tree
+                       parse-tree-sans-leaf-words)
+        freqs (->> parse-tree
+                   (map (fn [line] {line 1}))
+                   (apply merge-with +))]
+    freqs))
+
 (comment
-  (let [zipper (treebank-zipper-1 ["This is a zipper test."
-                                   "And this is another one."
-                                   "This is not a test."])]
-    (->> zipper
-         leaf-pos-paths
-         (map pos-freq)
-         (apply deep-merge-with +)))
-;; => {TOP
-;;     {:freq 18,
-;;      S
-;;      {:freq 18,
-;;       NP {:freq 3, DT {:words {"This" 2, "this" 1}, :freq 3}},
-;;       VP
-;;       {:freq 11,
-;;        VBZ {:words {"is" 3}, :freq 3},
-;;        NP
-;;        {:freq 7,
-;;         DT {:words {"a" 2, "another" 1}, :freq 3},
-;;         NN {:words {"zipper" 1, "test" 2}, :freq 3},
-;;         CD {:words {"one" 1}, :freq 1}},
-;;        RB {:words {"not" 1}, :freq 1}},
-;;       . {:words {"." 3}, :freq 3},
-;;       CC {:words {"And" 1}, :freq 1}}}} 
+  (let [text-lines ["This is a test."
+                    "This is a line."
+                    "That is a test."
+                    "You are a test."
+                    "This is not a test."
+                    "I am a test."
+                    "This is a sample."
+                    "Go to the store."
+                    "Run the test."
+                    "The test is running."]]
+    (structure-freqs text-lines))
+  ;; => {(TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))) (.))) 4,
+  ;;     (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))) (.))) 2,
+  ;;     (TOP (S (NP (DT)) (VP (VBZ) (RB) (NP (DT) (NN))) (.))) 1,
+  ;;     (TOP (VP (VB) (PP (TO) (NP (DT) (NN))) (.))) 1,
+  ;;     (TOP (VP (VB) (NP (DT) (NN)) (.))) 1,
+  ;;     (TOP (S (NP (DT) (NN)) (VP (VBZ) (VP (VBG))) (.))) 1}
  )
+
+(defn generate-from-structure-and-pos-freqs
+  "This is an attempt to improve on selecting words for a particular part-of-speech
+  based solely on the part of speech.
+
+  By passing a `structure-freqs` map, we can choose from a collection of known-valid
+  structures.
+
+  S -> NP | VP | NP VP
+  NP -> NN | DT NN | DT
+  VP -> VB | VB NN
+
+  If we go just based on frequencies, we might have a grammar that allows something like the above.
+  But it might be such that when S is NP VP, then NP is NEVER DT.
+
+  Thak's what supplying the structure frequency map can improve.
+  "
+  [structure pos-freqs]
+  (let [zipper (zip/seq-zip structure)]
+    (loop [zipper zipper]
+      (let [path (map first (zip/path zipper))
+            cur-freqs (pos-freqs path)]
+        (cond
+          (zip/end? zipper) (zip/root zipper)
+
+          (zip/branch? zipper) (recur (zip/next zipper))
+
+          cur-freqs
+          (let [selection (first (weighted-rand/weighted-selection second cur-freqs))]
+            (recur
+             (zip/next
+              (zip/next
+               (zip/replace
+                zipper
+                (list (zip/node zipper) selection))))))
+
+          ;; ???
+          :else (recur (zip/next zipper)))))))
+
+(comment
+  (let [corpus ["this is a test"
+                "that is a test"
+                "this is a sample"
+                "that is some code"
+                "there is a car"
+                "those are some cars"
+                "that is a dog"
+                "it is a dog"]
+        structure '(TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN)))))
+        pos-freqs (->> corpus
+                       treebank-zipper
+                       leaf-pos-path-word-freqs)]
+    (repeatedly
+     10
+     (fn []
+       (->> (generate-from-structure-and-pos-freqs
+             structure
+             pos-freqs)
+            zip/seq-zip
+            (iterate zip/next)
+            (take-while (complement zip/end?))
+            (filter #(string? (zip/node %)))
+            (map zip/node)
+            (string/join " ")))))
+  ;; => ("this is some dog"
+  ;;     "that is a test"
+  ;;     "those is a dog"
+  ;;     "those is a sample"
+  ;;     "this is a test"
+  ;;     "that is some test"
+  ;;     "that is a car"
+  ;;     "that is a car"
+  ;;     "that is some dog"
+  ;;     "this is a test")
+  )
+
 (defn node-constituents
  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
  returns a list of the top-level node tag and its first-level child tags.
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -68,7 +68,7 @@
                  (lazy-seq ((window n) (drop n coll)))))))

 (defn clean-text [text]
-  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
+  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" "")))

 (defn padr [val n coll]
  (concat coll (repeat n val)))
--- a/src/com/owoga/prhyme/util/nlp.clj
+++ b/src/com/owoga/prhyme/util/nlp.clj
@ -1,210 +0,0 @@
-(ns com.owoga.prhyme.util.nlp
-  (:require [opennlp.nlp :as nlp]
-            [opennlp.treebank :as tb]
-            [clojure.string :as string]
-            [clojure.java.io :as io]
-            [clojure.zip :as zip]
-            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
-  (:import (opennlp.tools.postag POSModel POSTaggerME)))
-
-(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
-(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
-(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
-(def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
-
-;;;; The tagger that onennlp.nlp gives us doesn't provide access
-;;;; to the probabilities of all tags. It gives us the probability of the
-;;;; top tag through some metadata. But to get probs for all tags, we
-;;;; need to implement our own tagger.
-(defprotocol Tagger
-  (tags [this sent])
-  (probs [this])
-  (top-k-sequences [this sent]))
-
-(defn make-pos-tagger
-  [modelfile]
-  (let [model (with-open [model-stream (io/input-stream modelfile)]
-                (POSModel. model-stream))
-        tagger (POSTaggerME. model)]
-    (reify Tagger
-      (tags [_ tokens]
-        (let [token-array (into-array String tokens)]
-          (map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array))))
-      (probs [_] (seq (.probs tagger)))
-      (top-k-sequences [_ tokens]
-        (let [token-array (into-array String tokens)]
-          (.topKSequences tagger #^"[Ljava.lang.String;" token-array))))))
-
-(def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
-
-(comment
-  (let [phrase "The feeling hurts."]
-    (map (juxt #(.getOutcomes %)
-               #(map float (.getProbs %)))
-         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
-  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
-  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
-  )
-
-(defn valid-sentence?
-  "Tokenizes and parses the phrase using OpenNLP models from
-  http://opennlp.sourceforge.net/models-1.5/
-
-  If the parse tree has an clause as the top-level tag, then
-  we consider it a valid English sentence."
-  [phrase]
-  (->> phrase
-       tokenize
-       (string/join " ")
-       vector
-       parse
-       first
-       tb/make-tree
-       :chunk
-       first
-       :tag
-       tb2/clauses
-       boolean))
-
-(defn unmake-tree
-  "Tokenizing and then parsing a sentence returns a string
-  representation of the parse tree. This is a helper function
-  to make working with the parse tree more convenient. We
-  can use `opennlp.treebank/make-tree` to make a clojure map
-  representation of the tree, then we can `unmake` the tree
-  to turn it into a list representation of the tree that
-  we can easily use in a clojure zipper. (read-string almost works,
-  but falls apart when reading things like commas)."
-  [node]
-  (cond
-    (string? node) node
-    (map? node) (list (:tag node) (unmake-tree (:chunk node)))
-    :else (map unmake-tree node)))
-
-(comment
-  (let [phrase "Hello, Eric"]
-    (->> phrase
-         tokenize
-         (string/join " ")
-         vector
-         parse
-         (map tb/make-tree)
-         unmake-tree))
-  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
-  )
-
-(defn treebank-zipper
-  "Turns a bit of text into a parse tree into a zipper."
-  [text]
-  (let [tree (->> text
-                  tokenize
-                  (string/join " ")
-                  vector
-                  parse
-                  (map tb/make-tree)
-                  unmake-tree)]
-    (zip/zipper seq? seq (fn [_ c] c) tree)))
-
-(defn node-constituents
-  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
-  returns a list of the top-level node tag and its first-level child tags.
-  "
-  [node]
-  (list
-   (first node)
-   (if (every? string? (map first (rest node)))
-     nil
-     (map first (first (rest node))))))
-
-(defn phrase-constituents
-  "Given a bit of text that can be parsed into a treebank tree,
-  Get a sequence of the tags and their chunks.
-  For example:
-    My name is Eric.
-  Returns the sequence:
-    At the TOP tag, we have a 'S part-of-speech (a clause).
-    At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period)
-    At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun)
-    ...
-  "
-  [text]
-  (->> (treebank-zipper text)
-       (iterate zip/next)
-       (take-while (complement zip/end?))
-       (filter (complement zip/branch?))
-       (map zip/path)
-       (map last)
-       (map node-constituents)
-       (remove #(string? (first %)))))
-
-(comment
-  (phrase-constituents "My name is Eric.")
-  ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
-  (phrase-constituents "How are you?")
-  ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP))) 
-  )
-
-
-(defn pos-constituent-frequencies
-  "Frequencies of the parts of speech that make up phrases.
-  Example:
-    Clauses are made up of:
-      NounPhrase + VerbPhrase 2 times
-      Clause + CoordinatingConjuction + Clause 1 times
-    NounPhrases are made up of:
-      ProperNouns 2 times
-      PersonalPronoun + SingularNoun 3 times
-
-  Does not include frequencies for leaf words. By that I mean: A SingularNoun might
-  appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair
-  and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned
-  by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %)))
-  line. This data structure is used as a kind of markov selection process and we don't really
-  care how often the leafs are used. We just care about the ratio at which we should pick each
-  leaf from a given parent.
-  "
-  [texts]
-  (reduce
-   (fn [acc text]
-     (let [constituents (->> text
-                             phrase-constituents
-                             (remove #(nil? (second %))))]
-       (reduce
-        (fn [acc constituent]
-          (let [k1 (first constituent)
-                k2 (second constituent)]
-            (update-in acc [k1 k2] (fnil inc 0))))
-        acc
-        constituents)))
-   {}
-   texts))
-
-(comment
-  (pos-constituent-frequencies
-   ["My name is Eric."
-    "My hat is blue and I like cake."
-    "Your name is Taylor."
-    "How are you?"])
-  ;; => {TOP {(S) 3, (SBARQ) 1},
-  ;;     S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
-  ;;     NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
-  ;;     VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1},
-  ;;     ADJP {(JJ) 1},
-  ;;     SBARQ {(WHADVP SQ .) 1},
-  ;;     WHADVP {(WRB) 1},
-  ;;     SQ {(VBP NP) 1}}
-
-  (let [phrase "How are you today?"]
-    (->> phrase
-         tokenize
-         (string/join " ")
-         vector
-         parse
-         (map tb/make-tree)))
-
-  (let [phrase "I gave the cake to John at the store."]
-    (parse (tokenize phrase)))
-
-  (let [phrase "I've got a good feeling"]
-    (pos-tagger (tokenize phrase)))
-  )
--- a/src/com/owoga/prhyme/util/weighted_rand.clj
+++ b/src/com/owoga/prhyme/util/weighted_rand.clj
@ -78,7 +78,8 @@
         selection (nth coll index)]
     selection))
  ([key-fn coll]
-   (let [rng (from-weights (map key-fn coll))
+   (let [coll (seq coll)
+         rng (from-weights (map key-fn coll))
         index (nextr rng nil)
         selection (nth coll index)]
     selection)))