Add comments to nlp/g code

5 years ago · 0c7da21610
parent c0ad5acefc
commit 0c7da21610
6 changed files with 478 additions and 381 deletions
--- a/dev/examples/core.clj
+++ b/dev/examples/core.clj
@ -1,16 +1,21 @@
 (ns examples.core
  (:require [clojure.string :as string]
            [clojure.set]
            [clojure.java.io :as io]
            [com.owoga.prhyme.frp :as frp]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]
            [com.owoga.prhyme.data.bigrams :as bigrams]
            [com.owoga.prhyme.gen :as gen]
            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.prhyme.nlg.core :as nlg]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.data.thesaurus :as thesaurus]
            [com.owoga.prhyme.data.darklyrics :as darklyrics]
            [com.owoga.prhyme.generation.weighted-selection :as weighted]
            [clojure.set :as set]
-            [clojure.zip :as zip]))
+            [clojure.zip :as zip]
            [clojure.walk :as walk]))
 (defn weight-fn [word target result]
  (let [rimes (frp/consecutive-matching word target :rimes)
@ -190,3 +195,91 @@
            (map :normalized-word)))))
  )
 (defn remove-sentences-with-words-not-in-dictionary [dictionary]
  (let [dictionary (into #{} dictionary)]
    (fn [sentences]
      (->> sentences
           (map #(string/split % #" "))
           (remove #(some (complement dictionary) %))
           (remove #(some string/blank? %))
           (map #(string/join " " %))))))
 (defn dark-pos-freqs []
  (let [directory "dark-corpus"]
    (->> (file-seq (io/file directory))
         (remove #(.isDirectory %))
         (drop 10)
         (take 10)
         (map slurp)
         (map util/clean-text)
         (filter dict/english?)
         (map #(string/split % #"\n+"))
         (map (remove-sentences-with-words-not-in-dictionary dict/popular))
         (map nlp/treebank-zipper)
         (map nlp/leaf-pos-path-word-freqs)
         (apply nlp/deep-merge-with +))))
 (defn dark-structures []
  (let [directory "dark-corpus"]
    (->> (file-seq (io/file directory))
         (remove #(.isDirectory %))
         (take 1000)
         (map slurp)
         (map util/clean-text)
         (filter dict/english?)
         (map #(string/split % #"\n+"))
         (map #(remove string/blank? %))
         (map nlp/parse-to-simple-tree)
         (map nlp/parse-tree-sans-leaf-words)
         (map
          (fn [lines]
            (map
             (fn [line]
               (hash-map line 1))
             lines)))
         (map (partial merge-with +))
         flatten
         (apply merge-with +))))
 (comment
  (time (def example-pos-freqs (dark-pos-freqs)))
  example-pos-freqs
  (take 20 example-pos-freqs)
  (time (def example-structures (dark-structures)))
  (def common-example-structures
    (filter
     #(< 10 (second %))
     example-structures))
  (count common-example-structures)
  (let [structure (rand-nth (seq common-example-structures))
        zipper (zip/seq-zip (first structure))]
    (loop [zipper zipper]
      (let [path (map first (zip/path zipper))]
        (cond
          (zip/end? zipper) (zip/root zipper)
          (and (not-empty path)
               (example-pos-freqs path))
          (recur
           (-> zipper
               zip/up
               (zip/append-child
                (first
                 (rand-nth
                  (seq
                   (example-pos-freqs path)))))
               zip/down
               zip/next
               zip/next))
          :else (recur (zip/next zipper))))))
  (get-in {:a 1} '())
  (let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))]
    (-> zipper
        zip/down
        zip/right
        zip/node))
  )
--- a/src/com/owoga/prhyme/gen.clj
+++ b/src/com/owoga/prhyme/gen.clj
@ -3,7 +3,7 @@
            [com.owoga.prhyme.util.math :as math]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
-            [com.owoga.prhyme.util.nlp :as nlp]
+            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.prhyme.frp :as frp]
            [com.owoga.prhyme.core :as prhyme]))
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -4,7 +4,9 @@
            [clojure.string :as string]
            [clojure.java.io :as io]
            [clojure.zip :as zip]
-            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
+            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
            [clojure.walk :as walk])
  (:import (opennlp.tools.postag POSModel POSTaggerME)))
 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
@ -44,13 +46,6 @@
         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
  (let [phrase "Blood falls."]
    (->> phrase
         tokenize
         (top-k-sequences prhyme-pos-tagger)
         (map (juxt #(.getOutcomes %)
                    #(map float (.getProbs %))))))
  )
 (defn deep-merge-with [f & maps]
@ -67,6 +62,7 @@
  ;; => {:a 1, :b {:b 7}, :c 3}
  )
 (defn valid-sentence?
  "Tokenizes and parses the phrase using OpenNLP models from
  http://opennlp.sourceforge.net/models-1.5/
@ -95,7 +91,16 @@
  representation of the tree, then we can `unmake` the tree
  to turn it into a list representation of the tree that
  we can easily use in a clojure zipper. (read-string almost works,
-  but falls apart when reading things like commas)."
+  but falls apart when reading things like commas).
  This doesn't map to the string returned by the parser.
  Children here are all nested inside a single list.
  For example: (NP ((DT 'a') (NN ('sample')) (NN 'test')))
  rather than: (NP (DT 'a') (NN 'sample') (NN 'test'))
  If you want a data structure that is in parity with the
  string returned by the parser, look at `parse-to-simple-tree`
  "
  [node]
  (cond
    (string? node) node
@ -103,42 +108,78 @@
    :else (map unmake-tree node)))
 (comment
-  (let [phrase "Hello, Eric"]
+  (let [text-lines ["This is a sample test."
-    (->> phrase
+                    "This is another line."]]
-         tokenize
+    (->> text-lines
-         (string/join " ")
+         (map tokenize)
-         vector
+         (map (partial string/join " "))
         parse
         (map tb/make-tree)
-         unmake-tree))
+         (map unmake-tree)))
-  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
+  ;; => ((TOP
  ;;      ((S
  ;;        ((NP ((DT ("This"))))
  ;;         (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("sample")) (NN ("test"))))))
  ;;         (. ("."))))))
  ;;     (TOP
  ;;      ((S
  ;;        ((NP ((DT ("This"))))
  ;;         (VP ((VBZ ("is")) (NP ((DT ("another")) (NN ("line"))))))
  ;;         (. (".")))))))
  )
-(defn treebank-zipper
+
-  "Turns a bit of text into a parse tree into a zipper."
+(defn parse-to-simple-tree
-  [text]
+  "Returns a Clojure data structure, a list, in the shape of a tree, that
-  (let [tree (->> text
+  matches the structure returned by the parser.
-                  tokenize
+
-                  (string/join " ")
+  The implementation takes a roundabout path. It uses `tb/make-tree` to
-                  vector
+  get a map/list data structure, it then unmakes the tree info a data
  structure that can be `walk`ed, it then walks the tree making modifications
  to bring it in line with the parse string structure.
  "
  [text-lines]
  (->> text-lines
       (map tokenize)
       (map (partial string/join " "))
       parse
       (map tb/make-tree)
-                  unmake-tree)]
+       (map unmake-tree)
-    (zip/seq-zip tree)))
+       (map
        #(walk/postwalk
          (fn [node]
            (cond
              (and (seq? node)
                   (symbol? (first node))
                   (and (seq? (second node))
                        (seq? (first (second node)))))
              (cons (first node) (second node))
              (and (seq? node)
                   (string? (first node)))
              (first node)
              :else node))
          %))))
 (comment
-  (let [zipper (treebank-zipper "This is a zipper test.")]
+  ;; This is what the parsed string looks like.
-    zipper)
+  (let [text-lines ["Hello, world!"]]
-  ;; => [((TOP
+    (->> text-lines (map tokenize) (map (partial string/join " ")) parse))
-  ;;       ((S
+  ;; => ["(TOP (FRAG (INTJ (UH Hello)) (, ,) (NP (NN world)) (. !)))"]
-  ;;         ((NP ((DT ("This"))))
+
-  ;;          (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("zipper")) (NN ("test"))))))
+  ;; And this is what the simple-tree Clojure data structure looks like.
-  ;;          (. (".")))))))
+  (let [text-lines ["Hello, world!"]]
-  ;;     nil]
+    (parse-to-simple-tree text-lines))
  ;; => ((TOP (FRAG (INTJ (UH "Hello")) (, ",") (NP (NN "world")) (. "!")))) 
  )
-(defn treebank-zipper-1
+(defn treebank-zipper
-  "Turns a bit of text into a parse tree into a zipper."
+  "Turns a bit of text into a parse tree into a zipper.
  Porcelain. If you have the simple tree data structure
  returned by `parse-to-simple-tree`, then you can just
  pass that directly to `zip/seq-zip`."
  [texts]
  (let [tree (->> texts
                  (map tokenize)
@ -148,14 +189,30 @@
                  unmake-tree)]
    (zip/seq-zip tree)))
-(defn seq-zip [zipper]
+(comment
  (let [texts ["Eric's test is difficult."]]
    (loop [zipper (treebank-zipper texts)]
      (cond
        (zip/end? zipper) (zip/root zipper)
        (= 'JJ (zip/node zipper)) (recur (-> zipper
                                             zip/next
                                             (zip/replace '("thorough"))))
        :else (recur (zip/next zipper)))))
  ;; => ((TOP
  ;;      ((S
  ;;        ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
  ;;         (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
  ;;         (. (".")))))))
  )
 (defn iter-zip [zipper]
  (->> zipper
       (iterate zip/next)
       (take-while (complement zip/end?))))
-(defn seq-nodes [zipper]
+(defn iter-nodes [zipper]
  (->> zipper
-       seq-zip
+       iter-zip
       (map zip/node)))
 (defn phrase-level? [node]
@ -168,9 +225,19 @@
       (= 1 (count (second node)))
       (string? (first (second node)))))
-(defn leaf-pos-paths [zipper]
+(defn leaf-pos-paths
  "Seq of the path down the parse tree to each leaf part-of-speech.
  Useful to aggregate over a corpus information like how often
  different words are used as different parts of speech.
  Benefit of having the entire path down the tree is that you
  can know things like 'A noun phrase as a child of a verb phrase
  never has a determiner.'
  "
  [zipper]
  (->> zipper
-       seq-zip
+       iter-zip
       (filter (complement zip/branch?))
       (map zip/path)
       (map #(map first %))
@ -182,23 +249,79 @@
       (filter #(string? (last %)))))
 (comment
-  (let [zipper (treebank-zipper-1 ["This is a zipper test."
+  (let [zipper (treebank-zipper ["Commas, they work."
-                                   "And this is another one."])]
+                                 "Eric's test is difficult."
                                 "Eric's testing the code."])]
    (leaf-pos-paths zipper))
-;; => ((TOP S NP DT "This")
+  ;; => ((TOP S NP NNP "Commas")
-;;     (TOP S VP VBZ "is")
+  ;;     (TOP S , ",")
-;;     (TOP S VP NP DT "a")
+  ;;     (TOP S NP PRP "they")
-;;     (TOP S VP NP NN "zipper")
+  ;;     (TOP S VP VBP "work")
 ;;     (TOP S VP NP NN "test")
  ;;     (TOP S . ".")
-;;     (TOP S CC "And")
+  ;;     (TOP S NP NP NNP "Eric")
-;;     (TOP S NP DT "this")
+  ;;     (TOP S NP NP POS "'s")
  ;;     (TOP S NP NN "test")
  ;;     (TOP S VP VBZ "is")
-;;     (TOP S VP NP DT "another")
+  ;;     (TOP S VP ADJP JJ "difficult")
-;;     (TOP S VP NP CD "one")
+  ;;     (TOP S . ".")
  ;;     (TOP S NP NNP "Eric")
  ;;     (TOP S VP VBZ "'s")
  ;;     (TOP S VP VP VBG "testing")
  ;;     (TOP S VP VP NP DT "the")
  ;;     (TOP S VP VP NP NN "code")
  ;;     (TOP S . "."))
  )
 (defn leaf-pos-path-word-freqs [zipper]
  (->> zipper
      leaf-pos-paths
      (map #(hash-map (butlast %) {(last %) 1}))
      (apply deep-merge-with +)))
 (comment
  (let [zipper (treebank-zipper ["Eric's test is difficult."
                                 "Eric's test is thorough."
                                 "Eric's testing."])]
    (leaf-pos-path-word-freqs zipper))
  ;; => {(TOP NP .) {"." 1},
  ;;     (TOP NP NP POS) {"'s" 1},
  ;;     (TOP S NP NP NNP) {"Eric" 2},
  ;;     (TOP NP NN) {"testing" 1},
  ;;     (TOP S VP VBZ) {"is" 2},
  ;;     (TOP S .) {"." 2},
  ;;     (TOP S NP NP POS) {"'s" 2},
  ;;     (TOP NP NP NNP) {"Eric" 1},
  ;;     (TOP S NP NN) {"test" 2},
  ;;     (TOP S VP ADJP JJ) {"difficult" 1, "thorough" 1}}
  )
 (defn parse-tree-sans-leaf-words
  "Takes a 'simple' parse tree (`parse-to-simple-tree`)
  Removes the leaf words from the tree.
  Useful if you want to work with the structure of something
  without caring about the actual words."
  [tree]
  (walk/postwalk
   (fn [node]
     (if (and (seq? node)
              (string? (second node)))
       (take 1 node)
       node))
   tree))
 (comment
  (let [tree (parse-to-simple-tree ["Eric's test is difficult."
                                    "Eric's test is thorough."
                                    "Eric's testing."
                                    "Eric is testing."])]
    (parse-tree-sans-leaf-words tree))
  ;; => ((TOP (S (NP (NP (NNP) (POS)) (NN)) (VP (VBZ) (ADJP (JJ))) (.)))
  ;;     (TOP (S (NP (NP (NNP) (POS)) (NN)) (VP (VBZ) (ADJP (JJ))) (.)))
  ;;     (TOP (NP (NP (NNP) (POS)) (NN) (.)))
  ;;     (TOP (S (NP (NNP)) (VP (VBZ) (VP (VBG))) (.))))
  )
 ;;;; This is not sufficient
 ;; You'll end up with a mapping that says a verb phrase can be a
@ -232,7 +355,7 @@
  "
  [zipper]
  (->> zipper
-       seq-nodes
+       iter-nodes
       (filter phrase-level?)
       (map
        (fn [[pos xs]]
@ -246,29 +369,47 @@
       (apply deep-merge-with +)))
 (comment
-  (let [zipper (treebank-zipper-1 ["This is a test and that is not a test."
+  (let [zipper (treebank-zipper ["Eric's test is difficult."
-                                   "My name is Eric."
+                                 "Eric's test is thorough."
-                                   "Go to the store."
+                                 "Eric's testing."])
                                   "Your name is not Eric."
                                   "This is a sample test."])
        freqs (->> zipper
                   pos->children-freqs)]
    freqs)
-;; => {NP {(DT) 3, (DT NN) 3, (PRP$ NN) 2, (NNP) 2, (DT NN NN) 1},
+  ;; => {NP {(NP NN) 2, (NNP POS) 3, (NP NN .) 1},
-;;     VB {"Go" 1},
+  ;;     ADJP {(JJ) 2},
-;;     VBZ {"is" 5},
+  ;;     VBZ {"is" 2},
-;;     S {(S CC S .) 1, (NP VP) 2, (NP VP .) 3},
+  ;;     S {(NP VP .) 2},
-;;     RB {"not" 2},
+  ;;     NNP {"Eric" 3},
-;;     NNP {"Eric" 2},
+  ;;     JJ {"difficult" 1, "thorough" 1},
-;;     TO {"to" 1},
+  ;;     TOP {(S) 2, (NP) 1},
-;;     DT {"This" 2, "a" 3, "that" 1, "the" 1},
+  ;;     POS {"'s" 3},
-;;     TOP {(S) 4, (VP) 1},
+  ;;     NN {"test" 2, "testing" 1},
-;;     CC {"and" 1},
+  ;;     VP {(VBZ ADJP) 2},
-;;     NN {"test" 3, "name" 2, "store" 1, "sample" 1},
+  ;;     . {"." 3}}
-;;     PP {(TO NP) 1},
+  )
-;;     VP {(VBZ NP) 3, (VBZ RB NP) 2, (VB PP .) 1},
+
-;;     PRP$ {"My" 1, "Your" 1},
+(defn remove-sentences-with-words-not-in-dictionary [dictionary]
-;;     . {"." 5}} 
+  (let [dictionary (into #{} dictionary)]
    (fn [sentences]
      (println sentences)
      (println dictionary)
      (->> sentences
           (map #(string/split % #" "))
           (remove #(some (complement dictionary) %))
           (remove #(some string/blank? %))
           (map #(string/join " " %))))))
 (comment
  (let [dictionary ["this" "is" "a" "test"]
        remove-fn (remove-sentences-with-words-not-in-dictionary
                   dictionary)
        sentences ["this is a foobar test"
                   "Eric is a test"
                   "this is a test"
                   "a test this is"]]
    (->> sentences
         remove-fn))
  ;; => ("this is a test" "a test this is")
  )
 (defn generate-from-freqs
@ -276,86 +417,78 @@
  and the number of times each constituent is seen in a corpus.
  start is the part-of-speech to start generating for.
  Creates a zipper that will initially have just the starting node.
  Iterates through the zipper with zip/next. When we encounter a node
  that exists in the frequencies map, then replace the node we are on
  with a value from the map.
  A problem with this is that we might not always get a valid structure.
  For example, a noun phrase might not start with a determiner 20% of the time,
  but 100% of the time when the noun phrase follows a verb phrase.
  So if we are just selecting based on frequency, we'll end up worse off than
  if we take into account the entire context we're in.
  "
  [freqs start]
  (let [zipper (zip/seq-zip (list start))]
    (loop [zipper zipper]
      (let [cur-freqs (freqs (zip/node zipper))]
        (cond
          (zip/end? zipper) (zip/root zipper)
          (zip/branch? zipper) (recur (zip/next zipper))
-        (freqs (zip/node zipper))
+          cur-freqs
          (let [selection (first (weighted-rand/weighted-selection second cur-freqs))]
            (recur
             (zip/next
              (zip/next
               (zip/replace
                zipper
-            (list
+                (list (zip/node zipper) selection))))))
             (zip/node zipper)
             (first (rand-nth (seq (freqs (zip/node zipper))))))))))
-        :else (recur (zip/next zipper))))))
+          :else (recur (zip/next zipper)))))))
 (comment
-  (let [zipper (treebank-zipper-1 ["This is a zipper test in the car."
+  (letfn [;; This is just a helper function to get the string leafs of a tree and
-                                   "And this is another one."
+          ;; join them into a single string.
-                                   "Here are some sample sentences."
+          (string-leaf-nodes [tree]
-                                   "Let's see what we can generate."
+            (->> tree
                                   "This is a big adjective."
                                   "That's an ugly sentence."
                                   "The corpus will be simple to start."])
        freqs (->> zipper
                   pos->children-freqs)]
    freqs
    (->> (generate-from-freqs freqs 'TOP))
    (->> (repeatedly
          (fn []
            (->> (generate-from-freqs freqs 'TOP)
                 (zip/seq-zip)
                 (iterate zip/next)
                 (take-while (complement zip/end?))
                 (filter (complement zip/branch?))
                 (map zip/node)
                 (filter string?)
-                 (string/join " "))))
+                 (string/join " ")))]
-         (filter valid-sentence?)
+    (let [ ;; Create an initial corpus.
-         (take 10)))
+          zipper (treebank-zipper ["This is a zipper test in the car."
-  ;; => ("Here be what start big And This test see what be That one start . . That one ."
+                                   "And this is another one."
-  ;;     "Here are This ugly zipper ."
+                                   "Here are some sample sentences."
-  ;;     "And an is The sample ."
+                                   "Let's see what we can generate."
-  ;;     "generate simple see This corpus test to Let And a Let what an corpus adjective to be And another ugly car can see ugly start what this one to are . . . . . . ."
+                                   "This is a big adjective."
-  ;;     "This sample sentences Let big 's a sample adjective in we ."
+                                   "That's a short adjective."
-  ;;     "will generate what Let"
+                                   "The corpus will be a simple start."])
-  ;;     "Here be ugly That sentence generate a big test ."
+          ;; Extract parts-of-speech frequencies from the corpus.
-  ;;     "Here is this corpus sentences in a test this sample sentences ."
+          freqs (pos->children-freqs zipper)]
-  ;;     "And some ugly zipper 's 's ."
+      ;; Let's generate a bunch of noun phrases.
-  ;;     "Here generate big the adjective 's another corpus an .")
+      (repeatedly
-  ;; => ((TOP
+       10
-  ;;      ((SINV
+       (fn []
-  ;;        ((ADVP ((RB "Here")))
+         (->> (generate-from-freqs freqs 'NP)
-  ;;         (VP
+              string-leaf-nodes)))))
-  ;;          ((TO "to")
+  ;; => ("another simple corpus"
-  ;;           (VP
+  ;;     "we"
-  ;;            ((TO "to")
+  ;;     "a big start"
-  ;;             (VP
+  ;;     "This start"
-  ;;              ((VB "Let")
+  ;;     "'s"
-  ;;               (SBAR
+  ;;     "a"
-  ;;                ((WHNP ((WP "what")))
+  ;;     "a test sentences"
-  ;;                 (S
+  ;;     "a sample"
-  ;;                  ((NP ((DT "this") (CD "one")))
+  ;;     "another car"
-  ;;                   (VP
+  ;;     "this car sentences")
  ;;                    ((VBZ "is")
  ;;                     (NP ((DT "The") (NN "adjective") (NN "car")))
  ;;                     (PP
  ;;                      ((IN "in")
  ;;                       (NP
  ;;                        ((DT "The") (NN "test") (NNS "sentences")))))))))))))))))
  ;;         (NP ((DT "another") (CD "one")))
  ;;         (. "."))))))
  )
@ -384,39 +517,119 @@
 (comment
  (let [pos-path '(TOP S NP DT "This")]
    (pos-freq pos-path))
  ;; => {TOP {:freq 1, S {:freq 1, NP {:freq 1, DT {:words {"This" 1}, :freq 1}}}}}
  )
 (defn pos-freqs [pos-paths]
  (apply deep-merge-with + pos-paths))
 (defn structure-freqs
  "Frequencies of entire parse trees."
  [text-lines]
  (let [parse-tree (-> text-lines
                       parse-to-simple-tree
                       parse-tree-sans-leaf-words)
        freqs (->> parse-tree
                   (map (fn [line] {line 1}))
                   (apply merge-with +))]
    freqs))
 (comment
-  (let [zipper (treebank-zipper-1 ["This is a zipper test."
+  (let [text-lines ["This is a test."
-                                   "And this is another one."
+                    "This is a line."
-                                   "This is not a test."])]
+                    "That is a test."
-    (->> zipper
+                    "You are a test."
-         leaf-pos-paths
+                    "This is not a test."
-         (map pos-freq)
+                    "I am a test."
-         (apply deep-merge-with +)))
+                    "This is a sample."
-;; => {TOP
+                    "Go to the store."
-;;     {:freq 18,
+                    "Run the test."
-;;      S
+                    "The test is running."]]
-;;      {:freq 18,
+    (structure-freqs text-lines))
-;;       NP {:freq 3, DT {:words {"This" 2, "this" 1}, :freq 3}},
+  ;; => {(TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))) (.))) 4,
-;;       VP
+  ;;     (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))) (.))) 2,
-;;       {:freq 11,
+  ;;     (TOP (S (NP (DT)) (VP (VBZ) (RB) (NP (DT) (NN))) (.))) 1,
-;;        VBZ {:words {"is" 3}, :freq 3},
+  ;;     (TOP (VP (VB) (PP (TO) (NP (DT) (NN))) (.))) 1,
-;;        NP
+  ;;     (TOP (VP (VB) (NP (DT) (NN)) (.))) 1,
-;;        {:freq 7,
+  ;;     (TOP (S (NP (DT) (NN)) (VP (VBZ) (VP (VBG))) (.))) 1}
-;;         DT {:words {"a" 2, "another" 1}, :freq 3},
+  )
-;;         NN {:words {"zipper" 1, "test" 2}, :freq 3},
+
-;;         CD {:words {"one" 1}, :freq 1}},
+(defn generate-from-structure-and-pos-freqs
-;;        RB {:words {"not" 1}, :freq 1}},
+  "This is an attempt to improve on selecting words for a particular part-of-speech
-;;       . {:words {"." 3}, :freq 3},
+  based solely on the part of speech.
-;;       CC {:words {"And" 1}, :freq 1}}}} 
+
  By passing a `structure-freqs` map, we can choose from a collection of known-valid
  structures.
  S -> NP | VP | NP VP
  NP -> NN | DT NN | DT
  VP -> VB | VB NN
  If we go just based on frequencies, we might have a grammar that allows something like the above.
  But it might be such that when S is NP VP, then NP is NEVER DT.
  Thak's what supplying the structure frequency map can improve.
  "
  [structure pos-freqs]
  (let [zipper (zip/seq-zip structure)]
    (loop [zipper zipper]
      (let [path (map first (zip/path zipper))
            cur-freqs (pos-freqs path)]
        (cond
          (zip/end? zipper) (zip/root zipper)
          (zip/branch? zipper) (recur (zip/next zipper))
          cur-freqs
          (let [selection (first (weighted-rand/weighted-selection second cur-freqs))]
            (recur
             (zip/next
              (zip/next
               (zip/replace
                zipper
                (list (zip/node zipper) selection))))))
          ;; ???
          :else (recur (zip/next zipper)))))))
 (comment
  (let [corpus ["this is a test"
                "that is a test"
                "this is a sample"
                "that is some code"
                "there is a car"
                "those are some cars"
                "that is a dog"
                "it is a dog"]
        structure '(TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN)))))
        pos-freqs (->> corpus
                       treebank-zipper
                       leaf-pos-path-word-freqs)]
    (repeatedly
     10
     (fn []
       (->> (generate-from-structure-and-pos-freqs
             structure
             pos-freqs)
            zip/seq-zip
            (iterate zip/next)
            (take-while (complement zip/end?))
            (filter #(string? (zip/node %)))
            (map zip/node)
            (string/join " ")))))
  ;; => ("this is some dog"
  ;;     "that is a test"
  ;;     "those is a dog"
  ;;     "those is a sample"
  ;;     "this is a test"
  ;;     "that is some test"
  ;;     "that is a car"
  ;;     "that is a car"
  ;;     "that is some dog"
  ;;     "this is a test")
  )
 (defn node-constituents
  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
  returns a list of the top-level node tag and its first-level child tags.
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -68,7 +68,7 @@
                  (lazy-seq ((window n) (drop n coll)))))))
 (defn clean-text [text]
-  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
+  (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" "")))
 (defn padr [val n coll]
  (concat coll (repeat n val)))
--- a/src/com/owoga/prhyme/util/nlp.clj
+++ b/src/com/owoga/prhyme/util/nlp.clj
@ -1,210 +0,0 @@
 (ns com.owoga.prhyme.util.nlp
  (:require [opennlp.nlp :as nlp]
            [opennlp.treebank :as tb]
            [clojure.string :as string]
            [clojure.java.io :as io]
            [clojure.zip :as zip]
            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
  (:import (opennlp.tools.postag POSModel POSTaggerME)))
 (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
 (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
 (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
 (def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
 ;;;; The tagger that onennlp.nlp gives us doesn't provide access
 ;;;; to the probabilities of all tags. It gives us the probability of the
 ;;;; top tag through some metadata. But to get probs for all tags, we
 ;;;; need to implement our own tagger.
 (defprotocol Tagger
  (tags [this sent])
  (probs [this])
  (top-k-sequences [this sent]))
 (defn make-pos-tagger
  [modelfile]
  (let [model (with-open [model-stream (io/input-stream modelfile)]
                (POSModel. model-stream))
        tagger (POSTaggerME. model)]
    (reify Tagger
      (tags [_ tokens]
        (let [token-array (into-array String tokens)]
          (map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array))))
      (probs [_] (seq (.probs tagger)))
      (top-k-sequences [_ tokens]
        (let [token-array (into-array String tokens)]
          (.topKSequences tagger #^"[Ljava.lang.String;" token-array))))))
 (def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
 (comment
  (let [phrase "The feeling hurts."]
    (map (juxt #(.getOutcomes %)
               #(map float (.getProbs %)))
         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
  )
 (defn valid-sentence?
  "Tokenizes and parses the phrase using OpenNLP models from
  http://opennlp.sourceforge.net/models-1.5/
  If the parse tree has an clause as the top-level tag, then
  we consider it a valid English sentence."
  [phrase]
  (->> phrase
       tokenize
       (string/join " ")
       vector
       parse
       first
       tb/make-tree
       :chunk
       first
       :tag
       tb2/clauses
       boolean))
 (defn unmake-tree
  "Tokenizing and then parsing a sentence returns a string
  representation of the parse tree. This is a helper function
  to make working with the parse tree more convenient. We
  can use `opennlp.treebank/make-tree` to make a clojure map
  representation of the tree, then we can `unmake` the tree
  to turn it into a list representation of the tree that
  we can easily use in a clojure zipper. (read-string almost works,
  but falls apart when reading things like commas)."
  [node]
  (cond
    (string? node) node
    (map? node) (list (:tag node) (unmake-tree (:chunk node)))
    :else (map unmake-tree node)))
 (comment
  (let [phrase "Hello, Eric"]
    (->> phrase
         tokenize
         (string/join " ")
         vector
         parse
         (map tb/make-tree)
         unmake-tree))
  ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
  )
 (defn treebank-zipper
  "Turns a bit of text into a parse tree into a zipper."
  [text]
  (let [tree (->> text
                  tokenize
                  (string/join " ")
                  vector
                  parse
                  (map tb/make-tree)
                  unmake-tree)]
    (zip/zipper seq? seq (fn [_ c] c) tree)))
 (defn node-constituents
  "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
  returns a list of the top-level node tag and its first-level child tags.
  "
  [node]
  (list
   (first node)
   (if (every? string? (map first (rest node)))
     nil
     (map first (first (rest node))))))
 (defn phrase-constituents
  "Given a bit of text that can be parsed into a treebank tree,
  Get a sequence of the tags and their chunks.
  For example:
    My name is Eric.
  Returns the sequence:
    At the TOP tag, we have a 'S part-of-speech (a clause).
    At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period)
    At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun)
    ...
  "
  [text]
  (->> (treebank-zipper text)
       (iterate zip/next)
       (take-while (complement zip/end?))
       (filter (complement zip/branch?))
       (map zip/path)
       (map last)
       (map node-constituents)
       (remove #(string? (first %)))))
 (comment
  (phrase-constituents "My name is Eric.")
  ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
  (phrase-constituents "How are you?")
  ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP))) 
  )
 (defn pos-constituent-frequencies
  "Frequencies of the parts of speech that make up phrases.
  Example:
    Clauses are made up of:
      NounPhrase + VerbPhrase 2 times
      Clause + CoordinatingConjuction + Clause 1 times
    NounPhrases are made up of:
      ProperNouns 2 times
      PersonalPronoun + SingularNoun 3 times
  Does not include frequencies for leaf words. By that I mean: A SingularNoun might
  appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair
  and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned
  by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %)))
  line. This data structure is used as a kind of markov selection process and we don't really
  care how often the leafs are used. We just care about the ratio at which we should pick each
  leaf from a given parent.
  "
  [texts]
  (reduce
   (fn [acc text]
     (let [constituents (->> text
                             phrase-constituents
                             (remove #(nil? (second %))))]
       (reduce
        (fn [acc constituent]
          (let [k1 (first constituent)
                k2 (second constituent)]
            (update-in acc [k1 k2] (fnil inc 0))))
        acc
        constituents)))
   {}
   texts))
 (comment
  (pos-constituent-frequencies
   ["My name is Eric."
    "My hat is blue and I like cake."
    "Your name is Taylor."
    "How are you?"])
  ;; => {TOP {(S) 3, (SBARQ) 1},
  ;;     S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
  ;;     NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
  ;;     VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1},
  ;;     ADJP {(JJ) 1},
  ;;     SBARQ {(WHADVP SQ .) 1},
  ;;     WHADVP {(WRB) 1},
  ;;     SQ {(VBP NP) 1}}
  (let [phrase "How are you today?"]
    (->> phrase
         tokenize
         (string/join " ")
         vector
         parse
         (map tb/make-tree)))
  (let [phrase "I gave the cake to John at the store."]
    (parse (tokenize phrase)))
  (let [phrase "I've got a good feeling"]
    (pos-tagger (tokenize phrase)))
  )
--- a/src/com/owoga/prhyme/util/weighted_rand.clj
+++ b/src/com/owoga/prhyme/util/weighted_rand.clj
@ -78,7 +78,8 @@
         selection (nth coll index)]
     selection))
  ([key-fn coll]
-   (let [rng (from-weights (map key-fn coll))
+   (let [coll (seq coll)
         rng (from-weights (map key-fn coll))
         index (nextr rng nil)
         selection (nth coll index)]
     selection)))