From acd22d9b2d6f0b68408148a91a2d1586ecf6be9c Mon Sep 17 00:00:00 2001
From: Eric Ihli <eihli@owoga.com>
Date: Fri, 30 Apr 2021 18:37:48 -0500
Subject: [PATCH] Move generation code to nlg

---
 src/com/owoga/prhyme/data/dictionary.clj |   2 +-
 src/com/owoga/prhyme/data_transform.clj  | 280 +++++++++++------------
 src/com/owoga/prhyme/nlg/prhyme_nlg.clj  | 182 +++++++++++++++
 src/com/owoga/prhyme/nlp/core.clj        | 120 +++++++++-
 4 files changed, 434 insertions(+), 150 deletions(-)

diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj
index c8b714b..45a8bc6 100644
--- a/src/com/owoga/prhyme/data/dictionary.clj
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@@ -98,7 +98,7 @@
         english-words
         (->> words
              (filter #(word-set (string/lower-case %))))]
-    (< 0.7 (/ (count english-words) (max 1 (count words))))))
+    (< 0.8 (/ (count english-words) (max 1 (count words))))))
 
 (comment
   (let [phoneme-lookup (into
diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj
index 5412caa..cd08aa9 100644
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@@ -2,7 +2,6 @@
   (:require [clojure.string :as string]
             [clojure.java.io :as io]
             [com.owoga.prhyme.data.dictionary :as dict]
-            [com.owoga.prhyme.nlp.core :as nlp]
             [com.owoga.trie :as trie]
             [com.owoga.tightly-packed-trie :as tpt]
             [com.owoga.tightly-packed-trie.encoding :as encoding]
@@ -97,21 +96,23 @@
                    k)]
       [k' 1])))
 
-(defn xf-part-of-speech-database
-  [database]
-  (fn [sentence]
-    (let [leafs (->> sentence
-                     nlp/treebank-zipper
-                     nlp/leaf-pos-path-word-freqs)]
-      (run!
-       (fn [[k v]]
-         (swap!
-          database
-          assoc
-          k
-          (merge-with + (@database k) v)))
-       leafs)
-      sentence)))
+(comment
+  ;; TODO: Move to nlp.core
+  (defn xf-part-of-speech-database
+   [database]
+   (fn [sentence]
+     (let [leafs (->> sentence
+                      nlp/treebank-zipper
+                      nlp/leaf-pos-path-word-freqs)]
+       (run!
+        (fn [[k v]]
+          (swap!
+           database
+           assoc
+           k
+           (merge-with + (@database k) v)))
+        leafs)
+       sentence))))
 
 (comment
   (let [database (atom {})]
@@ -206,32 +207,31 @@
       (recur (conj result [k v])
              (rest k)))))
 
-(defn process-text
-  "Processes text into key value pairs where
+(comment
+  ;; TODO: Move to nlp.core
+  (defn process-text
+   "Processes text into key value pairs where
   the keys are parts-of-speech paths and the values
   are the children at that path.
 
   Ready to be inserted into a trie."
-  [text]
-  (->> text
-       (split-text-into-sentences)
-       (map string/trim)
-       (remove empty?)
-       (mapv nlp/treebank-zipper)
-       (remove nil?)
-       (map nlp/parts-of-speech-trie-entries)
-       (mapv (fn [file]
-               (mapv (fn [line]
-                       (mapv vec line))
-                     file)))
-       (reduce into [])
-       (map flatten-trie-entry-to-all-subkeys)
-       (reduce into [])
-       (mapv normalize-text)
-       (mapv (fn [[k v]]
-               (clojure.lang.MapEntry. (into (vec k) [v]) v)))))
+   [text]
+   (->> text
+        (split-text-into-sentences)
+        (map string/trim)
+        (remove empty?)
+        (mapv nlp/treebank-zipper)
+        (remove nil?)
+        (map nlp/parts-of-speech-trie-entries)
+        (reduce into [])
+        (map flatten-trie-entry-to-all-subkeys)
+        (reduce into [])
+        (mapv normalize-text)
+        (mapv (fn [[k v]]
+                (clojure.lang.MapEntry. (into (vec k) [v]) v))))))
 
 (comment
+  (process-text (first texts))
   (flatten-trie-entry-to-all-subkeys
    '[(TOP S NP) (NP PP)])
   ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
@@ -304,7 +304,7 @@
           trie
           entries)))
       (trie/make-trie)
-      (take 300 texts))))
+      (take 3000 texts))))
 
   (nippy/freeze-to-file "/tmp/test-trie.bin" (seq test-trie))
   (time
@@ -441,32 +441,6 @@
         (#(zip/insert-right % (zip/node z2)))
         (zip/root))))
 
-(defn generate
-  [trie database zipper]
-  (let [k (map first (zip/path zipper))]
-    (do (Thread/sleep 10) (println k))
-    (if (vector? (database (last k)))
-      (loop [zipper zipper]
-        (let [children (last (map first (zip/path zipper)))]
-          (Thread/sleep 50) (println children (zip/root zipper))
-          (if (empty? children)
-            zipper
-            (recur
-             (-> zipper
-                 zip/up
-                 (zip/append-child [(first children)])
-                 (zip/down)
-                 (zip/rightmost)
-                 (zip/down)
-                 (#(generate trie database %))
-                 (zip/up)
-                 (zip/up)
-                 (zip/down)
-                 (zip/replace (subvec 1 children)))))))
-      (zip/insert-right
-       zipper
-       (choose trie database k)))))
-
 (defn generate
   [trie database zipper]
   (cond
@@ -526,12 +500,14 @@
 
 (comment
   (trie/lookup test-trie [1])
-  (->> (generate test-trie @test-database (zip/vector-zip [1]))
-       (zip/vector-zip)
-       (iterate zip/next)
-       (take-while (complement zip/end?))
-       (map zip/node)
-       (filter string?))
+  (repeatedly
+   20
+   #(->> (generate test-trie @test-database (zip/vector-zip [1]))
+        (zip/vector-zip)
+        (iterate zip/next)
+        (take-while (complement zip/end?))
+        (map zip/node)
+        (filter string?)))
 
   (-> [:a [:b] [:b]]
       zip/vector-zip
@@ -649,43 +625,47 @@
 
   )
 
-(defn xf-grammar-database
-  [database]
-  (fn [sentence]
-    (let [leafs (->> sentence
-                     nlp/treebank-zipper
-                     nlp/leaf-pos-path-word-freqs)]
-      (run!
-       (fn [[k v]]
-         (swap!
-          database
-          assoc
-          k
-          (merge-with + (@database k) v)))
-       leafs)
-      sentence)))
-
-(defn file-seq->grammar-tree
-  [files]
-  (transduce
-   (comp
-    (xf-file-seq 0 1000)
-    (map slurp)
-    (map #(string/split % #"[\n+\?\.]"))
-    (map (partial transduce xf-tokenize conj))
-    (map (partial transduce xf-filter-english conj))
-    (map (partial remove empty?))
-    (remove empty?)
-    (map (partial transduce xf-untokenize conj))
-    (map nlp/grammar-tree-frequencies)
-    (map (partial into {})))
-   (fn
-     ([acc]
-      (sort-by (comp - second) acc))
-     ([acc m]
-      (merge-with + acc m)))
-   {}
-   files))
+(comment
+  ;; TODO: Move to nlp.core
+  (defn xf-grammar-database
+    [database]
+    (fn [sentence]
+      (let [leafs (->> sentence
+                       nlp/treebank-zipper
+                       nlp/leaf-pos-path-word-freqs)]
+        (run!
+         (fn [[k v]]
+           (swap!
+            database
+            assoc
+            k
+            (merge-with + (@database k) v)))
+         leafs)
+        sentence))))
+
+(comment
+  ;; TODO: remove or move to nlp.core
+  (defn file-seq->grammar-tree
+    [files]
+    (transduce
+     (comp
+      (xf-file-seq 0 1000)
+      (map slurp)
+      (map #(string/split % #"[\n+\?\.]"))
+      (map (partial transduce xf-tokenize conj))
+      (map (partial transduce xf-filter-english conj))
+      (map (partial remove empty?))
+      (remove empty?)
+      (map (partial transduce xf-untokenize conj))
+      (map nlp/grammar-tree-frequencies)
+      (map (partial into {})))
+     (fn
+       ([acc]
+        (sort-by (comp - second) acc))
+       ([acc m]
+        (merge-with + acc m)))
+     {}
+     files)))
 
 (comment
   (time
@@ -699,27 +679,29 @@
 
   )
 
-(defn file-seq->part-of-speech-freqs
-  [files]
-  (transduce
-   (comp
-    (xf-file-seq 0 1000)
-    (map slurp)
-    (map #(string/split % #"[\n+\?\.]"))
-    (map (partial transduce xf-tokenize conj))
-    (map (partial transduce xf-filter-english conj))
-    (map (partial remove empty?))
-    (remove empty?)
-    (map (partial transduce xf-untokenize conj))
-    (map (partial map nlp/treebank-zipper))
-    (map (partial map nlp/leaf-pos-path-word-freqs))
-    (map (partial reduce (fn [acc m]
-                           (nlp/deep-merge-with + acc m)) {})))
-   (completing
-    (fn [result input]
-      (nlp/deep-merge-with + result input)))
-   {}
-   files))
+(comment
+  ;; TODO: Remove or move to nlp.core
+  (defn file-seq->part-of-speech-freqs
+    [files]
+    (transduce
+     (comp
+      (xf-file-seq 0 1000)
+      (map slurp)
+      (map #(string/split % #"[\n+\?\.]"))
+      (map (partial transduce xf-tokenize conj))
+      (map (partial transduce xf-filter-english conj))
+      (map (partial remove empty?))
+      (remove empty?)
+      (map (partial transduce xf-untokenize conj))
+      (map (partial map nlp/treebank-zipper))
+      (map (partial map nlp/leaf-pos-path-word-freqs))
+      (map (partial reduce (fn [acc m]
+                             (nlp/deep-merge-with + acc m)) {})))
+     (completing
+      (fn [result input]
+        (nlp/deep-merge-with + result input)))
+     {}
+     files)))
 
 (comment
   (time (->> (file-seq->part-of-speech-freqs
@@ -732,24 +714,26 @@
   )
 
 
-(defn file-seq->parts-of-speech-trie
-  [files]
-  (transduce
-   (comp
-    (xf-file-seq 0 1000)
-    (map slurp)
-    (map #(string/split % #"[\n+\?\.]"))
-    (map (partial transduce xf-tokenize conj))
-    (map (partial transduce xf-filter-english conj))
-    (map (partial remove empty?))
-    (remove empty?)
-    (map (partial transduce xf-untokenize conj))
-    (map nlp/grammar-tree-frequencies)
-    (map (partial into {})))
-   (fn
-     ([acc]
-      (sort-by (comp - second) acc))
-     ([acc m]
-      (merge-with + acc m)))
-   {}
-   files))
+(comment
+  ;; TODO: Remove or move to nlp.core
+  (defn file-seq->parts-of-speech-trie
+    [files]
+    (transduce
+     (comp
+      (xf-file-seq 0 1000)
+      (map slurp)
+      (map #(string/split % #"[\n+\?\.]"))
+      (map (partial transduce xf-tokenize conj))
+      (map (partial transduce xf-filter-english conj))
+      (map (partial remove empty?))
+      (remove empty?)
+      (map (partial transduce xf-untokenize conj))
+      (map nlp/grammar-tree-frequencies)
+      (map (partial into {})))
+     (fn
+       ([acc]
+        (sort-by (comp - second) acc))
+       ([acc m]
+        (merge-with + acc m)))
+     {}
+     files)))
diff --git a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
index aa9580e..f00b547 100644
--- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
+++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
@@ -2,9 +2,13 @@
   (:require [clojure.zip :as zip]
             [clojure.string :as string]
             [taoensso.timbre :as timbre]
+            [com.owoga.prhyme.util.math :as math]
             [examples.core :as examples]
             [taoensso.nippy :as nippy]
             [com.owoga.prhyme.nlp.core :as nlp]
+            [clojure.java.io :as io]
+            [com.owoga.prhyme.data-transform :as df]
+            [com.owoga.trie :as trie]
             [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
             [clojure.set :as set]))
 
@@ -403,3 +407,181 @@
     [(TOP (NP (NP (NN)) (PP (IN) (NP (PRP$) (NN))))) 218]
     [(TOP (NP (JJ) (NNS))) 211]
     [(TOP (VB)) 204]))
+
+
+(comment
+  (def test-database (atom {::nlp/next-id 1}))
+
+  (def texts
+    (eduction
+     (comp (df/xf-file-seq 0 250000)
+           (map slurp))
+     (file-seq (io/file "dark-corpus"))))
+
+  (time
+   (def test-trie
+     (transduce
+      (comp
+       (map
+        (fn [text]
+          (try
+            (nlp/text->grammar-trie-map-entry text)
+            (catch Exception e
+              (throw e)))))
+       (map (partial map (nlp/make-database-stateful-xf test-database))))
+      (completing
+       (fn [trie entries]
+         (reduce
+          (fn [trie [k v]]
+            (update trie k (fnil inc 0)))
+          trie
+          entries)))
+      (trie/make-trie)
+      (take 300 texts))))
+
+  )
+
+(defn children
+  [trie database k]
+  (->> (trie/lookup trie k)
+       (trie/children)
+       (map #(vector (.key %) (get % [])))
+       (remove (comp nil? second))
+       (sort-by (comp - second))))
+
+(defn choose
+  [trie database k]
+  (math/weighted-selection
+   second
+   (children trie database k)))
+
+(defn markov-generate-grammar
+  [trie database zipper]
+  (cond
+    (zip/end? zipper)
+    (zip/root zipper)
+
+    (seqable? (zip/node zipper))
+    (recur trie database (zip/next zipper))
+
+    (symbol? (zip/node zipper))
+    (recur trie database (zip/next zipper))
+
+    (symbol? (database (zip/node zipper)))
+    (let [sym (database (zip/node zipper))
+          sym-path  (->> (map first (zip/path zipper))
+                         butlast
+                         (filter symbol?)
+                         (#(concat % (list sym))))
+          path (map database sym-path)
+          choice (first (choose trie database path))]
+      (recur
+       trie
+       database
+       (-> zipper
+           (zip/replace
+            [sym choice])
+           (zip/root)
+           (zip/vector-zip))))
+
+    (string? (database (zip/node zipper)))
+    (let [terminal (database (zip/node zipper))
+          path (->> (map first (zip/path zipper))
+                    butlast
+                    (filter symbol?))]
+      (recur
+       trie
+       database
+       (-> zipper
+           zip/remove
+           zip/root
+           zip/vector-zip)))
+
+    :else
+    (recur
+     trie
+     database
+     (-> zipper
+         (zip/replace
+          (mapv
+           database
+           (database (zip/node zipper))))
+         (zip/next)
+         (zip/root)
+         (zip/vector-zip)))))
+
+(comment
+  (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
+
+  )
+
+(defn markov-generate-sentence
+  [trie database zipper]
+  (cond
+    (zip/end? zipper)
+    (zip/root zipper)
+
+    (seqable? (zip/node zipper))
+    (recur trie database (zip/next zipper))
+
+    (symbol? (zip/node zipper))
+    (recur trie database (zip/next zipper))
+
+    (symbol? (database (zip/node zipper)))
+    (let [sym (database (zip/node zipper))
+          sym-path  (->> (map first (zip/path zipper))
+                         butlast
+                         (filter symbol?)
+                         (#(concat % (list sym))))
+          path (map database sym-path)
+          choice (first (choose trie database path))]
+      (recur
+       trie
+       database
+       (-> zipper
+           (zip/replace
+            [sym choice])
+           (zip/root)
+           (zip/vector-zip))))
+
+    (string? (database (zip/node zipper)))
+    (let [terminal (database (zip/node zipper))
+          path (->> (map first (zip/path zipper))
+                    butlast
+                    (filter symbol?))]
+      (recur
+       trie
+       database
+       (-> zipper
+           (zip/replace
+            terminal)
+           (zip/next)
+           (zip/root)
+           (zip/vector-zip))))
+
+    :else
+    (recur
+     trie
+     database
+     (-> zipper
+         (zip/replace
+          (mapv
+           database
+           (database (zip/node zipper))))
+         (zip/next)
+         (zip/root)
+         (zip/vector-zip)))))
+
+(comment
+  (generate test-trie @test-database (zip/vector-zip [1]))
+
+  (repeatedly
+   20
+   #(->> (generate test-trie @test-database (zip/vector-zip [1]))
+         (zip/vector-zip)
+         (iterate zip/next)
+         (take-while (complement zip/end?))
+         (map zip/node)
+         (filter string?)))
+
+  )
diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj
index a88629b..47d440f 100644
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@@ -2,11 +2,14 @@
   (:require [opennlp.nlp :as nlp]
             [opennlp.treebank :as tb]
             [clojure.string :as string]
+            [com.owoga.prhyme.data-transform :as df]
+            [com.owoga.trie :as trie]
             [clojure.java.io :as io]
             [clojure.zip :as zip]
             [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
             [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
-            [clojure.walk :as walk])
+            [clojure.walk :as walk]
+            [com.owoga.prhyme.data.dictionary :as dict])
   (:import (opennlp.tools.postag POSModel POSTaggerME)
            (opennlp.tools.parser Parse ParserModel
                                  ParserFactory)
@@ -1209,3 +1212,118 @@
   ;;     [(TOP S NP PP NP NN) ("today")])
 
   )
+
+
+;;;; Grammar Trie
+;;
+;; Create a trie from treebank parsed grammar trees.
+
+(defn -split-text-into-sentences
+  "Splits text on newlines, periods, exclamation and question marks."
+  [text]
+  (->> text
+       (#(string/replace % #"([\.\?\!\n]+)" "$1\n"))
+       (string/split-lines)))
+
+(defn -flatten-trie-entry-to-all-subkeys
+  "Turns
+
+  [[k1 k2 k3] v]
+
+  into
+
+  [[[k1 k2 k3] v]]
+  [[k2 k3] v]]
+  [[k3] v]]]
+
+  This is useful for creating a trie from a grammar tree. It's
+  nice to know that k3 is a child of both [k1 k2] and [k2] so
+  if you need to generate a [k2] in isolation, you have
+  acces to [k1 k2] and [k4 k2] and [kn k2] etc... all under the
+  top-level key [k2].
+  "
+  [[k v]]
+  (loop [result []
+         k k]
+    (if (empty? k)
+      result
+      (recur (conj result [k v])
+             (rest k)))))
+
+(defn -normalize-text
+  [[k v]]
+  (if (string? (first v))
+    [k (string/lower-case (first v))]
+    [k v]))
+
+(defn english?
+  [text]
+  (->> text
+       (#(string/replace % #"\W" " "))
+       (#(string/replace % #" +" " "))
+       (#(string/split % #" "))
+       (every? #(dict/cmu-with-stress-map (string/lower-case %)))))
+
+(defn text->grammar-trie-map-entry
+  "Processes text into key value pairs where
+  the keys are parts-of-speech paths and the values
+  are the children at that path.
+
+  Ready to be inserted into a trie."
+  [text]
+  (->> text
+       (-split-text-into-sentences)
+       (map string/trim)
+       (remove empty?)
+       (mapv treebank-zipper)
+       (remove nil?)
+       (map parts-of-speech-trie-entries)
+       (reduce into [])
+       (map -flatten-trie-entry-to-all-subkeys)
+       (reduce into [])
+       (mapv -normalize-text)
+       (mapv (fn [[k v]]
+               (clojure.lang.MapEntry. (into (vec k) [v]) v)))))
+
+(defn -new-key
+  "Associates key with an auto-incrementing ID
+  and the ID with the key.
+
+  This 'database' is an atom that maps
+  keys to integer ids and integer ids to keys.
+
+  This lets us use integers throughout the trie data structure,
+  which ends up being a lot more efficient and prepares the trie
+  for being turned into a tightly-packed-trie."
+  [database k]
+  (let [next-id (@database ::next-id)]
+    (swap!
+     database
+     #(-> %
+          (assoc k next-id)
+          (assoc next-id k)
+          (update ::next-id inc)))
+    next-id))
+
+(defn make-database-stateful-xf
+  "This 'database' is an atom that maps
+  keys to integer ids and integer ids to keys.
+
+  This lets us use integers throughout the trie data structure,
+  which ends up being a lot more efficient and prepares the trie
+  for being turned into a tightly-packed-trie.
+
+  Takes an atom and returns a function that takes a Trie key/value.
+  When the returned function is called, it checks to see
+  if the key is in the database and if so it returns the associated id.
+  If not, it increments the id (which is stored in the database
+  under :next-id) and returns that new id."
+  [database]
+  (fn [[k v]]
+    (let [k' (mapv (fn [kn]
+                     (if-let [id (get @database kn)]
+                       id
+                       (-new-key database kn)))
+                   k)]
+      [k' 1])))
+