Move generation code to nlg

5 years ago · acd22d9b2d
parent 1c3a07708a
commit acd22d9b2d
4 changed files with 434 additions and 150 deletions
--- a/src/com/owoga/prhyme/data/dictionary.clj
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@ -98,7 +98,7 @@
        english-words
        (->> words
             (filter #(word-set (string/lower-case %))))]
-    (< 0.7 (/ (count english-words) (max 1 (count words))))))
+    (< 0.8 (/ (count english-words) (max 1 (count words))))))
 (comment
  (let [phoneme-lookup (into
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -2,7 +2,6 @@
  (:require [clojure.string :as string]
            [clojure.java.io :as io]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.trie :as trie]
            [com.owoga.tightly-packed-trie :as tpt]
            [com.owoga.tightly-packed-trie.encoding :as encoding]
@ -97,7 +96,9 @@
                   k)]
      [k' 1])))
-(defn xf-part-of-speech-database
+(comment
  ;; TODO: Move to nlp.core
  (defn xf-part-of-speech-database
   [database]
   (fn [sentence]
     (let [leafs (->> sentence
@ -111,7 +112,7 @@
           k
           (merge-with + (@database k) v)))
        leafs)
-      sentence)))
+       sentence))))
 (comment
  (let [database (atom {})]
@ -206,7 +207,9 @@
      (recur (conj result [k v])
             (rest k)))))
-(defn process-text
+(comment
  ;; TODO: Move to nlp.core
  (defn process-text
   "Processes text into key value pairs where
  the keys are parts-of-speech paths and the values
  are the children at that path.
@ -220,18 +223,15 @@
        (mapv nlp/treebank-zipper)
        (remove nil?)
        (map nlp/parts-of-speech-trie-entries)
       (mapv (fn [file]
               (mapv (fn [line]
                       (mapv vec line))
                     file)))
        (reduce into [])
        (map flatten-trie-entry-to-all-subkeys)
        (reduce into [])
        (mapv normalize-text)
        (mapv (fn [[k v]]
-               (clojure.lang.MapEntry. (into (vec k) [v]) v)))))
+                (clojure.lang.MapEntry. (into (vec k) [v]) v))))))
 (comment
  (process-text (first texts))
  (flatten-trie-entry-to-all-subkeys
   '[(TOP S NP) (NP PP)])
  ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
@ -304,7 +304,7 @@
          trie
          entries)))
      (trie/make-trie)
-      (take 300 texts))))
+      (take 3000 texts))))
  (nippy/freeze-to-file "/tmp/test-trie.bin" (seq test-trie))
  (time
@ -441,32 +441,6 @@
        (#(zip/insert-right % (zip/node z2)))
        (zip/root))))
 (defn generate
  [trie database zipper]
  (let [k (map first (zip/path zipper))]
    (do (Thread/sleep 10) (println k))
    (if (vector? (database (last k)))
      (loop [zipper zipper]
        (let [children (last (map first (zip/path zipper)))]
          (Thread/sleep 50) (println children (zip/root zipper))
          (if (empty? children)
            zipper
            (recur
             (-> zipper
                 zip/up
                 (zip/append-child [(first children)])
                 (zip/down)
                 (zip/rightmost)
                 (zip/down)
                 (#(generate trie database %))
                 (zip/up)
                 (zip/up)
                 (zip/down)
                 (zip/replace (subvec 1 children)))))))
      (zip/insert-right
       zipper
       (choose trie database k)))))
 (defn generate
  [trie database zipper]
  (cond
@ -526,12 +500,14 @@
 (comment
  (trie/lookup test-trie [1])
-  (->> (generate test-trie @test-database (zip/vector-zip [1]))
+  (repeatedly
   20
   #(->> (generate test-trie @test-database (zip/vector-zip [1]))
        (zip/vector-zip)
        (iterate zip/next)
        (take-while (complement zip/end?))
        (map zip/node)
-       (filter string?))
+        (filter string?)))
  (-> [:a [:b] [:b]]
      zip/vector-zip
@ -649,7 +625,9 @@
  )
-(defn xf-grammar-database
+(comment
  ;; TODO: Move to nlp.core
  (defn xf-grammar-database
    [database]
    (fn [sentence]
      (let [leafs (->> sentence
@ -663,9 +641,11 @@
            k
            (merge-with + (@database k) v)))
         leafs)
-      sentence)))
+        sentence))))
-(defn file-seq->grammar-tree
+(comment
  ;; TODO: remove or move to nlp.core
  (defn file-seq->grammar-tree
    [files]
    (transduce
     (comp
@ -685,7 +665,7 @@
       ([acc m]
        (merge-with + acc m)))
     {}
-   files))
+     files)))
 (comment
  (time
@ -699,7 +679,9 @@
  )
-(defn file-seq->part-of-speech-freqs
+(comment
  ;; TODO: Remove or move to nlp.core
  (defn file-seq->part-of-speech-freqs
    [files]
    (transduce
     (comp
@ -719,7 +701,7 @@
      (fn [result input]
        (nlp/deep-merge-with + result input)))
     {}
-   files))
+     files)))
 (comment
  (time (->> (file-seq->part-of-speech-freqs
@ -732,7 +714,9 @@
  )
-(defn file-seq->parts-of-speech-trie
+(comment
  ;; TODO: Remove or move to nlp.core
  (defn file-seq->parts-of-speech-trie
    [files]
    (transduce
     (comp
@ -752,4 +736,4 @@
       ([acc m]
        (merge-with + acc m)))
     {}
-   files))
+     files)))
--- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
+++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
@ -2,9 +2,13 @@
  (:require [clojure.zip :as zip]
            [clojure.string :as string]
            [taoensso.timbre :as timbre]
            [com.owoga.prhyme.util.math :as math]
            [examples.core :as examples]
            [taoensso.nippy :as nippy]
            [com.owoga.prhyme.nlp.core :as nlp]
            [clojure.java.io :as io]
            [com.owoga.prhyme.data-transform :as df]
            [com.owoga.trie :as trie]
            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
            [clojure.set :as set]))
@ -403,3 +407,181 @@
    [(TOP (NP (NP (NN)) (PP (IN) (NP (PRP$) (NN))))) 218]
    [(TOP (NP (JJ) (NNS))) 211]
    [(TOP (VB)) 204]))
 (comment
  (def test-database (atom {::nlp/next-id 1}))
  (def texts
    (eduction
     (comp (df/xf-file-seq 0 250000)
           (map slurp))
     (file-seq (io/file "dark-corpus"))))
  (time
   (def test-trie
     (transduce
      (comp
       (map
        (fn [text]
          (try
            (nlp/text->grammar-trie-map-entry text)
            (catch Exception e
              (throw e)))))
       (map (partial map (nlp/make-database-stateful-xf test-database))))
      (completing
       (fn [trie entries]
         (reduce
          (fn [trie [k v]]
            (update trie k (fnil inc 0)))
          trie
          entries)))
      (trie/make-trie)
      (take 300 texts))))
  )
 (defn children
  [trie database k]
  (->> (trie/lookup trie k)
       (trie/children)
       (map #(vector (.key %) (get % [])))
       (remove (comp nil? second))
       (sort-by (comp - second))))
 (defn choose
  [trie database k]
  (math/weighted-selection
   second
   (children trie database k)))
 (defn markov-generate-grammar
  [trie database zipper]
  (cond
    (zip/end? zipper)
    (zip/root zipper)
    (seqable? (zip/node zipper))
    (recur trie database (zip/next zipper))
    (symbol? (zip/node zipper))
    (recur trie database (zip/next zipper))
    (symbol? (database (zip/node zipper)))
    (let [sym (database (zip/node zipper))
          sym-path  (->> (map first (zip/path zipper))
                         butlast
                         (filter symbol?)
                         (#(concat % (list sym))))
          path (map database sym-path)
          choice (first (choose trie database path))]
      (recur
       trie
       database
       (-> zipper
           (zip/replace
            [sym choice])
           (zip/root)
           (zip/vector-zip))))
    (string? (database (zip/node zipper)))
    (let [terminal (database (zip/node zipper))
          path (->> (map first (zip/path zipper))
                    butlast
                    (filter symbol?))]
      (recur
       trie
       database
       (-> zipper
           zip/remove
           zip/root
           zip/vector-zip)))
    :else
    (recur
     trie
     database
     (-> zipper
         (zip/replace
          (mapv
           database
           (database (zip/node zipper))))
         (zip/next)
         (zip/root)
         (zip/vector-zip)))))
 (comment
  (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
  )
 (defn markov-generate-sentence
  [trie database zipper]
  (cond
    (zip/end? zipper)
    (zip/root zipper)
    (seqable? (zip/node zipper))
    (recur trie database (zip/next zipper))
    (symbol? (zip/node zipper))
    (recur trie database (zip/next zipper))
    (symbol? (database (zip/node zipper)))
    (let [sym (database (zip/node zipper))
          sym-path  (->> (map first (zip/path zipper))
                         butlast
                         (filter symbol?)
                         (#(concat % (list sym))))
          path (map database sym-path)
          choice (first (choose trie database path))]
      (recur
       trie
       database
       (-> zipper
           (zip/replace
            [sym choice])
           (zip/root)
           (zip/vector-zip))))
    (string? (database (zip/node zipper)))
    (let [terminal (database (zip/node zipper))
          path (->> (map first (zip/path zipper))
                    butlast
                    (filter symbol?))]
      (recur
       trie
       database
       (-> zipper
           (zip/replace
            terminal)
           (zip/next)
           (zip/root)
           (zip/vector-zip))))
    :else
    (recur
     trie
     database
     (-> zipper
         (zip/replace
          (mapv
           database
           (database (zip/node zipper))))
         (zip/next)
         (zip/root)
         (zip/vector-zip)))))
 (comment
  (generate test-trie @test-database (zip/vector-zip [1]))
  (repeatedly
   20
   #(->> (generate test-trie @test-database (zip/vector-zip [1]))
         (zip/vector-zip)
         (iterate zip/next)
         (take-while (complement zip/end?))
         (map zip/node)
         (filter string?)))
  )
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -2,11 +2,14 @@
  (:require [opennlp.nlp :as nlp]
            [opennlp.treebank :as tb]
            [clojure.string :as string]
            [com.owoga.prhyme.data-transform :as df]
            [com.owoga.trie :as trie]
            [clojure.java.io :as io]
            [clojure.zip :as zip]
            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
-            [clojure.walk :as walk])
+            [clojure.walk :as walk]
            [com.owoga.prhyme.data.dictionary :as dict])
  (:import (opennlp.tools.postag POSModel POSTaggerME)
           (opennlp.tools.parser Parse ParserModel
                                 ParserFactory)
@ -1209,3 +1212,118 @@
  ;;     [(TOP S NP PP NP NN) ("today")])
  )
 ;;;; Grammar Trie
 ;;
 ;; Create a trie from treebank parsed grammar trees.
 (defn -split-text-into-sentences
  "Splits text on newlines, periods, exclamation and question marks."
  [text]
  (->> text
       (#(string/replace % #"([\.\?\!\n]+)" "$1\n"))
       (string/split-lines)))
 (defn -flatten-trie-entry-to-all-subkeys
  "Turns
  [[k1 k2 k3] v]
  into
  [[[k1 k2 k3] v]]
  [[k2 k3] v]]
  [[k3] v]]]
  This is useful for creating a trie from a grammar tree. It's
  nice to know that k3 is a child of both [k1 k2] and [k2] so
  if you need to generate a [k2] in isolation, you have
  acces to [k1 k2] and [k4 k2] and [kn k2] etc... all under the
  top-level key [k2].
  "
  [[k v]]
  (loop [result []
         k k]
    (if (empty? k)
      result
      (recur (conj result [k v])
             (rest k)))))
 (defn -normalize-text
  [[k v]]
  (if (string? (first v))
    [k (string/lower-case (first v))]
    [k v]))
 (defn english?
  [text]
  (->> text
       (#(string/replace % #"\W" " "))
       (#(string/replace % #" +" " "))
       (#(string/split % #" "))
       (every? #(dict/cmu-with-stress-map (string/lower-case %)))))
 (defn text->grammar-trie-map-entry
  "Processes text into key value pairs where
  the keys are parts-of-speech paths and the values
  are the children at that path.
  Ready to be inserted into a trie."
  [text]
  (->> text
       (-split-text-into-sentences)
       (map string/trim)
       (remove empty?)
       (mapv treebank-zipper)
       (remove nil?)
       (map parts-of-speech-trie-entries)
       (reduce into [])
       (map -flatten-trie-entry-to-all-subkeys)
       (reduce into [])
       (mapv -normalize-text)
       (mapv (fn [[k v]]
               (clojure.lang.MapEntry. (into (vec k) [v]) v)))))
 (defn -new-key
  "Associates key with an auto-incrementing ID
  and the ID with the key.
  This 'database' is an atom that maps
  keys to integer ids and integer ids to keys.
  This lets us use integers throughout the trie data structure,
  which ends up being a lot more efficient and prepares the trie
  for being turned into a tightly-packed-trie."
  [database k]
  (let [next-id (@database ::next-id)]
    (swap!
     database
     #(-> %
          (assoc k next-id)
          (assoc next-id k)
          (update ::next-id inc)))
    next-id))
 (defn make-database-stateful-xf
  "This 'database' is an atom that maps
  keys to integer ids and integer ids to keys.
  This lets us use integers throughout the trie data structure,
  which ends up being a lot more efficient and prepares the trie
  for being turned into a tightly-packed-trie.
  Takes an atom and returns a function that takes a Trie key/value.
  When the returned function is called, it checks to see
  if the key is in the database and if so it returns the associated id.
  If not, it increments the id (which is stored in the database
  under :next-id) and returns that new id."
  [database]
  (fn [[k v]]
    (let [k' (mapv (fn [kn]
                     (if-let [id (get @database kn)]
                       id
                       (-new-key database kn)))
                   k)]
      [k' 1])))