Add nippy for pos and grammar freqs

5 years ago · c193152a79
parent 75218e770b
commit c193152a79
11 changed files with 558 additions and 47 deletions
--- a/deps.edn
+++ b/deps.edn
@ -14,5 +14,5 @@
        org.clojure/data.fressian {:mvn/version "1.0.0"}
        com.taoensso/nippy {:mvn/version "3.0.0"}
        com.taoensso/timbre {:mvn/version "4.10.0"}}
- :aliases {:dev {:extra-paths ["test" "examples"]
+ :aliases {:dev {:extra-paths ["test" "examples" "dev"]
                 :extra-deps {}}}}
--- a/dev/examples/core.clj
+++ b/dev/examples/core.clj
@ -2,6 +2,8 @@
  (:require [clojure.string :as string]
            [clojure.set]
            [clojure.java.io :as io]
+            [taoensso.nippy :as nippy]
+            [taoensso.timbre :as timbre]
            [com.owoga.prhyme.frp :as frp]
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.core :as prhyme]
@ -205,53 +207,156 @@
           (remove #(some string/blank? %))
           (map #(string/join " " %))))))

-(defn dark-pos-freqs []
-  (let [directory "dark-corpus"]
-    (->> (file-seq (io/file directory))
-         (remove #(.isDirectory %))
-         (take 1000)
-         (map slurp)
-         (map util/clean-text)
-         (filter dict/english?)
-         (map #(string/split % #"\n+"))
-         (map (remove-sentences-with-words-not-in-dictionary dict/popular))
-         (remove empty?)
-         (remove #(some empty? %))
-         (map nlp/treebank-zipper)
-         (map nlp/leaf-pos-path-word-freqs)
-         (apply nlp/deep-merge-with +))))
-
-(defn dark-structures []
-  (let [directory "dark-corpus"]
-    (->> (file-seq (io/file directory))
-         (remove #(.isDirectory %))
-         (take 500)
-         (map slurp)
-         (map util/clean-text)
-         (filter dict/english?)
-         (map #(string/split % #"\n+"))
-         (map (remove-sentences-with-words-not-in-dictionary dict/popular))
-         (remove empty?)
-         (remove #(some empty? %))
-         (map nlp/parse-to-simple-tree)
-         (map nlp/parse-tree-sans-leaf-words)
-         (map
-          (fn [lines]
-            (map
-             (fn [line]
-               (hash-map line 1))
-             lines)))
-         (map (partial merge-with +))
-         flatten
-         (apply merge-with +))))
+(defn pos-path-freqs
+  "Seq of pos-path frequencies of each document.
+  To reduce, deep merge with +."
+  [documents]
+  (->> documents
+       (map slurp)
+       (map util/clean-text)
+       (filter dict/english?)
+       (map #(string/split % #"\n+"))
+       (map (remove-sentences-with-words-not-in-dictionary dict/popular))
+       (remove empty?)
+       (remove #(some empty? %))
+       (map nlp/treebank-zipper)
+       (map nlp/leaf-pos-path-word-freqs)))
+
+(defn structures
+  "Seq of structure frequencies of each document.
+  To reduce, merge with +."
+  [documents]
+  (->> documents
+       (map slurp)
+       (map util/clean-text)
+       (filter dict/english?)
+       (map #(string/split % #"\n+"))
+       (map (remove-sentences-with-words-not-in-dictionary dict/popular))
+       (remove empty?)
+       (remove #(some empty? %))
+       (map nlp/parse-to-simple-tree)
+       (map nlp/parse-tree-sans-leaf-words)
+       (map
+        (fn [lines]
+          (map #(hash-map % 1) lines)))
+       (map (partial apply merge-with +))))

 (defn weighted-selection-from-map [m]
  (first (weighted-rand/weighted-selection second (seq m))))

+(defn chunked-writing-pos-path-freqs
+  [documents chunk-size]
+  (let [chunks (range 0 (count documents) chunk-size)]
+    (run!
+     (fn [chunk]
+       (let [structure (->> documents
+                            (drop chunk)
+                            (take chunk-size)
+                            pos-path-freqs
+                            (reduce
+                             (fn [a v]
+                               (nlp/deep-merge-with + a v))
+                             {}))
+             filepath (format "resources/pos-freqs/%s.nip" chunk)]
+         (timbre/info (format "Writing to %s." filepath))
+         (nippy/freeze-to-file filepath structure)))
+     chunks)))
+
+(defn chunked-writing-structure-freqs
+  [documents chunk-size]
+  (let [chunks (range 0 (count documents) chunk-size)]
+    (run!
+     (fn [chunk]
+       (let [structure (->> documents
+                            (drop chunk)
+                            (take chunk-size)
+                            structures
+                            (reduce
+                             (fn [a v]
+                               (nlp/deep-merge-with + a v))
+                             {}))
+             filepath (format "resources/structure-freqs/%s.nip" chunk)]
+         (timbre/info (format "Writing to %s." filepath))
+         (nippy/freeze-to-file filepath structure)))
+     chunks)))
+
+(defn pos-paths->pos-freqs
+  "Convert pos paths, like {(TOP S NP NN) {'test' 5 'car' 3 ,,,}}
+  into a top-level pos freq map like {NN {'test' 25 'car' 8 ,,,}}.is"
+  [pos-paths]
+  (->> pos-paths
+       (map
+        (fn [[k v]]
+          (hash-map (last k) v)))
+       (reduce
+        (fn [a v]
+          (nlp/deep-merge-with + a v))
+        {})))
+
 (comment
-  (time (def example-pos-freqs (dark-pos-freqs)))
+  (take 5 darklyrics/darklyrics-markov-2)
+  (darklyrics/darklyrics-markov-2 '("time" "is"))
+  (def darkov-2 darklyrics/darklyrics-markov-2)
+  ;; => ([("profanity" "unholy") {"its" 2}]
+  ;;     [("ants" "triumph") {nil 1}]
+  ;;     [("hiding" "our") {"of" 1, "expose" 3, "above" 1}]
+  ;;     [("won't" "intervention") {"divine" 1, "an" 1}]
+  ;;     [("pines" "weeping") {"the" 1}])
+
+  (def structures (nippy/thaw-from-file "resources/structure-freqs/0.nip"))
+  (take 100 (reverse (sort-by second structures)))
+  (let [documents (->> "dark-corpus"
+                       io/file
+                       file-seq
+                       (remove #(.isDirectory %))
+                       (drop 5000)
+                       (take 10000))
+        chunk-size 5000]
+    (chunked-writing-pos-path-freqs
+     documents
+     chunk-size))
+
+  (def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip"))
+  (take 10 t1)
+  (let [path-freqs (pos-paths->pos-freqs t1)]
+    (take 10 path-freqs))

-  (time (def example-structures (dark-structures)))
+  (take 5 t1)
+  (take 10 (reverse (sort-by #(count (second %)) t1)))
+  (def t3 (nippy/thaw-from-file "resources/pos-freqs/400.nip"))
+  (def t2 (nippy/thaw-from-file "resources/pos-freqs/800.nip"))
+  (count (merge-with + t1 t2 t3))
+  ;; => 2353
+  (count t3)
+  ;; => 1013
+  (count t1)
+  ;; => 871
+  (count t2)
+  ;; => 676  (def corpus
+  (->> "dark-corpus"
+       io/file
+       file-seq
+       (remove #(.isDirectory %))))
+
+  (time
+   (def example-pos-freqs
+     (->> corpus
+          (take 100)
+          pos-path-freqs
+          (reduce
+           (fn [a v]
+             (nlp/deep-merge-with + a v))
+           {}))))
+
+  (time
+   (def example-structures
+     (->> corpus
+          (take 100)
+          structures
+          (reduce
+           (fn [a v]
+             (merge-with + a v))
+           {}))))

  (let [structure (weighted-selection-from-map example-structures)]
    (repeatedly
@ -262,6 +367,7 @@
             example-pos-freqs)
            nlp/leaf-nodes
            (string/join " ")))))
+
  ;; => ("then get your life"
  ;;     "sometimes lie my hand"
  ;;     "still become your chapter"
--- a/resources/pos-freqs/0.nip
+++ b/resources/pos-freqs/0.nip
--- a/resources/structure-freqs/0.nip
+++ b/resources/structure-freqs/0.nip
--- a/resources/structure-freqs/10000.nip
+++ b/resources/structure-freqs/10000.nip
--- a/resources/structure-freqs/5000.nip
+++ b/resources/structure-freqs/5000.nip
--- a/src/com/owoga/prhyme/corpus/db.clj
+++ b/src/com/owoga/prhyme/corpus/db.clj
@ -0,0 +1,2 @@
+(ns com.owoga.prhyme.corpus.db
+  (:require [integrant.core :as ig]))
--- a/src/com/owoga/prhyme/generation/weighted_selection.clj
+++ b/src/com/owoga/prhyme/generation/weighted_selection.clj
@ -74,6 +74,56 @@
             target
             result]))))))

+(defn adjust-for-markov-simple-structure
+  "Like the other adjust-for-markov, but the structure is simply
+  k/v pairs (element/weight).
+ 
+  Works with a markov data structure that was generated taking into account
+  sentence boundaries (represented as nils).
+
+  A key in the markov structure of '(nil) would have a value that represents all
+  words that have occurred in position 1 of the raw data.
+
+  A key of '(nil \"foo\") would have a value that represents all words
+  that occurred in position 2 following \"foo\"
+
+  Automatically detects the order (window size) of the markov model. Does this
+  by counting the length of the first key.
+  "
+  [markov percent]
+  (let [markov-n (count (first (first markov)))]
+    (fn [[words target result]]
+      (let [key (let [k (map :normalized-word (take markov-n result))]
+                  (reverse
+                   (if (> markov-n (count k))
+                     (concat k (repeat (- markov-n (count k)) nil))
+                     k)))
+            markov-options (markov key)
+            markov-option-avg (/ (apply + (vals markov-options))
+                                 (max 1 (count markov-options)))]
+        (if (nil? markov-options)
+          [words target result]
+          (let [[markovs non-markovs]
+                ((juxt filter remove)
+                 #(markov-options (:normalized-word %))
+                 words)
+                weight-non-markovs (apply + (map :weight non-markovs))
+                target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
+                                         weight-non-markovs)
+                count-markovs (count markovs)
+                adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
+            [(concat
+              (map
+               (fn [m]
+                 (let [option (markov-options (:normalized-word m))]
+                   (as-> m m
+                     (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
+                     (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
+               markovs)
+              non-markovs)
+             target
+             result]))))))
+
 (defn adjust-for-rhymes
  "Weights words by whether or not they rhyme.
  Once result contains something, becomes inactive. If you want to try to rhyme
--- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
+++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj
@ -0,0 +1,332 @@
+(ns com.owoga.prhyme.nlg.prhyme-nlg
+  (:require [clojure.zip :as zip]
+            [clojure.string :as string]
+            [taoensso.timbre :as timbre]
+            [examples.core :as examples]
+            [taoensso.nippy :as nippy]
+            [com.owoga.prhyme.nlp.core :as nlp]
+            [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
+            [clojure.set :as set]))
+
+(defn update-values [m f & args]
+  (reduce
+   (fn [acc [k v]]
+     (assoc acc k (apply f v args)))
+   {}
+   m))
+
+(defn generate
+  [pos-path->word-freqs
+   pos->word-freqs
+   target-parse-tree]
+  (loop [parse-zipper (zip/seq-zip target-parse-tree)]
+    (cond
+      (zip/end? parse-zipper) (zip/root parse-zipper)
+
+      (zip/branch? parse-zipper)
+      (recur (zip/next parse-zipper))
+
+      (string? (zip/node parse-zipper))
+      (recur (zip/next parse-zipper))
+
+      (and (symbol? (zip/node parse-zipper))
+           (or (pos->word-freqs (zip/node parse-zipper))
+               (pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
+      (let [target-path (seq (map first (zip/path parse-zipper)))
+            target-pos (zip/node parse-zipper)
+            pos-path-word (pos-path->word-freqs target-path)
+            pos-word (pos->word-freqs target-pos)]
+        (timbre/info "Choosing POS for" target-path)
+        (let [selection (weighted-rand/weighted-selection-from-map
+                         (merge-with +
+                          (update-values pos-path-word * 2)
+                          pos-word))]
+          (timbre/info "from" (take 5
+                                    (merge-with +
+                                     (update-values pos-path-word * 2)
+                                     pos-word)))
+          (timbre/info "Chose " selection)
+          (recur
+           (-> parse-zipper
+               zip/up
+               (#(zip/replace % (list (zip/node (zip/down %)) selection)))
+               zip/next
+               zip/next))))
+
+      :else
+      (recur (zip/next parse-zipper)))))
+
+(defn next-word [zipper]
+  (->> zipper
+      nlp/iter-zip
+      (filter #(string? (zip/node %)))
+      first
+      (#(if (nil? %) :end (zip/node %)))))
+
+(defn next-two-words [nodes]
+  (timbre/info
+   (apply list (->> nodes
+                    (map zip/node))))
+  (->> nodes
+       (filter #(string? (zip/node %)))
+       (take 2)
+       (map #(if (nil? %) :end (zip/node %)))))
+
+(comment
+  (let [zipper (zip/seq-zip '(TOP (S (NN "Eric") (VBZ "is") (JJ "testing"))))]
+    (->> zipper
+         nlp/iter-zip
+         reverse
+         next-two-words))
+
+  )
+
+(defn set-last [zipper f]
+  (let [last-node (->> zipper
+                       (iterate zip/next)
+                       (take-while (complement zip/end?))
+                       last
+                       zip/prev)]
+    (-> last-node
+        (zip/replace (f last-node)))))
+
+(comment
+  (let [zipper (zip/seq-zip '(TOP (S (NN) (VBZ "is") (JJ))))]
+    (-> zipper
+        (set-last (fn [node] (list (zip/node (zip/next node)) "bad")))
+        zip/root))
+
+  )
+
+(defn generate-with-markov
+  [pos-path->word-freqs
+   pos->word-freqs
+   target-parse-tree
+   markov]
+  (loop [parse-zipper (zip/seq-zip target-parse-tree)]
+    (cond
+      (zip/end? parse-zipper) (zip/root parse-zipper)
+
+      (zip/branch? parse-zipper)
+      (recur (zip/next parse-zipper))
+
+      (string? (zip/node parse-zipper))
+      (recur (zip/next parse-zipper))
+
+      (and (symbol? (zip/node parse-zipper))
+           (or (pos->word-freqs (zip/node parse-zipper))
+               (pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
+      (let [target-path (seq (map first (zip/path parse-zipper)))
+            target-pos (zip/node parse-zipper)
+            pos-path-word (pos-path->word-freqs target-path)
+            pos-word (pos->word-freqs target-pos)
+            markov-options (markov (reverse (next-two-words (nlp/iter-zip
+                                                             parse-zipper
+                                                             zip/prev
+                                                             nil?))))]
+        (timbre/info "Markov options are"
+                     (apply list (next-two-words (nlp/iter-zip
+                                                  parse-zipper
+                                                  zip/prev
+                                                  nil?)))
+                     (apply list (take 3 markov-options)))
+        (timbre/info "Choosing POS for" target-path)
+        (let [selection (weighted-rand/weighted-selection-from-map
+                         (merge-with
+                          *
+                          (update-values markov-options * 10)
+                          (update-values pos-path-word * 2)
+                          pos-word))]
+          (timbre/info "from" (apply
+                               list
+                               (take
+                                5
+                                (merge-with
+                                 *
+                                 (update-values markov-options * 10)
+                                 (update-values pos-path-word * 2)
+                                 pos-word))))
+          (timbre/info "Chose " selection)
+          (recur
+           (-> parse-zipper
+               zip/up
+               (#(zip/replace % (list (zip/node (zip/down %)) selection)))
+               zip/next
+               zip/next))))
+
+      :else
+      (recur (zip/next parse-zipper)))))
+
+(defn generate-with-markov-with-custom-progression
+  "Sams as above, but with next/prev and stop fns"
+  [next
+   prev
+   next-stop?
+   prev-stop?
+   pos-path->word-freqs
+   pos->word-freqs
+   parse-zipper
+   markov]
+  (loop [parse-zipper parse-zipper]
+    (cond
+      (nil? (next parse-zipper)) (zip/root parse-zipper)
+     
+      (next-stop? parse-zipper) (zip/root parse-zipper)
+
+      (zip/branch? parse-zipper)
+      (recur (next parse-zipper))
+
+      (string? (zip/node parse-zipper))
+      (recur (next parse-zipper))
+
+      (and (symbol? (zip/node parse-zipper))
+           (or (pos->word-freqs (zip/node parse-zipper))
+               (pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
+      (let [target-path (seq (map first (zip/path parse-zipper)))
+            target-pos (zip/node parse-zipper)
+            pos-path-word (pos-path->word-freqs target-path)
+            pos-word (pos->word-freqs target-pos)
+            pos-map (merge-with
+                     (fn [a b] (* 1.5 (+ a b)))
+                     pos-path-word
+                     pos-word)
+            markov-options (markov (reverse
+                                    (next-two-words (nlp/iter-zip
+                                                     parse-zipper
+                                                     prev
+                                                     prev-stop?))))
+            selection-possibilities (merge-with
+                                     (fn [a b]
+                                       (let [max-pos (apply max (vals pos-map))]
+                                         (+ a b max-pos)))
+                                     pos-map
+                                     markov-options)]
+        (timbre/info "Markov options are"
+                     (apply list (next-two-words (nlp/iter-zip
+                                                  parse-zipper
+                                                  prev
+                                                  prev-stop?)))
+                     (apply list (take 10 markov-options)))
+        (timbre/info "Choosing POS for" target-path)
+        (let [selection (weighted-rand/weighted-selection-from-map
+                         selection-possibilities)]
+          (timbre/info
+           "Most likely selection possibilities"
+           (apply list (take 5 (reverse (sort-by second selection-possibilities)))))
+          (timbre/info "Chose " selection)
+          (recur
+           (-> parse-zipper
+               zip/up
+               (#(zip/replace % (list (zip/node (zip/down %)) selection)))
+               zip/down
+               next
+               next))))
+
+      :else
+      (recur (next parse-zipper)))))
+
+(comment
+  (let [structure '(TOP (S (NP (DT) (JJ) (NN))
+                           (VP (VBZ))
+                           (NP (DT) (JJ) (NN))))
+        structure (-> structure
+                      zip/seq-zip
+                      nlp/iter-zip
+                      last)
+        pos-freqs (examples/pos-paths->pos-freqs
+                   examples/t1)]
+    (repeatedly
+     10
+     (fn []
+       (->> (generate-with-markov-with-custom-progression
+             zip/prev
+             zip/next
+             nil?
+             zip/end?
+             examples/t1
+             pos-freqs
+             structure
+             examples/darkov-2)
+            nlp/leaf-nodes
+            (string/join " ")))))
+
+  (timbre/set-level! :info)
+  (timbre/set-level! :error)
+
+  (let [pos-path->word-freqs
+        {'(S N) {"Eric" 1 "Edgar" 2}
+         '(S V) {"tests" 2 "runs" 1}}
+        pos->word-freqs
+        {'N {"Edward" 1}
+         'V {"breaks" 1}}
+        target-parse-tree
+        '(S (N) (V))]
+    (-> (generate
+         pos-path->word-freqs
+         pos->word-freqs
+         target-parse-tree)))
+  (time (def example-pos-freqs examples/example-pos-freqs))
+  (nippy/thaw)
+  (nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs)
+
+  (time (def example-structures examples/example-structures))
+  (weighted-rand/weighted-selection-from-map
+   example-structures)
+
+
+
+  (take 5 examples/t2)
+  (let [structure '(TOP (S (NP (DT) (JJ) (NN))
+                           (VP (VBZ))
+                           (NP (DT) (JJ) (NN))))
+        structure (-> structure
+                      zip/seq-zip
+                      nlp/iter-zip
+                      last)
+        pos-freqs (examples/pos-paths->pos-freqs
+                   examples/t1)]
+    (repeatedly
+     10
+     (fn []
+       (->> (generate-with-markov-with-custom-progression
+             zip/prev
+             zip/next
+             nil?
+             zip/end?
+             examples/t1
+             pos-freqs
+             structure
+             examples/darkov-2)
+            nlp/leaf-nodes
+            (string/join " ")))))
+
+  (repeatedly
+   10
+   (fn []
+     (let [structure (weighted-rand/weighted-selection-from-map
+                      (->> examples/t2
+                           (sort-by second)
+                           (reverse)
+                           (take 20)))
+           structure (-> structure
+                         zip/seq-zip
+                         nlp/iter-zip
+                         last)
+           pos-freqs (examples/pos-paths->pos-freqs
+                      examples/t1)]
+       (repeatedly
+        10
+        (fn []
+          (->> (generate-with-markov-with-custom-progression
+                zip/prev
+                zip/next
+                nil?
+                zip/end?
+                examples/t1
+                pos-freqs
+                structure
+                examples/darkov-2)
+               nlp/leaf-nodes
+               (string/join " ")))))))
+
+  )
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -49,6 +49,14 @@
         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
  ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
  ;;     [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
+  (let [phrase "I am feeling the heat."]
+    (map (juxt #(.getOutcomes %)
+               #(map float (.getProbs %)))
+         (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
+;; => ([["PRP" "VBP" "VBG" "DT" "NN" "."]
+;;      (0.9800125 0.9771906 0.9722519 0.9709216 0.9941198 0.98704773)]
+;;     [["PRP" "VBP" "NN" "DT" "NN" "."]
+;;      (0.9800125 0.9771906 0.01259052 0.76849043 0.99447477 0.98704773)])
  )

 ;;;; Custom parser to get access to top N parses
@ -84,7 +92,12 @@
    (apply map vector ((juxt parse-strs parse-probs) parses))))

 (comment
-  (parse-top-n "." 3)
+  (let [phrase "The feeling hurts."]
+    (->> phrase
+        tokenize
+        (string/join " ")
+        (#(parse-top-n % 10))))
+  (Math/pow Math/E -0.96)
  )

 (defn deep-merge-with [f & maps]
@ -244,10 +257,15 @@
  ;;         (. (".")))))))
  )

-(defn iter-zip [zipper]
-  (->> zipper
-       (iterate zip/next)
-       (take-while (complement zip/end?))))
+(defn iter-zip
+  ([zipper]
+   (->> zipper
+        (iterate zip/next)
+        (take-while (complement zip/end?))))
+  ([zipper n s]
+   (->> zipper
+        (iterate n)
+        (take-while (complement s)))))

 (defn iter-nodes [zipper]
  (->> zipper
--- a/src/com/owoga/prhyme/util/weighted_rand.clj
+++ b/src/com/owoga/prhyme/util/weighted_rand.clj
@ -97,3 +97,6 @@
                             (frequencies chosen))
                       (range (count ws)))]
    accuracy))
+
+(defn weighted-selection-from-map [m]
+  (first (weighted-selection second (seq m))))