diff --git a/deps.edn b/deps.edn index ce3aa9f..472b97f 100644 --- a/deps.edn +++ b/deps.edn @@ -14,5 +14,5 @@ org.clojure/data.fressian {:mvn/version "1.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/timbre {:mvn/version "4.10.0"}} - :aliases {:dev {:extra-paths ["test" "examples"] + :aliases {:dev {:extra-paths ["test" "examples" "dev"] :extra-deps {}}}} diff --git a/dev/examples/core.clj b/dev/examples/core.clj index 98dc696..f388b1e 100644 --- a/dev/examples/core.clj +++ b/dev/examples/core.clj @@ -2,6 +2,8 @@ (:require [clojure.string :as string] [clojure.set] [clojure.java.io :as io] + [taoensso.nippy :as nippy] + [taoensso.timbre :as timbre] [com.owoga.prhyme.frp :as frp] [com.owoga.prhyme.util :as util] [com.owoga.prhyme.core :as prhyme] @@ -205,53 +207,156 @@ (remove #(some string/blank? %)) (map #(string/join " " %)))))) -(defn dark-pos-freqs [] - (let [directory "dark-corpus"] - (->> (file-seq (io/file directory)) - (remove #(.isDirectory %)) - (take 1000) - (map slurp) - (map util/clean-text) - (filter dict/english?) - (map #(string/split % #"\n+")) - (map (remove-sentences-with-words-not-in-dictionary dict/popular)) - (remove empty?) - (remove #(some empty? %)) - (map nlp/treebank-zipper) - (map nlp/leaf-pos-path-word-freqs) - (apply nlp/deep-merge-with +)))) - -(defn dark-structures [] - (let [directory "dark-corpus"] - (->> (file-seq (io/file directory)) - (remove #(.isDirectory %)) - (take 500) - (map slurp) - (map util/clean-text) - (filter dict/english?) - (map #(string/split % #"\n+")) - (map (remove-sentences-with-words-not-in-dictionary dict/popular)) - (remove empty?) - (remove #(some empty? %)) - (map nlp/parse-to-simple-tree) - (map nlp/parse-tree-sans-leaf-words) - (map - (fn [lines] - (map - (fn [line] - (hash-map line 1)) - lines))) - (map (partial merge-with +)) - flatten - (apply merge-with +)))) +(defn pos-path-freqs + "Seq of pos-path frequencies of each document. + To reduce, deep merge with +." + [documents] + (->> documents + (map slurp) + (map util/clean-text) + (filter dict/english?) + (map #(string/split % #"\n+")) + (map (remove-sentences-with-words-not-in-dictionary dict/popular)) + (remove empty?) + (remove #(some empty? %)) + (map nlp/treebank-zipper) + (map nlp/leaf-pos-path-word-freqs))) + +(defn structures + "Seq of structure frequencies of each document. + To reduce, merge with +." + [documents] + (->> documents + (map slurp) + (map util/clean-text) + (filter dict/english?) + (map #(string/split % #"\n+")) + (map (remove-sentences-with-words-not-in-dictionary dict/popular)) + (remove empty?) + (remove #(some empty? %)) + (map nlp/parse-to-simple-tree) + (map nlp/parse-tree-sans-leaf-words) + (map + (fn [lines] + (map #(hash-map % 1) lines))) + (map (partial apply merge-with +)))) (defn weighted-selection-from-map [m] (first (weighted-rand/weighted-selection second (seq m)))) +(defn chunked-writing-pos-path-freqs + [documents chunk-size] + (let [chunks (range 0 (count documents) chunk-size)] + (run! + (fn [chunk] + (let [structure (->> documents + (drop chunk) + (take chunk-size) + pos-path-freqs + (reduce + (fn [a v] + (nlp/deep-merge-with + a v)) + {})) + filepath (format "resources/pos-freqs/%s.nip" chunk)] + (timbre/info (format "Writing to %s." filepath)) + (nippy/freeze-to-file filepath structure))) + chunks))) + +(defn chunked-writing-structure-freqs + [documents chunk-size] + (let [chunks (range 0 (count documents) chunk-size)] + (run! + (fn [chunk] + (let [structure (->> documents + (drop chunk) + (take chunk-size) + structures + (reduce + (fn [a v] + (nlp/deep-merge-with + a v)) + {})) + filepath (format "resources/structure-freqs/%s.nip" chunk)] + (timbre/info (format "Writing to %s." filepath)) + (nippy/freeze-to-file filepath structure))) + chunks))) + +(defn pos-paths->pos-freqs + "Convert pos paths, like {(TOP S NP NN) {'test' 5 'car' 3 ,,,}} + into a top-level pos freq map like {NN {'test' 25 'car' 8 ,,,}}.is" + [pos-paths] + (->> pos-paths + (map + (fn [[k v]] + (hash-map (last k) v))) + (reduce + (fn [a v] + (nlp/deep-merge-with + a v)) + {}))) + (comment - (time (def example-pos-freqs (dark-pos-freqs))) + (take 5 darklyrics/darklyrics-markov-2) + (darklyrics/darklyrics-markov-2 '("time" "is")) + (def darkov-2 darklyrics/darklyrics-markov-2) + ;; => ([("profanity" "unholy") {"its" 2}] + ;; [("ants" "triumph") {nil 1}] + ;; [("hiding" "our") {"of" 1, "expose" 3, "above" 1}] + ;; [("won't" "intervention") {"divine" 1, "an" 1}] + ;; [("pines" "weeping") {"the" 1}]) + + (def structures (nippy/thaw-from-file "resources/structure-freqs/0.nip")) + (take 100 (reverse (sort-by second structures))) + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)) + (drop 5000) + (take 10000)) + chunk-size 5000] + (chunked-writing-pos-path-freqs + documents + chunk-size)) + + (def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip")) + (take 10 t1) + (let [path-freqs (pos-paths->pos-freqs t1)] + (take 10 path-freqs)) - (time (def example-structures (dark-structures))) + (take 5 t1) + (take 10 (reverse (sort-by #(count (second %)) t1))) + (def t3 (nippy/thaw-from-file "resources/pos-freqs/400.nip")) + (def t2 (nippy/thaw-from-file "resources/pos-freqs/800.nip")) + (count (merge-with + t1 t2 t3)) + ;; => 2353 + (count t3) + ;; => 1013 + (count t1) + ;; => 871 + (count t2) + ;; => 676 (def corpus + (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)))) + + (time + (def example-pos-freqs + (->> corpus + (take 100) + pos-path-freqs + (reduce + (fn [a v] + (nlp/deep-merge-with + a v)) + {})))) + + (time + (def example-structures + (->> corpus + (take 100) + structures + (reduce + (fn [a v] + (merge-with + a v)) + {})))) (let [structure (weighted-selection-from-map example-structures)] (repeatedly @@ -262,6 +367,7 @@ example-pos-freqs) nlp/leaf-nodes (string/join " "))))) + ;; => ("then get your life" ;; "sometimes lie my hand" ;; "still become your chapter" diff --git a/resources/pos-freqs/0.nip b/resources/pos-freqs/0.nip new file mode 100644 index 0000000..2f4e014 Binary files /dev/null and b/resources/pos-freqs/0.nip differ diff --git a/resources/structure-freqs/0.nip b/resources/structure-freqs/0.nip new file mode 100644 index 0000000..7e6f484 Binary files /dev/null and b/resources/structure-freqs/0.nip differ diff --git a/resources/structure-freqs/10000.nip b/resources/structure-freqs/10000.nip new file mode 100644 index 0000000..a73409e Binary files /dev/null and b/resources/structure-freqs/10000.nip differ diff --git a/resources/structure-freqs/5000.nip b/resources/structure-freqs/5000.nip new file mode 100644 index 0000000..902521b Binary files /dev/null and b/resources/structure-freqs/5000.nip differ diff --git a/src/com/owoga/prhyme/corpus/db.clj b/src/com/owoga/prhyme/corpus/db.clj new file mode 100644 index 0000000..b63c25e --- /dev/null +++ b/src/com/owoga/prhyme/corpus/db.clj @@ -0,0 +1,2 @@ +(ns com.owoga.prhyme.corpus.db + (:require [integrant.core :as ig])) diff --git a/src/com/owoga/prhyme/generation/weighted_selection.clj b/src/com/owoga/prhyme/generation/weighted_selection.clj index d6ae47e..1bf74e6 100644 --- a/src/com/owoga/prhyme/generation/weighted_selection.clj +++ b/src/com/owoga/prhyme/generation/weighted_selection.clj @@ -74,6 +74,56 @@ target result])))))) +(defn adjust-for-markov-simple-structure + "Like the other adjust-for-markov, but the structure is simply + k/v pairs (element/weight). + + Works with a markov data structure that was generated taking into account + sentence boundaries (represented as nils). + + A key in the markov structure of '(nil) would have a value that represents all + words that have occurred in position 1 of the raw data. + + A key of '(nil \"foo\") would have a value that represents all words + that occurred in position 2 following \"foo\" + + Automatically detects the order (window size) of the markov model. Does this + by counting the length of the first key. + " + [markov percent] + (let [markov-n (count (first (first markov)))] + (fn [[words target result]] + (let [key (let [k (map :normalized-word (take markov-n result))] + (reverse + (if (> markov-n (count k)) + (concat k (repeat (- markov-n (count k)) nil)) + k))) + markov-options (markov key) + markov-option-avg (/ (apply + (vals markov-options)) + (max 1 (count markov-options)))] + (if (nil? markov-options) + [words target result] + (let [[markovs non-markovs] + ((juxt filter remove) + #(markov-options (:normalized-word %)) + words) + weight-non-markovs (apply + (map :weight non-markovs)) + target-weight-markovs (- (/ weight-non-markovs (- 1 percent)) + weight-non-markovs) + count-markovs (count markovs) + adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))] + [(concat + (map + (fn [m] + (let [option (markov-options (:normalized-word m))] + (as-> m m + (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m))) + (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs))))) + markovs) + non-markovs) + target + result])))))) + (defn adjust-for-rhymes "Weights words by whether or not they rhyme. Once result contains something, becomes inactive. If you want to try to rhyme diff --git a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj new file mode 100644 index 0000000..d67ab61 --- /dev/null +++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj @@ -0,0 +1,332 @@ +(ns com.owoga.prhyme.nlg.prhyme-nlg + (:require [clojure.zip :as zip] + [clojure.string :as string] + [taoensso.timbre :as timbre] + [examples.core :as examples] + [taoensso.nippy :as nippy] + [com.owoga.prhyme.nlp.core :as nlp] + [com.owoga.prhyme.util.weighted-rand :as weighted-rand] + [clojure.set :as set])) + +(defn update-values [m f & args] + (reduce + (fn [acc [k v]] + (assoc acc k (apply f v args))) + {} + m)) + +(defn generate + [pos-path->word-freqs + pos->word-freqs + target-parse-tree] + (loop [parse-zipper (zip/seq-zip target-parse-tree)] + (cond + (zip/end? parse-zipper) (zip/root parse-zipper) + + (zip/branch? parse-zipper) + (recur (zip/next parse-zipper)) + + (string? (zip/node parse-zipper)) + (recur (zip/next parse-zipper)) + + (and (symbol? (zip/node parse-zipper)) + (or (pos->word-freqs (zip/node parse-zipper)) + (pos-path->word-freqs (seq (map first (zip/path parse-zipper)))))) + (let [target-path (seq (map first (zip/path parse-zipper))) + target-pos (zip/node parse-zipper) + pos-path-word (pos-path->word-freqs target-path) + pos-word (pos->word-freqs target-pos)] + (timbre/info "Choosing POS for" target-path) + (let [selection (weighted-rand/weighted-selection-from-map + (merge-with + + (update-values pos-path-word * 2) + pos-word))] + (timbre/info "from" (take 5 + (merge-with + + (update-values pos-path-word * 2) + pos-word))) + (timbre/info "Chose " selection) + (recur + (-> parse-zipper + zip/up + (#(zip/replace % (list (zip/node (zip/down %)) selection))) + zip/next + zip/next)))) + + :else + (recur (zip/next parse-zipper))))) + +(defn next-word [zipper] + (->> zipper + nlp/iter-zip + (filter #(string? (zip/node %))) + first + (#(if (nil? %) :end (zip/node %))))) + +(defn next-two-words [nodes] + (timbre/info + (apply list (->> nodes + (map zip/node)))) + (->> nodes + (filter #(string? (zip/node %))) + (take 2) + (map #(if (nil? %) :end (zip/node %))))) + +(comment + (let [zipper (zip/seq-zip '(TOP (S (NN "Eric") (VBZ "is") (JJ "testing"))))] + (->> zipper + nlp/iter-zip + reverse + next-two-words)) + + ) + +(defn set-last [zipper f] + (let [last-node (->> zipper + (iterate zip/next) + (take-while (complement zip/end?)) + last + zip/prev)] + (-> last-node + (zip/replace (f last-node))))) + +(comment + (let [zipper (zip/seq-zip '(TOP (S (NN) (VBZ "is") (JJ))))] + (-> zipper + (set-last (fn [node] (list (zip/node (zip/next node)) "bad"))) + zip/root)) + + ) + +(defn generate-with-markov + [pos-path->word-freqs + pos->word-freqs + target-parse-tree + markov] + (loop [parse-zipper (zip/seq-zip target-parse-tree)] + (cond + (zip/end? parse-zipper) (zip/root parse-zipper) + + (zip/branch? parse-zipper) + (recur (zip/next parse-zipper)) + + (string? (zip/node parse-zipper)) + (recur (zip/next parse-zipper)) + + (and (symbol? (zip/node parse-zipper)) + (or (pos->word-freqs (zip/node parse-zipper)) + (pos-path->word-freqs (seq (map first (zip/path parse-zipper)))))) + (let [target-path (seq (map first (zip/path parse-zipper))) + target-pos (zip/node parse-zipper) + pos-path-word (pos-path->word-freqs target-path) + pos-word (pos->word-freqs target-pos) + markov-options (markov (reverse (next-two-words (nlp/iter-zip + parse-zipper + zip/prev + nil?))))] + (timbre/info "Markov options are" + (apply list (next-two-words (nlp/iter-zip + parse-zipper + zip/prev + nil?))) + (apply list (take 3 markov-options))) + (timbre/info "Choosing POS for" target-path) + (let [selection (weighted-rand/weighted-selection-from-map + (merge-with + * + (update-values markov-options * 10) + (update-values pos-path-word * 2) + pos-word))] + (timbre/info "from" (apply + list + (take + 5 + (merge-with + * + (update-values markov-options * 10) + (update-values pos-path-word * 2) + pos-word)))) + (timbre/info "Chose " selection) + (recur + (-> parse-zipper + zip/up + (#(zip/replace % (list (zip/node (zip/down %)) selection))) + zip/next + zip/next)))) + + :else + (recur (zip/next parse-zipper))))) + +(defn generate-with-markov-with-custom-progression + "Sams as above, but with next/prev and stop fns" + [next + prev + next-stop? + prev-stop? + pos-path->word-freqs + pos->word-freqs + parse-zipper + markov] + (loop [parse-zipper parse-zipper] + (cond + (nil? (next parse-zipper)) (zip/root parse-zipper) + + (next-stop? parse-zipper) (zip/root parse-zipper) + + (zip/branch? parse-zipper) + (recur (next parse-zipper)) + + (string? (zip/node parse-zipper)) + (recur (next parse-zipper)) + + (and (symbol? (zip/node parse-zipper)) + (or (pos->word-freqs (zip/node parse-zipper)) + (pos-path->word-freqs (seq (map first (zip/path parse-zipper)))))) + (let [target-path (seq (map first (zip/path parse-zipper))) + target-pos (zip/node parse-zipper) + pos-path-word (pos-path->word-freqs target-path) + pos-word (pos->word-freqs target-pos) + pos-map (merge-with + (fn [a b] (* 1.5 (+ a b))) + pos-path-word + pos-word) + markov-options (markov (reverse + (next-two-words (nlp/iter-zip + parse-zipper + prev + prev-stop?)))) + selection-possibilities (merge-with + (fn [a b] + (let [max-pos (apply max (vals pos-map))] + (+ a b max-pos))) + pos-map + markov-options)] + (timbre/info "Markov options are" + (apply list (next-two-words (nlp/iter-zip + parse-zipper + prev + prev-stop?))) + (apply list (take 10 markov-options))) + (timbre/info "Choosing POS for" target-path) + (let [selection (weighted-rand/weighted-selection-from-map + selection-possibilities)] + (timbre/info + "Most likely selection possibilities" + (apply list (take 5 (reverse (sort-by second selection-possibilities))))) + (timbre/info "Chose " selection) + (recur + (-> parse-zipper + zip/up + (#(zip/replace % (list (zip/node (zip/down %)) selection))) + zip/down + next + next)))) + + :else + (recur (next parse-zipper))))) + +(comment + (let [structure '(TOP (S (NP (DT) (JJ) (NN)) + (VP (VBZ)) + (NP (DT) (JJ) (NN)))) + structure (-> structure + zip/seq-zip + nlp/iter-zip + last) + pos-freqs (examples/pos-paths->pos-freqs + examples/t1)] + (repeatedly + 10 + (fn [] + (->> (generate-with-markov-with-custom-progression + zip/prev + zip/next + nil? + zip/end? + examples/t1 + pos-freqs + structure + examples/darkov-2) + nlp/leaf-nodes + (string/join " "))))) + + (timbre/set-level! :info) + (timbre/set-level! :error) + + (let [pos-path->word-freqs + {'(S N) {"Eric" 1 "Edgar" 2} + '(S V) {"tests" 2 "runs" 1}} + pos->word-freqs + {'N {"Edward" 1} + 'V {"breaks" 1}} + target-parse-tree + '(S (N) (V))] + (-> (generate + pos-path->word-freqs + pos->word-freqs + target-parse-tree))) + (time (def example-pos-freqs examples/example-pos-freqs)) + (nippy/thaw) + (nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs) + + (time (def example-structures examples/example-structures)) + (weighted-rand/weighted-selection-from-map + example-structures) + + + + (take 5 examples/t2) + (let [structure '(TOP (S (NP (DT) (JJ) (NN)) + (VP (VBZ)) + (NP (DT) (JJ) (NN)))) + structure (-> structure + zip/seq-zip + nlp/iter-zip + last) + pos-freqs (examples/pos-paths->pos-freqs + examples/t1)] + (repeatedly + 10 + (fn [] + (->> (generate-with-markov-with-custom-progression + zip/prev + zip/next + nil? + zip/end? + examples/t1 + pos-freqs + structure + examples/darkov-2) + nlp/leaf-nodes + (string/join " "))))) + + (repeatedly + 10 + (fn [] + (let [structure (weighted-rand/weighted-selection-from-map + (->> examples/t2 + (sort-by second) + (reverse) + (take 20))) + structure (-> structure + zip/seq-zip + nlp/iter-zip + last) + pos-freqs (examples/pos-paths->pos-freqs + examples/t1)] + (repeatedly + 10 + (fn [] + (->> (generate-with-markov-with-custom-progression + zip/prev + zip/next + nil? + zip/end? + examples/t1 + pos-freqs + structure + examples/darkov-2) + nlp/leaf-nodes + (string/join " "))))))) + + ) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index 26736f0..54f33d0 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -49,6 +49,14 @@ (top-k-sequences prhyme-pos-tagger (tokenize phrase)))) ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)] ;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)]) + (let [phrase "I am feeling the heat."] + (map (juxt #(.getOutcomes %) + #(map float (.getProbs %))) + (top-k-sequences prhyme-pos-tagger (tokenize phrase)))) +;; => ([["PRP" "VBP" "VBG" "DT" "NN" "."] +;; (0.9800125 0.9771906 0.9722519 0.9709216 0.9941198 0.98704773)] +;; [["PRP" "VBP" "NN" "DT" "NN" "."] +;; (0.9800125 0.9771906 0.01259052 0.76849043 0.99447477 0.98704773)]) ) ;;;; Custom parser to get access to top N parses @@ -84,7 +92,12 @@ (apply map vector ((juxt parse-strs parse-probs) parses)))) (comment - (parse-top-n "." 3) + (let [phrase "The feeling hurts."] + (->> phrase + tokenize + (string/join " ") + (#(parse-top-n % 10)))) + (Math/pow Math/E -0.96) ) (defn deep-merge-with [f & maps] @@ -244,10 +257,15 @@ ;; (. ("."))))))) ) -(defn iter-zip [zipper] - (->> zipper - (iterate zip/next) - (take-while (complement zip/end?)))) +(defn iter-zip + ([zipper] + (->> zipper + (iterate zip/next) + (take-while (complement zip/end?)))) + ([zipper n s] + (->> zipper + (iterate n) + (take-while (complement s))))) (defn iter-nodes [zipper] (->> zipper diff --git a/src/com/owoga/prhyme/util/weighted_rand.clj b/src/com/owoga/prhyme/util/weighted_rand.clj index 871d11f..f0f432b 100644 --- a/src/com/owoga/prhyme/util/weighted_rand.clj +++ b/src/com/owoga/prhyme/util/weighted_rand.clj @@ -97,3 +97,6 @@ (frequencies chosen)) (range (count ws)))] accuracy)) + +(defn weighted-selection-from-map [m] + (first (weighted-selection second (seq m))))