Add nippy for pos and grammar freqs

main
Eric Ihli 4 years ago
parent 75218e770b
commit c193152a79

@ -14,5 +14,5 @@
org.clojure/data.fressian {:mvn/version "1.0.0"} org.clojure/data.fressian {:mvn/version "1.0.0"}
com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"}
com.taoensso/timbre {:mvn/version "4.10.0"}} com.taoensso/timbre {:mvn/version "4.10.0"}}
:aliases {:dev {:extra-paths ["test" "examples"] :aliases {:dev {:extra-paths ["test" "examples" "dev"]
:extra-deps {}}}} :extra-deps {}}}}

@ -2,6 +2,8 @@
(:require [clojure.string :as string] (:require [clojure.string :as string]
[clojure.set] [clojure.set]
[clojure.java.io :as io] [clojure.java.io :as io]
[taoensso.nippy :as nippy]
[taoensso.timbre :as timbre]
[com.owoga.prhyme.frp :as frp] [com.owoga.prhyme.frp :as frp]
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.core :as prhyme]
@ -205,53 +207,156 @@
(remove #(some string/blank? %)) (remove #(some string/blank? %))
(map #(string/join " " %)))))) (map #(string/join " " %))))))
(defn dark-pos-freqs [] (defn pos-path-freqs
(let [directory "dark-corpus"] "Seq of pos-path frequencies of each document.
(->> (file-seq (io/file directory)) To reduce, deep merge with +."
(remove #(.isDirectory %)) [documents]
(take 1000) (->> documents
(map slurp) (map slurp)
(map util/clean-text) (map util/clean-text)
(filter dict/english?) (filter dict/english?)
(map #(string/split % #"\n+")) (map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular)) (map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?) (remove empty?)
(remove #(some empty? %)) (remove #(some empty? %))
(map nlp/treebank-zipper) (map nlp/treebank-zipper)
(map nlp/leaf-pos-path-word-freqs) (map nlp/leaf-pos-path-word-freqs)))
(apply nlp/deep-merge-with +))))
(defn structures
(defn dark-structures [] "Seq of structure frequencies of each document.
(let [directory "dark-corpus"] To reduce, merge with +."
(->> (file-seq (io/file directory)) [documents]
(remove #(.isDirectory %)) (->> documents
(take 500) (map slurp)
(map slurp) (map util/clean-text)
(map util/clean-text) (filter dict/english?)
(filter dict/english?) (map #(string/split % #"\n+"))
(map #(string/split % #"\n+")) (map (remove-sentences-with-words-not-in-dictionary dict/popular))
(map (remove-sentences-with-words-not-in-dictionary dict/popular)) (remove empty?)
(remove empty?) (remove #(some empty? %))
(remove #(some empty? %)) (map nlp/parse-to-simple-tree)
(map nlp/parse-to-simple-tree) (map nlp/parse-tree-sans-leaf-words)
(map nlp/parse-tree-sans-leaf-words) (map
(map (fn [lines]
(fn [lines] (map #(hash-map % 1) lines)))
(map (map (partial apply merge-with +))))
(fn [line]
(hash-map line 1))
lines)))
(map (partial merge-with +))
flatten
(apply merge-with +))))
(defn weighted-selection-from-map [m] (defn weighted-selection-from-map [m]
(first (weighted-rand/weighted-selection second (seq m)))) (first (weighted-rand/weighted-selection second (seq m))))
(defn chunked-writing-pos-path-freqs
[documents chunk-size]
(let [chunks (range 0 (count documents) chunk-size)]
(run!
(fn [chunk]
(let [structure (->> documents
(drop chunk)
(take chunk-size)
pos-path-freqs
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{}))
filepath (format "resources/pos-freqs/%s.nip" chunk)]
(timbre/info (format "Writing to %s." filepath))
(nippy/freeze-to-file filepath structure)))
chunks)))
(defn chunked-writing-structure-freqs
[documents chunk-size]
(let [chunks (range 0 (count documents) chunk-size)]
(run!
(fn [chunk]
(let [structure (->> documents
(drop chunk)
(take chunk-size)
structures
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{}))
filepath (format "resources/structure-freqs/%s.nip" chunk)]
(timbre/info (format "Writing to %s." filepath))
(nippy/freeze-to-file filepath structure)))
chunks)))
(defn pos-paths->pos-freqs
"Convert pos paths, like {(TOP S NP NN) {'test' 5 'car' 3 ,,,}}
into a top-level pos freq map like {NN {'test' 25 'car' 8 ,,,}}.is"
[pos-paths]
(->> pos-paths
(map
(fn [[k v]]
(hash-map (last k) v)))
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{})))
(comment (comment
(time (def example-pos-freqs (dark-pos-freqs))) (take 5 darklyrics/darklyrics-markov-2)
(darklyrics/darklyrics-markov-2 '("time" "is"))
(def darkov-2 darklyrics/darklyrics-markov-2)
;; => ([("profanity" "unholy") {"its" 2}]
;; [("ants" "triumph") {nil 1}]
;; [("hiding" "our") {"of" 1, "expose" 3, "above" 1}]
;; [("won't" "intervention") {"divine" 1, "an" 1}]
;; [("pines" "weeping") {"the" 1}])
(def structures (nippy/thaw-from-file "resources/structure-freqs/0.nip"))
(take 100 (reverse (sort-by second structures)))
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 5000)
(take 10000))
chunk-size 5000]
(chunked-writing-pos-path-freqs
documents
chunk-size))
(def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip"))
(take 10 t1)
(let [path-freqs (pos-paths->pos-freqs t1)]
(take 10 path-freqs))
(time (def example-structures (dark-structures))) (take 5 t1)
(take 10 (reverse (sort-by #(count (second %)) t1)))
(def t3 (nippy/thaw-from-file "resources/pos-freqs/400.nip"))
(def t2 (nippy/thaw-from-file "resources/pos-freqs/800.nip"))
(count (merge-with + t1 t2 t3))
;; => 2353
(count t3)
;; => 1013
(count t1)
;; => 871
(count t2)
;; => 676 (def corpus
(->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))))
(time
(def example-pos-freqs
(->> corpus
(take 100)
pos-path-freqs
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{}))))
(time
(def example-structures
(->> corpus
(take 100)
structures
(reduce
(fn [a v]
(merge-with + a v))
{}))))
(let [structure (weighted-selection-from-map example-structures)] (let [structure (weighted-selection-from-map example-structures)]
(repeatedly (repeatedly
@ -262,6 +367,7 @@
example-pos-freqs) example-pos-freqs)
nlp/leaf-nodes nlp/leaf-nodes
(string/join " "))))) (string/join " ")))))
;; => ("then get your life" ;; => ("then get your life"
;; "sometimes lie my hand" ;; "sometimes lie my hand"
;; "still become your chapter" ;; "still become your chapter"

Binary file not shown.

Binary file not shown.

@ -0,0 +1,2 @@
(ns com.owoga.prhyme.corpus.db
(:require [integrant.core :as ig]))

@ -74,6 +74,56 @@
target target
result])))))) result]))))))
(defn adjust-for-markov-simple-structure
"Like the other adjust-for-markov, but the structure is simply
k/v pairs (element/weight).
Works with a markov data structure that was generated taking into account
sentence boundaries (represented as nils).
A key in the markov structure of '(nil) would have a value that represents all
words that have occurred in position 1 of the raw data.
A key of '(nil \"foo\") would have a value that represents all words
that occurred in position 2 following \"foo\"
Automatically detects the order (window size) of the markov model. Does this
by counting the length of the first key.
"
[markov percent]
(let [markov-n (count (first (first markov)))]
(fn [[words target result]]
(let [key (let [k (map :normalized-word (take markov-n result))]
(reverse
(if (> markov-n (count k))
(concat k (repeat (- markov-n (count k)) nil))
k)))
markov-options (markov key)
markov-option-avg (/ (apply + (vals markov-options))
(max 1 (count markov-options)))]
(if (nil? markov-options)
[words target result]
(let [[markovs non-markovs]
((juxt filter remove)
#(markov-options (:normalized-word %))
words)
weight-non-markovs (apply + (map :weight non-markovs))
target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
weight-non-markovs)
count-markovs (count markovs)
adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
[(concat
(map
(fn [m]
(let [option (markov-options (:normalized-word m))]
(as-> m m
(assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
(assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
markovs)
non-markovs)
target
result]))))))
(defn adjust-for-rhymes (defn adjust-for-rhymes
"Weights words by whether or not they rhyme. "Weights words by whether or not they rhyme.
Once result contains something, becomes inactive. If you want to try to rhyme Once result contains something, becomes inactive. If you want to try to rhyme

@ -0,0 +1,332 @@
(ns com.owoga.prhyme.nlg.prhyme-nlg
(:require [clojure.zip :as zip]
[clojure.string :as string]
[taoensso.timbre :as timbre]
[examples.core :as examples]
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[clojure.set :as set]))
(defn update-values [m f & args]
(reduce
(fn [acc [k v]]
(assoc acc k (apply f v args)))
{}
m))
(defn generate
[pos-path->word-freqs
pos->word-freqs
target-parse-tree]
(loop [parse-zipper (zip/seq-zip target-parse-tree)]
(cond
(zip/end? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (zip/next parse-zipper))
(string? (zip/node parse-zipper))
(recur (zip/next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(or (pos->word-freqs (zip/node parse-zipper))
(pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
(let [target-path (seq (map first (zip/path parse-zipper)))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-word (pos->word-freqs target-pos)]
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
(merge-with +
(update-values pos-path-word * 2)
pos-word))]
(timbre/info "from" (take 5
(merge-with +
(update-values pos-path-word * 2)
pos-word)))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/next
zip/next))))
:else
(recur (zip/next parse-zipper)))))
(defn next-word [zipper]
(->> zipper
nlp/iter-zip
(filter #(string? (zip/node %)))
first
(#(if (nil? %) :end (zip/node %)))))
(defn next-two-words [nodes]
(timbre/info
(apply list (->> nodes
(map zip/node))))
(->> nodes
(filter #(string? (zip/node %)))
(take 2)
(map #(if (nil? %) :end (zip/node %)))))
(comment
(let [zipper (zip/seq-zip '(TOP (S (NN "Eric") (VBZ "is") (JJ "testing"))))]
(->> zipper
nlp/iter-zip
reverse
next-two-words))
)
(defn set-last [zipper f]
(let [last-node (->> zipper
(iterate zip/next)
(take-while (complement zip/end?))
last
zip/prev)]
(-> last-node
(zip/replace (f last-node)))))
(comment
(let [zipper (zip/seq-zip '(TOP (S (NN) (VBZ "is") (JJ))))]
(-> zipper
(set-last (fn [node] (list (zip/node (zip/next node)) "bad")))
zip/root))
)
(defn generate-with-markov
[pos-path->word-freqs
pos->word-freqs
target-parse-tree
markov]
(loop [parse-zipper (zip/seq-zip target-parse-tree)]
(cond
(zip/end? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (zip/next parse-zipper))
(string? (zip/node parse-zipper))
(recur (zip/next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(or (pos->word-freqs (zip/node parse-zipper))
(pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
(let [target-path (seq (map first (zip/path parse-zipper)))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-word (pos->word-freqs target-pos)
markov-options (markov (reverse (next-two-words (nlp/iter-zip
parse-zipper
zip/prev
nil?))))]
(timbre/info "Markov options are"
(apply list (next-two-words (nlp/iter-zip
parse-zipper
zip/prev
nil?)))
(apply list (take 3 markov-options)))
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
(merge-with
*
(update-values markov-options * 10)
(update-values pos-path-word * 2)
pos-word))]
(timbre/info "from" (apply
list
(take
5
(merge-with
*
(update-values markov-options * 10)
(update-values pos-path-word * 2)
pos-word))))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/next
zip/next))))
:else
(recur (zip/next parse-zipper)))))
(defn generate-with-markov-with-custom-progression
"Sams as above, but with next/prev and stop fns"
[next
prev
next-stop?
prev-stop?
pos-path->word-freqs
pos->word-freqs
parse-zipper
markov]
(loop [parse-zipper parse-zipper]
(cond
(nil? (next parse-zipper)) (zip/root parse-zipper)
(next-stop? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (next parse-zipper))
(string? (zip/node parse-zipper))
(recur (next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(or (pos->word-freqs (zip/node parse-zipper))
(pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
(let [target-path (seq (map first (zip/path parse-zipper)))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-word (pos->word-freqs target-pos)
pos-map (merge-with
(fn [a b] (* 1.5 (+ a b)))
pos-path-word
pos-word)
markov-options (markov (reverse
(next-two-words (nlp/iter-zip
parse-zipper
prev
prev-stop?))))
selection-possibilities (merge-with
(fn [a b]
(let [max-pos (apply max (vals pos-map))]
(+ a b max-pos)))
pos-map
markov-options)]
(timbre/info "Markov options are"
(apply list (next-two-words (nlp/iter-zip
parse-zipper
prev
prev-stop?)))
(apply list (take 10 markov-options)))
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
selection-possibilities)]
(timbre/info
"Most likely selection possibilities"
(apply list (take 5 (reverse (sort-by second selection-possibilities)))))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/down
next
next))))
:else
(recur (next parse-zipper)))))
(comment
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (VBZ))
(NP (DT) (JJ) (NN))))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))
(timbre/set-level! :info)
(timbre/set-level! :error)
(let [pos-path->word-freqs
{'(S N) {"Eric" 1 "Edgar" 2}
'(S V) {"tests" 2 "runs" 1}}
pos->word-freqs
{'N {"Edward" 1}
'V {"breaks" 1}}
target-parse-tree
'(S (N) (V))]
(-> (generate
pos-path->word-freqs
pos->word-freqs
target-parse-tree)))
(time (def example-pos-freqs examples/example-pos-freqs))
(nippy/thaw)
(nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs)
(time (def example-structures examples/example-structures))
(weighted-rand/weighted-selection-from-map
example-structures)
(take 5 examples/t2)
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (VBZ))
(NP (DT) (JJ) (NN))))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))
(repeatedly
10
(fn []
(let [structure (weighted-rand/weighted-selection-from-map
(->> examples/t2
(sort-by second)
(reverse)
(take 20)))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))))
)

@ -49,6 +49,14 @@
(top-k-sequences prhyme-pos-tagger (tokenize phrase)))) (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)] ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)]) ;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
(let [phrase "I am feeling the heat."]
(map (juxt #(.getOutcomes %)
#(map float (.getProbs %)))
(top-k-sequences prhyme-pos-tagger (tokenize phrase))))
;; => ([["PRP" "VBP" "VBG" "DT" "NN" "."]
;; (0.9800125 0.9771906 0.9722519 0.9709216 0.9941198 0.98704773)]
;; [["PRP" "VBP" "NN" "DT" "NN" "."]
;; (0.9800125 0.9771906 0.01259052 0.76849043 0.99447477 0.98704773)])
) )
;;;; Custom parser to get access to top N parses ;;;; Custom parser to get access to top N parses
@ -84,7 +92,12 @@
(apply map vector ((juxt parse-strs parse-probs) parses)))) (apply map vector ((juxt parse-strs parse-probs) parses))))
(comment (comment
(parse-top-n "." 3) (let [phrase "The feeling hurts."]
(->> phrase
tokenize
(string/join " ")
(#(parse-top-n % 10))))
(Math/pow Math/E -0.96)
) )
(defn deep-merge-with [f & maps] (defn deep-merge-with [f & maps]
@ -244,10 +257,15 @@
;; (. ("."))))))) ;; (. (".")))))))
) )
(defn iter-zip [zipper] (defn iter-zip
(->> zipper ([zipper]
(iterate zip/next) (->> zipper
(take-while (complement zip/end?)))) (iterate zip/next)
(take-while (complement zip/end?))))
([zipper n s]
(->> zipper
(iterate n)
(take-while (complement s)))))
(defn iter-nodes [zipper] (defn iter-nodes [zipper]
(->> zipper (->> zipper

@ -97,3 +97,6 @@
(frequencies chosen)) (frequencies chosen))
(range (count ws)))] (range (count ws)))]
accuracy)) accuracy))
(defn weighted-selection-from-map [m]
(first (weighted-selection second (seq m))))

Loading…
Cancel
Save