Add nippy for pos and grammar freqs

main
Eric Ihli 4 years ago
parent 75218e770b
commit c193152a79

@ -14,5 +14,5 @@
org.clojure/data.fressian {:mvn/version "1.0.0"}
com.taoensso/nippy {:mvn/version "3.0.0"}
com.taoensso/timbre {:mvn/version "4.10.0"}}
:aliases {:dev {:extra-paths ["test" "examples"]
:aliases {:dev {:extra-paths ["test" "examples" "dev"]
:extra-deps {}}}}

@ -2,6 +2,8 @@
(:require [clojure.string :as string]
[clojure.set]
[clojure.java.io :as io]
[taoensso.nippy :as nippy]
[taoensso.timbre :as timbre]
[com.owoga.prhyme.frp :as frp]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.core :as prhyme]
@ -205,53 +207,156 @@
(remove #(some string/blank? %))
(map #(string/join " " %))))))
(defn dark-pos-freqs []
(let [directory "dark-corpus"]
(->> (file-seq (io/file directory))
(remove #(.isDirectory %))
(take 1000)
(map slurp)
(map util/clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map nlp/treebank-zipper)
(map nlp/leaf-pos-path-word-freqs)
(apply nlp/deep-merge-with +))))
(defn dark-structures []
(let [directory "dark-corpus"]
(->> (file-seq (io/file directory))
(remove #(.isDirectory %))
(take 500)
(map slurp)
(map util/clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map nlp/parse-to-simple-tree)
(map nlp/parse-tree-sans-leaf-words)
(map
(fn [lines]
(map
(fn [line]
(hash-map line 1))
lines)))
(map (partial merge-with +))
flatten
(apply merge-with +))))
(defn pos-path-freqs
"Seq of pos-path frequencies of each document.
To reduce, deep merge with +."
[documents]
(->> documents
(map slurp)
(map util/clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map nlp/treebank-zipper)
(map nlp/leaf-pos-path-word-freqs)))
(defn structures
"Seq of structure frequencies of each document.
To reduce, merge with +."
[documents]
(->> documents
(map slurp)
(map util/clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map nlp/parse-to-simple-tree)
(map nlp/parse-tree-sans-leaf-words)
(map
(fn [lines]
(map #(hash-map % 1) lines)))
(map (partial apply merge-with +))))
(defn weighted-selection-from-map [m]
(first (weighted-rand/weighted-selection second (seq m))))
(defn chunked-writing-pos-path-freqs
[documents chunk-size]
(let [chunks (range 0 (count documents) chunk-size)]
(run!
(fn [chunk]
(let [structure (->> documents
(drop chunk)
(take chunk-size)
pos-path-freqs
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{}))
filepath (format "resources/pos-freqs/%s.nip" chunk)]
(timbre/info (format "Writing to %s." filepath))
(nippy/freeze-to-file filepath structure)))
chunks)))
(defn chunked-writing-structure-freqs
[documents chunk-size]
(let [chunks (range 0 (count documents) chunk-size)]
(run!
(fn [chunk]
(let [structure (->> documents
(drop chunk)
(take chunk-size)
structures
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{}))
filepath (format "resources/structure-freqs/%s.nip" chunk)]
(timbre/info (format "Writing to %s." filepath))
(nippy/freeze-to-file filepath structure)))
chunks)))
(defn pos-paths->pos-freqs
"Convert pos paths, like {(TOP S NP NN) {'test' 5 'car' 3 ,,,}}
into a top-level pos freq map like {NN {'test' 25 'car' 8 ,,,}}.is"
[pos-paths]
(->> pos-paths
(map
(fn [[k v]]
(hash-map (last k) v)))
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{})))
(comment
(time (def example-pos-freqs (dark-pos-freqs)))
(take 5 darklyrics/darklyrics-markov-2)
(darklyrics/darklyrics-markov-2 '("time" "is"))
(def darkov-2 darklyrics/darklyrics-markov-2)
;; => ([("profanity" "unholy") {"its" 2}]
;; [("ants" "triumph") {nil 1}]
;; [("hiding" "our") {"of" 1, "expose" 3, "above" 1}]
;; [("won't" "intervention") {"divine" 1, "an" 1}]
;; [("pines" "weeping") {"the" 1}])
(def structures (nippy/thaw-from-file "resources/structure-freqs/0.nip"))
(take 100 (reverse (sort-by second structures)))
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 5000)
(take 10000))
chunk-size 5000]
(chunked-writing-pos-path-freqs
documents
chunk-size))
(def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip"))
(take 10 t1)
(let [path-freqs (pos-paths->pos-freqs t1)]
(take 10 path-freqs))
(time (def example-structures (dark-structures)))
(take 5 t1)
(take 10 (reverse (sort-by #(count (second %)) t1)))
(def t3 (nippy/thaw-from-file "resources/pos-freqs/400.nip"))
(def t2 (nippy/thaw-from-file "resources/pos-freqs/800.nip"))
(count (merge-with + t1 t2 t3))
;; => 2353
(count t3)
;; => 1013
(count t1)
;; => 871
(count t2)
;; => 676 (def corpus
(->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))))
(time
(def example-pos-freqs
(->> corpus
(take 100)
pos-path-freqs
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
{}))))
(time
(def example-structures
(->> corpus
(take 100)
structures
(reduce
(fn [a v]
(merge-with + a v))
{}))))
(let [structure (weighted-selection-from-map example-structures)]
(repeatedly
@ -262,6 +367,7 @@
example-pos-freqs)
nlp/leaf-nodes
(string/join " ")))))
;; => ("then get your life"
;; "sometimes lie my hand"
;; "still become your chapter"

Binary file not shown.

Binary file not shown.

@ -0,0 +1,2 @@
(ns com.owoga.prhyme.corpus.db
(:require [integrant.core :as ig]))

@ -74,6 +74,56 @@
target
result]))))))
(defn adjust-for-markov-simple-structure
"Like the other adjust-for-markov, but the structure is simply
k/v pairs (element/weight).
Works with a markov data structure that was generated taking into account
sentence boundaries (represented as nils).
A key in the markov structure of '(nil) would have a value that represents all
words that have occurred in position 1 of the raw data.
A key of '(nil \"foo\") would have a value that represents all words
that occurred in position 2 following \"foo\"
Automatically detects the order (window size) of the markov model. Does this
by counting the length of the first key.
"
[markov percent]
(let [markov-n (count (first (first markov)))]
(fn [[words target result]]
(let [key (let [k (map :normalized-word (take markov-n result))]
(reverse
(if (> markov-n (count k))
(concat k (repeat (- markov-n (count k)) nil))
k)))
markov-options (markov key)
markov-option-avg (/ (apply + (vals markov-options))
(max 1 (count markov-options)))]
(if (nil? markov-options)
[words target result]
(let [[markovs non-markovs]
((juxt filter remove)
#(markov-options (:normalized-word %))
words)
weight-non-markovs (apply + (map :weight non-markovs))
target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
weight-non-markovs)
count-markovs (count markovs)
adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
[(concat
(map
(fn [m]
(let [option (markov-options (:normalized-word m))]
(as-> m m
(assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
(assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
markovs)
non-markovs)
target
result]))))))
(defn adjust-for-rhymes
"Weights words by whether or not they rhyme.
Once result contains something, becomes inactive. If you want to try to rhyme

@ -0,0 +1,332 @@
(ns com.owoga.prhyme.nlg.prhyme-nlg
(:require [clojure.zip :as zip]
[clojure.string :as string]
[taoensso.timbre :as timbre]
[examples.core :as examples]
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[clojure.set :as set]))
(defn update-values [m f & args]
(reduce
(fn [acc [k v]]
(assoc acc k (apply f v args)))
{}
m))
(defn generate
[pos-path->word-freqs
pos->word-freqs
target-parse-tree]
(loop [parse-zipper (zip/seq-zip target-parse-tree)]
(cond
(zip/end? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (zip/next parse-zipper))
(string? (zip/node parse-zipper))
(recur (zip/next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(or (pos->word-freqs (zip/node parse-zipper))
(pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
(let [target-path (seq (map first (zip/path parse-zipper)))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-word (pos->word-freqs target-pos)]
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
(merge-with +
(update-values pos-path-word * 2)
pos-word))]
(timbre/info "from" (take 5
(merge-with +
(update-values pos-path-word * 2)
pos-word)))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/next
zip/next))))
:else
(recur (zip/next parse-zipper)))))
(defn next-word [zipper]
(->> zipper
nlp/iter-zip
(filter #(string? (zip/node %)))
first
(#(if (nil? %) :end (zip/node %)))))
(defn next-two-words [nodes]
(timbre/info
(apply list (->> nodes
(map zip/node))))
(->> nodes
(filter #(string? (zip/node %)))
(take 2)
(map #(if (nil? %) :end (zip/node %)))))
(comment
(let [zipper (zip/seq-zip '(TOP (S (NN "Eric") (VBZ "is") (JJ "testing"))))]
(->> zipper
nlp/iter-zip
reverse
next-two-words))
)
(defn set-last [zipper f]
(let [last-node (->> zipper
(iterate zip/next)
(take-while (complement zip/end?))
last
zip/prev)]
(-> last-node
(zip/replace (f last-node)))))
(comment
(let [zipper (zip/seq-zip '(TOP (S (NN) (VBZ "is") (JJ))))]
(-> zipper
(set-last (fn [node] (list (zip/node (zip/next node)) "bad")))
zip/root))
)
(defn generate-with-markov
[pos-path->word-freqs
pos->word-freqs
target-parse-tree
markov]
(loop [parse-zipper (zip/seq-zip target-parse-tree)]
(cond
(zip/end? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (zip/next parse-zipper))
(string? (zip/node parse-zipper))
(recur (zip/next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(or (pos->word-freqs (zip/node parse-zipper))
(pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
(let [target-path (seq (map first (zip/path parse-zipper)))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-word (pos->word-freqs target-pos)
markov-options (markov (reverse (next-two-words (nlp/iter-zip
parse-zipper
zip/prev
nil?))))]
(timbre/info "Markov options are"
(apply list (next-two-words (nlp/iter-zip
parse-zipper
zip/prev
nil?)))
(apply list (take 3 markov-options)))
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
(merge-with
*
(update-values markov-options * 10)
(update-values pos-path-word * 2)
pos-word))]
(timbre/info "from" (apply
list
(take
5
(merge-with
*
(update-values markov-options * 10)
(update-values pos-path-word * 2)
pos-word))))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/next
zip/next))))
:else
(recur (zip/next parse-zipper)))))
(defn generate-with-markov-with-custom-progression
"Sams as above, but with next/prev and stop fns"
[next
prev
next-stop?
prev-stop?
pos-path->word-freqs
pos->word-freqs
parse-zipper
markov]
(loop [parse-zipper parse-zipper]
(cond
(nil? (next parse-zipper)) (zip/root parse-zipper)
(next-stop? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (next parse-zipper))
(string? (zip/node parse-zipper))
(recur (next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(or (pos->word-freqs (zip/node parse-zipper))
(pos-path->word-freqs (seq (map first (zip/path parse-zipper))))))
(let [target-path (seq (map first (zip/path parse-zipper)))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-word (pos->word-freqs target-pos)
pos-map (merge-with
(fn [a b] (* 1.5 (+ a b)))
pos-path-word
pos-word)
markov-options (markov (reverse
(next-two-words (nlp/iter-zip
parse-zipper
prev
prev-stop?))))
selection-possibilities (merge-with
(fn [a b]
(let [max-pos (apply max (vals pos-map))]
(+ a b max-pos)))
pos-map
markov-options)]
(timbre/info "Markov options are"
(apply list (next-two-words (nlp/iter-zip
parse-zipper
prev
prev-stop?)))
(apply list (take 10 markov-options)))
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
selection-possibilities)]
(timbre/info
"Most likely selection possibilities"
(apply list (take 5 (reverse (sort-by second selection-possibilities)))))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/down
next
next))))
:else
(recur (next parse-zipper)))))
(comment
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (VBZ))
(NP (DT) (JJ) (NN))))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))
(timbre/set-level! :info)
(timbre/set-level! :error)
(let [pos-path->word-freqs
{'(S N) {"Eric" 1 "Edgar" 2}
'(S V) {"tests" 2 "runs" 1}}
pos->word-freqs
{'N {"Edward" 1}
'V {"breaks" 1}}
target-parse-tree
'(S (N) (V))]
(-> (generate
pos-path->word-freqs
pos->word-freqs
target-parse-tree)))
(time (def example-pos-freqs examples/example-pos-freqs))
(nippy/thaw)
(nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs)
(time (def example-structures examples/example-structures))
(weighted-rand/weighted-selection-from-map
example-structures)
(take 5 examples/t2)
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (VBZ))
(NP (DT) (JJ) (NN))))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))
(repeatedly
10
(fn []
(let [structure (weighted-rand/weighted-selection-from-map
(->> examples/t2
(sort-by second)
(reverse)
(take 20)))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))))
)

@ -49,6 +49,14 @@
(top-k-sequences prhyme-pos-tagger (tokenize phrase))))
;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
(let [phrase "I am feeling the heat."]
(map (juxt #(.getOutcomes %)
#(map float (.getProbs %)))
(top-k-sequences prhyme-pos-tagger (tokenize phrase))))
;; => ([["PRP" "VBP" "VBG" "DT" "NN" "."]
;; (0.9800125 0.9771906 0.9722519 0.9709216 0.9941198 0.98704773)]
;; [["PRP" "VBP" "NN" "DT" "NN" "."]
;; (0.9800125 0.9771906 0.01259052 0.76849043 0.99447477 0.98704773)])
)
;;;; Custom parser to get access to top N parses
@ -84,7 +92,12 @@
(apply map vector ((juxt parse-strs parse-probs) parses))))
(comment
(parse-top-n "." 3)
(let [phrase "The feeling hurts."]
(->> phrase
tokenize
(string/join " ")
(#(parse-top-n % 10))))
(Math/pow Math/E -0.96)
)
(defn deep-merge-with [f & maps]
@ -244,10 +257,15 @@
;; (. (".")))))))
)
(defn iter-zip [zipper]
(->> zipper
(iterate zip/next)
(take-while (complement zip/end?))))
(defn iter-zip
([zipper]
(->> zipper
(iterate zip/next)
(take-while (complement zip/end?))))
([zipper n s]
(->> zipper
(iterate n)
(take-while (complement s)))))
(defn iter-nodes [zipper]
(->> zipper

@ -97,3 +97,6 @@
(frequencies chosen))
(range (count ws)))]
accuracy))
(defn weighted-selection-from-map [m]
(first (weighted-selection second (seq m))))

Loading…
Cancel
Save